/build/source/llvm/lib/Target/X86/X86ISelLowering.cpp

1

//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//

2

//

3

// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

4

// See https://llvm.org/LICENSE.txt for license information.

5

// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

6

//

7

//===----------------------------------------------------------------------===//

8

//

9

// This file defines the interfaces that X86 uses to lower LLVM code into a

10

// selection DAG.

11

//

12

//===----------------------------------------------------------------------===//

13

14

#include "X86ISelLowering.h"

15

#include "MCTargetDesc/X86ShuffleDecode.h"

16

#include "X86.h"

17

#include "X86CallingConv.h"

18

#include "X86FrameLowering.h"

19

#include "X86InstrBuilder.h"

20

#include "X86IntrinsicsInfo.h"

21

#include "X86MachineFunctionInfo.h"

22

#include "X86TargetMachine.h"

23

#include "X86TargetObjectFile.h"

24

#include "llvm/ADT/SmallBitVector.h"

25

#include "llvm/ADT/SmallSet.h"

26

#include "llvm/ADT/Statistic.h"

27

#include "llvm/ADT/StringExtras.h"

28

#include "llvm/ADT/StringSwitch.h"

29

#include "llvm/Analysis/BlockFrequencyInfo.h"

30

#include "llvm/Analysis/ObjCARCUtil.h"

31

#include "llvm/Analysis/ProfileSummaryInfo.h"

32

#include "llvm/Analysis/VectorUtils.h"

33

#include "llvm/CodeGen/IntrinsicLowering.h"

34

#include "llvm/CodeGen/MachineFrameInfo.h"

35

#include "llvm/CodeGen/MachineFunction.h"

36

#include "llvm/CodeGen/MachineInstrBuilder.h"

37

#include "llvm/CodeGen/MachineJumpTableInfo.h"

38

#include "llvm/CodeGen/MachineLoopInfo.h"

39

#include "llvm/CodeGen/MachineModuleInfo.h"

40

#include "llvm/CodeGen/MachineRegisterInfo.h"

41

#include "llvm/CodeGen/TargetLowering.h"

42

#include "llvm/CodeGen/WinEHFuncInfo.h"

43

#include "llvm/IR/CallingConv.h"

44

#include "llvm/IR/Constants.h"

45

#include "llvm/IR/DerivedTypes.h"

46

#include "llvm/IR/DiagnosticInfo.h"

47

#include "llvm/IR/EHPersonalities.h"

48

#include "llvm/IR/Function.h"

49

#include "llvm/IR/GlobalAlias.h"

50

#include "llvm/IR/GlobalVariable.h"

51

#include "llvm/IR/IRBuilder.h"

52

#include "llvm/IR/Instructions.h"

53

#include "llvm/IR/Intrinsics.h"

54

#include "llvm/IR/PatternMatch.h"

55

#include "llvm/MC/MCAsmInfo.h"

56

#include "llvm/MC/MCContext.h"

57

#include "llvm/MC/MCExpr.h"

58

#include "llvm/MC/MCSymbol.h"

59

#include "llvm/Support/CommandLine.h"

60

#include "llvm/Support/Debug.h"

61

#include "llvm/Support/ErrorHandling.h"

62

#include "llvm/Support/KnownBits.h"

63

#include "llvm/Support/MathExtras.h"

64

#include "llvm/Target/TargetOptions.h"

65

#include <algorithm>

66

#include <bitset>

67

#include <cctype>

68

#include <numeric>

69

using namespace llvm;

70

71

#define DEBUG_TYPE"x86-isel" "x86-isel"

72

73

STATISTIC(NumTailCalls, "Number of tail calls")static llvm::Statistic NumTailCalls = {"x86-isel", "NumTailCalls"
, "Number of tail calls"};

74

75

static cl::opt<int> ExperimentalPrefInnermostLoopAlignment(

76

"x86-experimental-pref-innermost-loop-alignment", cl::init(4),

77

cl::desc(

78

"Sets the preferable loop alignment for experiments (as log2 bytes) "

79

"for innermost loops only. If specified, this option overrides "

80

"alignment set by x86-experimental-pref-loop-alignment."),

81

cl::Hidden);

82

83

static cl::opt<bool> MulConstantOptimization(

84

"mul-constant-optimization", cl::init(true),

85

cl::desc("Replace 'mul x, Const' with more effective instructions like "

86

"SHIFT, LEA, etc."),

87

cl::Hidden);

88

89

static cl::opt<bool> ExperimentalUnorderedISEL(

90

"x86-experimental-unordered-atomic-isel", cl::init(false),

91

cl::desc("Use LoadSDNode and StoreSDNode instead of "

92

"AtomicSDNode for unordered atomic loads and "

93

"stores respectively."),

94

cl::Hidden);

95

96

/// Call this when the user attempts to do something unsupported, like

97

/// returning a double without SSE2 enabled on x86_64. This is not fatal, unlike

98

/// report_fatal_error, so calling code should attempt to recover without

99

/// crashing.

100

static void errorUnsupported(SelectionDAG &DAG, const SDLoc &dl,

101

const char *Msg) {

102

MachineFunction &MF = DAG.getMachineFunction();

103

DAG.getContext()->diagnose(

104

DiagnosticInfoUnsupported(MF.getFunction(), Msg, dl.getDebugLoc()));

105

}

106

107

/// Returns true if a CC can dynamically exclude a register from the list of

108

/// callee-saved-registers (TargetRegistryInfo::getCalleeSavedRegs()) based on

109

/// the return registers.

110

static bool shouldDisableRetRegFromCSR(CallingConv::ID CC) {

111

switch (CC) {

112

default:

113

return false;

114

case CallingConv::X86_RegCall:

115

case CallingConv::PreserveMost:

116

case CallingConv::PreserveAll:

117

return true;

118

}

119

}

120

121

/// Returns true if a CC can dynamically exclude a register from the list of

122

/// callee-saved-registers (TargetRegistryInfo::getCalleeSavedRegs()) based on

123

/// the parameters.

124

static bool shouldDisableArgRegFromCSR(CallingConv::ID CC) {

125

return CC == CallingConv::X86_RegCall;

126

}

127

128

X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,

129

const X86Subtarget &STI)

130

: TargetLowering(TM), Subtarget(STI) {

131

bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87();

132

MVT PtrVT = MVT::getIntegerVT(TM.getPointerSizeInBits(0));

133

134

// Set up the TargetLowering object.

135

136

// X86 is weird. It always uses i8 for shift amounts and setcc results.

137

setBooleanContents(ZeroOrOneBooleanContent);

138

// X86-SSE is even stranger. It uses -1 or 0 for vector masks.

139

setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);

140

141

// For 64-bit, since we have so many registers, use the ILP scheduler.

142

// For 32-bit, use the register pressure specific scheduling.

143

// For Atom, always use ILP scheduling.

144

if (Subtarget.isAtom())

145

setSchedulingPreference(Sched::ILP);

146

else if (Subtarget.is64Bit())

147

setSchedulingPreference(Sched::ILP);

148

else

149

setSchedulingPreference(Sched::RegPressure);

150

const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();

151

setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());

152

153

// Bypass expensive divides and use cheaper ones.

154

if (TM.getOptLevel() >= CodeGenOpt::Default) {

155

if (Subtarget.hasSlowDivide32())

156

addBypassSlowDiv(32, 8);

157

if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit())

158

addBypassSlowDiv(64, 32);

159

}

160

161

// Setup Windows compiler runtime calls.

162

if (Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium()) {

163

static const struct {

164

const RTLIB::Libcall Op;

165

const char * const Name;

166

const CallingConv::ID CC;

167

} LibraryCalls[] = {

168

{ RTLIB::SDIV_I64, "_alldiv", CallingConv::X86_StdCall },

169

{ RTLIB::UDIV_I64, "_aulldiv", CallingConv::X86_StdCall },

170

{ RTLIB::SREM_I64, "_allrem", CallingConv::X86_StdCall },

171

{ RTLIB::UREM_I64, "_aullrem", CallingConv::X86_StdCall },

172

{ RTLIB::MUL_I64, "_allmul", CallingConv::X86_StdCall },

173

};

174

175

for (const auto &LC : LibraryCalls) {

176

setLibcallName(LC.Op, LC.Name);

177

setLibcallCallingConv(LC.Op, LC.CC);

178

}

179

}

180

181

if (Subtarget.getTargetTriple().isOSMSVCRT()) {

182

// MSVCRT doesn't have powi; fall back to pow

183

setLibcallName(RTLIB::POWI_F32, nullptr);

184

setLibcallName(RTLIB::POWI_F64, nullptr);

185

}

186

187

// If we don't have cmpxchg8b(meaing this is a 386/486), limit atomic size to

188

// 32 bits so the AtomicExpandPass will expand it so we don't need cmpxchg8b.

189

// FIXME: Should we be limiting the atomic size on other configs? Default is

190

// 1024.

191

if (!Subtarget.canUseCMPXCHG8B())

192

setMaxAtomicSizeInBitsSupported(32);

193

194

setMaxDivRemBitWidthSupported(Subtarget.is64Bit() ? 128 : 64);

195

196

setMaxLargeFPConvertBitWidthSupported(128);

197

198

// Set up the register classes.

199

addRegisterClass(MVT::i8, &X86::GR8RegClass);

200

addRegisterClass(MVT::i16, &X86::GR16RegClass);

201

addRegisterClass(MVT::i32, &X86::GR32RegClass);

202

if (Subtarget.is64Bit())

203

addRegisterClass(MVT::i64, &X86::GR64RegClass);

204

205

for (MVT VT : MVT::integer_valuetypes())

206

setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);

207

208

// We don't accept any truncstore of integer registers.

209

setTruncStoreAction(MVT::i64, MVT::i32, Expand);

210

setTruncStoreAction(MVT::i64, MVT::i16, Expand);

211

setTruncStoreAction(MVT::i64, MVT::i8 , Expand);

212

setTruncStoreAction(MVT::i32, MVT::i16, Expand);

213

setTruncStoreAction(MVT::i32, MVT::i8 , Expand);

214

setTruncStoreAction(MVT::i16, MVT::i8, Expand);

215

216

setTruncStoreAction(MVT::f64, MVT::f32, Expand);

217

218

// SETOEQ and SETUNE require checking two conditions.

219

for (auto VT : {MVT::f32, MVT::f64, MVT::f80}) {

220

setCondCodeAction(ISD::SETOEQ, VT, Expand);

221

setCondCodeAction(ISD::SETUNE, VT, Expand);

222

}

223

224

// Integer absolute.

225

if (Subtarget.canUseCMOV()) {

226

setOperationAction(ISD::ABS , MVT::i16 , Custom);

227

setOperationAction(ISD::ABS , MVT::i32 , Custom);

228

if (Subtarget.is64Bit())

229

setOperationAction(ISD::ABS , MVT::i64 , Custom);

230

}

231

232

// Signed saturation subtraction.

233

setOperationAction(ISD::SSUBSAT , MVT::i8 , Custom);

234

setOperationAction(ISD::SSUBSAT , MVT::i16 , Custom);

235

setOperationAction(ISD::SSUBSAT , MVT::i32 , Custom);

236

if (Subtarget.is64Bit())

237

setOperationAction(ISD::SSUBSAT , MVT::i64 , Custom);

238

239

// Funnel shifts.

240

for (auto ShiftOp : {ISD::FSHL, ISD::FSHR}) {

241

// For slow shld targets we only lower for code size.

242

LegalizeAction ShiftDoubleAction = Subtarget.isSHLDSlow() ? Custom : Legal;

243

244

setOperationAction(ShiftOp , MVT::i8 , Custom);

245

setOperationAction(ShiftOp , MVT::i16 , Custom);

246

setOperationAction(ShiftOp , MVT::i32 , ShiftDoubleAction);

247

if (Subtarget.is64Bit())

248

setOperationAction(ShiftOp , MVT::i64 , ShiftDoubleAction);

249

}

250

251

if (!Subtarget.useSoftFloat()) {

252

// Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this

253

// operation.

254

setOperationAction(ISD::UINT_TO_FP, MVT::i8, Promote);

255

setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i8, Promote);

256

setOperationAction(ISD::UINT_TO_FP, MVT::i16, Promote);

257

setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i16, Promote);

258

// We have an algorithm for SSE2, and we turn this into a 64-bit

259

// FILD or VCVTUSI2SS/SD for other targets.

260

setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);

261

setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i32, Custom);

262

// We have an algorithm for SSE2->double, and we turn this into a

263

// 64-bit FILD followed by conditional FADD for other targets.

264

setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);

265

setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i64, Custom);

266

267

// Promote i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have

268

// this operation.

269

setOperationAction(ISD::SINT_TO_FP, MVT::i8, Promote);

270

setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i8, Promote);

271

// SSE has no i16 to fp conversion, only i32. We promote in the handler

272

// to allow f80 to use i16 and f64 to use i16 with sse1 only

273

setOperationAction(ISD::SINT_TO_FP, MVT::i16, Custom);

274

setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i16, Custom);

275

// f32 and f64 cases are Legal with SSE1/SSE2, f80 case is not

276

setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);

277

setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i32, Custom);

278

// In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64

279

// are Legal, f80 is custom lowered.

280

setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);

281

setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i64, Custom);

282

283

// Promote i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have

284

// this operation.

285

setOperationAction(ISD::FP_TO_SINT, MVT::i8, Promote);

286

// FIXME: This doesn't generate invalid exception when it should. PR44019.

287

setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i8, Promote);

288

setOperationAction(ISD::FP_TO_SINT, MVT::i16, Custom);

289

setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i16, Custom);

290

setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);

291

setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i32, Custom);

292

// In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64

293

// are Legal, f80 is custom lowered.

294

setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);

295

setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i64, Custom);

296

297

// Handle FP_TO_UINT by promoting the destination to a larger signed

298

// conversion.

299

setOperationAction(ISD::FP_TO_UINT, MVT::i8, Promote);

300

// FIXME: This doesn't generate invalid exception when it should. PR44019.

301

setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i8, Promote);

302

setOperationAction(ISD::FP_TO_UINT, MVT::i16, Promote);

303

// FIXME: This doesn't generate invalid exception when it should. PR44019.

304

setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i16, Promote);

305

setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);

306

setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i32, Custom);

307

setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);

308

setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i64, Custom);

309

310

setOperationAction(ISD::LRINT, MVT::f32, Custom);

311

setOperationAction(ISD::LRINT, MVT::f64, Custom);

312

setOperationAction(ISD::LLRINT, MVT::f32, Custom);

313

setOperationAction(ISD::LLRINT, MVT::f64, Custom);

314

315

if (!Subtarget.is64Bit()) {

316

setOperationAction(ISD::LRINT, MVT::i64, Custom);

317

setOperationAction(ISD::LLRINT, MVT::i64, Custom);

318

}

319

}

320

321

if (Subtarget.hasSSE2()) {

322

// Custom lowering for saturating float to int conversions.

323

// We handle promotion to larger result types manually.

324

for (MVT VT : { MVT::i8, MVT::i16, MVT::i32 }) {

325

setOperationAction(ISD::FP_TO_UINT_SAT, VT, Custom);

326

setOperationAction(ISD::FP_TO_SINT_SAT, VT, Custom);

327

}

328

if (Subtarget.is64Bit()) {

329

setOperationAction(ISD::FP_TO_UINT_SAT, MVT::i64, Custom);

330

setOperationAction(ISD::FP_TO_SINT_SAT, MVT::i64, Custom);

331

}

332

}

333

334

// Handle address space casts between mixed sized pointers.

335

setOperationAction(ISD::ADDRSPACECAST, MVT::i32, Custom);

336

setOperationAction(ISD::ADDRSPACECAST, MVT::i64, Custom);

337

338

// TODO: when we have SSE, these could be more efficient, by using movd/movq.

339

if (!Subtarget.hasSSE2()) {

340

setOperationAction(ISD::BITCAST , MVT::f32 , Expand);

341

setOperationAction(ISD::BITCAST , MVT::i32 , Expand);

342

if (Subtarget.is64Bit()) {

343

setOperationAction(ISD::BITCAST , MVT::f64 , Expand);

344

// Without SSE, i64->f64 goes through memory.

345

setOperationAction(ISD::BITCAST , MVT::i64 , Expand);

346

}

347

} else if (!Subtarget.is64Bit())

348

setOperationAction(ISD::BITCAST , MVT::i64 , Custom);

349

350

// Scalar integer divide and remainder are lowered to use operations that

351

// produce two results, to match the available instructions. This exposes

352

// the two-result form to trivial CSE, which is able to combine x/y and x%y

353

// into a single instruction.

354

//

355

// Scalar integer multiply-high is also lowered to use two-result

356

// operations, to match the available instructions. However, plain multiply

357

// (low) operations are left as Legal, as there are single-result

358

// instructions for this in x86. Using the two-result multiply instructions

359

// when both high and low results are needed must be arranged by dagcombine.

360

for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {

361

setOperationAction(ISD::MULHS, VT, Expand);

362

setOperationAction(ISD::MULHU, VT, Expand);

363

setOperationAction(ISD::SDIV, VT, Expand);

364

setOperationAction(ISD::UDIV, VT, Expand);

365

setOperationAction(ISD::SREM, VT, Expand);

366

setOperationAction(ISD::UREM, VT, Expand);

367

}

368

369

setOperationAction(ISD::BR_JT , MVT::Other, Expand);

370

setOperationAction(ISD::BRCOND , MVT::Other, Custom);

371

for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128,

372

MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {

373

setOperationAction(ISD::BR_CC, VT, Expand);

374

setOperationAction(ISD::SELECT_CC, VT, Expand);

375

}

376

if (Subtarget.is64Bit())

377

setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);

378

setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal);

379

setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal);

380

setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand);

381

382

setOperationAction(ISD::FREM , MVT::f32 , Expand);

383

setOperationAction(ISD::FREM , MVT::f64 , Expand);

384

setOperationAction(ISD::FREM , MVT::f80 , Expand);

385

setOperationAction(ISD::FREM , MVT::f128 , Expand);

386

387

if (!Subtarget.useSoftFloat() && Subtarget.hasX87()) {

388

setOperationAction(ISD::GET_ROUNDING , MVT::i32 , Custom);

389

setOperationAction(ISD::SET_ROUNDING , MVT::Other, Custom);

390

}

391

392

// Promote the i8 variants and force them on up to i32 which has a shorter

393

// encoding.

394

setOperationPromotedToType(ISD::CTTZ , MVT::i8 , MVT::i32);

395

setOperationPromotedToType(ISD::CTTZ_ZERO_UNDEF, MVT::i8 , MVT::i32);

396

// Promoted i16. tzcntw has a false dependency on Intel CPUs. For BSF, we emit

397

// a REP prefix to encode it as TZCNT for modern CPUs so it makes sense to

398

// promote that too.

399

setOperationPromotedToType(ISD::CTTZ , MVT::i16 , MVT::i32);

400

setOperationPromotedToType(ISD::CTTZ_ZERO_UNDEF, MVT::i16 , MVT::i32);

401

402

if (!Subtarget.hasBMI()) {

403

setOperationAction(ISD::CTTZ , MVT::i32 , Custom);

404

setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32 , Legal);

405

if (Subtarget.is64Bit()) {

406

setOperationAction(ISD::CTTZ , MVT::i64 , Custom);

407

setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Legal);

408

}

409

}

410

411

if (Subtarget.hasLZCNT()) {

412

// When promoting the i8 variants, force them to i32 for a shorter

413

// encoding.

414

setOperationPromotedToType(ISD::CTLZ , MVT::i8 , MVT::i32);

415

setOperationPromotedToType(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , MVT::i32);

416

} else {

417

for (auto VT : {MVT::i8, MVT::i16, MVT::i32, MVT::i64}) {

418

if (VT == MVT::i64 && !Subtarget.is64Bit())

419

continue;

420

setOperationAction(ISD::CTLZ , VT, Custom);

421

setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Custom);

422

}

423

}

424

425

for (auto Op : {ISD::FP16_TO_FP, ISD::STRICT_FP16_TO_FP, ISD::FP_TO_FP16,

426

ISD::STRICT_FP_TO_FP16}) {

427

// Special handling for half-precision floating point conversions.

428

// If we don't have F16C support, then lower half float conversions

429

// into library calls.

430

setOperationAction(

431

Op, MVT::f32,

432

(!Subtarget.useSoftFloat() && Subtarget.hasF16C()) ? Custom : Expand);

433

// There's never any support for operations beyond MVT::f32.

434

setOperationAction(Op, MVT::f64, Expand);

435

setOperationAction(Op, MVT::f80, Expand);

436

setOperationAction(Op, MVT::f128, Expand);

437

}

438

439

for (MVT VT : {MVT::f32, MVT::f64, MVT::f80, MVT::f128}) {

440

setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);

441

setLoadExtAction(ISD::EXTLOAD, VT, MVT::bf16, Expand);

442

setTruncStoreAction(VT, MVT::f16, Expand);

443

setTruncStoreAction(VT, MVT::bf16, Expand);

444

445

setOperationAction(ISD::BF16_TO_FP, VT, Expand);

446

setOperationAction(ISD::FP_TO_BF16, VT, Custom);

447

}

448

449

setOperationAction(ISD::PARITY, MVT::i8, Custom);

450

setOperationAction(ISD::PARITY, MVT::i16, Custom);

451

setOperationAction(ISD::PARITY, MVT::i32, Custom);

452

if (Subtarget.is64Bit())

453

setOperationAction(ISD::PARITY, MVT::i64, Custom);

454

if (Subtarget.hasPOPCNT()) {

455

setOperationPromotedToType(ISD::CTPOP, MVT::i8, MVT::i32);

456

// popcntw is longer to encode than popcntl and also has a false dependency

457

// on the dest that popcntl hasn't had since Cannon Lake.

458

setOperationPromotedToType(ISD::CTPOP, MVT::i16, MVT::i32);

459

} else {

460

setOperationAction(ISD::CTPOP , MVT::i8 , Expand);

461

setOperationAction(ISD::CTPOP , MVT::i16 , Expand);

462

setOperationAction(ISD::CTPOP , MVT::i32 , Expand);

463

if (Subtarget.is64Bit())

464

setOperationAction(ISD::CTPOP , MVT::i64 , Expand);

465

else

466

setOperationAction(ISD::CTPOP , MVT::i64 , Custom);

467

}

468

469

setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom);

470

471

if (!Subtarget.hasMOVBE())

472

setOperationAction(ISD::BSWAP , MVT::i16 , Expand);

473

474

// X86 wants to expand cmov itself.

475

for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) {

476

setOperationAction(ISD::SELECT, VT, Custom);

477

setOperationAction(ISD::SETCC, VT, Custom);

478

setOperationAction(ISD::STRICT_FSETCC, VT, Custom);

479

setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);

480

}

481

for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {

482

if (VT == MVT::i64 && !Subtarget.is64Bit())

483

continue;

484

setOperationAction(ISD::SELECT, VT, Custom);

485

setOperationAction(ISD::SETCC, VT, Custom);

486

}

487

488

// Custom action for SELECT MMX and expand action for SELECT_CC MMX

489

setOperationAction(ISD::SELECT, MVT::x86mmx, Custom);

490

setOperationAction(ISD::SELECT_CC, MVT::x86mmx, Expand);

491

492

setOperationAction(ISD::EH_RETURN , MVT::Other, Custom);

493

// NOTE: EH_SJLJ_SETJMP/_LONGJMP are not recommended, since

494

// LLVM/Clang supports zero-cost DWARF and SEH exception handling.

495

setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);

496

setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);

497

setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom);

498

if (TM.Options.ExceptionModel == ExceptionHandling::SjLj)

499

setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");

500

501

// Darwin ABI issue.

502

for (auto VT : { MVT::i32, MVT::i64 }) {

503

if (VT == MVT::i64 && !Subtarget.is64Bit())

504

continue;

505

setOperationAction(ISD::ConstantPool , VT, Custom);

506

setOperationAction(ISD::JumpTable , VT, Custom);

507

setOperationAction(ISD::GlobalAddress , VT, Custom);

508

setOperationAction(ISD::GlobalTLSAddress, VT, Custom);

509

setOperationAction(ISD::ExternalSymbol , VT, Custom);

510

setOperationAction(ISD::BlockAddress , VT, Custom);

511

}

512

513

// 64-bit shl, sra, srl (iff 32-bit x86)

514

for (auto VT : { MVT::i32, MVT::i64 }) {

515

if (VT == MVT::i64 && !Subtarget.is64Bit())

516

continue;

517

setOperationAction(ISD::SHL_PARTS, VT, Custom);

518

setOperationAction(ISD::SRA_PARTS, VT, Custom);

519

setOperationAction(ISD::SRL_PARTS, VT, Custom);

520

}

521

522

if (Subtarget.hasSSEPrefetch() || Subtarget.hasThreeDNow())

523

setOperationAction(ISD::PREFETCH , MVT::Other, Legal);

524

525

setOperationAction(ISD::ATOMIC_FENCE , MVT::Other, Custom);

526

527

// Expand certain atomics

528

for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {

529

setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);

530

setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);

531

setOperationAction(ISD::ATOMIC_LOAD_ADD, VT, Custom);

532

setOperationAction(ISD::ATOMIC_LOAD_OR, VT, Custom);

533

setOperationAction(ISD::ATOMIC_LOAD_XOR, VT, Custom);

534

setOperationAction(ISD::ATOMIC_LOAD_AND, VT, Custom);

535

setOperationAction(ISD::ATOMIC_STORE, VT, Custom);

536

}

537

538

if (!Subtarget.is64Bit())

539

setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Custom);

540

541

if (Subtarget.canUseCMPXCHG16B())

542

setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);

543

544

// FIXME - use subtarget debug flags

545

if (!Subtarget.isTargetDarwin() && !Subtarget.isTargetELF() &&

546

!Subtarget.isTargetCygMing() && !Subtarget.isTargetWin64() &&

547

TM.Options.ExceptionModel != ExceptionHandling::SjLj) {

548

setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);

549

}

550

551

setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);

552

setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);

553

554

setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);

555

setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);

556

557

setOperationAction(ISD::TRAP, MVT::Other, Legal);

558

setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);

559

if (Subtarget.isTargetPS())

560

setOperationAction(ISD::UBSANTRAP, MVT::Other, Expand);

561

else

562

setOperationAction(ISD::UBSANTRAP, MVT::Other, Legal);

563

564

// VASTART needs to be custom lowered to use the VarArgsFrameIndex

565

setOperationAction(ISD::VASTART , MVT::Other, Custom);

566

setOperationAction(ISD::VAEND , MVT::Other, Expand);

567

bool Is64Bit = Subtarget.is64Bit();

568

setOperationAction(ISD::VAARG, MVT::Other, Is64Bit ? Custom : Expand);

569

setOperationAction(ISD::VACOPY, MVT::Other, Is64Bit ? Custom : Expand);

570

571

setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);

572

setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);

573

574

setOperationAction(ISD::DYNAMIC_STACKALLOC, PtrVT, Custom);

575

576

// GC_TRANSITION_START and GC_TRANSITION_END need custom lowering.

577

setOperationAction(ISD::GC_TRANSITION_START, MVT::Other, Custom);

578

setOperationAction(ISD::GC_TRANSITION_END, MVT::Other, Custom);

579

580

setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Legal);

581

582

auto setF16Action = [&] (MVT VT, LegalizeAction Action) {

583

setOperationAction(ISD::FABS, VT, Action);

584

setOperationAction(ISD::FNEG, VT, Action);

585

setOperationAction(ISD::FCOPYSIGN, VT, Expand);

586

setOperationAction(ISD::FREM, VT, Action);

587

setOperationAction(ISD::FMA, VT, Action);

588

setOperationAction(ISD::FMINNUM, VT, Action);

589

setOperationAction(ISD::FMAXNUM, VT, Action);

590

setOperationAction(ISD::FMINIMUM, VT, Action);

591

setOperationAction(ISD::FMAXIMUM, VT, Action);

592

setOperationAction(ISD::FSIN, VT, Action);

593

setOperationAction(ISD::FCOS, VT, Action);

594

setOperationAction(ISD::FSINCOS, VT, Action);

595

setOperationAction(ISD::FSQRT, VT, Action);

596

setOperationAction(ISD::FPOW, VT, Action);

597

setOperationAction(ISD::FLOG, VT, Action);

598

setOperationAction(ISD::FLOG2, VT, Action);

599

setOperationAction(ISD::FLOG10, VT, Action);

600

setOperationAction(ISD::FEXP, VT, Action);

601

setOperationAction(ISD::FEXP2, VT, Action);

602

setOperationAction(ISD::FCEIL, VT, Action);

603

setOperationAction(ISD::FFLOOR, VT, Action);

604

setOperationAction(ISD::FNEARBYINT, VT, Action);

605

setOperationAction(ISD::FRINT, VT, Action);

606

setOperationAction(ISD::BR_CC, VT, Action);

607

setOperationAction(ISD::SETCC, VT, Action);

608

setOperationAction(ISD::SELECT, VT, Custom);

609

setOperationAction(ISD::SELECT_CC, VT, Action);

610

setOperationAction(ISD::FROUND, VT, Action);

611

setOperationAction(ISD::FROUNDEVEN, VT, Action);

612

setOperationAction(ISD::FTRUNC, VT, Action);

613

};

614

615

if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {

616

// f16, f32 and f64 use SSE.

617

// Set up the FP register classes.

618

addRegisterClass(MVT::f16, Subtarget.hasAVX512() ? &X86::FR16XRegClass

619

: &X86::FR16RegClass);

620

addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass

621

: &X86::FR32RegClass);

622

addRegisterClass(MVT::f64, Subtarget.hasAVX512() ? &X86::FR64XRegClass

623

: &X86::FR64RegClass);

624

625

// Disable f32->f64 extload as we can only generate this in one instruction

626

// under optsize. So its easier to pattern match (fpext (load)) for that

627

// case instead of needing to emit 2 instructions for extload in the

628

// non-optsize case.

629

setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);

630

631

for (auto VT : { MVT::f32, MVT::f64 }) {

632

// Use ANDPD to simulate FABS.

633

setOperationAction(ISD::FABS, VT, Custom);

634

635

// Use XORP to simulate FNEG.

636

setOperationAction(ISD::FNEG, VT, Custom);

637

638

// Use ANDPD and ORPD to simulate FCOPYSIGN.

639

setOperationAction(ISD::FCOPYSIGN, VT, Custom);

640

641

// These might be better off as horizontal vector ops.

642

setOperationAction(ISD::FADD, VT, Custom);

643

setOperationAction(ISD::FSUB, VT, Custom);

644

645

// We don't support sin/cos/fmod

646

setOperationAction(ISD::FSIN , VT, Expand);

647

setOperationAction(ISD::FCOS , VT, Expand);

648

setOperationAction(ISD::FSINCOS, VT, Expand);

649

}

650

651

// Half type will be promoted by default.

652

setF16Action(MVT::f16, Promote);

653

setOperationAction(ISD::FADD, MVT::f16, Promote);

654

setOperationAction(ISD::FSUB, MVT::f16, Promote);

655

setOperationAction(ISD::FMUL, MVT::f16, Promote);

656

setOperationAction(ISD::FDIV, MVT::f16, Promote);

657

setOperationAction(ISD::FP_ROUND, MVT::f16, Custom);

658

setOperationAction(ISD::FP_EXTEND, MVT::f32, Custom);

659

setOperationAction(ISD::FP_EXTEND, MVT::f64, Custom);

660

661

setOperationAction(ISD::STRICT_FADD, MVT::f16, Promote);

662

setOperationAction(ISD::STRICT_FSUB, MVT::f16, Promote);

663

setOperationAction(ISD::STRICT_FMUL, MVT::f16, Promote);

664

setOperationAction(ISD::STRICT_FDIV, MVT::f16, Promote);

665

setOperationAction(ISD::STRICT_FMA, MVT::f16, Promote);

666

setOperationAction(ISD::STRICT_FMINNUM, MVT::f16, Promote);

667

setOperationAction(ISD::STRICT_FMAXNUM, MVT::f16, Promote);

668

setOperationAction(ISD::STRICT_FMINIMUM, MVT::f16, Promote);

669

setOperationAction(ISD::STRICT_FMAXIMUM, MVT::f16, Promote);

670

setOperationAction(ISD::STRICT_FSQRT, MVT::f16, Promote);

671

setOperationAction(ISD::STRICT_FPOW, MVT::f16, Promote);

672

setOperationAction(ISD::STRICT_FLOG, MVT::f16, Promote);

673

setOperationAction(ISD::STRICT_FLOG2, MVT::f16, Promote);

674

setOperationAction(ISD::STRICT_FLOG10, MVT::f16, Promote);

675

setOperationAction(ISD::STRICT_FEXP, MVT::f16, Promote);

676

setOperationAction(ISD::STRICT_FEXP2, MVT::f16, Promote);

677

setOperationAction(ISD::STRICT_FCEIL, MVT::f16, Promote);

678

setOperationAction(ISD::STRICT_FFLOOR, MVT::f16, Promote);

679

setOperationAction(ISD::STRICT_FNEARBYINT, MVT::f16, Promote);

680

setOperationAction(ISD::STRICT_FRINT, MVT::f16, Promote);

681

setOperationAction(ISD::STRICT_FSETCC, MVT::f16, Promote);

682

setOperationAction(ISD::STRICT_FSETCCS, MVT::f16, Promote);

683

setOperationAction(ISD::STRICT_FROUND, MVT::f16, Promote);

684

setOperationAction(ISD::STRICT_FROUNDEVEN, MVT::f16, Promote);

685

setOperationAction(ISD::STRICT_FTRUNC, MVT::f16, Promote);

686

setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, Custom);

687

setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f32, Custom);

688

setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Custom);

689

690

setLibcallName(RTLIB::FPROUND_F32_F16, "__truncsfhf2");

691

setLibcallName(RTLIB::FPEXT_F16_F32, "__extendhfsf2");

692

693

// Lower this to MOVMSK plus an AND.

694

setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);

695

setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);

696

697

} else if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1() &&

698

(UseX87 || Is64Bit)) {

699

// Use SSE for f32, x87 for f64.

700

// Set up the FP register classes.

701

addRegisterClass(MVT::f32, &X86::FR32RegClass);

702

if (UseX87)

703

addRegisterClass(MVT::f64, &X86::RFP64RegClass);

704

705

// Use ANDPS to simulate FABS.

706

setOperationAction(ISD::FABS , MVT::f32, Custom);

707

708

// Use XORP to simulate FNEG.

709

setOperationAction(ISD::FNEG , MVT::f32, Custom);

710

711

if (UseX87)

712

setOperationAction(ISD::UNDEF, MVT::f64, Expand);

713

714

// Use ANDPS and ORPS to simulate FCOPYSIGN.

715

if (UseX87)

716

setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);

717

setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);

718

719

// We don't support sin/cos/fmod

720

setOperationAction(ISD::FSIN , MVT::f32, Expand);

721

setOperationAction(ISD::FCOS , MVT::f32, Expand);

722

setOperationAction(ISD::FSINCOS, MVT::f32, Expand);

723

724

if (UseX87) {

725

// Always expand sin/cos functions even though x87 has an instruction.

726

setOperationAction(ISD::FSIN, MVT::f64, Expand);

727

setOperationAction(ISD::FCOS, MVT::f64, Expand);

728

setOperationAction(ISD::FSINCOS, MVT::f64, Expand);

729

}

730

} else if (UseX87) {

731

// f32 and f64 in x87.

732

// Set up the FP register classes.

733

addRegisterClass(MVT::f64, &X86::RFP64RegClass);

734

addRegisterClass(MVT::f32, &X86::RFP32RegClass);

735

736

for (auto VT : { MVT::f32, MVT::f64 }) {

737

setOperationAction(ISD::UNDEF, VT, Expand);

738

setOperationAction(ISD::FCOPYSIGN, VT, Expand);

739

740

// Always expand sin/cos functions even though x87 has an instruction.

741

setOperationAction(ISD::FSIN , VT, Expand);

742

setOperationAction(ISD::FCOS , VT, Expand);

743

setOperationAction(ISD::FSINCOS, VT, Expand);

744

}

745

}

746

747

// Expand FP32 immediates into loads from the stack, save special cases.

748

if (isTypeLegal(MVT::f32)) {

749

if (UseX87 && (getRegClassFor(MVT::f32) == &X86::RFP32RegClass)) {

750

addLegalFPImmediate(APFloat(+0.0f)); // FLD0

751

addLegalFPImmediate(APFloat(+1.0f)); // FLD1

752

addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS

753

addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS

754

} else // SSE immediates.

755

addLegalFPImmediate(APFloat(+0.0f)); // xorps

756

}

757

// Expand FP64 immediates into loads from the stack, save special cases.

758

if (isTypeLegal(MVT::f64)) {

759

if (UseX87 && getRegClassFor(MVT::f64) == &X86::RFP64RegClass) {

760

addLegalFPImmediate(APFloat(+0.0)); // FLD0

761

addLegalFPImmediate(APFloat(+1.0)); // FLD1

762

addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS

763

addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS

764

} else // SSE immediates.

765

addLegalFPImmediate(APFloat(+0.0)); // xorpd

766

}

767

// Support fp16 0 immediate.

768

if (isTypeLegal(MVT::f16))

769

addLegalFPImmediate(APFloat::getZero(APFloat::IEEEhalf()));

770

771

// Handle constrained floating-point operations of scalar.

772

setOperationAction(ISD::STRICT_FADD, MVT::f32, Legal);

773

setOperationAction(ISD::STRICT_FADD, MVT::f64, Legal);

774

setOperationAction(ISD::STRICT_FSUB, MVT::f32, Legal);

775

setOperationAction(ISD::STRICT_FSUB, MVT::f64, Legal);

776

setOperationAction(ISD::STRICT_FMUL, MVT::f32, Legal);

777

setOperationAction(ISD::STRICT_FMUL, MVT::f64, Legal);

778

setOperationAction(ISD::STRICT_FDIV, MVT::f32, Legal);

779

setOperationAction(ISD::STRICT_FDIV, MVT::f64, Legal);

780

setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Legal);

781

setOperationAction(ISD::STRICT_FP_ROUND, MVT::f64, Legal);

782

setOperationAction(ISD::STRICT_FSQRT, MVT::f32, Legal);

783

setOperationAction(ISD::STRICT_FSQRT, MVT::f64, Legal);

784

785

// We don't support FMA.

786

setOperationAction(ISD::FMA, MVT::f64, Expand);

787

setOperationAction(ISD::FMA, MVT::f32, Expand);

788

789

// f80 always uses X87.

790

if (UseX87) {

791

addRegisterClass(MVT::f80, &X86::RFP80RegClass);

792

setOperationAction(ISD::UNDEF, MVT::f80, Expand);

793

setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);

794

{

795

APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended());

796

addLegalFPImmediate(TmpFlt); // FLD0

797

TmpFlt.changeSign();

798

addLegalFPImmediate(TmpFlt); // FLD0/FCHS

799

800

bool ignored;

801

APFloat TmpFlt2(+1.0);

802

TmpFlt2.convert(APFloat::x87DoubleExtended(), APFloat::rmNearestTiesToEven,

803

&ignored);

804

addLegalFPImmediate(TmpFlt2); // FLD1

805

TmpFlt2.changeSign();

806

addLegalFPImmediate(TmpFlt2); // FLD1/FCHS

807

}

808

809

// Always expand sin/cos functions even though x87 has an instruction.

810

setOperationAction(ISD::FSIN , MVT::f80, Expand);

811

setOperationAction(ISD::FCOS , MVT::f80, Expand);

812

setOperationAction(ISD::FSINCOS, MVT::f80, Expand);

813

814

setOperationAction(ISD::FFLOOR, MVT::f80, Expand);

815

setOperationAction(ISD::FCEIL, MVT::f80, Expand);

816

setOperationAction(ISD::FTRUNC, MVT::f80, Expand);

817

setOperationAction(ISD::FRINT, MVT::f80, Expand);

818

setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);

819

setOperationAction(ISD::FMA, MVT::f80, Expand);

820

setOperationAction(ISD::LROUND, MVT::f80, Expand);

821

setOperationAction(ISD::LLROUND, MVT::f80, Expand);

822

setOperationAction(ISD::LRINT, MVT::f80, Custom);

823

setOperationAction(ISD::LLRINT, MVT::f80, Custom);

824

825

// Handle constrained floating-point operations of scalar.

826

setOperationAction(ISD::STRICT_FADD , MVT::f80, Legal);

827

setOperationAction(ISD::STRICT_FSUB , MVT::f80, Legal);

828

setOperationAction(ISD::STRICT_FMUL , MVT::f80, Legal);

829

setOperationAction(ISD::STRICT_FDIV , MVT::f80, Legal);

830

setOperationAction(ISD::STRICT_FSQRT , MVT::f80, Legal);

831

if (isTypeLegal(MVT::f16)) {

832

setOperationAction(ISD::FP_EXTEND, MVT::f80, Custom);

833

setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f80, Custom);

834

} else {

835

setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f80, Legal);

836

}

837

// FIXME: When the target is 64-bit, STRICT_FP_ROUND will be overwritten

838

// as Custom.

839

setOperationAction(ISD::STRICT_FP_ROUND, MVT::f80, Legal);

840

}

841

842

// f128 uses xmm registers, but most operations require libcalls.

843

if (!Subtarget.useSoftFloat() && Subtarget.is64Bit() && Subtarget.hasSSE1()) {

844

addRegisterClass(MVT::f128, Subtarget.hasVLX() ? &X86::VR128XRegClass

845

: &X86::VR128RegClass);

846

847

addLegalFPImmediate(APFloat::getZero(APFloat::IEEEquad())); // xorps

848

849

setOperationAction(ISD::FADD, MVT::f128, LibCall);

850

setOperationAction(ISD::STRICT_FADD, MVT::f128, LibCall);

851

setOperationAction(ISD::FSUB, MVT::f128, LibCall);

852

setOperationAction(ISD::STRICT_FSUB, MVT::f128, LibCall);

853

setOperationAction(ISD::FDIV, MVT::f128, LibCall);

854

setOperationAction(ISD::STRICT_FDIV, MVT::f128, LibCall);

855

setOperationAction(ISD::FMUL, MVT::f128, LibCall);

856

setOperationAction(ISD::STRICT_FMUL, MVT::f128, LibCall);

857

setOperationAction(ISD::FMA, MVT::f128, LibCall);

858

setOperationAction(ISD::STRICT_FMA, MVT::f128, LibCall);

859

860

setOperationAction(ISD::FABS, MVT::f128, Custom);

861

setOperationAction(ISD::FNEG, MVT::f128, Custom);

862

setOperationAction(ISD::FCOPYSIGN, MVT::f128, Custom);

863

864

setOperationAction(ISD::FSIN, MVT::f128, LibCall);

865

setOperationAction(ISD::STRICT_FSIN, MVT::f128, LibCall);

866

setOperationAction(ISD::FCOS, MVT::f128, LibCall);

867

setOperationAction(ISD::STRICT_FCOS, MVT::f128, LibCall);

868

setOperationAction(ISD::FSINCOS, MVT::f128, LibCall);

869

// No STRICT_FSINCOS

870

setOperationAction(ISD::FSQRT, MVT::f128, LibCall);

871

setOperationAction(ISD::STRICT_FSQRT, MVT::f128, LibCall);

872

873

setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom);

874

setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f128, Custom);

875

// We need to custom handle any FP_ROUND with an f128 input, but

876

// LegalizeDAG uses the result type to know when to run a custom handler.

877

// So we have to list all legal floating point result types here.

878

if (isTypeLegal(MVT::f32)) {

879

setOperationAction(ISD::FP_ROUND, MVT::f32, Custom);

880

setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Custom);

881

}

882

if (isTypeLegal(MVT::f64)) {

883

setOperationAction(ISD::FP_ROUND, MVT::f64, Custom);

884

setOperationAction(ISD::STRICT_FP_ROUND, MVT::f64, Custom);

885

}

886

if (isTypeLegal(MVT::f80)) {

887

setOperationAction(ISD::FP_ROUND, MVT::f80, Custom);

888

setOperationAction(ISD::STRICT_FP_ROUND, MVT::f80, Custom);

889

}

890

891

setOperationAction(ISD::SETCC, MVT::f128, Custom);

892

893

setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f32, Expand);

894

setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f64, Expand);

895

setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f80, Expand);

896

setTruncStoreAction(MVT::f128, MVT::f32, Expand);

897

setTruncStoreAction(MVT::f128, MVT::f64, Expand);

898

setTruncStoreAction(MVT::f128, MVT::f80, Expand);

899

}

900

901

// Always use a library call for pow.

902

setOperationAction(ISD::FPOW , MVT::f32 , Expand);

903

setOperationAction(ISD::FPOW , MVT::f64 , Expand);

904

setOperationAction(ISD::FPOW , MVT::f80 , Expand);

905

setOperationAction(ISD::FPOW , MVT::f128 , Expand);

906

907

setOperationAction(ISD::FLOG, MVT::f80, Expand);

908

setOperationAction(ISD::FLOG2, MVT::f80, Expand);

909

setOperationAction(ISD::FLOG10, MVT::f80, Expand);

910

setOperationAction(ISD::FEXP, MVT::f80, Expand);

911

setOperationAction(ISD::FEXP2, MVT::f80, Expand);

912

setOperationAction(ISD::FMINNUM, MVT::f80, Expand);

913

setOperationAction(ISD::FMAXNUM, MVT::f80, Expand);

914

915

// Some FP actions are always expanded for vector types.

916

for (auto VT : { MVT::v8f16, MVT::v16f16, MVT::v32f16,

917

MVT::v4f32, MVT::v8f32, MVT::v16f32,

918

MVT::v2f64, MVT::v4f64, MVT::v8f64 }) {

919

setOperationAction(ISD::FSIN, VT, Expand);

920

setOperationAction(ISD::FSINCOS, VT, Expand);

921

setOperationAction(ISD::FCOS, VT, Expand);

922

setOperationAction(ISD::FREM, VT, Expand);

923

setOperationAction(ISD::FCOPYSIGN, VT, Expand);

924

setOperationAction(ISD::FPOW, VT, Expand);

925

setOperationAction(ISD::FLOG, VT, Expand);

926

setOperationAction(ISD::FLOG2, VT, Expand);

927

setOperationAction(ISD::FLOG10, VT, Expand);

928

setOperationAction(ISD::FEXP, VT, Expand);

929

setOperationAction(ISD::FEXP2, VT, Expand);

930

}

931

932

// First set operation action for all vector types to either promote

933

// (for widening) or expand (for scalarization). Then we will selectively

934

// turn on ones that can be effectively codegen'd.

935

for (MVT VT : MVT::fixedlen_vector_valuetypes()) {

936

setOperationAction(ISD::SDIV, VT, Expand);

937

setOperationAction(ISD::UDIV, VT, Expand);

938

setOperationAction(ISD::SREM, VT, Expand);

939

setOperationAction(ISD::UREM, VT, Expand);

940

setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand);

941

setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);

942

setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand);

943

setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand);

944

setOperationAction(ISD::FMA, VT, Expand);

945

setOperationAction(ISD::FFLOOR, VT, Expand);

946

setOperationAction(ISD::FCEIL, VT, Expand);

947

setOperationAction(ISD::FTRUNC, VT, Expand);

948

setOperationAction(ISD::FRINT, VT, Expand);

949

setOperationAction(ISD::FNEARBYINT, VT, Expand);

950

setOperationAction(ISD::SMUL_LOHI, VT, Expand);

951

setOperationAction(ISD::MULHS, VT, Expand);

952

setOperationAction(ISD::UMUL_LOHI, VT, Expand);

953

setOperationAction(ISD::MULHU, VT, Expand);

954

setOperationAction(ISD::SDIVREM, VT, Expand);

955

setOperationAction(ISD::UDIVREM, VT, Expand);

956

setOperationAction(ISD::CTPOP, VT, Expand);

957

setOperationAction(ISD::CTTZ, VT, Expand);

958

setOperationAction(ISD::CTLZ, VT, Expand);

959

setOperationAction(ISD::ROTL, VT, Expand);

960

setOperationAction(ISD::ROTR, VT, Expand);

961

setOperationAction(ISD::BSWAP, VT, Expand);

962

setOperationAction(ISD::SETCC, VT, Expand);

963

setOperationAction(ISD::FP_TO_UINT, VT, Expand);

964

setOperationAction(ISD::FP_TO_SINT, VT, Expand);

965

setOperationAction(ISD::UINT_TO_FP, VT, Expand);

966

setOperationAction(ISD::SINT_TO_FP, VT, Expand);

967

setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand);

968

setOperationAction(ISD::TRUNCATE, VT, Expand);

969

setOperationAction(ISD::SIGN_EXTEND, VT, Expand);

970

setOperationAction(ISD::ZERO_EXTEND, VT, Expand);

971

setOperationAction(ISD::ANY_EXTEND, VT, Expand);

972

setOperationAction(ISD::SELECT_CC, VT, Expand);

973

for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {

974

setTruncStoreAction(InnerVT, VT, Expand);

975

976

setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);

977

setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand);

978

979

// N.b. ISD::EXTLOAD legality is basically ignored except for i1-like

980

// types, we have to deal with them whether we ask for Expansion or not.

981

// Setting Expand causes its own optimisation problems though, so leave

982

// them legal.

983

if (VT.getVectorElementType() == MVT::i1)

984

setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);

985

986

// EXTLOAD for MVT::f16 vectors is not legal because f16 vectors are

987

// split/scalarized right now.

988

if (VT.getVectorElementType() == MVT::f16 ||

989

VT.getVectorElementType() == MVT::bf16)

990

setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);

991

}

992

}

993

994

// FIXME: In order to prevent SSE instructions being expanded to MMX ones

995

// with -msoft-float, disable use of MMX as well.

996

if (!Subtarget.useSoftFloat() && Subtarget.hasMMX()) {

997

addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);

998

// No operations on x86mmx supported, everything uses intrinsics.

999

}

1000

1001

if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1()) {

1002

addRegisterClass(MVT::v4f32, Subtarget.hasVLX() ? &X86::VR128XRegClass

1003

: &X86::VR128RegClass);

1004

1005

setOperationAction(ISD::FNEG, MVT::v4f32, Custom);

1006

setOperationAction(ISD::FABS, MVT::v4f32, Custom);

1007

setOperationAction(ISD::FCOPYSIGN, MVT::v4f32, Custom);

1008

setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom);

1009

setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom);

1010

setOperationAction(ISD::VSELECT, MVT::v4f32, Custom);

1011

setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);

1012

setOperationAction(ISD::SELECT, MVT::v4f32, Custom);

1013

1014

setOperationAction(ISD::LOAD, MVT::v2f32, Custom);

1015

setOperationAction(ISD::STORE, MVT::v2f32, Custom);

1016

1017

setOperationAction(ISD::STRICT_FADD, MVT::v4f32, Legal);

1018

setOperationAction(ISD::STRICT_FSUB, MVT::v4f32, Legal);

1019

setOperationAction(ISD::STRICT_FMUL, MVT::v4f32, Legal);

1020

setOperationAction(ISD::STRICT_FDIV, MVT::v4f32, Legal);

1021

setOperationAction(ISD::STRICT_FSQRT, MVT::v4f32, Legal);

1022

}

1023

1024

if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {

1025

addRegisterClass(MVT::v2f64, Subtarget.hasVLX() ? &X86::VR128XRegClass

1026

: &X86::VR128RegClass);

1027

1028

// FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM

1029

// registers cannot be used even for integer operations.

1030

addRegisterClass(MVT::v16i8, Subtarget.hasVLX() ? &X86::VR128XRegClass

1031

: &X86::VR128RegClass);

1032

addRegisterClass(MVT::v8i16, Subtarget.hasVLX() ? &X86::VR128XRegClass

1033

: &X86::VR128RegClass);

1034

addRegisterClass(MVT::v8f16, Subtarget.hasVLX() ? &X86::VR128XRegClass

1035

: &X86::VR128RegClass);

1036

addRegisterClass(MVT::v4i32, Subtarget.hasVLX() ? &X86::VR128XRegClass

1037

: &X86::VR128RegClass);

1038

addRegisterClass(MVT::v2i64, Subtarget.hasVLX() ? &X86::VR128XRegClass

1039

: &X86::VR128RegClass);

1040

1041

for (auto VT : { MVT::v2i8, MVT::v4i8, MVT::v8i8,

1042

MVT::v2i16, MVT::v4i16, MVT::v2i32 }) {

1043

setOperationAction(ISD::SDIV, VT, Custom);

1044

setOperationAction(ISD::SREM, VT, Custom);

1045

setOperationAction(ISD::UDIV, VT, Custom);

1046

setOperationAction(ISD::UREM, VT, Custom);

1047

}

1048

1049

setOperationAction(ISD::MUL, MVT::v2i8, Custom);

1050

setOperationAction(ISD::MUL, MVT::v4i8, Custom);

1051

setOperationAction(ISD::MUL, MVT::v8i8, Custom);

1052

1053

setOperationAction(ISD::MUL, MVT::v16i8, Custom);

1054

setOperationAction(ISD::MUL, MVT::v4i32, Custom);

1055

setOperationAction(ISD::MUL, MVT::v2i64, Custom);

1056

setOperationAction(ISD::MULHU, MVT::v4i32, Custom);

1057

setOperationAction(ISD::MULHS, MVT::v4i32, Custom);

1058

setOperationAction(ISD::MULHU, MVT::v16i8, Custom);

1059

setOperationAction(ISD::MULHS, MVT::v16i8, Custom);

1060

setOperationAction(ISD::MULHU, MVT::v8i16, Legal);

1061

setOperationAction(ISD::MULHS, MVT::v8i16, Legal);

1062

setOperationAction(ISD::MUL, MVT::v8i16, Legal);

1063

setOperationAction(ISD::AVGCEILU, MVT::v16i8, Legal);

1064

setOperationAction(ISD::AVGCEILU, MVT::v8i16, Legal);

1065

1066

setOperationAction(ISD::SMULO, MVT::v16i8, Custom);

1067

setOperationAction(ISD::UMULO, MVT::v16i8, Custom);

1068

setOperationAction(ISD::UMULO, MVT::v2i32, Custom);

1069

1070

setOperationAction(ISD::FNEG, MVT::v2f64, Custom);

1071

setOperationAction(ISD::FABS, MVT::v2f64, Custom);

1072

setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Custom);

1073

1074

for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {

1075

setOperationAction(ISD::SMAX, VT, VT == MVT::v8i16 ? Legal : Custom);

1076

setOperationAction(ISD::SMIN, VT, VT == MVT::v8i16 ? Legal : Custom);

1077

setOperationAction(ISD::UMAX, VT, VT == MVT::v16i8 ? Legal : Custom);

1078

setOperationAction(ISD::UMIN, VT, VT == MVT::v16i8 ? Legal : Custom);

1079

}

1080

1081

setOperationAction(ISD::ABDU, MVT::v16i8, Custom);

1082

setOperationAction(ISD::ABDS, MVT::v8i16, Custom);

1083

1084

setOperationAction(ISD::UADDSAT, MVT::v16i8, Legal);

1085

setOperationAction(ISD::SADDSAT, MVT::v16i8, Legal);

1086

setOperationAction(ISD::USUBSAT, MVT::v16i8, Legal);

1087

setOperationAction(ISD::SSUBSAT, MVT::v16i8, Legal);

1088

setOperationAction(ISD::UADDSAT, MVT::v8i16, Legal);

1089

setOperationAction(ISD::SADDSAT, MVT::v8i16, Legal);

1090

setOperationAction(ISD::USUBSAT, MVT::v8i16, Legal);

1091

setOperationAction(ISD::SSUBSAT, MVT::v8i16, Legal);

1092

setOperationAction(ISD::USUBSAT, MVT::v4i32, Custom);

1093

setOperationAction(ISD::USUBSAT, MVT::v2i64, Custom);

1094

1095

setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom);

1096

setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom);

1097

setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);

1098

setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);

1099

1100

for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {

1101

setOperationAction(ISD::SETCC, VT, Custom);

1102

setOperationAction(ISD::STRICT_FSETCC, VT, Custom);

1103

setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);

1104

setOperationAction(ISD::CTPOP, VT, Custom);

1105

setOperationAction(ISD::ABS, VT, Custom);

1106

1107

// The condition codes aren't legal in SSE/AVX and under AVX512 we use

1108

// setcc all the way to isel and prefer SETGT in some isel patterns.

1109

setCondCodeAction(ISD::SETLT, VT, Custom);

1110

setCondCodeAction(ISD::SETLE, VT, Custom);

1111

}

1112

1113

for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {

1114

setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);

1115

setOperationAction(ISD::BUILD_VECTOR, VT, Custom);

1116

setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);

1117

setOperationAction(ISD::VSELECT, VT, Custom);

1118

setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);

1119

}

1120

1121

for (auto VT : { MVT::v8f16, MVT::v2f64, MVT::v2i64 }) {

1122

setOperationAction(ISD::BUILD_VECTOR, VT, Custom);

1123

setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);

1124

setOperationAction(ISD::VSELECT, VT, Custom);

1125

1126

if (VT == MVT::v2i64 && !Subtarget.is64Bit())

1127

continue;

1128

1129

setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);

1130

setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);

1131

}

1132

setF16Action(MVT::v8f16, Expand);

1133

setOperationAction(ISD::FADD, MVT::v8f16, Expand);

1134

setOperationAction(ISD::FSUB, MVT::v8f16, Expand);

1135

setOperationAction(ISD::FMUL, MVT::v8f16, Expand);

1136

setOperationAction(ISD::FDIV, MVT::v8f16, Expand);

1137

1138

// Custom lower v2i64 and v2f64 selects.

1139

setOperationAction(ISD::SELECT, MVT::v2f64, Custom);

1140

setOperationAction(ISD::SELECT, MVT::v2i64, Custom);

1141

setOperationAction(ISD::SELECT, MVT::v4i32, Custom);

1142

setOperationAction(ISD::SELECT, MVT::v8i16, Custom);

1143

setOperationAction(ISD::SELECT, MVT::v8f16, Custom);

1144

setOperationAction(ISD::SELECT, MVT::v16i8, Custom);

1145

1146

setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Custom);

1147

setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Custom);

1148

setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Custom);

1149

setOperationAction(ISD::FP_TO_UINT, MVT::v2i32, Custom);

1150

setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v4i32, Custom);

1151

setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2i32, Custom);

1152

1153

// Custom legalize these to avoid over promotion or custom promotion.

1154

for (auto VT : {MVT::v2i8, MVT::v4i8, MVT::v8i8, MVT::v2i16, MVT::v4i16}) {

1155

setOperationAction(ISD::FP_TO_SINT, VT, Custom);

1156

setOperationAction(ISD::FP_TO_UINT, VT, Custom);

1157

setOperationAction(ISD::STRICT_FP_TO_SINT, VT, Custom);

1158

setOperationAction(ISD::STRICT_FP_TO_UINT, VT, Custom);

1159

}

1160

1161

setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Custom);

1162

setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4i32, Custom);

1163

setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom);

1164

setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2i32, Custom);

1165

1166

setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom);

1167

setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2i32, Custom);

1168

1169

setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom);

1170

setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i32, Custom);

1171

1172

// Fast v2f32 UINT_TO_FP( v2i32 ) custom conversion.

1173

setOperationAction(ISD::SINT_TO_FP, MVT::v2f32, Custom);

1174

setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2f32, Custom);

1175

setOperationAction(ISD::UINT_TO_FP, MVT::v2f32, Custom);

1176

setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2f32, Custom);

1177

1178

setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom);

1179

setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v2f32, Custom);

1180

setOperationAction(ISD::FP_ROUND, MVT::v2f32, Custom);

1181

setOperationAction(ISD::STRICT_FP_ROUND, MVT::v2f32, Custom);

1182

1183

// We want to legalize this to an f64 load rather than an i64 load on

1184

// 64-bit targets and two 32-bit loads on a 32-bit target. Similar for

1185

// store.

1186

setOperationAction(ISD::LOAD, MVT::v2i32, Custom);

1187

setOperationAction(ISD::LOAD, MVT::v4i16, Custom);

1188

setOperationAction(ISD::LOAD, MVT::v8i8, Custom);

1189

setOperationAction(ISD::STORE, MVT::v2i32, Custom);

1190

setOperationAction(ISD::STORE, MVT::v4i16, Custom);

1191

setOperationAction(ISD::STORE, MVT::v8i8, Custom);

1192

1193

// Add 32-bit vector stores to help vectorization opportunities.

1194

setOperationAction(ISD::STORE, MVT::v2i16, Custom);

1195

setOperationAction(ISD::STORE, MVT::v4i8, Custom);

1196

1197

setOperationAction(ISD::BITCAST, MVT::v2i32, Custom);

1198

setOperationAction(ISD::BITCAST, MVT::v4i16, Custom);

1199

setOperationAction(ISD::BITCAST, MVT::v8i8, Custom);

1200

if (!Subtarget.hasAVX512())

1201

setOperationAction(ISD::BITCAST, MVT::v16i1, Custom);

1202

1203

setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v2i64, Custom);

1204

setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i32, Custom);

1205

setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i16, Custom);

1206

1207

setOperationAction(ISD::SIGN_EXTEND, MVT::v4i64, Custom);

1208

1209

setOperationAction(ISD::TRUNCATE, MVT::v2i8, Custom);

1210

setOperationAction(ISD::TRUNCATE, MVT::v2i16, Custom);

1211

setOperationAction(ISD::TRUNCATE, MVT::v2i32, Custom);

1212

setOperationAction(ISD::TRUNCATE, MVT::v4i8, Custom);

1213

setOperationAction(ISD::TRUNCATE, MVT::v4i16, Custom);

1214

setOperationAction(ISD::TRUNCATE, MVT::v8i8, Custom);

1215

1216

// In the customized shift lowering, the legal v4i32/v2i64 cases

1217

// in AVX2 will be recognized.

1218

for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {

1219

setOperationAction(ISD::SRL, VT, Custom);

1220

setOperationAction(ISD::SHL, VT, Custom);

1221

setOperationAction(ISD::SRA, VT, Custom);

1222

if (VT == MVT::v2i64) continue;

1223

setOperationAction(ISD::ROTL, VT, Custom);

1224

setOperationAction(ISD::ROTR, VT, Custom);

1225

setOperationAction(ISD::FSHL, VT, Custom);

1226

setOperationAction(ISD::FSHR, VT, Custom);

1227

}

1228

1229

setOperationAction(ISD::STRICT_FSQRT, MVT::v2f64, Legal);

1230

setOperationAction(ISD::STRICT_FADD, MVT::v2f64, Legal);

1231

setOperationAction(ISD::STRICT_FSUB, MVT::v2f64, Legal);

1232

setOperationAction(ISD::STRICT_FMUL, MVT::v2f64, Legal);

1233

setOperationAction(ISD::STRICT_FDIV, MVT::v2f64, Legal);

1234

}

1235

1236

if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {

1237

setOperationAction(ISD::ABS, MVT::v16i8, Legal);

1238

setOperationAction(ISD::ABS, MVT::v8i16, Legal);

1239

setOperationAction(ISD::ABS, MVT::v4i32, Legal);

1240

setOperationAction(ISD::BITREVERSE, MVT::v16i8, Custom);

1241

setOperationAction(ISD::CTLZ, MVT::v16i8, Custom);

1242

setOperationAction(ISD::CTLZ, MVT::v8i16, Custom);

1243

setOperationAction(ISD::CTLZ, MVT::v4i32, Custom);

1244

setOperationAction(ISD::CTLZ, MVT::v2i64, Custom);

1245

1246

// These might be better off as horizontal vector ops.

1247

setOperationAction(ISD::ADD, MVT::i16, Custom);

1248

setOperationAction(ISD::ADD, MVT::i32, Custom);

1249

setOperationAction(ISD::SUB, MVT::i16, Custom);

1250

setOperationAction(ISD::SUB, MVT::i32, Custom);

1251

}

1252

1253

if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) {

1254

for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {

1255

setOperationAction(ISD::FFLOOR, RoundedTy, Legal);

1256

setOperationAction(ISD::STRICT_FFLOOR, RoundedTy, Legal);

1257

setOperationAction(ISD::FCEIL, RoundedTy, Legal);

1258

setOperationAction(ISD::STRICT_FCEIL, RoundedTy, Legal);

1259

setOperationAction(ISD::FTRUNC, RoundedTy, Legal);

1260

setOperationAction(ISD::STRICT_FTRUNC, RoundedTy, Legal);

1261

setOperationAction(ISD::FRINT, RoundedTy, Legal);

1262

setOperationAction(ISD::STRICT_FRINT, RoundedTy, Legal);

1263

setOperationAction(ISD::FNEARBYINT, RoundedTy, Legal);

1264

setOperationAction(ISD::STRICT_FNEARBYINT, RoundedTy, Legal);

1265

setOperationAction(ISD::FROUNDEVEN, RoundedTy, Legal);

1266

setOperationAction(ISD::STRICT_FROUNDEVEN, RoundedTy, Legal);

1267

1268

setOperationAction(ISD::FROUND, RoundedTy, Custom);

1269

}

1270

1271

setOperationAction(ISD::SMAX, MVT::v16i8, Legal);

1272

setOperationAction(ISD::SMAX, MVT::v4i32, Legal);

1273

setOperationAction(ISD::UMAX, MVT::v8i16, Legal);

1274

setOperationAction(ISD::UMAX, MVT::v4i32, Legal);

1275

setOperationAction(ISD::SMIN, MVT::v16i8, Legal);

1276

setOperationAction(ISD::SMIN, MVT::v4i32, Legal);

1277

setOperationAction(ISD::UMIN, MVT::v8i16, Legal);

1278

setOperationAction(ISD::UMIN, MVT::v4i32, Legal);

1279

1280

for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64}) {

1281

setOperationAction(ISD::ABDS, VT, Custom);

1282

setOperationAction(ISD::ABDU, VT, Custom);

1283

}

1284

1285

setOperationAction(ISD::UADDSAT, MVT::v4i32, Custom);

1286

setOperationAction(ISD::SADDSAT, MVT::v2i64, Custom);

1287

setOperationAction(ISD::SSUBSAT, MVT::v2i64, Custom);

1288

1289

// FIXME: Do we need to handle scalar-to-vector here?

1290

setOperationAction(ISD::MUL, MVT::v4i32, Legal);

1291

setOperationAction(ISD::SMULO, MVT::v2i32, Custom);

1292

1293

// We directly match byte blends in the backend as they match the VSELECT

1294

// condition form.

1295

setOperationAction(ISD::VSELECT, MVT::v16i8, Legal);

1296

1297

// SSE41 brings specific instructions for doing vector sign extend even in

1298

// cases where we don't have SRA.

1299

for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {

1300

setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Legal);

1301

setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Legal);

1302

}

1303

1304

// SSE41 also has vector sign/zero extending loads, PMOV[SZ]X

1305

for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {

1306

setLoadExtAction(LoadExtOp, MVT::v8i16, MVT::v8i8, Legal);

1307

setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i8, Legal);

1308

setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i8, Legal);

1309

setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i16, Legal);

1310

setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i16, Legal);

1311

setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i32, Legal);

1312

}

1313

1314

if (Subtarget.is64Bit() && !Subtarget.hasAVX512()) {

1315

// We need to scalarize v4i64->v432 uint_to_fp using cvtsi2ss, but we can

1316

// do the pre and post work in the vector domain.

1317

setOperationAction(ISD::UINT_TO_FP, MVT::v4i64, Custom);

1318

setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i64, Custom);

1319

// We need to mark SINT_TO_FP as Custom even though we want to expand it

1320

// so that DAG combine doesn't try to turn it into uint_to_fp.

1321

setOperationAction(ISD::SINT_TO_FP, MVT::v4i64, Custom);

1322

setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4i64, Custom);

1323

}

1324

}

1325

1326

if (!Subtarget.useSoftFloat() && Subtarget.hasSSE42()) {

1327

setOperationAction(ISD::UADDSAT, MVT::v2i64, Custom);

1328

}

1329

1330

if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) {

1331

for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,

1332

MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {

1333

setOperationAction(ISD::ROTL, VT, Custom);

1334

setOperationAction(ISD::ROTR, VT, Custom);

1335

}

1336

1337

// XOP can efficiently perform BITREVERSE with VPPERM.

1338

for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 })

1339

setOperationAction(ISD::BITREVERSE, VT, Custom);

1340

1341

for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,

1342

MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })

1343

setOperationAction(ISD::BITREVERSE, VT, Custom);

1344

}

1345

1346

if (!Subtarget.useSoftFloat() && Subtarget.hasAVX()) {

1347

bool HasInt256 = Subtarget.hasInt256();

1348

1349

addRegisterClass(MVT::v32i8, Subtarget.hasVLX() ? &X86::VR256XRegClass

1350

: &X86::VR256RegClass);

1351

addRegisterClass(MVT::v16i16, Subtarget.hasVLX() ? &X86::VR256XRegClass

1352

: &X86::VR256RegClass);

1353

addRegisterClass(MVT::v16f16, Subtarget.hasVLX() ? &X86::VR256XRegClass

1354

: &X86::VR256RegClass);

1355

addRegisterClass(MVT::v8i32, Subtarget.hasVLX() ? &X86::VR256XRegClass

1356

: &X86::VR256RegClass);

1357

addRegisterClass(MVT::v8f32, Subtarget.hasVLX() ? &X86::VR256XRegClass

1358

: &X86::VR256RegClass);

1359

addRegisterClass(MVT::v4i64, Subtarget.hasVLX() ? &X86::VR256XRegClass

1360

: &X86::VR256RegClass);

1361

addRegisterClass(MVT::v4f64, Subtarget.hasVLX() ? &X86::VR256XRegClass

1362

: &X86::VR256RegClass);

1363

1364

for (auto VT : { MVT::v8f32, MVT::v4f64 }) {

1365

setOperationAction(ISD::FFLOOR, VT, Legal);

1366

setOperationAction(ISD::STRICT_FFLOOR, VT, Legal);

1367

setOperationAction(ISD::FCEIL, VT, Legal);

1368

setOperationAction(ISD::STRICT_FCEIL, VT, Legal);

1369

setOperationAction(ISD::FTRUNC, VT, Legal);

1370

setOperationAction(ISD::STRICT_FTRUNC, VT, Legal);

1371

setOperationAction(ISD::FRINT, VT, Legal);

1372

setOperationAction(ISD::STRICT_FRINT, VT, Legal);

1373

setOperationAction(ISD::FNEARBYINT, VT, Legal);

1374

setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal);

1375

setOperationAction(ISD::FROUNDEVEN, VT, Legal);

1376

setOperationAction(ISD::STRICT_FROUNDEVEN, VT, Legal);

1377

1378

setOperationAction(ISD::FROUND, VT, Custom);

1379

1380

setOperationAction(ISD::FNEG, VT, Custom);

1381

setOperationAction(ISD::FABS, VT, Custom);

1382

setOperationAction(ISD::FCOPYSIGN, VT, Custom);

1383

}

1384

1385

// (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted

1386

// even though v8i16 is a legal type.

1387

setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i16, MVT::v8i32);

1388

setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i16, MVT::v8i32);

1389

setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i16, MVT::v8i32);

1390

setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i16, MVT::v8i32);

1391

setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Custom);

1392

setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Custom);

1393

setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v8i32, Custom);

1394

1395

setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Custom);

1396

setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v8i32, Custom);

1397

setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Expand);

1398

setOperationAction(ISD::FP_ROUND, MVT::v8f16, Expand);

1399

setOperationAction(ISD::FP_EXTEND, MVT::v4f64, Custom);

1400

setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v4f64, Custom);

1401

1402

setOperationAction(ISD::STRICT_FP_ROUND, MVT::v4f32, Legal);

1403

setOperationAction(ISD::STRICT_FADD, MVT::v8f32, Legal);

1404

setOperationAction(ISD::STRICT_FADD, MVT::v4f64, Legal);

1405

setOperationAction(ISD::STRICT_FSUB, MVT::v8f32, Legal);

1406

setOperationAction(ISD::STRICT_FSUB, MVT::v4f64, Legal);

1407

setOperationAction(ISD::STRICT_FMUL, MVT::v8f32, Legal);

1408

setOperationAction(ISD::STRICT_FMUL, MVT::v4f64, Legal);

1409

setOperationAction(ISD::STRICT_FDIV, MVT::v8f32, Legal);

1410

setOperationAction(ISD::STRICT_FDIV, MVT::v4f64, Legal);

1411

setOperationAction(ISD::STRICT_FSQRT, MVT::v8f32, Legal);

1412

setOperationAction(ISD::STRICT_FSQRT, MVT::v4f64, Legal);

1413

1414

if (!Subtarget.hasAVX512())

1415

setOperationAction(ISD::BITCAST, MVT::v32i1, Custom);

1416

1417

// In the customized shift lowering, the legal v8i32/v4i64 cases

1418

// in AVX2 will be recognized.

1419

for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {

1420

setOperationAction(ISD::SRL, VT, Custom);

1421

setOperationAction(ISD::SHL, VT, Custom);

1422

setOperationAction(ISD::SRA, VT, Custom);

1423

setOperationAction(ISD::ABDS, VT, Custom);

1424

setOperationAction(ISD::ABDU, VT, Custom);

1425

if (VT == MVT::v4i64) continue;

1426

setOperationAction(ISD::ROTL, VT, Custom);

1427

setOperationAction(ISD::ROTR, VT, Custom);

1428

setOperationAction(ISD::FSHL, VT, Custom);

1429

setOperationAction(ISD::FSHR, VT, Custom);

1430

}

1431

1432

// These types need custom splitting if their input is a 128-bit vector.

1433

setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom);

1434

setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);

1435

setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom);

1436

setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);

1437

1438

setOperationAction(ISD::SELECT, MVT::v4f64, Custom);

1439

setOperationAction(ISD::SELECT, MVT::v4i64, Custom);

1440

setOperationAction(ISD::SELECT, MVT::v8i32, Custom);

1441

setOperationAction(ISD::SELECT, MVT::v16i16, Custom);

1442

setOperationAction(ISD::SELECT, MVT::v16f16, Custom);

1443

setOperationAction(ISD::SELECT, MVT::v32i8, Custom);

1444

setOperationAction(ISD::SELECT, MVT::v8f32, Custom);

1445

1446

for (auto VT : { MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {

1447

setOperationAction(ISD::SIGN_EXTEND, VT, Custom);

1448

setOperationAction(ISD::ZERO_EXTEND, VT, Custom);

1449

setOperationAction(ISD::ANY_EXTEND, VT, Custom);

1450

}

1451

1452

setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom);

1453

setOperationAction(ISD::TRUNCATE, MVT::v8i16, Custom);

1454

setOperationAction(ISD::TRUNCATE, MVT::v4i32, Custom);

1455

setOperationAction(ISD::BITREVERSE, MVT::v32i8, Custom);

1456

1457

for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {

1458

setOperationAction(ISD::SETCC, VT, Custom);

1459

setOperationAction(ISD::STRICT_FSETCC, VT, Custom);

1460

setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);

1461

setOperationAction(ISD::CTPOP, VT, Custom);

1462

setOperationAction(ISD::CTLZ, VT, Custom);

1463

1464

// The condition codes aren't legal in SSE/AVX and under AVX512 we use

1465

// setcc all the way to isel and prefer SETGT in some isel patterns.

1466

setCondCodeAction(ISD::SETLT, VT, Custom);

1467

setCondCodeAction(ISD::SETLE, VT, Custom);

1468

}

1469

1470

if (Subtarget.hasAnyFMA()) {

1471

for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32,

1472

MVT::v2f64, MVT::v4f64 }) {

1473

setOperationAction(ISD::FMA, VT, Legal);

1474

setOperationAction(ISD::STRICT_FMA, VT, Legal);

1475

}

1476

}

1477

1478

for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {

1479

setOperationAction(ISD::ADD, VT, HasInt256 ? Legal : Custom);

1480

setOperationAction(ISD::SUB, VT, HasInt256 ? Legal : Custom);

1481

}

1482

1483

setOperationAction(ISD::MUL, MVT::v4i64, Custom);

1484

setOperationAction(ISD::MUL, MVT::v8i32, HasInt256 ? Legal : Custom);

1485

setOperationAction(ISD::MUL, MVT::v16i16, HasInt256 ? Legal : Custom);

1486

setOperationAction(ISD::MUL, MVT::v32i8, Custom);

1487

1488

setOperationAction(ISD::MULHU, MVT::v8i32, Custom);

1489

setOperationAction(ISD::MULHS, MVT::v8i32, Custom);

1490

setOperationAction(ISD::MULHU, MVT::v16i16, HasInt256 ? Legal : Custom);

1491

setOperationAction(ISD::MULHS, MVT::v16i16, HasInt256 ? Legal : Custom);

1492

setOperationAction(ISD::MULHU, MVT::v32i8, Custom);

1493

setOperationAction(ISD::MULHS, MVT::v32i8, Custom);

1494

setOperationAction(ISD::AVGCEILU, MVT::v16i16, HasInt256 ? Legal : Custom);

1495

setOperationAction(ISD::AVGCEILU, MVT::v32i8, HasInt256 ? Legal : Custom);

1496

1497

setOperationAction(ISD::SMULO, MVT::v32i8, Custom);

1498

setOperationAction(ISD::UMULO, MVT::v32i8, Custom);

1499

1500

setOperationAction(ISD::ABS, MVT::v4i64, Custom);

1501

setOperationAction(ISD::SMAX, MVT::v4i64, Custom);

1502

setOperationAction(ISD::UMAX, MVT::v4i64, Custom);

1503

setOperationAction(ISD::SMIN, MVT::v4i64, Custom);

1504

setOperationAction(ISD::UMIN, MVT::v4i64, Custom);

1505

1506

setOperationAction(ISD::UADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom);

1507

setOperationAction(ISD::SADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom);

1508

setOperationAction(ISD::USUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom);

1509

setOperationAction(ISD::SSUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom);

1510

setOperationAction(ISD::UADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom);

1511

setOperationAction(ISD::SADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom);

1512

setOperationAction(ISD::USUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom);

1513

setOperationAction(ISD::SSUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom);

1514

setOperationAction(ISD::UADDSAT, MVT::v8i32, Custom);

1515

setOperationAction(ISD::USUBSAT, MVT::v8i32, Custom);

1516

setOperationAction(ISD::UADDSAT, MVT::v4i64, Custom);

1517

setOperationAction(ISD::USUBSAT, MVT::v4i64, Custom);

1518

1519

for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {

1520

setOperationAction(ISD::ABS, VT, HasInt256 ? Legal : Custom);

1521

setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom);

1522

setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom);

1523

setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom);

1524

setOperationAction(ISD::UMIN, VT, HasInt256 ? Legal : Custom);

1525

}

1526

1527

for (auto VT : {MVT::v16i16, MVT::v8i32, MVT::v4i64}) {

1528

setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Custom);

1529

setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Custom);

1530

}

1531

1532

if (HasInt256) {

1533

// The custom lowering for UINT_TO_FP for v8i32 becomes interesting

1534

// when we have a 256bit-wide blend with immediate.

1535

setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom);

1536

setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v8i32, Custom);

1537

1538

// AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X

1539

for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {

1540

setLoadExtAction(LoadExtOp, MVT::v16i16, MVT::v16i8, Legal);

1541

setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i8, Legal);

1542

setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i8, Legal);

1543

setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i16, Legal);

1544

setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i16, Legal);

1545

setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i32, Legal);

1546

}

1547

}

1548

1549

for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,

1550

MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {

1551

setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);

1552

setOperationAction(ISD::MSTORE, VT, Legal);

1553

}

1554

1555

// Extract subvector is special because the value type

1556

// (result) is 128-bit but the source is 256-bit wide.

1557

for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,

1558

MVT::v8f16, MVT::v4f32, MVT::v2f64 }) {

1559

setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);

1560

}

1561

1562

// Custom lower several nodes for 256-bit types.

1563

for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,

1564

MVT::v16f16, MVT::v8f32, MVT::v4f64 }) {

1565

setOperationAction(ISD::BUILD_VECTOR, VT, Custom);

1566

setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);

1567

setOperationAction(ISD::VSELECT, VT, Custom);

1568

setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);

1569

setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);

1570

setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);

1571

setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);

1572

setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);

1573

setOperationAction(ISD::STORE, VT, Custom);

1574

}

1575

setF16Action(MVT::v16f16, Expand);

1576

setOperationAction(ISD::FADD, MVT::v16f16, Expand);

1577

setOperationAction(ISD::FSUB, MVT::v16f16, Expand);

1578

setOperationAction(ISD::FMUL, MVT::v16f16, Expand);

1579

setOperationAction(ISD::FDIV, MVT::v16f16, Expand);

1580

1581

if (HasInt256) {

1582

setOperationAction(ISD::VSELECT, MVT::v32i8, Legal);

1583

1584

// Custom legalize 2x32 to get a little better code.

1585

setOperationAction(ISD::MGATHER, MVT::v2f32, Custom);

1586

setOperationAction(ISD::MGATHER, MVT::v2i32, Custom);

1587

1588

for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,

1589

MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })

1590

setOperationAction(ISD::MGATHER, VT, Custom);

1591

}

1592

}

1593

1594

if (!Subtarget.useSoftFloat() && !Subtarget.hasFP16() &&

1595

Subtarget.hasF16C()) {

1596

for (MVT VT : { MVT::f16, MVT::v2f16, MVT::v4f16, MVT::v8f16 }) {

1597

setOperationAction(ISD::FP_ROUND, VT, Custom);

1598

setOperationAction(ISD::STRICT_FP_ROUND, VT, Custom);

1599

}

1600

for (MVT VT : { MVT::f32, MVT::v2f32, MVT::v4f32 }) {

1601

setOperationAction(ISD::FP_EXTEND, VT, Custom);

1602

setOperationAction(ISD::STRICT_FP_EXTEND, VT, Custom);

1603

}

1604

for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV}) {

1605

setOperationPromotedToType(Opc, MVT::v8f16, MVT::v8f32);

1606

setOperationPromotedToType(Opc, MVT::v16f16, MVT::v16f32);

1607

}

1608

1609

setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Legal);

1610

setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v8f32, Legal);

1611

}

1612

1613

// This block controls legalization of the mask vector sizes that are

1614

// available with AVX512. 512-bit vectors are in a separate block controlled

1615

// by useAVX512Regs.

1616

if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {

1617

addRegisterClass(MVT::v1i1, &X86::VK1RegClass);

1618

addRegisterClass(MVT::v2i1, &X86::VK2RegClass);

1619

addRegisterClass(MVT::v4i1, &X86::VK4RegClass);

1620

addRegisterClass(MVT::v8i1, &X86::VK8RegClass);

1621

addRegisterClass(MVT::v16i1, &X86::VK16RegClass);

1622

1623

setOperationAction(ISD::SELECT, MVT::v1i1, Custom);

1624

setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v1i1, Custom);

1625

setOperationAction(ISD::BUILD_VECTOR, MVT::v1i1, Custom);

1626

1627

setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i1, MVT::v8i32);

1628

setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i1, MVT::v8i32);

1629

setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v4i1, MVT::v4i32);

1630

setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v4i1, MVT::v4i32);

1631

setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i1, MVT::v8i32);

1632

setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i1, MVT::v8i32);

1633

setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v4i1, MVT::v4i32);

1634

setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v4i1, MVT::v4i32);

1635

setOperationAction(ISD::FP_TO_SINT, MVT::v2i1, Custom);

1636

setOperationAction(ISD::FP_TO_UINT, MVT::v2i1, Custom);

1637

setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2i1, Custom);

1638

setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2i1, Custom);

1639

1640

// There is no byte sized k-register load or store without AVX512DQ.

1641

if (!Subtarget.hasDQI()) {

1642

setOperationAction(ISD::LOAD, MVT::v1i1, Custom);

1643

setOperationAction(ISD::LOAD, MVT::v2i1, Custom);

1644

setOperationAction(ISD::LOAD, MVT::v4i1, Custom);

1645

setOperationAction(ISD::LOAD, MVT::v8i1, Custom);

1646

1647

setOperationAction(ISD::STORE, MVT::v1i1, Custom);

1648

setOperationAction(ISD::STORE, MVT::v2i1, Custom);

1649

setOperationAction(ISD::STORE, MVT::v4i1, Custom);

1650

setOperationAction(ISD::STORE, MVT::v8i1, Custom);

1651

}

1652

1653

// Extends of v16i1/v8i1/v4i1/v2i1 to 128-bit vectors.

1654

for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {

1655

setOperationAction(ISD::SIGN_EXTEND, VT, Custom);

1656

setOperationAction(ISD::ZERO_EXTEND, VT, Custom);

1657

setOperationAction(ISD::ANY_EXTEND, VT, Custom);

1658

}

1659

1660

for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 })

1661

setOperationAction(ISD::VSELECT, VT, Expand);

1662

1663

for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 }) {

1664

setOperationAction(ISD::SETCC, VT, Custom);

1665

setOperationAction(ISD::STRICT_FSETCC, VT, Custom);

1666

setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);

1667

setOperationAction(ISD::SELECT, VT, Custom);

1668

setOperationAction(ISD::TRUNCATE, VT, Custom);

1669

1670

setOperationAction(ISD::BUILD_VECTOR, VT, Custom);

1671

setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);

1672

setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);

1673

setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);

1674

setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);

1675

setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);

1676

}

1677

1678

for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1 })

1679

setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);

1680

}

1681

1682

// This block controls legalization for 512-bit operations with 32/64 bit

1683

// elements. 512-bits can be disabled based on prefer-vector-width and

1684

// required-vector-width function attributes.

1685

if (!Subtarget.useSoftFloat() && Subtarget.useAVX512Regs()) {

1686

bool HasBWI = Subtarget.hasBWI();

1687

1688

addRegisterClass(MVT::v16i32, &X86::VR512RegClass);

1689

addRegisterClass(MVT::v16f32, &X86::VR512RegClass);

1690

addRegisterClass(MVT::v8i64, &X86::VR512RegClass);

1691

addRegisterClass(MVT::v8f64, &X86::VR512RegClass);

1692

addRegisterClass(MVT::v32i16, &X86::VR512RegClass);

1693

addRegisterClass(MVT::v32f16, &X86::VR512RegClass);

1694

addRegisterClass(MVT::v64i8, &X86::VR512RegClass);

1695

1696

for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) {

1697

setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8, Legal);

1698

setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i16, Legal);

1699

setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i8, Legal);

1700

setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i16, Legal);

1701

setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i32, Legal);

1702

if (HasBWI)

1703

setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);

1704

}

1705

1706

for (MVT VT : { MVT::v16f32, MVT::v8f64 }) {

1707

setOperationAction(ISD::FNEG, VT, Custom);

1708

setOperationAction(ISD::FABS, VT, Custom);

1709

setOperationAction(ISD::FMA, VT, Legal);

1710

setOperationAction(ISD::STRICT_FMA, VT, Legal);

1711

setOperationAction(ISD::FCOPYSIGN, VT, Custom);

1712

}

1713

1714

for (MVT VT : { MVT::v16i1, MVT::v16i8 }) {

1715

setOperationPromotedToType(ISD::FP_TO_SINT , VT, MVT::v16i32);

1716

setOperationPromotedToType(ISD::FP_TO_UINT , VT, MVT::v16i32);

1717

setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, VT, MVT::v16i32);

1718

setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, VT, MVT::v16i32);

1719

}

1720

1721

for (MVT VT : { MVT::v16i16, MVT::v16i32 }) {

1722

setOperationAction(ISD::FP_TO_SINT, VT, Custom);

1723

setOperationAction(ISD::FP_TO_UINT, VT, Custom);

1724

setOperationAction(ISD::STRICT_FP_TO_SINT, VT, Custom);

1725

setOperationAction(ISD::STRICT_FP_TO_UINT, VT, Custom);

1726

}

1727

1728

setOperationAction(ISD::SINT_TO_FP, MVT::v16i32, Custom);

1729

setOperationAction(ISD::UINT_TO_FP, MVT::v16i32, Custom);

1730

setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v16i32, Custom);

1731

setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v16i32, Custom);

1732

setOperationAction(ISD::FP_EXTEND, MVT::v8f64, Custom);

1733

setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v8f64, Custom);

1734

1735

setOperationAction(ISD::STRICT_FADD, MVT::v16f32, Legal);

1736

setOperationAction(ISD::STRICT_FADD, MVT::v8f64, Legal);

1737

setOperationAction(ISD::STRICT_FSUB, MVT::v16f32, Legal);

1738

setOperationAction(ISD::STRICT_FSUB, MVT::v8f64, Legal);

1739

setOperationAction(ISD::STRICT_FMUL, MVT::v16f32, Legal);

1740

setOperationAction(ISD::STRICT_FMUL, MVT::v8f64, Legal);

1741

setOperationAction(ISD::STRICT_FDIV, MVT::v16f32, Legal);

1742

setOperationAction(ISD::STRICT_FDIV, MVT::v8f64, Legal);

1743

setOperationAction(ISD::STRICT_FSQRT, MVT::v16f32, Legal);

1744

setOperationAction(ISD::STRICT_FSQRT, MVT::v8f64, Legal);

1745

setOperationAction(ISD::STRICT_FP_ROUND, MVT::v8f32, Legal);

1746

1747

setTruncStoreAction(MVT::v8i64, MVT::v8i8, Legal);

1748

setTruncStoreAction(MVT::v8i64, MVT::v8i16, Legal);

1749

setTruncStoreAction(MVT::v8i64, MVT::v8i32, Legal);

1750

setTruncStoreAction(MVT::v16i32, MVT::v16i8, Legal);

1751

setTruncStoreAction(MVT::v16i32, MVT::v16i16, Legal);

1752

if (HasBWI)

1753

setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal);

1754

1755

// With 512-bit vectors and no VLX, we prefer to widen MLOAD/MSTORE

1756

// to 512-bit rather than use the AVX2 instructions so that we can use

1757

// k-masks.

1758

if (!Subtarget.hasVLX()) {

1759

for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,

1760

MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {

1761

setOperationAction(ISD::MLOAD, VT, Custom);

1762

setOperationAction(ISD::MSTORE, VT, Custom);

1763

}

1764

}

1765

1766

setOperationAction(ISD::TRUNCATE, MVT::v8i32, Legal);

1767

setOperationAction(ISD::TRUNCATE, MVT::v16i16, Legal);

1768

setOperationAction(ISD::TRUNCATE, MVT::v32i8, HasBWI ? Legal : Custom);

1769

setOperationAction(ISD::TRUNCATE, MVT::v16i64, Custom);

1770

setOperationAction(ISD::ZERO_EXTEND, MVT::v32i16, Custom);

1771

setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);

1772

setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom);

1773

setOperationAction(ISD::ANY_EXTEND, MVT::v32i16, Custom);

1774

setOperationAction(ISD::ANY_EXTEND, MVT::v16i32, Custom);

1775

setOperationAction(ISD::ANY_EXTEND, MVT::v8i64, Custom);

1776

setOperationAction(ISD::SIGN_EXTEND, MVT::v32i16, Custom);

1777

setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);

1778

setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom);

1779

1780

if (HasBWI) {

1781

// Extends from v64i1 masks to 512-bit vectors.

1782

setOperationAction(ISD::SIGN_EXTEND, MVT::v64i8, Custom);

1783

setOperationAction(ISD::ZERO_EXTEND, MVT::v64i8, Custom);

1784

setOperationAction(ISD::ANY_EXTEND, MVT::v64i8, Custom);

1785

}

1786

1787

for (auto VT : { MVT::v16f32, MVT::v8f64 }) {

1788

setOperationAction(ISD::FFLOOR, VT, Legal);

1789

setOperationAction(ISD::STRICT_FFLOOR, VT, Legal);

1790

setOperationAction(ISD::FCEIL, VT, Legal);

1791

setOperationAction(ISD::STRICT_FCEIL, VT, Legal);

1792

setOperationAction(ISD::FTRUNC, VT, Legal);

1793

setOperationAction(ISD::STRICT_FTRUNC, VT, Legal);

1794

setOperationAction(ISD::FRINT, VT, Legal);

1795

setOperationAction(ISD::STRICT_FRINT, VT, Legal);

1796

setOperationAction(ISD::FNEARBYINT, VT, Legal);

1797

setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal);

1798

setOperationAction(ISD::FROUNDEVEN, VT, Legal);

1799

setOperationAction(ISD::STRICT_FROUNDEVEN, VT, Legal);

1800

1801

setOperationAction(ISD::FROUND, VT, Custom);

1802

}

1803

1804

for (auto VT : {MVT::v32i16, MVT::v16i32, MVT::v8i64}) {

1805

setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Custom);

1806

setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Custom);

1807

}

1808

1809

setOperationAction(ISD::ADD, MVT::v32i16, HasBWI ? Legal : Custom);

1810

setOperationAction(ISD::SUB, MVT::v32i16, HasBWI ? Legal : Custom);

1811

setOperationAction(ISD::ADD, MVT::v64i8, HasBWI ? Legal : Custom);

1812

setOperationAction(ISD::SUB, MVT::v64i8, HasBWI ? Legal : Custom);

1813

1814

setOperationAction(ISD::MUL, MVT::v8i64, Custom);

1815

setOperationAction(ISD::MUL, MVT::v16i32, Legal);

1816

setOperationAction(ISD::MUL, MVT::v32i16, HasBWI ? Legal : Custom);

1817

setOperationAction(ISD::MUL, MVT::v64i8, Custom);

1818

1819

setOperationAction(ISD::MULHU, MVT::v16i32, Custom);

1820

setOperationAction(ISD::MULHS, MVT::v16i32, Custom);

1821

setOperationAction(ISD::MULHS, MVT::v32i16, HasBWI ? Legal : Custom);

1822

setOperationAction(ISD::MULHU, MVT::v32i16, HasBWI ? Legal : Custom);

1823

setOperationAction(ISD::MULHS, MVT::v64i8, Custom);

1824

setOperationAction(ISD::MULHU, MVT::v64i8, Custom);

1825

setOperationAction(ISD::AVGCEILU, MVT::v32i16, HasBWI ? Legal : Custom);

1826

setOperationAction(ISD::AVGCEILU, MVT::v64i8, HasBWI ? Legal : Custom);

1827

1828

setOperationAction(ISD::SMULO, MVT::v64i8, Custom);

1829

setOperationAction(ISD::UMULO, MVT::v64i8, Custom);

1830

1831

setOperationAction(ISD::BITREVERSE, MVT::v64i8, Custom);

1832

1833

for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64 }) {

1834

setOperationAction(ISD::SRL, VT, Custom);

1835

setOperationAction(ISD::SHL, VT, Custom);

1836

setOperationAction(ISD::SRA, VT, Custom);

1837

setOperationAction(ISD::ROTL, VT, Custom);

1838

setOperationAction(ISD::ROTR, VT, Custom);

1839

setOperationAction(ISD::SETCC, VT, Custom);

1840

setOperationAction(ISD::ABDS, VT, Custom);

1841

setOperationAction(ISD::ABDU, VT, Custom);

1842

1843

// The condition codes aren't legal in SSE/AVX and under AVX512 we use

1844

// setcc all the way to isel and prefer SETGT in some isel patterns.

1845

setCondCodeAction(ISD::SETLT, VT, Custom);

1846

setCondCodeAction(ISD::SETLE, VT, Custom);

1847

}

1848

for (auto VT : { MVT::v16i32, MVT::v8i64 }) {

1849

setOperationAction(ISD::SMAX, VT, Legal);

1850

setOperationAction(ISD::UMAX, VT, Legal);

1851

setOperationAction(ISD::SMIN, VT, Legal);

1852

setOperationAction(ISD::UMIN, VT, Legal);

1853

setOperationAction(ISD::ABS, VT, Legal);

1854

setOperationAction(ISD::CTPOP, VT, Custom);

1855

setOperationAction(ISD::STRICT_FSETCC, VT, Custom);

1856

setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);

1857

}

1858

1859

for (auto VT : { MVT::v64i8, MVT::v32i16 }) {

1860

setOperationAction(ISD::ABS, VT, HasBWI ? Legal : Custom);

1861

setOperationAction(ISD::CTPOP, VT, Subtarget.hasBITALG() ? Legal : Custom);

1862

setOperationAction(ISD::CTLZ, VT, Custom);

1863

setOperationAction(ISD::SMAX, VT, HasBWI ? Legal : Custom);

1864

setOperationAction(ISD::UMAX, VT, HasBWI ? Legal : Custom);

1865

setOperationAction(ISD::SMIN, VT, HasBWI ? Legal : Custom);

1866

setOperationAction(ISD::UMIN, VT, HasBWI ? Legal : Custom);

1867

setOperationAction(ISD::UADDSAT, VT, HasBWI ? Legal : Custom);

1868

setOperationAction(ISD::SADDSAT, VT, HasBWI ? Legal : Custom);

1869

setOperationAction(ISD::USUBSAT, VT, HasBWI ? Legal : Custom);

1870

setOperationAction(ISD::SSUBSAT, VT, HasBWI ? Legal : Custom);

1871

}

1872

1873

setOperationAction(ISD::FSHL, MVT::v64i8, Custom);

1874

setOperationAction(ISD::FSHR, MVT::v64i8, Custom);

1875

setOperationAction(ISD::FSHL, MVT::v32i16, Custom);

1876

setOperationAction(ISD::FSHR, MVT::v32i16, Custom);

1877

setOperationAction(ISD::FSHL, MVT::v16i32, Custom);

1878

setOperationAction(ISD::FSHR, MVT::v16i32, Custom);

1879

1880

if (Subtarget.hasDQI()) {

1881

for (auto Opc : {ISD::SINT_TO_FP, ISD::UINT_TO_FP, ISD::STRICT_SINT_TO_FP,

1882

ISD::STRICT_UINT_TO_FP, ISD::FP_TO_SINT, ISD::FP_TO_UINT,

1883

ISD::STRICT_FP_TO_SINT, ISD::STRICT_FP_TO_UINT})

1884

setOperationAction(Opc, MVT::v8i64, Custom);

1885

setOperationAction(ISD::MUL, MVT::v8i64, Legal);

1886

}

1887

1888

if (Subtarget.hasCDI()) {

1889

// NonVLX sub-targets extend 128/256 vectors to use the 512 version.

1890

for (auto VT : { MVT::v16i32, MVT::v8i64} ) {

1891

setOperationAction(ISD::CTLZ, VT, Legal);

1892

}

1893

} // Subtarget.hasCDI()

1894

1895

if (Subtarget.hasVPOPCNTDQ()) {

1896

for (auto VT : { MVT::v16i32, MVT::v8i64 })

1897

setOperationAction(ISD::CTPOP, VT, Legal);

1898

}

1899

1900

// Extract subvector is special because the value type

1901

// (result) is 256-bit but the source is 512-bit wide.

1902

// 128-bit was made Legal under AVX1.

1903

for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,

1904

MVT::v16f16, MVT::v8f32, MVT::v4f64 })

1905

setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);

1906

1907

for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64,

1908

MVT::v32f16, MVT::v16f32, MVT::v8f64 }) {

1909

setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);

1910

setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);

1911

setOperationAction(ISD::SELECT, VT, Custom);

1912

setOperationAction(ISD::VSELECT, VT, Custom);

1913

setOperationAction(ISD::BUILD_VECTOR, VT, Custom);

1914

setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);

1915

setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);

1916

setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);

1917

setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);

1918

}

1919

setF16Action(MVT::v32f16, Expand);

1920

setOperationAction(ISD::FP_ROUND, MVT::v16f16, Custom);

1921

setOperationAction(ISD::STRICT_FP_ROUND, MVT::v16f16, Custom);

1922

setOperationAction(ISD::FP_EXTEND, MVT::v16f32, Legal);

1923

setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v16f32, Legal);

1924

for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV}) {

1925

setOperationPromotedToType(Opc, MVT::v16f16, MVT::v16f32);

1926

setOperationPromotedToType(Opc, MVT::v32f16, MVT::v32f32);

1927

}

1928

1929

for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {

1930

setOperationAction(ISD::MLOAD, VT, Legal);

1931

setOperationAction(ISD::MSTORE, VT, Legal);

1932

setOperationAction(ISD::MGATHER, VT, Custom);

1933

setOperationAction(ISD::MSCATTER, VT, Custom);

1934

}

1935

if (HasBWI) {

1936

for (auto VT : { MVT::v64i8, MVT::v32i16 }) {

1937

setOperationAction(ISD::MLOAD, VT, Legal);

1938

setOperationAction(ISD::MSTORE, VT, Legal);

1939

}

1940

} else {

1941

setOperationAction(ISD::STORE, MVT::v32i16, Custom);

1942

setOperationAction(ISD::STORE, MVT::v64i8, Custom);

1943

}

1944

1945

if (Subtarget.hasVBMI2()) {

1946

for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64,

1947

MVT::v16i16, MVT::v8i32, MVT::v4i64,

1948

MVT::v32i16, MVT::v16i32, MVT::v8i64 }) {

1949

setOperationAction(ISD::FSHL, VT, Custom);

1950

setOperationAction(ISD::FSHR, VT, Custom);

1951

}

1952

1953

setOperationAction(ISD::ROTL, MVT::v32i16, Custom);

1954

setOperationAction(ISD::ROTR, MVT::v8i16, Custom);

1955

setOperationAction(ISD::ROTR, MVT::v16i16, Custom);

1956

setOperationAction(ISD::ROTR, MVT::v32i16, Custom);

1957

}

1958

}// useAVX512Regs

1959

1960

// This block controls legalization for operations that don't have

1961

// pre-AVX512 equivalents. Without VLX we use 512-bit operations for

1962

// narrower widths.

1963

if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {

1964

// These operations are handled on non-VLX by artificially widening in

1965

// isel patterns.

1966

1967

setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v8i32, Custom);

1968

setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v4i32, Custom);

1969

setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2i32, Custom);

1970

1971

if (Subtarget.hasDQI()) {

1972

// Fast v2f32 SINT_TO_FP( v2i64 ) custom conversion.

1973

// v2f32 UINT_TO_FP is already custom under SSE2.

1974

assert(isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) &&(static_cast <bool> (isOperationCustom(ISD::UINT_TO_FP,
MVT::v2f32) && isOperationCustom(ISD::STRICT_UINT_TO_FP
, MVT::v2f32) && "Unexpected operation action!") ? void
(0) : __assert_fail ("isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) && isOperationCustom(ISD::STRICT_UINT_TO_FP, MVT::v2f32) && \"Unexpected operation action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 1976, __extension__
__PRETTY_FUNCTION__))

1975

isOperationCustom(ISD::STRICT_UINT_TO_FP, MVT::v2f32) &&(static_cast <bool> (isOperationCustom(ISD::UINT_TO_FP,
MVT::v2f32) && isOperationCustom(ISD::STRICT_UINT_TO_FP
, MVT::v2f32) && "Unexpected operation action!") ? void
(0) : __assert_fail ("isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) && isOperationCustom(ISD::STRICT_UINT_TO_FP, MVT::v2f32) && \"Unexpected operation action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 1976, __extension__
__PRETTY_FUNCTION__))

1976

"Unexpected operation action!")(static_cast <bool> (isOperationCustom(ISD::UINT_TO_FP,
MVT::v2f32) && isOperationCustom(ISD::STRICT_UINT_TO_FP
, MVT::v2f32) && "Unexpected operation action!") ? void
(0) : __assert_fail ("isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) && isOperationCustom(ISD::STRICT_UINT_TO_FP, MVT::v2f32) && \"Unexpected operation action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 1976, __extension__
__PRETTY_FUNCTION__));

1977

// v2i64 FP_TO_S/UINT(v2f32) custom conversion.

1978

setOperationAction(ISD::FP_TO_SINT, MVT::v2f32, Custom);

1979

setOperationAction(ISD::FP_TO_UINT, MVT::v2f32, Custom);

1980

setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2f32, Custom);

1981

setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2f32, Custom);

1982

}

1983

1984

for (auto VT : { MVT::v2i64, MVT::v4i64 }) {

1985

setOperationAction(ISD::SMAX, VT, Legal);

1986

setOperationAction(ISD::UMAX, VT, Legal);

1987

setOperationAction(ISD::SMIN, VT, Legal);

1988

setOperationAction(ISD::UMIN, VT, Legal);

1989

setOperationAction(ISD::ABS, VT, Legal);

1990

}

1991

1992

for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {

1993

setOperationAction(ISD::ROTL, VT, Custom);

1994

setOperationAction(ISD::ROTR, VT, Custom);

1995

}

1996

1997

// Custom legalize 2x32 to get a little better code.

1998

setOperationAction(ISD::MSCATTER, MVT::v2f32, Custom);

1999

setOperationAction(ISD::MSCATTER, MVT::v2i32, Custom);

2000

2001

for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,

2002

MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })

2003

setOperationAction(ISD::MSCATTER, VT, Custom);

2004

2005

if (Subtarget.hasDQI()) {

2006

for (auto Opc : {ISD::SINT_TO_FP, ISD::UINT_TO_FP, ISD::STRICT_SINT_TO_FP,

2007

ISD::STRICT_UINT_TO_FP, ISD::FP_TO_SINT, ISD::FP_TO_UINT,

2008

ISD::STRICT_FP_TO_SINT, ISD::STRICT_FP_TO_UINT}) {

2009

setOperationAction(Opc, MVT::v2i64, Custom);

2010

setOperationAction(Opc, MVT::v4i64, Custom);

2011

}

2012

setOperationAction(ISD::MUL, MVT::v2i64, Legal);

2013

setOperationAction(ISD::MUL, MVT::v4i64, Legal);

2014

}

2015

2016

if (Subtarget.hasCDI()) {

2017

for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {

2018

setOperationAction(ISD::CTLZ, VT, Legal);

2019

}

2020

} // Subtarget.hasCDI()

2021

2022

if (Subtarget.hasVPOPCNTDQ()) {

2023

for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 })

2024

setOperationAction(ISD::CTPOP, VT, Legal);

2025

}

2026

}

2027

2028

// This block control legalization of v32i1/v64i1 which are available with

2029

// AVX512BW. 512-bit v32i16 and v64i8 vector legalization is controlled with

2030

// useBWIRegs.

2031

if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {

2032

addRegisterClass(MVT::v32i1, &X86::VK32RegClass);

2033

addRegisterClass(MVT::v64i1, &X86::VK64RegClass);

2034

2035

for (auto VT : { MVT::v32i1, MVT::v64i1 }) {

2036

setOperationAction(ISD::VSELECT, VT, Expand);

2037

setOperationAction(ISD::TRUNCATE, VT, Custom);

2038

setOperationAction(ISD::SETCC, VT, Custom);

2039

setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);

2040

setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);

2041

setOperationAction(ISD::SELECT, VT, Custom);

2042

setOperationAction(ISD::BUILD_VECTOR, VT, Custom);

2043

setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);

2044

setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);

2045

setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);

2046

}

2047

2048

for (auto VT : { MVT::v16i1, MVT::v32i1 })

2049

setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);

2050

2051

// Extends from v32i1 masks to 256-bit vectors.

2052

setOperationAction(ISD::SIGN_EXTEND, MVT::v32i8, Custom);

2053

setOperationAction(ISD::ZERO_EXTEND, MVT::v32i8, Custom);

2054

setOperationAction(ISD::ANY_EXTEND, MVT::v32i8, Custom);

2055

2056

for (auto VT : { MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16 }) {

2057

setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);

2058

setOperationAction(ISD::MSTORE, VT, Subtarget.hasVLX() ? Legal : Custom);

2059

}

2060

2061

// These operations are handled on non-VLX by artificially widening in

2062

// isel patterns.

2063

// TODO: Custom widen in lowering on non-VLX and drop the isel patterns?

2064

2065

if (Subtarget.hasBITALG()) {

2066

for (auto VT : { MVT::v16i8, MVT::v32i8, MVT::v8i16, MVT::v16i16 })

2067

setOperationAction(ISD::CTPOP, VT, Legal);

2068

}

2069

}

2070

2071

if (!Subtarget.useSoftFloat() && Subtarget.hasFP16()) {

2072

auto setGroup = [&] (MVT VT) {

2073

setOperationAction(ISD::FADD, VT, Legal);

2074

setOperationAction(ISD::STRICT_FADD, VT, Legal);

2075

setOperationAction(ISD::FSUB, VT, Legal);

2076

setOperationAction(ISD::STRICT_FSUB, VT, Legal);

2077

setOperationAction(ISD::FMUL, VT, Legal);

2078

setOperationAction(ISD::STRICT_FMUL, VT, Legal);

2079

setOperationAction(ISD::FDIV, VT, Legal);

2080

setOperationAction(ISD::STRICT_FDIV, VT, Legal);

2081

setOperationAction(ISD::FSQRT, VT, Legal);

2082

setOperationAction(ISD::STRICT_FSQRT, VT, Legal);

2083

2084

setOperationAction(ISD::FFLOOR, VT, Legal);

2085

setOperationAction(ISD::STRICT_FFLOOR, VT, Legal);

2086

setOperationAction(ISD::FCEIL, VT, Legal);

2087

setOperationAction(ISD::STRICT_FCEIL, VT, Legal);

2088

setOperationAction(ISD::FTRUNC, VT, Legal);

2089

setOperationAction(ISD::STRICT_FTRUNC, VT, Legal);

2090

setOperationAction(ISD::FRINT, VT, Legal);

2091

setOperationAction(ISD::STRICT_FRINT, VT, Legal);

2092

setOperationAction(ISD::FNEARBYINT, VT, Legal);

2093

setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal);

2094

2095

setOperationAction(ISD::FROUND, VT, Custom);

2096

2097

setOperationAction(ISD::LOAD, VT, Legal);

2098

setOperationAction(ISD::STORE, VT, Legal);

2099

2100

setOperationAction(ISD::FMA, VT, Legal);

2101

setOperationAction(ISD::STRICT_FMA, VT, Legal);

2102

setOperationAction(ISD::VSELECT, VT, Legal);

2103

setOperationAction(ISD::BUILD_VECTOR, VT, Custom);

2104

setOperationAction(ISD::SELECT, VT, Custom);

2105

2106

setOperationAction(ISD::FNEG, VT, Custom);

2107

setOperationAction(ISD::FABS, VT, Custom);

2108

setOperationAction(ISD::FCOPYSIGN, VT, Custom);

2109

setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);

2110

setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);

2111

};

2112

2113

// AVX512_FP16 scalar operations

2114

setGroup(MVT::f16);

2115

setOperationAction(ISD::FREM, MVT::f16, Promote);

2116

setOperationAction(ISD::STRICT_FREM, MVT::f16, Promote);

2117

setOperationAction(ISD::SELECT_CC, MVT::f16, Expand);

2118

setOperationAction(ISD::BR_CC, MVT::f16, Expand);

2119

setOperationAction(ISD::SETCC, MVT::f16, Custom);

2120

setOperationAction(ISD::STRICT_FSETCC, MVT::f16, Custom);

2121

setOperationAction(ISD::STRICT_FSETCCS, MVT::f16, Custom);

2122

setOperationAction(ISD::STRICT_FROUND, MVT::f16, Promote);

2123

setOperationAction(ISD::FROUNDEVEN, MVT::f16, Legal);

2124

setOperationAction(ISD::STRICT_FROUNDEVEN, MVT::f16, Legal);

2125

setOperationAction(ISD::FP_ROUND, MVT::f16, Custom);

2126

setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, Custom);

2127

setOperationAction(ISD::FP_EXTEND, MVT::f32, Legal);

2128

setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f32, Legal);

2129

2130

setCondCodeAction(ISD::SETOEQ, MVT::f16, Expand);

2131

setCondCodeAction(ISD::SETUNE, MVT::f16, Expand);

2132

2133

if (Subtarget.useAVX512Regs()) {

2134

setGroup(MVT::v32f16);

2135

setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v32f16, Custom);

2136

setOperationAction(ISD::SINT_TO_FP, MVT::v32i16, Legal);

2137

setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v32i16, Legal);

2138

setOperationAction(ISD::UINT_TO_FP, MVT::v32i16, Legal);

2139

setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v32i16, Legal);

2140

setOperationAction(ISD::FP_ROUND, MVT::v16f16, Legal);

2141

setOperationAction(ISD::STRICT_FP_ROUND, MVT::v16f16, Legal);

2142

setOperationAction(ISD::FP_EXTEND, MVT::v16f32, Legal);

2143

setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v16f32, Legal);

2144

setOperationAction(ISD::FP_EXTEND, MVT::v8f64, Legal);

2145

setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v8f64, Legal);

2146

setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32f16, Custom);

2147

2148

setOperationAction(ISD::FP_TO_SINT, MVT::v32i16, Custom);

2149

setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v32i16, Custom);

2150

setOperationAction(ISD::FP_TO_UINT, MVT::v32i16, Custom);

2151

setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v32i16, Custom);

2152

setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v32i8, MVT::v32i16);

2153

setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v32i8,

2154

MVT::v32i16);

2155

setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v32i8, MVT::v32i16);

2156

setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v32i8,

2157

MVT::v32i16);

2158

setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v32i1, MVT::v32i16);

2159

setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v32i1,

2160

MVT::v32i16);

2161

setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v32i1, MVT::v32i16);

2162

setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v32i1,

2163

MVT::v32i16);

2164

2165

setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v16f16, Legal);

2166

setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32f16, Legal);

2167

setOperationAction(ISD::CONCAT_VECTORS, MVT::v32f16, Custom);

2168

2169

setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Legal);

2170

setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16f16, Legal);

2171

2172

setOperationAction(ISD::STRICT_FSETCC, MVT::v32i1, Custom);

2173

setOperationAction(ISD::STRICT_FSETCCS, MVT::v32i1, Custom);

2174

}

2175

2176

if (Subtarget.hasVLX()) {

2177

setGroup(MVT::v8f16);

2178

setGroup(MVT::v16f16);

2179

2180

setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8f16, Legal);

2181

setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16f16, Custom);

2182

setOperationAction(ISD::SINT_TO_FP, MVT::v16i16, Legal);

2183

setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v16i16, Legal);

2184

setOperationAction(ISD::SINT_TO_FP, MVT::v8i16, Legal);

2185

setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v8i16, Legal);

2186

setOperationAction(ISD::UINT_TO_FP, MVT::v16i16, Legal);

2187

setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v16i16, Legal);

2188

setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Legal);

2189

setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v8i16, Legal);

2190

2191

setOperationAction(ISD::FP_TO_SINT, MVT::v8i16, Custom);

2192

setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v8i16, Custom);

2193

setOperationAction(ISD::FP_TO_UINT, MVT::v8i16, Custom);

2194

setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v8i16, Custom);

2195

setOperationAction(ISD::FP_ROUND, MVT::v8f16, Legal);

2196

setOperationAction(ISD::STRICT_FP_ROUND, MVT::v8f16, Legal);

2197

setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Legal);

2198

setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v8f32, Legal);

2199

setOperationAction(ISD::FP_EXTEND, MVT::v4f64, Legal);

2200

setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v4f64, Legal);

2201

2202

// INSERT_VECTOR_ELT v8f16 extended to VECTOR_SHUFFLE

2203

setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8f16, Custom);

2204

setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16f16, Custom);

2205

2206

setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8f16, Legal);

2207

setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v16f16, Legal);

2208

setOperationAction(ISD::CONCAT_VECTORS, MVT::v16f16, Custom);

2209

2210

setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Legal);

2211

setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Legal);

2212

setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Legal);

2213

setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Legal);

2214

2215

// Need to custom widen these to prevent scalarization.

2216

setOperationAction(ISD::LOAD, MVT::v4f16, Custom);

2217

setOperationAction(ISD::STORE, MVT::v4f16, Custom);

2218

}

2219

}

2220

2221

if (!Subtarget.useSoftFloat() &&

2222

(Subtarget.hasAVXNECONVERT() || Subtarget.hasBF16())) {

2223

addRegisterClass(MVT::v8bf16, &X86::VR128XRegClass);

2224

addRegisterClass(MVT::v16bf16, &X86::VR256XRegClass);

2225

// We set the type action of bf16 to TypeSoftPromoteHalf, but we don't

2226

// provide the method to promote BUILD_VECTOR. Set the operation action

2227

// Custom to do the customization later.

2228

setOperationAction(ISD::BUILD_VECTOR, MVT::bf16, Custom);

2229

for (auto VT : {MVT::v8bf16, MVT::v16bf16}) {

2230

setF16Action(VT, Expand);

2231

setOperationAction(ISD::FADD, VT, Expand);

2232

setOperationAction(ISD::FSUB, VT, Expand);

2233

setOperationAction(ISD::FMUL, VT, Expand);

2234

setOperationAction(ISD::FDIV, VT, Expand);

2235

setOperationAction(ISD::BUILD_VECTOR, VT, Custom);

2236

}

2237

addLegalFPImmediate(APFloat::getZero(APFloat::BFloat()));

2238

}

2239

2240

if (!Subtarget.useSoftFloat() && Subtarget.hasBF16()) {

2241

addRegisterClass(MVT::v32bf16, &X86::VR512RegClass);

2242

setF16Action(MVT::v32bf16, Expand);

2243

setOperationAction(ISD::FADD, MVT::v32bf16, Expand);

2244

setOperationAction(ISD::FSUB, MVT::v32bf16, Expand);

2245

setOperationAction(ISD::FMUL, MVT::v32bf16, Expand);

2246

setOperationAction(ISD::FDIV, MVT::v32bf16, Expand);

2247

setOperationAction(ISD::BUILD_VECTOR, MVT::v32bf16, Custom);

2248

}

2249

2250

if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {

2251

setTruncStoreAction(MVT::v4i64, MVT::v4i8, Legal);

2252

setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal);

2253

setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal);

2254

setTruncStoreAction(MVT::v8i32, MVT::v8i8, Legal);

2255

setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal);

2256

2257

setTruncStoreAction(MVT::v2i64, MVT::v2i8, Legal);

2258

setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal);

2259

setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal);

2260

setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal);

2261

setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);

2262

2263

if (Subtarget.hasBWI()) {

2264

setTruncStoreAction(MVT::v16i16, MVT::v16i8, Legal);

2265

setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);

2266

}

2267

2268

if (Subtarget.hasFP16()) {

2269

// vcvttph2[u]dq v4f16 -> v4i32/64, v2f16 -> v2i32/64

2270

setOperationAction(ISD::FP_TO_SINT, MVT::v2f16, Custom);

2271

setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2f16, Custom);

2272

setOperationAction(ISD::FP_TO_UINT, MVT::v2f16, Custom);

2273

setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2f16, Custom);

2274

setOperationAction(ISD::FP_TO_SINT, MVT::v4f16, Custom);

2275

setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v4f16, Custom);

2276

setOperationAction(ISD::FP_TO_UINT, MVT::v4f16, Custom);

2277

setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v4f16, Custom);

2278

// vcvt[u]dq2ph v4i32/64 -> v4f16, v2i32/64 -> v2f16

2279

setOperationAction(ISD::SINT_TO_FP, MVT::v2f16, Custom);

2280

setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2f16, Custom);

2281

setOperationAction(ISD::UINT_TO_FP, MVT::v2f16, Custom);

2282

setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2f16, Custom);

2283

setOperationAction(ISD::SINT_TO_FP, MVT::v4f16, Custom);

2284

setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4f16, Custom);

2285

setOperationAction(ISD::UINT_TO_FP, MVT::v4f16, Custom);

2286

setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4f16, Custom);

2287

// vcvtps2phx v4f32 -> v4f16, v2f32 -> v2f16

2288

setOperationAction(ISD::FP_ROUND, MVT::v2f16, Custom);

2289

setOperationAction(ISD::STRICT_FP_ROUND, MVT::v2f16, Custom);

2290

setOperationAction(ISD::FP_ROUND, MVT::v4f16, Custom);

2291

setOperationAction(ISD::STRICT_FP_ROUND, MVT::v4f16, Custom);

2292

// vcvtph2psx v4f16 -> v4f32, v2f16 -> v2f32

2293

setOperationAction(ISD::FP_EXTEND, MVT::v2f16, Custom);

2294

setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v2f16, Custom);

2295

setOperationAction(ISD::FP_EXTEND, MVT::v4f16, Custom);

2296

setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v4f16, Custom);

2297

}

2298

2299

setOperationAction(ISD::TRUNCATE, MVT::v16i32, Custom);

2300

setOperationAction(ISD::TRUNCATE, MVT::v8i64, Custom);

2301

setOperationAction(ISD::TRUNCATE, MVT::v16i64, Custom);

2302

}

2303

2304

if (Subtarget.hasAMXTILE()) {

2305

addRegisterClass(MVT::x86amx, &X86::TILERegClass);

2306

}

2307

2308

// We want to custom lower some of our intrinsics.

2309

setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);

2310

setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);

2311

setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);

2312

if (!Subtarget.is64Bit()) {

2313

setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);

2314

}

2315

2316

// Only custom-lower 64-bit SADDO and friends on 64-bit because we don't

2317

// handle type legalization for these operations here.

2318

//

2319

// FIXME: We really should do custom legalization for addition and

2320

// subtraction on x86-32 once PR3203 is fixed. We really can't do much better

2321

// than generic legalization for 64-bit multiplication-with-overflow, though.

2322

for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {

2323

if (VT == MVT::i64 && !Subtarget.is64Bit())

2324

continue;

2325

// Add/Sub/Mul with overflow operations are custom lowered.

2326

setOperationAction(ISD::SADDO, VT, Custom);

2327

setOperationAction(ISD::UADDO, VT, Custom);

2328

setOperationAction(ISD::SSUBO, VT, Custom);

2329

setOperationAction(ISD::USUBO, VT, Custom);

2330

setOperationAction(ISD::SMULO, VT, Custom);

2331

setOperationAction(ISD::UMULO, VT, Custom);

2332

2333

// Support carry in as value rather than glue.

2334

setOperationAction(ISD::ADDCARRY, VT, Custom);

2335

setOperationAction(ISD::SUBCARRY, VT, Custom);

2336

setOperationAction(ISD::SETCCCARRY, VT, Custom);

2337

setOperationAction(ISD::SADDO_CARRY, VT, Custom);

2338

setOperationAction(ISD::SSUBO_CARRY, VT, Custom);

2339

}

2340

2341

if (!Subtarget.is64Bit()) {

2342

// These libcalls are not available in 32-bit.

2343

setLibcallName(RTLIB::SHL_I128, nullptr);

2344

setLibcallName(RTLIB::SRL_I128, nullptr);

2345

setLibcallName(RTLIB::SRA_I128, nullptr);

2346

setLibcallName(RTLIB::MUL_I128, nullptr);

2347

// The MULO libcall is not part of libgcc, only compiler-rt.

2348

setLibcallName(RTLIB::MULO_I64, nullptr);

2349

}

2350

// The MULO libcall is not part of libgcc, only compiler-rt.

2351

setLibcallName(RTLIB::MULO_I128, nullptr);

2352

2353

// Combine sin / cos into _sincos_stret if it is available.

2354

if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&

2355

getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {

2356

setOperationAction(ISD::FSINCOS, MVT::f64, Custom);

2357

setOperationAction(ISD::FSINCOS, MVT::f32, Custom);

2358

}

2359

2360

if (Subtarget.isTargetWin64()) {

2361

setOperationAction(ISD::SDIV, MVT::i128, Custom);

2362

setOperationAction(ISD::UDIV, MVT::i128, Custom);

2363

setOperationAction(ISD::SREM, MVT::i128, Custom);

2364

setOperationAction(ISD::UREM, MVT::i128, Custom);

2365

setOperationAction(ISD::FP_TO_SINT, MVT::i128, Custom);

2366

setOperationAction(ISD::FP_TO_UINT, MVT::i128, Custom);

2367

setOperationAction(ISD::SINT_TO_FP, MVT::i128, Custom);

2368

setOperationAction(ISD::UINT_TO_FP, MVT::i128, Custom);

2369

setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i128, Custom);

2370

setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i128, Custom);

2371

setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i128, Custom);

2372

setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i128, Custom);

2373

}

2374

2375

// On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)`

2376

// is. We should promote the value to 64-bits to solve this.

2377

// This is what the CRT headers do - `fmodf` is an inline header

2378

// function casting to f64 and calling `fmod`.

2379

if (Subtarget.is32Bit() &&

2380

(Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium()))

2381

for (ISD::NodeType Op :

2382

{ISD::FCEIL, ISD::STRICT_FCEIL,

2383

ISD::FCOS, ISD::STRICT_FCOS,

2384

ISD::FEXP, ISD::STRICT_FEXP,

2385

ISD::FFLOOR, ISD::STRICT_FFLOOR,

2386

ISD::FREM, ISD::STRICT_FREM,

2387

ISD::FLOG, ISD::STRICT_FLOG,

2388

ISD::FLOG10, ISD::STRICT_FLOG10,

2389

ISD::FPOW, ISD::STRICT_FPOW,

2390

ISD::FSIN, ISD::STRICT_FSIN})

2391

if (isOperationExpand(Op, MVT::f32))

2392

setOperationAction(Op, MVT::f32, Promote);

2393

2394

// We have target-specific dag combine patterns for the following nodes:

2395

setTargetDAGCombine({ISD::VECTOR_SHUFFLE,

2396

ISD::SCALAR_TO_VECTOR,

2397

ISD::INSERT_VECTOR_ELT,

2398

ISD::EXTRACT_VECTOR_ELT,

2399

ISD::CONCAT_VECTORS,

2400

ISD::INSERT_SUBVECTOR,

2401

ISD::EXTRACT_SUBVECTOR,

2402

ISD::BITCAST,

2403

ISD::VSELECT,

2404

ISD::SELECT,

2405

ISD::SHL,

2406

ISD::SRA,

2407

ISD::SRL,

2408

ISD::OR,

2409

ISD::AND,

2410

ISD::ADD,

2411

ISD::FADD,

2412

ISD::FSUB,

2413

ISD::FNEG,

2414

ISD::FMA,

2415

ISD::STRICT_FMA,

2416

ISD::FMINNUM,

2417

ISD::FMAXNUM,

2418

ISD::SUB,

2419

ISD::LOAD,

2420

ISD::MLOAD,

2421

ISD::STORE,

2422

ISD::MSTORE,

2423

ISD::TRUNCATE,

2424

ISD::ZERO_EXTEND,

2425

ISD::ANY_EXTEND,

2426

ISD::SIGN_EXTEND,

2427

ISD::SIGN_EXTEND_INREG,

2428

ISD::ANY_EXTEND_VECTOR_INREG,

2429

ISD::SIGN_EXTEND_VECTOR_INREG,

2430

ISD::ZERO_EXTEND_VECTOR_INREG,

2431

ISD::SINT_TO_FP,

2432

ISD::UINT_TO_FP,

2433

ISD::STRICT_SINT_TO_FP,

2434

ISD::STRICT_UINT_TO_FP,

2435

ISD::SETCC,

2436

ISD::MUL,

2437

ISD::XOR,

2438

ISD::MSCATTER,

2439

ISD::MGATHER,

2440

ISD::FP16_TO_FP,

2441

ISD::FP_EXTEND,

2442

ISD::STRICT_FP_EXTEND,

2443

ISD::FP_ROUND,

2444

ISD::STRICT_FP_ROUND});

2445

2446

computeRegisterProperties(Subtarget.getRegisterInfo());

2447

2448

MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores

2449

MaxStoresPerMemsetOptSize = 8;

2450

MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores

2451

MaxStoresPerMemcpyOptSize = 4;

2452

MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores

2453

MaxStoresPerMemmoveOptSize = 4;

2454

2455

// TODO: These control memcmp expansion in CGP and could be raised higher, but

2456

// that needs to benchmarked and balanced with the potential use of vector

2457

// load/store types (PR33329, PR33914).

2458

MaxLoadsPerMemcmp = 2;

2459

MaxLoadsPerMemcmpOptSize = 2;

2460

2461

// Default loop alignment, which can be overridden by -align-loops.

2462

setPrefLoopAlignment(Align(16));

2463

2464

// An out-of-order CPU can speculatively execute past a predictable branch,

2465

// but a conditional move could be stalled by an expensive earlier operation.

2466

PredictableSelectIsExpensive = Subtarget.getSchedModel().isOutOfOrder();

2467

EnableExtLdPromotion = true;

2468

setPrefFunctionAlignment(Align(16));

2469

2470

verifyIntrinsicTables();

2471

2472

// Default to having -disable-strictnode-mutation on

2473

IsStrictFPEnabled = true;

2474

}

2475

2476

// This has so far only been implemented for 64-bit MachO.

2477

bool X86TargetLowering::useLoadStackGuardNode() const {

2478

return Subtarget.isTargetMachO() && Subtarget.is64Bit();

2479

}

2480

2481

bool X86TargetLowering::useStackGuardXorFP() const {

2482

// Currently only MSVC CRTs XOR the frame pointer into the stack guard value.

2483

return Subtarget.getTargetTriple().isOSMSVCRT() && !Subtarget.isTargetMachO();

2484

}

2485

2486

SDValue X86TargetLowering::emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val,

2487

const SDLoc &DL) const {

2488

EVT PtrTy = getPointerTy(DAG.getDataLayout());

2489

unsigned XorOp = Subtarget.is64Bit() ? X86::XOR64_FP : X86::XOR32_FP;

2490

MachineSDNode *Node = DAG.getMachineNode(XorOp, DL, PtrTy, Val);

2491

return SDValue(Node, 0);

2492

}

2493

2494

TargetLoweringBase::LegalizeTypeAction

2495

X86TargetLowering::getPreferredVectorAction(MVT VT) const {

2496

if ((VT == MVT::v32i1 || VT == MVT::v64i1) && Subtarget.hasAVX512() &&

2497

!Subtarget.hasBWI())

2498

return TypeSplitVector;

2499

2500

if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&

2501

!Subtarget.hasF16C() && VT.getVectorElementType() == MVT::f16)

2502

return TypeSplitVector;

2503

2504

if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&

2505

VT.getVectorElementType() != MVT::i1)

2506

return TypeWidenVector;

2507

2508

return TargetLoweringBase::getPreferredVectorAction(VT);

2509

}

2510

2511

static std::pair<MVT, unsigned>

2512

handleMaskRegisterForCallingConv(unsigned NumElts, CallingConv::ID CC,

2513

const X86Subtarget &Subtarget) {

2514

// v2i1/v4i1/v8i1/v16i1 all pass in xmm registers unless the calling

2515

// convention is one that uses k registers.

2516

if (NumElts == 2)

2517

return {MVT::v2i64, 1};

2518

if (NumElts == 4)

2519

return {MVT::v4i32, 1};

2520

if (NumElts == 8 && CC != CallingConv::X86_RegCall &&

2521

CC != CallingConv::Intel_OCL_BI)

2522

return {MVT::v8i16, 1};

2523

if (NumElts == 16 && CC != CallingConv::X86_RegCall &&

2524

CC != CallingConv::Intel_OCL_BI)

2525

return {MVT::v16i8, 1};

2526

// v32i1 passes in ymm unless we have BWI and the calling convention is

2527

// regcall.

2528

if (NumElts == 32 && (!Subtarget.hasBWI() || CC != CallingConv::X86_RegCall))

2529

return {MVT::v32i8, 1};

2530

// Split v64i1 vectors if we don't have v64i8 available.

2531

if (NumElts == 64 && Subtarget.hasBWI() && CC != CallingConv::X86_RegCall) {

2532

if (Subtarget.useAVX512Regs())

2533

return {MVT::v64i8, 1};

2534

return {MVT::v32i8, 2};

2535

}

2536

2537

// Break wide or odd vXi1 vectors into scalars to match avx2 behavior.

2538

if (!isPowerOf2_32(NumElts) || (NumElts == 64 && !Subtarget.hasBWI()) ||

2539

NumElts > 64)

2540

return {MVT::i8, NumElts};

2541

2542

return {MVT::INVALID_SIMPLE_VALUE_TYPE, 0};

2543

}

2544

2545

MVT X86TargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,

2546

CallingConv::ID CC,

2547

EVT VT) const {

2548

if (VT.isVector()) {

2549

if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512()) {

2550

unsigned NumElts = VT.getVectorNumElements();

2551

2552

MVT RegisterVT;

2553

unsigned NumRegisters;

2554

std::tie(RegisterVT, NumRegisters) =

2555

handleMaskRegisterForCallingConv(NumElts, CC, Subtarget);

2556

if (RegisterVT != MVT::INVALID_SIMPLE_VALUE_TYPE)

2557

return RegisterVT;

2558

}

2559

2560

if (VT.getVectorElementType() == MVT::f16 && VT.getVectorNumElements() < 8)

2561

return MVT::v8f16;

2562

}

2563

2564

// We will use more GPRs for f64 and f80 on 32 bits when x87 is disabled.

2565

if ((VT == MVT::f64 || VT == MVT::f80) && !Subtarget.is64Bit() &&

2566

!Subtarget.hasX87())

2567

return MVT::i32;

2568

2569

if (VT.isVector() && VT.getVectorElementType() == MVT::bf16)

2570

return getRegisterTypeForCallingConv(Context, CC,

2571

VT.changeVectorElementTypeToInteger());

2572

2573

return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);

2574

}

2575

2576

unsigned X86TargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,

2577

CallingConv::ID CC,

2578

EVT VT) const {

2579

if (VT.isVector()) {

2580

if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512()) {

2581

unsigned NumElts = VT.getVectorNumElements();

2582

2583

MVT RegisterVT;

2584

unsigned NumRegisters;

2585

std::tie(RegisterVT, NumRegisters) =

2586

handleMaskRegisterForCallingConv(NumElts, CC, Subtarget);

2587

if (RegisterVT != MVT::INVALID_SIMPLE_VALUE_TYPE)

2588

return NumRegisters;

2589

}

2590

2591

if (VT.getVectorElementType() == MVT::f16 && VT.getVectorNumElements() < 8)

2592

return 1;

2593

}

2594

2595

// We have to split f64 to 2 registers and f80 to 3 registers on 32 bits if

2596

// x87 is disabled.

2597

if (!Subtarget.is64Bit() && !Subtarget.hasX87()) {

2598

if (VT == MVT::f64)

2599

return 2;

2600

if (VT == MVT::f80)

2601

return 3;

2602

}

2603

2604

if (VT.isVector() && VT.getVectorElementType() == MVT::bf16)

2605

return getNumRegistersForCallingConv(Context, CC,

2606

VT.changeVectorElementTypeToInteger());

2607

2608

return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);

2609

}

2610

2611

unsigned X86TargetLowering::getVectorTypeBreakdownForCallingConv(

2612

LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,

2613

unsigned &NumIntermediates, MVT &RegisterVT) const {

2614

// Break wide or odd vXi1 vectors into scalars to match avx2 behavior.

2615

if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&

2616

Subtarget.hasAVX512() &&

2617

(!isPowerOf2_32(VT.getVectorNumElements()) ||

2618

(VT.getVectorNumElements() == 64 && !Subtarget.hasBWI()) ||

2619

VT.getVectorNumElements() > 64)) {

2620

RegisterVT = MVT::i8;

2621

IntermediateVT = MVT::i1;

2622

NumIntermediates = VT.getVectorNumElements();

2623

return NumIntermediates;

2624

}

2625

2626

// Split v64i1 vectors if we don't have v64i8 available.

2627

if (VT == MVT::v64i1 && Subtarget.hasBWI() && !Subtarget.useAVX512Regs() &&

2628

CC != CallingConv::X86_RegCall) {

2629

RegisterVT = MVT::v32i8;

2630

IntermediateVT = MVT::v32i1;

2631

NumIntermediates = 2;

2632

return 2;

2633

}

2634

2635

return TargetLowering::getVectorTypeBreakdownForCallingConv(Context, CC, VT, IntermediateVT,

2636

NumIntermediates, RegisterVT);

2637

}

2638

2639

EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL,

2640

LLVMContext& Context,

2641

EVT VT) const {

2642

if (!VT.isVector())

2643

return MVT::i8;

2644

2645

if (Subtarget.hasAVX512()) {

2646

// Figure out what this type will be legalized to.

2647

EVT LegalVT = VT;

2648

while (getTypeAction(Context, LegalVT) != TypeLegal)

2649

LegalVT = getTypeToTransformTo(Context, LegalVT);

2650

2651

// If we got a 512-bit vector then we'll definitely have a vXi1 compare.

2652

if (LegalVT.getSimpleVT().is512BitVector())

2653

return EVT::getVectorVT(Context, MVT::i1, VT.getVectorElementCount());

2654

2655

if (LegalVT.getSimpleVT().isVector() && Subtarget.hasVLX()) {

2656

// If we legalized to less than a 512-bit vector, then we will use a vXi1

2657

// compare for vXi32/vXi64 for sure. If we have BWI we will also support

2658

// vXi16/vXi8.

2659

MVT EltVT = LegalVT.getSimpleVT().getVectorElementType();

2660

if (Subtarget.hasBWI() || EltVT.getSizeInBits() >= 32)

2661

return EVT::getVectorVT(Context, MVT::i1, VT.getVectorElementCount());

2662

}

2663

}

2664

2665

return VT.changeVectorElementTypeToInteger();

2666

}

2667

2668

/// Helper for getByValTypeAlignment to determine

2669

/// the desired ByVal argument alignment.

2670

static void getMaxByValAlign(Type *Ty, Align &MaxAlign) {

2671

if (MaxAlign == 16)

2672

return;

2673

if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {

2674

if (VTy->getPrimitiveSizeInBits().getFixedValue() == 128)

2675

MaxAlign = Align(16);

2676

} else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {

2677

Align EltAlign;

2678

getMaxByValAlign(ATy->getElementType(), EltAlign);

2679

if (EltAlign > MaxAlign)

2680

MaxAlign = EltAlign;

2681

} else if (StructType *STy = dyn_cast<StructType>(Ty)) {

2682

for (auto *EltTy : STy->elements()) {

2683

Align EltAlign;

2684

getMaxByValAlign(EltTy, EltAlign);

2685

if (EltAlign > MaxAlign)

2686

MaxAlign = EltAlign;

2687

if (MaxAlign == 16)

2688

break;

2689

}

2690

}

2691

}

2692

2693

/// Return the desired alignment for ByVal aggregate

2694

/// function arguments in the caller parameter area. For X86, aggregates

2695

/// that contain SSE vectors are placed at 16-byte boundaries while the rest

2696

/// are at 4-byte boundaries.

2697

uint64_t X86TargetLowering::getByValTypeAlignment(Type *Ty,

2698

const DataLayout &DL) const {

2699

if (Subtarget.is64Bit()) {

2700

// Max of 8 and alignment of type.

2701

Align TyAlign = DL.getABITypeAlign(Ty);

2702

if (TyAlign > 8)

2703

return TyAlign.value();

2704

return 8;

2705

}

2706

2707

Align Alignment(4);

2708

if (Subtarget.hasSSE1())

2709

getMaxByValAlign(Ty, Alignment);

2710

return Alignment.value();

2711

}

2712

2713

/// It returns EVT::Other if the type should be determined using generic

2714

/// target-independent logic.

2715

/// For vector ops we check that the overall size isn't larger than our

2716

/// preferred vector width.

2717

EVT X86TargetLowering::getOptimalMemOpType(

2718

const MemOp &Op, const AttributeList &FuncAttributes) const {

2719

if (!FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat)) {

2720

if (Op.size() >= 16 &&

2721

(!Subtarget.isUnalignedMem16Slow() || Op.isAligned(Align(16)))) {

2722

// FIXME: Check if unaligned 64-byte accesses are slow.

2723

if (Op.size() >= 64 && Subtarget.hasAVX512() &&

2724

(Subtarget.getPreferVectorWidth() >= 512)) {

2725

return Subtarget.hasBWI() ? MVT::v64i8 : MVT::v16i32;

2726

}

2727

// FIXME: Check if unaligned 32-byte accesses are slow.

2728

if (Op.size() >= 32 && Subtarget.hasAVX() &&

2729

Subtarget.useLight256BitInstructions()) {

2730

// Although this isn't a well-supported type for AVX1, we'll let

2731

// legalization and shuffle lowering produce the optimal codegen. If we

2732

// choose an optimal type with a vector element larger than a byte,

2733

// getMemsetStores() may create an intermediate splat (using an integer

2734

// multiply) before we splat as a vector.

2735

return MVT::v32i8;

2736

}

2737

if (Subtarget.hasSSE2() && (Subtarget.getPreferVectorWidth() >= 128))

2738

return MVT::v16i8;

2739

// TODO: Can SSE1 handle a byte vector?

2740

// If we have SSE1 registers we should be able to use them.

2741

if (Subtarget.hasSSE1() && (Subtarget.is64Bit() || Subtarget.hasX87()) &&

2742

(Subtarget.getPreferVectorWidth() >= 128))

2743

return MVT::v4f32;

2744

} else if (((Op.isMemcpy() && !Op.isMemcpyStrSrc()) || Op.isZeroMemset()) &&

2745

Op.size() >= 8 && !Subtarget.is64Bit() && Subtarget.hasSSE2()) {

2746

// Do not use f64 to lower memcpy if source is string constant. It's

2747

// better to use i32 to avoid the loads.

2748

// Also, do not use f64 to lower memset unless this is a memset of zeros.

2749

// The gymnastics of splatting a byte value into an XMM register and then

2750

// only using 8-byte stores (because this is a CPU with slow unaligned

2751

// 16-byte accesses) makes that a loser.

2752

return MVT::f64;

2753

}

2754

}

2755

// This is a compromise. If we reach here, unaligned accesses may be slow on

2756

// this target. However, creating smaller, aligned accesses could be even

2757

// slower and would certainly be a lot more code.

2758

if (Subtarget.is64Bit() && Op.size() >= 8)

2759

return MVT::i64;

2760

return MVT::i32;

2761

}

2762

2763

bool X86TargetLowering::isSafeMemOpType(MVT VT) const {

2764

if (VT == MVT::f32)

2765

return Subtarget.hasSSE1();

2766

if (VT == MVT::f64)

2767

return Subtarget.hasSSE2();

2768

return true;

2769

}

2770

2771

static bool isBitAligned(Align Alignment, uint64_t SizeInBits) {

2772

return (8 * Alignment.value()) % SizeInBits == 0;

2773

}

2774

2775

bool X86TargetLowering::isMemoryAccessFast(EVT VT, Align Alignment) const {

2776

if (isBitAligned(Alignment, VT.getSizeInBits()))

2777

return true;

2778

switch (VT.getSizeInBits()) {

2779

default:

2780

// 8-byte and under are always assumed to be fast.

2781

return true;

2782

case 128:

2783

return !Subtarget.isUnalignedMem16Slow();

2784

case 256:

2785

return !Subtarget.isUnalignedMem32Slow();

2786

// TODO: What about AVX-512 (512-bit) accesses?

2787

}

2788

}

2789

2790

bool X86TargetLowering::allowsMisalignedMemoryAccesses(

2791

EVT VT, unsigned, Align Alignment, MachineMemOperand::Flags Flags,

2792

unsigned *Fast) const {

2793

if (Fast)

2794

*Fast = isMemoryAccessFast(VT, Alignment);

2795

// NonTemporal vector memory ops must be aligned.

2796

if (!!(Flags & MachineMemOperand::MONonTemporal) && VT.isVector()) {

2797

// NT loads can only be vector aligned, so if its less aligned than the

2798

// minimum vector size (which we can split the vector down to), we might as

2799

// well use a regular unaligned vector load.

2800

// We don't have any NT loads pre-SSE41.

2801

if (!!(Flags & MachineMemOperand::MOLoad))

2802

return (Alignment < 16 || !Subtarget.hasSSE41());

2803

return false;

2804

}

2805

// Misaligned accesses of any size are always allowed.

2806

return true;

2807

}

2808

2809

bool X86TargetLowering::allowsMemoryAccess(LLVMContext &Context,

2810

const DataLayout &DL, EVT VT,

2811

unsigned AddrSpace, Align Alignment,

2812

MachineMemOperand::Flags Flags,

2813

unsigned *Fast) const {

2814

if (Fast)

2815

*Fast = isMemoryAccessFast(VT, Alignment);

2816

if (!!(Flags & MachineMemOperand::MONonTemporal) && VT.isVector()) {

2817

if (allowsMisalignedMemoryAccesses(VT, AddrSpace, Alignment, Flags,

2818

/*Fast=*/nullptr))

2819

return true;

2820

// NonTemporal vector memory ops are special, and must be aligned.

2821

if (!isBitAligned(Alignment, VT.getSizeInBits()))

2822

return false;

2823

switch (VT.getSizeInBits()) {

2824

case 128:

2825

if (!!(Flags & MachineMemOperand::MOLoad) && Subtarget.hasSSE41())

2826

return true;

2827

if (!!(Flags & MachineMemOperand::MOStore) && Subtarget.hasSSE2())

2828

return true;

2829

return false;

2830

case 256:

2831

if (!!(Flags & MachineMemOperand::MOLoad) && Subtarget.hasAVX2())

2832

return true;

2833

if (!!(Flags & MachineMemOperand::MOStore) && Subtarget.hasAVX())

2834

return true;

2835

return false;

2836

case 512:

2837

if (Subtarget.hasAVX512())

2838

return true;

2839

return false;

2840

default:

2841

return false; // Don't have NonTemporal vector memory ops of this size.

2842

}

2843

}

2844

return true;

2845

}

2846

2847

/// Return the entry encoding for a jump table in the

2848

/// current function. The returned value is a member of the

2849

/// MachineJumpTableInfo::JTEntryKind enum.

2850

unsigned X86TargetLowering::getJumpTableEncoding() const {

2851

// In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF

2852

// symbol.

2853

if (isPositionIndependent() && Subtarget.isPICStyleGOT())

2854

return MachineJumpTableInfo::EK_Custom32;

2855

2856

// Otherwise, use the normal jump table encoding heuristics.

2857

return TargetLowering::getJumpTableEncoding();

2858

}

2859

2860

bool X86TargetLowering::splitValueIntoRegisterParts(

2861

SelectionDAG &DAG, const SDLoc &DL, SDValue Val, SDValue *Parts,

2862

unsigned NumParts, MVT PartVT, std::optional<CallingConv::ID> CC) const {

2863

bool IsABIRegCopy = CC.has_value();

2864

EVT ValueVT = Val.getValueType();

2865

if (IsABIRegCopy && ValueVT == MVT::bf16 && PartVT == MVT::f32) {

2866

unsigned ValueBits = ValueVT.getSizeInBits();

2867

unsigned PartBits = PartVT.getSizeInBits();

2868

Val = DAG.getNode(ISD::BITCAST, DL, MVT::getIntegerVT(ValueBits), Val);

2869

Val = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::getIntegerVT(PartBits), Val);

2870

Val = DAG.getNode(ISD::BITCAST, DL, PartVT, Val);

2871

Parts[0] = Val;

2872

return true;

2873

}

2874

return false;

2875

}

2876

2877

SDValue X86TargetLowering::joinRegisterPartsIntoValue(

2878

SelectionDAG &DAG, const SDLoc &DL, const SDValue *Parts, unsigned NumParts,

2879

MVT PartVT, EVT ValueVT, std::optional<CallingConv::ID> CC) const {

2880

bool IsABIRegCopy = CC.has_value();

2881

if (IsABIRegCopy && ValueVT == MVT::bf16 && PartVT == MVT::f32) {

2882

unsigned ValueBits = ValueVT.getSizeInBits();

2883

unsigned PartBits = PartVT.getSizeInBits();

2884

SDValue Val = Parts[0];

2885

2886

Val = DAG.getNode(ISD::BITCAST, DL, MVT::getIntegerVT(PartBits), Val);

2887

Val = DAG.getNode(ISD::TRUNCATE, DL, MVT::getIntegerVT(ValueBits), Val);

2888

Val = DAG.getNode(ISD::BITCAST, DL, ValueVT, Val);

2889

return Val;

2890

}

2891

return SDValue();

2892

}

2893

2894

bool X86TargetLowering::useSoftFloat() const {

2895

return Subtarget.useSoftFloat();

2896

}

2897

2898

void X86TargetLowering::markLibCallAttributes(MachineFunction *MF, unsigned CC,

2899

ArgListTy &Args) const {

2900

2901

// Only relabel X86-32 for C / Stdcall CCs.

2902

if (Subtarget.is64Bit())

2903

return;

2904

if (CC != CallingConv::C && CC != CallingConv::X86_StdCall)

2905

return;

2906

unsigned ParamRegs = 0;

2907

if (auto *M = MF->getFunction().getParent())

2908

ParamRegs = M->getNumberRegisterParameters();

2909

2910

// Mark the first N int arguments as having reg

2911

for (auto &Arg : Args) {

2912

Type *T = Arg.Ty;

2913

if (T->isIntOrPtrTy())

2914

if (MF->getDataLayout().getTypeAllocSize(T) <= 8) {

2915

unsigned numRegs = 1;

2916

if (MF->getDataLayout().getTypeAllocSize(T) > 4)

2917

numRegs = 2;

2918

if (ParamRegs < numRegs)

2919

return;

2920

ParamRegs -= numRegs;

2921

Arg.IsInReg = true;

2922

}

2923

}

2924

}

2925

2926

const MCExpr *

2927

X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,

2928

const MachineBasicBlock *MBB,

2929

unsigned uid,MCContext &Ctx) const{

2930

assert(isPositionIndependent() && Subtarget.isPICStyleGOT())(static_cast <bool> (isPositionIndependent() &&
Subtarget.isPICStyleGOT()) ? void (0) : __assert_fail ("isPositionIndependent() && Subtarget.isPICStyleGOT()"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 2930, __extension__
__PRETTY_FUNCTION__));

2931

// In 32-bit ELF systems, our jump table entries are formed with @GOTOFF

2932

// entries.

2933

return MCSymbolRefExpr::create(MBB->getSymbol(),

2934

MCSymbolRefExpr::VK_GOTOFF, Ctx);

2935

}

2936

2937

/// Returns relocation base for the given PIC jumptable.

2938

SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,

2939

SelectionDAG &DAG) const {

2940

if (!Subtarget.is64Bit())

2941

// This doesn't have SDLoc associated with it, but is not really the

2942

// same as a Register.

2943

return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),

2944

getPointerTy(DAG.getDataLayout()));

2945

return Table;

2946

}

2947

2948

/// This returns the relocation base for the given PIC jumptable,

2949

/// the same as getPICJumpTableRelocBase, but as an MCExpr.

2950

const MCExpr *X86TargetLowering::

2951

getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,

2952

MCContext &Ctx) const {

2953

// X86-64 uses RIP relative addressing based on the jump table label.

2954

if (Subtarget.isPICStyleRIPRel())

2955

return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);

2956

2957

// Otherwise, the reference is relative to the PIC base.

2958

return MCSymbolRefExpr::create(MF->getPICBaseSymbol(), Ctx);

2959

}

2960

2961

std::pair<const TargetRegisterClass *, uint8_t>

2962

X86TargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,

2963

MVT VT) const {

2964

const TargetRegisterClass *RRC = nullptr;

2965

uint8_t Cost = 1;

2966

switch (VT.SimpleTy) {

2967

default:

2968

return TargetLowering::findRepresentativeClass(TRI, VT);

2969

case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:

2970

RRC = Subtarget.is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass;

2971

break;

2972

case MVT::x86mmx:

2973

RRC = &X86::VR64RegClass;

2974

break;

2975

case MVT::f32: case MVT::f64:

2976

case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:

2977

case MVT::v4f32: case MVT::v2f64:

2978

case MVT::v32i8: case MVT::v16i16: case MVT::v8i32: case MVT::v4i64:

2979

case MVT::v8f32: case MVT::v4f64:

2980

case MVT::v64i8: case MVT::v32i16: case MVT::v16i32: case MVT::v8i64:

2981

case MVT::v16f32: case MVT::v8f64:

2982

RRC = &X86::VR128XRegClass;

2983

break;

2984

}

2985

return std::make_pair(RRC, Cost);

2986

}

2987

2988

unsigned X86TargetLowering::getAddressSpace() const {

2989

if (Subtarget.is64Bit())

2990

return (getTargetMachine().getCodeModel() == CodeModel::Kernel) ? 256 : 257;

2991

return 256;

2992

}

2993

2994

static bool hasStackGuardSlotTLS(const Triple &TargetTriple) {

2995

return TargetTriple.isOSGlibc() || TargetTriple.isOSFuchsia() ||

2996

(TargetTriple.isAndroid() && !TargetTriple.isAndroidVersionLT(17));

2997

}

2998

2999

static Constant* SegmentOffset(IRBuilderBase &IRB,

3000

int Offset, unsigned AddressSpace) {

3001

return ConstantExpr::getIntToPtr(

3002

ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset),

3003

Type::getInt8PtrTy(IRB.getContext())->getPointerTo(AddressSpace));

3004

}

3005

3006

Value *X86TargetLowering::getIRStackGuard(IRBuilderBase &IRB) const {

3007

// glibc, bionic, and Fuchsia have a special slot for the stack guard in

3008

// tcbhead_t; use it instead of the usual global variable (see

3009

// sysdeps/{i386,x86_64}/nptl/tls.h)

3010

if (hasStackGuardSlotTLS(Subtarget.getTargetTriple())) {

3011

if (Subtarget.isTargetFuchsia()) {

3012

// <zircon/tls.h> defines ZX_TLS_STACK_GUARD_OFFSET with this value.

3013

return SegmentOffset(IRB, 0x10, getAddressSpace());

3014

} else {

3015

unsigned AddressSpace = getAddressSpace();

3016

Module *M = IRB.GetInsertBlock()->getParent()->getParent();

3017

// Specially, some users may customize the base reg and offset.

3018

int Offset = M->getStackProtectorGuardOffset();

3019

// If we don't set -stack-protector-guard-offset value:

3020

// %fs:0x28, unless we're using a Kernel code model, in which case

3021

// it's %gs:0x28. gs:0x14 on i386.

3022

if (Offset == INT_MAX2147483647)

3023

Offset = (Subtarget.is64Bit()) ? 0x28 : 0x14;

3024

3025

StringRef GuardReg = M->getStackProtectorGuardReg();

3026

if (GuardReg == "fs")

3027

AddressSpace = X86AS::FS;

3028

else if (GuardReg == "gs")

3029

AddressSpace = X86AS::GS;

3030

3031

// Use symbol guard if user specify.

3032

StringRef GuardSymb = M->getStackProtectorGuardSymbol();

3033

if (!GuardSymb.empty()) {

3034

GlobalVariable *GV = M->getGlobalVariable(GuardSymb);

3035

if (!GV) {

3036

Type *Ty = Subtarget.is64Bit() ? Type::getInt64Ty(M->getContext())

3037

: Type::getInt32Ty(M->getContext());

3038

GV = new GlobalVariable(*M, Ty, false, GlobalValue::ExternalLinkage,

3039

nullptr, GuardSymb, nullptr,

3040

GlobalValue::NotThreadLocal, AddressSpace);

3041

}

3042

return GV;

3043

}

3044

3045

return SegmentOffset(IRB, Offset, AddressSpace);

3046

}

3047

}

3048

return TargetLowering::getIRStackGuard(IRB);

3049

}

3050

3051

void X86TargetLowering::insertSSPDeclarations(Module &M) const {

3052

// MSVC CRT provides functionalities for stack protection.

3053

if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||

3054

Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {

3055

// MSVC CRT has a global variable holding security cookie.

3056

M.getOrInsertGlobal("__security_cookie",

3057

Type::getInt8PtrTy(M.getContext()));

3058

3059

// MSVC CRT has a function to validate security cookie.

3060

FunctionCallee SecurityCheckCookie = M.getOrInsertFunction(

3061

"__security_check_cookie", Type::getVoidTy(M.getContext()),

3062

Type::getInt8PtrTy(M.getContext()));

3063

if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee())) {

3064

F->setCallingConv(CallingConv::X86_FastCall);

3065

F->addParamAttr(0, Attribute::AttrKind::InReg);

3066

}

3067

return;

3068

}

3069

3070

StringRef GuardMode = M.getStackProtectorGuard();

3071

3072

// glibc, bionic, and Fuchsia have a special slot for the stack guard.

3073

if ((GuardMode == "tls" || GuardMode.empty()) &&

3074

hasStackGuardSlotTLS(Subtarget.getTargetTriple()))

3075

return;

3076

TargetLowering::insertSSPDeclarations(M);

3077

}

3078

3079

Value *X86TargetLowering::getSDagStackGuard(const Module &M) const {

3080

// MSVC CRT has a global variable holding security cookie.

3081

if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||

3082

Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {

3083

return M.getGlobalVariable("__security_cookie");

3084

}

3085

return TargetLowering::getSDagStackGuard(M);

3086

}

3087

3088

Function *X86TargetLowering::getSSPStackGuardCheck(const Module &M) const {

3089

// MSVC CRT has a function to validate security cookie.

3090

if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||

3091

Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {

3092

return M.getFunction("__security_check_cookie");

3093

}

3094

return TargetLowering::getSSPStackGuardCheck(M);

3095

}

3096

3097

Value *

3098

X86TargetLowering::getSafeStackPointerLocation(IRBuilderBase &IRB) const {

3099

if (Subtarget.getTargetTriple().isOSContiki())

3100

return getDefaultSafeStackPointerLocation(IRB, false);

3101

3102

// Android provides a fixed TLS slot for the SafeStack pointer. See the

3103

// definition of TLS_SLOT_SAFESTACK in

3104

// https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h

3105

if (Subtarget.isTargetAndroid()) {

3106

// %fs:0x48, unless we're using a Kernel code model, in which case it's %gs:

3107

// %gs:0x24 on i386

3108

int Offset = (Subtarget.is64Bit()) ? 0x48 : 0x24;

3109

return SegmentOffset(IRB, Offset, getAddressSpace());

3110

}

3111

3112

// Fuchsia is similar.

3113

if (Subtarget.isTargetFuchsia()) {

3114

// <zircon/tls.h> defines ZX_TLS_UNSAFE_SP_OFFSET with this value.

3115

return SegmentOffset(IRB, 0x18, getAddressSpace());

3116

}

3117

3118

return TargetLowering::getSafeStackPointerLocation(IRB);

3119

}

3120

3121

//===----------------------------------------------------------------------===//

3122

// Return Value Calling Convention Implementation

3123

//===----------------------------------------------------------------------===//

3124

3125

bool X86TargetLowering::CanLowerReturn(

3126

CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,

3127

const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {

3128

SmallVector<CCValAssign, 16> RVLocs;

3129

CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);

3130

return CCInfo.CheckReturn(Outs, RetCC_X86);

3131

}

3132

3133

const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {

3134

static const MCPhysReg ScratchRegs[] = { X86::R11, 0 };

3135

return ScratchRegs;

3136

}

3137

3138

ArrayRef<MCPhysReg> X86TargetLowering::getRoundingControlRegisters() const {

3139

// FIXME: We should def X86::FPCW for x87 as well. But it affects a lot of lit

3140

// tests at the moment, which is not what we expected.

3141

static const MCPhysReg RCRegs[] = {X86::MXCSR};

3142

return RCRegs;

3143

}

3144

3145

/// Lowers masks values (v*i1) to the local register values

3146

/// \returns DAG node after lowering to register type

3147

static SDValue lowerMasksToReg(const SDValue &ValArg, const EVT &ValLoc,

3148

const SDLoc &Dl, SelectionDAG &DAG) {

3149

EVT ValVT = ValArg.getValueType();

3150

3151

if (ValVT == MVT::v1i1)

3152

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, Dl, ValLoc, ValArg,

3153

DAG.getIntPtrConstant(0, Dl));

3154

3155

if ((ValVT == MVT::v8i1 && (ValLoc == MVT::i8 || ValLoc == MVT::i32)) ||

3156

(ValVT == MVT::v16i1 && (ValLoc == MVT::i16 || ValLoc == MVT::i32))) {

3157

// Two stage lowering might be required

3158

// bitcast: v8i1 -> i8 / v16i1 -> i16

3159

// anyextend: i8 -> i32 / i16 -> i32

3160

EVT TempValLoc = ValVT == MVT::v8i1 ? MVT::i8 : MVT::i16;

3161

SDValue ValToCopy = DAG.getBitcast(TempValLoc, ValArg);

3162

if (ValLoc == MVT::i32)

3163

ValToCopy = DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValToCopy);

3164

return ValToCopy;

3165

}

3166

3167

if ((ValVT == MVT::v32i1 && ValLoc == MVT::i32) ||

3168

(ValVT == MVT::v64i1 && ValLoc == MVT::i64)) {

3169

// One stage lowering is required

3170

// bitcast: v32i1 -> i32 / v64i1 -> i64

3171

return DAG.getBitcast(ValLoc, ValArg);

3172

}

3173

3174

return DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValArg);

3175

}

3176

3177

/// Breaks v64i1 value into two registers and adds the new node to the DAG

3178

static void Passv64i1ArgInRegs(

3179

const SDLoc &Dl, SelectionDAG &DAG, SDValue &Arg,

3180

SmallVectorImpl<std::pair<Register, SDValue>> &RegsToPass, CCValAssign &VA,

3181

CCValAssign &NextVA, const X86Subtarget &Subtarget) {

3182

assert(Subtarget.hasBWI() && "Expected AVX512BW target!")(static_cast <bool> (Subtarget.hasBWI() && "Expected AVX512BW target!"
) ? void (0) : __assert_fail ("Subtarget.hasBWI() && \"Expected AVX512BW target!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3182, __extension__
__PRETTY_FUNCTION__));

3183

assert(Subtarget.is32Bit() && "Expecting 32 bit target")(static_cast <bool> (Subtarget.is32Bit() && "Expecting 32 bit target"
) ? void (0) : __assert_fail ("Subtarget.is32Bit() && \"Expecting 32 bit target\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3183, __extension__
__PRETTY_FUNCTION__));

3184

assert(Arg.getValueType() == MVT::i64 && "Expecting 64 bit value")(static_cast <bool> (Arg.getValueType() == MVT::i64 &&
"Expecting 64 bit value") ? void (0) : __assert_fail ("Arg.getValueType() == MVT::i64 && \"Expecting 64 bit value\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3184, __extension__
__PRETTY_FUNCTION__));

3185

assert(VA.isRegLoc() && NextVA.isRegLoc() &&(static_cast <bool> (VA.isRegLoc() && NextVA.isRegLoc
() && "The value should reside in two registers") ? void
(0) : __assert_fail ("VA.isRegLoc() && NextVA.isRegLoc() && \"The value should reside in two registers\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3186, __extension__
__PRETTY_FUNCTION__))

3186

"The value should reside in two registers")(static_cast <bool> (VA.isRegLoc() && NextVA.isRegLoc
() && "The value should reside in two registers") ? void
(0) : __assert_fail ("VA.isRegLoc() && NextVA.isRegLoc() && \"The value should reside in two registers\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3186, __extension__
__PRETTY_FUNCTION__));

3187

3188

// Before splitting the value we cast it to i64

3189

Arg = DAG.getBitcast(MVT::i64, Arg);

3190

3191

// Splitting the value into two i32 types

3192

SDValue Lo, Hi;

3193

std::tie(Lo, Hi) = DAG.SplitScalar(Arg, Dl, MVT::i32, MVT::i32);

3194

3195

// Attach the two i32 types into corresponding registers

3196

RegsToPass.push_back(std::make_pair(VA.getLocReg(), Lo));

3197

RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), Hi));

3198

}

3199

3200

SDValue

3201

X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,

3202

bool isVarArg,

3203

const SmallVectorImpl<ISD::OutputArg> &Outs,

3204

const SmallVectorImpl<SDValue> &OutVals,

3205

const SDLoc &dl, SelectionDAG &DAG) const {

3206

MachineFunction &MF = DAG.getMachineFunction();

3207

X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();

3208

3209

// In some cases we need to disable registers from the default CSR list.

3210

// For example, when they are used as return registers (preserve_* and X86's

3211

// regcall) or for argument passing (X86's regcall).

3212

bool ShouldDisableCalleeSavedRegister =

3213

shouldDisableRetRegFromCSR(CallConv) ||

3214

MF.getFunction().hasFnAttribute("no_caller_saved_registers");

3215

3216

if (CallConv == CallingConv::X86_INTR && !Outs.empty())

3217

report_fatal_error("X86 interrupts may not return any value");

3218

3219

SmallVector<CCValAssign, 16> RVLocs;

3220

CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());

3221

CCInfo.AnalyzeReturn(Outs, RetCC_X86);

3222

3223

SmallVector<std::pair<Register, SDValue>, 4> RetVals;

3224

for (unsigned I = 0, OutsIndex = 0, E = RVLocs.size(); I != E;

3225

++I, ++OutsIndex) {

3226

CCValAssign &VA = RVLocs[I];

3227

assert(VA.isRegLoc() && "Can only return in registers!")(static_cast <bool> (VA.isRegLoc() && "Can only return in registers!"
) ? void (0) : __assert_fail ("VA.isRegLoc() && \"Can only return in registers!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3227, __extension__
__PRETTY_FUNCTION__));

3228

3229

// Add the register to the CalleeSaveDisableRegs list.

3230

if (ShouldDisableCalleeSavedRegister)

3231

MF.getRegInfo().disableCalleeSavedRegister(VA.getLocReg());

3232

3233

SDValue ValToCopy = OutVals[OutsIndex];

3234

EVT ValVT = ValToCopy.getValueType();

3235

3236

// Promote values to the appropriate types.

3237

if (VA.getLocInfo() == CCValAssign::SExt)

3238

ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);

3239

else if (VA.getLocInfo() == CCValAssign::ZExt)

3240

ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);

3241

else if (VA.getLocInfo() == CCValAssign::AExt) {

3242

if (ValVT.isVector() && ValVT.getVectorElementType() == MVT::i1)

3243

ValToCopy = lowerMasksToReg(ValToCopy, VA.getLocVT(), dl, DAG);

3244

else

3245

ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);

3246

}

3247

else if (VA.getLocInfo() == CCValAssign::BCvt)

3248

ValToCopy = DAG.getBitcast(VA.getLocVT(), ValToCopy);

3249

3250

assert(VA.getLocInfo() != CCValAssign::FPExt &&(static_cast <bool> (VA.getLocInfo() != CCValAssign::FPExt
&& "Unexpected FP-extend for return value.") ? void (
0) : __assert_fail ("VA.getLocInfo() != CCValAssign::FPExt && \"Unexpected FP-extend for return value.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3251, __extension__
__PRETTY_FUNCTION__))

3251

"Unexpected FP-extend for return value.")(static_cast <bool> (VA.getLocInfo() != CCValAssign::FPExt
&& "Unexpected FP-extend for return value.") ? void (
0) : __assert_fail ("VA.getLocInfo() != CCValAssign::FPExt && \"Unexpected FP-extend for return value.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3251, __extension__
__PRETTY_FUNCTION__));

3252

3253

// Report an error if we have attempted to return a value via an XMM

3254

// register and SSE was disabled.

3255

if (!Subtarget.hasSSE1() && X86::FR32XRegClass.contains(VA.getLocReg())) {

3256

errorUnsupported(DAG, dl, "SSE register return with SSE disabled");

3257

VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.

3258

} else if (!Subtarget.hasSSE2() &&

3259

X86::FR64XRegClass.contains(VA.getLocReg()) &&

3260

ValVT == MVT::f64) {

3261

// When returning a double via an XMM register, report an error if SSE2 is

3262

// not enabled.

3263

errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled");

3264

VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.

3265

}

3266

3267

// Returns in ST0/ST1 are handled specially: these are pushed as operands to

3268

// the RET instruction and handled by the FP Stackifier.

3269

if (VA.getLocReg() == X86::FP0 ||

3270

VA.getLocReg() == X86::FP1) {

3271

// If this is a copy from an xmm register to ST(0), use an FPExtend to

3272

// change the value to the FP stack register class.

3273

if (isScalarFPTypeInSSEReg(VA.getValVT()))

3274

ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);

3275

RetVals.push_back(std::make_pair(VA.getLocReg(), ValToCopy));

3276

// Don't emit a copytoreg.

3277

continue;

3278

}

3279

3280

// 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64

3281

// which is returned in RAX / RDX.

3282

if (Subtarget.is64Bit()) {

3283

if (ValVT == MVT::x86mmx) {

3284

if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {

3285

ValToCopy = DAG.getBitcast(MVT::i64, ValToCopy);

3286

ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,

3287

ValToCopy);

3288

// If we don't have SSE2 available, convert to v4f32 so the generated

3289

// register is legal.

3290

if (!Subtarget.hasSSE2())

3291

ValToCopy = DAG.getBitcast(MVT::v4f32, ValToCopy);

3292

}

3293

}

3294

}

3295

3296

if (VA.needsCustom()) {

3297

assert(VA.getValVT() == MVT::v64i1 &&(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs"
) ? void (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3298, __extension__
__PRETTY_FUNCTION__))

3298

"Currently the only custom case is when we split v64i1 to 2 regs")(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs"
) ? void (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3298, __extension__
__PRETTY_FUNCTION__));

3299

3300

Passv64i1ArgInRegs(dl, DAG, ValToCopy, RetVals, VA, RVLocs[++I],

3301

Subtarget);

3302

3303

// Add the second register to the CalleeSaveDisableRegs list.

3304

if (ShouldDisableCalleeSavedRegister)

3305

MF.getRegInfo().disableCalleeSavedRegister(RVLocs[I].getLocReg());

3306

} else {

3307

RetVals.push_back(std::make_pair(VA.getLocReg(), ValToCopy));

3308

}

3309

}

3310

3311

SDValue Glue;

3312

SmallVector<SDValue, 6> RetOps;

3313

RetOps.push_back(Chain); // Operand #0 = Chain (updated below)

3314

// Operand #1 = Bytes To Pop

3315

RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), dl,

3316

MVT::i32));

3317

3318

// Copy the result values into the output registers.

3319

for (auto &RetVal : RetVals) {

3320

if (RetVal.first == X86::FP0 || RetVal.first == X86::FP1) {

3321

RetOps.push_back(RetVal.second);

3322

continue; // Don't emit a copytoreg.

3323

}

3324

3325

Chain = DAG.getCopyToReg(Chain, dl, RetVal.first, RetVal.second, Glue);

3326

Glue = Chain.getValue(1);

3327

RetOps.push_back(

3328

DAG.getRegister(RetVal.first, RetVal.second.getValueType()));

3329

}

3330

3331

// Swift calling convention does not require we copy the sret argument

3332

// into %rax/%eax for the return, and SRetReturnReg is not set for Swift.

3333

3334

// All x86 ABIs require that for returning structs by value we copy

3335

// the sret argument into %rax/%eax (depending on ABI) for the return.

3336

// We saved the argument into a virtual register in the entry block,

3337

// so now we copy the value out and into %rax/%eax.

3338

//

3339

// Checking Function.hasStructRetAttr() here is insufficient because the IR

3340

// may not have an explicit sret argument. If FuncInfo.CanLowerReturn is

3341

// false, then an sret argument may be implicitly inserted in the SelDAG. In

3342

// either case FuncInfo->setSRetReturnReg() will have been called.

3343

if (Register SRetReg = FuncInfo->getSRetReturnReg()) {

3344

// When we have both sret and another return value, we should use the

3345

// original Chain stored in RetOps[0], instead of the current Chain updated

3346

// in the above loop. If we only have sret, RetOps[0] equals to Chain.

3347

3348

// For the case of sret and another return value, we have

3349

// Chain_0 at the function entry

3350

// Chain_1 = getCopyToReg(Chain_0) in the above loop

3351

// If we use Chain_1 in getCopyFromReg, we will have

3352

// Val = getCopyFromReg(Chain_1)

3353

// Chain_2 = getCopyToReg(Chain_1, Val) from below

3354

3355

// getCopyToReg(Chain_0) will be glued together with

3356

// getCopyToReg(Chain_1, Val) into Unit A, getCopyFromReg(Chain_1) will be

3357

// in Unit B, and we will have cyclic dependency between Unit A and Unit B:

3358

// Data dependency from Unit B to Unit A due to usage of Val in

3359

// getCopyToReg(Chain_1, Val)

3360

// Chain dependency from Unit A to Unit B

3361

3362

// So here, we use RetOps[0] (i.e Chain_0) for getCopyFromReg.

3363

SDValue Val = DAG.getCopyFromReg(RetOps[0], dl, SRetReg,

3364

getPointerTy(MF.getDataLayout()));

3365

3366

Register RetValReg

3367

= (Subtarget.is64Bit() && !Subtarget.isTarget64BitILP32()) ?

3368

X86::RAX : X86::EAX;

3369

Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Glue);

3370

Glue = Chain.getValue(1);

3371

3372

// RAX/EAX now acts like a return value.

3373

RetOps.push_back(

3374

DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));

3375

3376

// Add the returned register to the CalleeSaveDisableRegs list. Don't do

3377

// this however for preserve_most/preserve_all to minimize the number of

3378

// callee-saved registers for these CCs.

3379

if (ShouldDisableCalleeSavedRegister &&

3380

CallConv != CallingConv::PreserveAll &&

3381

CallConv != CallingConv::PreserveMost)

3382

MF.getRegInfo().disableCalleeSavedRegister(RetValReg);

3383

}

3384

3385

const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();

3386

const MCPhysReg *I =

3387

TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());

3388

if (I) {

3389

for (; *I; ++I) {

3390

if (X86::GR64RegClass.contains(*I))

3391

RetOps.push_back(DAG.getRegister(*I, MVT::i64));

3392

else

3393

llvm_unreachable("Unexpected register class in CSRsViaCopy!")::llvm::llvm_unreachable_internal("Unexpected register class in CSRsViaCopy!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3393);

3394

}

3395

}

3396

3397

RetOps[0] = Chain; // Update chain.

3398

3399

// Add the glue if we have it.

3400

if (Glue.getNode())

3401

RetOps.push_back(Glue);

3402

3403

X86ISD::NodeType opcode = X86ISD::RET_GLUE;

3404

if (CallConv == CallingConv::X86_INTR)

3405

opcode = X86ISD::IRET;

3406

return DAG.getNode(opcode, dl, MVT::Other, RetOps);

3407

}

3408

3409

bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {

3410

if (N->getNumValues() != 1 || !N->hasNUsesOfValue(1, 0))

3411

return false;

3412

3413

SDValue TCChain = Chain;

3414

SDNode *Copy = *N->use_begin();

3415

if (Copy->getOpcode() == ISD::CopyToReg) {

3416

// If the copy has a glue operand, we conservatively assume it isn't safe to

3417

// perform a tail call.

3418

if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)

3419

return false;

3420

TCChain = Copy->getOperand(0);

3421

} else if (Copy->getOpcode() != ISD::FP_EXTEND)

3422

return false;

3423

3424

bool HasRet = false;

3425

for (const SDNode *U : Copy->uses()) {

3426

if (U->getOpcode() != X86ISD::RET_GLUE)

3427

return false;

3428

// If we are returning more than one value, we can definitely

3429

// not make a tail call see PR19530

3430

if (U->getNumOperands() > 4)

3431

return false;

3432

if (U->getNumOperands() == 4 &&

3433

U->getOperand(U->getNumOperands() - 1).getValueType() != MVT::Glue)

3434

return false;

3435

HasRet = true;

3436

}

3437

3438

if (!HasRet)

3439

return false;

3440

3441

Chain = TCChain;

3442

return true;

3443

}

3444

3445

EVT X86TargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT,

3446

ISD::NodeType ExtendKind) const {

3447

MVT ReturnMVT = MVT::i32;

3448

3449

bool Darwin = Subtarget.getTargetTriple().isOSDarwin();

3450

if (VT == MVT::i1 || (!Darwin && (VT == MVT::i8 || VT == MVT::i16))) {

3451

// The ABI does not require i1, i8 or i16 to be extended.

3452

//

3453

// On Darwin, there is code in the wild relying on Clang's old behaviour of

3454

// always extending i8/i16 return values, so keep doing that for now.

3455

// (PR26665).

3456

ReturnMVT = MVT::i8;

3457

}

3458

3459

EVT MinVT = getRegisterType(Context, ReturnMVT);

3460

return VT.bitsLT(MinVT) ? MinVT : VT;

3461

}

3462

3463

/// Reads two 32 bit registers and creates a 64 bit mask value.

3464

/// \param VA The current 32 bit value that need to be assigned.

3465

/// \param NextVA The next 32 bit value that need to be assigned.

3466

/// \param Root The parent DAG node.

3467

/// \param [in,out] InGlue Represents SDvalue in the parent DAG node for

3468

/// glue purposes. In the case the DAG is already using

3469

/// physical register instead of virtual, we should glue

3470

/// our new SDValue to InGlue SDvalue.

3471

/// \return a new SDvalue of size 64bit.

3472

static SDValue getv64i1Argument(CCValAssign &VA, CCValAssign &NextVA,

3473

SDValue &Root, SelectionDAG &DAG,

3474

const SDLoc &Dl, const X86Subtarget &Subtarget,

3475

SDValue *InGlue = nullptr) {

3476

assert((Subtarget.hasBWI()) && "Expected AVX512BW target!")(static_cast <bool> ((Subtarget.hasBWI()) && "Expected AVX512BW target!"
) ? void (0) : __assert_fail ("(Subtarget.hasBWI()) && \"Expected AVX512BW target!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3476, __extension__
__PRETTY_FUNCTION__));

3477

assert(Subtarget.is32Bit() && "Expecting 32 bit target")(static_cast <bool> (Subtarget.is32Bit() && "Expecting 32 bit target"
) ? void (0) : __assert_fail ("Subtarget.is32Bit() && \"Expecting 32 bit target\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3477, __extension__
__PRETTY_FUNCTION__));

3478

assert(VA.getValVT() == MVT::v64i1 &&(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Expecting first location of 64 bit width type") ? void (0) :
__assert_fail ("VA.getValVT() == MVT::v64i1 && \"Expecting first location of 64 bit width type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3479, __extension__
__PRETTY_FUNCTION__))

3479

"Expecting first location of 64 bit width type")(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Expecting first location of 64 bit width type") ? void (0) :
__assert_fail ("VA.getValVT() == MVT::v64i1 && \"Expecting first location of 64 bit width type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3479, __extension__
__PRETTY_FUNCTION__));

3480

assert(NextVA.getValVT() == VA.getValVT() &&(static_cast <bool> (NextVA.getValVT() == VA.getValVT()
&& "The locations should have the same type") ? void
(0) : __assert_fail ("NextVA.getValVT() == VA.getValVT() && \"The locations should have the same type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3481, __extension__
__PRETTY_FUNCTION__))

3481

"The locations should have the same type")(static_cast <bool> (NextVA.getValVT() == VA.getValVT()
&& "The locations should have the same type") ? void
(0) : __assert_fail ("NextVA.getValVT() == VA.getValVT() && \"The locations should have the same type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3481, __extension__
__PRETTY_FUNCTION__));

3482

assert(VA.isRegLoc() && NextVA.isRegLoc() &&(static_cast <bool> (VA.isRegLoc() && NextVA.isRegLoc
() && "The values should reside in two registers") ? void
(0) : __assert_fail ("VA.isRegLoc() && NextVA.isRegLoc() && \"The values should reside in two registers\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3483, __extension__
__PRETTY_FUNCTION__))

3483

"The values should reside in two registers")(static_cast <bool> (VA.isRegLoc() && NextVA.isRegLoc
() && "The values should reside in two registers") ? void
(0) : __assert_fail ("VA.isRegLoc() && NextVA.isRegLoc() && \"The values should reside in two registers\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3483, __extension__
__PRETTY_FUNCTION__));

3484

3485

SDValue Lo, Hi;

3486

SDValue ArgValueLo, ArgValueHi;

3487

3488

MachineFunction &MF = DAG.getMachineFunction();

3489

const TargetRegisterClass *RC = &X86::GR32RegClass;

3490

3491

// Read a 32 bit value from the registers.

3492

if (nullptr == InGlue) {

3493

// When no physical register is present,

3494

// create an intermediate virtual register.

3495

Register Reg = MF.addLiveIn(VA.getLocReg(), RC);

3496

ArgValueLo = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);

3497

Reg = MF.addLiveIn(NextVA.getLocReg(), RC);

3498

ArgValueHi = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);

3499

} else {

3500

// When a physical register is available read the value from it and glue

3501

// the reads together.

3502

ArgValueLo =

3503

DAG.getCopyFromReg(Root, Dl, VA.getLocReg(), MVT::i32, *InGlue);

3504

*InGlue = ArgValueLo.getValue(2);

3505

ArgValueHi =

3506

DAG.getCopyFromReg(Root, Dl, NextVA.getLocReg(), MVT::i32, *InGlue);

3507

*InGlue = ArgValueHi.getValue(2);

3508

}

3509

3510

// Convert the i32 type into v32i1 type.

3511

Lo = DAG.getBitcast(MVT::v32i1, ArgValueLo);

3512

3513

// Convert the i32 type into v32i1 type.

3514

Hi = DAG.getBitcast(MVT::v32i1, ArgValueHi);

3515

3516

// Concatenate the two values together.

3517

return DAG.getNode(ISD::CONCAT_VECTORS, Dl, MVT::v64i1, Lo, Hi);

3518

}

3519

3520

/// The function will lower a register of various sizes (8/16/32/64)

3521

/// to a mask value of the expected size (v8i1/v16i1/v32i1/v64i1)

3522

/// \returns a DAG node contains the operand after lowering to mask type.

3523

static SDValue lowerRegToMasks(const SDValue &ValArg, const EVT &ValVT,

3524

const EVT &ValLoc, const SDLoc &Dl,

3525

SelectionDAG &DAG) {

3526

SDValue ValReturned = ValArg;

3527

3528

if (ValVT == MVT::v1i1)

3529

return DAG.getNode(ISD::SCALAR_TO_VECTOR, Dl, MVT::v1i1, ValReturned);

3530

3531

if (ValVT == MVT::v64i1) {

3532

// In 32 bit machine, this case is handled by getv64i1Argument

3533

assert(ValLoc == MVT::i64 && "Expecting only i64 locations")(static_cast <bool> (ValLoc == MVT::i64 && "Expecting only i64 locations"
) ? void (0) : __assert_fail ("ValLoc == MVT::i64 && \"Expecting only i64 locations\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3533, __extension__
__PRETTY_FUNCTION__));

3534

// In 64 bit machine, There is no need to truncate the value only bitcast

3535

} else {

3536

MVT maskLen;

3537

switch (ValVT.getSimpleVT().SimpleTy) {

3538

case MVT::v8i1:

3539

maskLen = MVT::i8;

3540

break;

3541

case MVT::v16i1:

3542

maskLen = MVT::i16;

3543

break;

3544

case MVT::v32i1:

3545

maskLen = MVT::i32;

3546

break;

3547

default:

3548

llvm_unreachable("Expecting a vector of i1 types")::llvm::llvm_unreachable_internal("Expecting a vector of i1 types"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3548);

3549

}

3550

3551

ValReturned = DAG.getNode(ISD::TRUNCATE, Dl, maskLen, ValReturned);

3552

}

3553

return DAG.getBitcast(ValVT, ValReturned);

3554

}

3555

3556

/// Lower the result values of a call into the

3557

/// appropriate copies out of appropriate physical registers.

3558

///

3559

SDValue X86TargetLowering::LowerCallResult(

3560

SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg,

3561

const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,

3562

SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,

3563

uint32_t *RegMask) const {

3564

3565

const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();

3566

// Assign locations to each value returned by this call.

3567

SmallVector<CCValAssign, 16> RVLocs;

3568

CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,

3569

*DAG.getContext());

3570

CCInfo.AnalyzeCallResult(Ins, RetCC_X86);

3571

3572

// Copy all of the result registers out of their specified physreg.

3573

for (unsigned I = 0, InsIndex = 0, E = RVLocs.size(); I != E;

3574

++I, ++InsIndex) {

3575

CCValAssign &VA = RVLocs[I];

3576

EVT CopyVT = VA.getLocVT();

3577

3578

// In some calling conventions we need to remove the used registers

3579

// from the register mask.

3580

if (RegMask) {

3581

for (MCSubRegIterator SubRegs(VA.getLocReg(), TRI, /*IncludeSelf=*/true);

3582

SubRegs.isValid(); ++SubRegs)

3583

RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32));

3584

}

3585

3586

// Report an error if there was an attempt to return FP values via XMM

3587

// registers.

3588

if (!Subtarget.hasSSE1() && X86::FR32XRegClass.contains(VA.getLocReg())) {

3589

errorUnsupported(DAG, dl, "SSE register return with SSE disabled");

3590

if (VA.getLocReg() == X86::XMM1)

3591

VA.convertToReg(X86::FP1); // Set reg to FP1, avoid hitting asserts.

3592

else

3593

VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.

3594

} else if (!Subtarget.hasSSE2() &&

3595

X86::FR64XRegClass.contains(VA.getLocReg()) &&

3596

CopyVT == MVT::f64) {

3597

errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled");

3598

if (VA.getLocReg() == X86::XMM1)

3599

VA.convertToReg(X86::FP1); // Set reg to FP1, avoid hitting asserts.

3600

else

3601

VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.

3602

}

3603

3604

// If we prefer to use the value in xmm registers, copy it out as f80 and

3605

// use a truncate to move it from fp stack reg to xmm reg.

3606

bool RoundAfterCopy = false;

3607

if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) &&

3608

isScalarFPTypeInSSEReg(VA.getValVT())) {

3609

if (!Subtarget.hasX87())

3610

report_fatal_error("X87 register return with X87 disabled");

3611

CopyVT = MVT::f80;

3612

RoundAfterCopy = (CopyVT != VA.getLocVT());

3613

}

3614

3615

SDValue Val;

3616

if (VA.needsCustom()) {

3617

assert(VA.getValVT() == MVT::v64i1 &&(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs"
) ? void (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3618, __extension__
__PRETTY_FUNCTION__))

3618

"Currently the only custom case is when we split v64i1 to 2 regs")(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs"
) ? void (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3618, __extension__
__PRETTY_FUNCTION__));

3619

Val =

3620

getv64i1Argument(VA, RVLocs[++I], Chain, DAG, dl, Subtarget, &InGlue);

3621

} else {

3622

Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), CopyVT, InGlue)

3623

.getValue(1);

3624

Val = Chain.getValue(0);

3625

InGlue = Chain.getValue(2);

3626

}

3627

3628

if (RoundAfterCopy)

3629

Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,

3630

// This truncation won't change the value.

3631

DAG.getIntPtrConstant(1, dl, /*isTarget=*/true));

3632

3633

if (VA.isExtInLoc()) {

3634

if (VA.getValVT().isVector() &&

3635

VA.getValVT().getScalarType() == MVT::i1 &&

3636

((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||

3637

(VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {

3638

// promoting a mask type (v*i1) into a register of type i64/i32/i16/i8

3639

Val = lowerRegToMasks(Val, VA.getValVT(), VA.getLocVT(), dl, DAG);

3640

} else

3641

Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);

3642

}

3643

3644

if (VA.getLocInfo() == CCValAssign::BCvt)

3645

Val = DAG.getBitcast(VA.getValVT(), Val);

3646

3647

InVals.push_back(Val);

3648

}

3649

3650

return Chain;

3651

}

3652

3653

//===----------------------------------------------------------------------===//

3654

// C & StdCall & Fast Calling Convention implementation

3655

//===----------------------------------------------------------------------===//

3656

// StdCall calling convention seems to be standard for many Windows' API

3657

// routines and around. It differs from C calling convention just a little:

3658

// callee should clean up the stack, not caller. Symbols should be also

3659

// decorated in some fancy way :) It doesn't support any vector arguments.

3660

// For info on fast calling convention see Fast Calling Convention (tail call)

3661

// implementation LowerX86_32FastCCCallTo.

3662

3663

/// Determines whether Args, either a set of outgoing arguments to a call, or a

3664

/// set of incoming args of a call, contains an sret pointer that the callee

3665

/// pops

3666

template <typename T>

3667

static bool hasCalleePopSRet(const SmallVectorImpl<T> &Args,

3668

const X86Subtarget &Subtarget) {

3669

// Not C++20 (yet), so no concepts available.

3670

static_assert(std::is_same_v<T, ISD::OutputArg> ||

3671

std::is_same_v<T, ISD::InputArg>,

3672

"requires ISD::OutputArg or ISD::InputArg");

3673

3674

// Only 32-bit pops the sret. It's a 64-bit world these days, so early-out

3675

// for most compilations.

3676

if (!Subtarget.is32Bit())

3677

return false;

3678

3679

if (Args.empty())

3680

return false;

3681

3682

// Most calls do not have an sret argument, check the arg next.

3683

const ISD::ArgFlagsTy &Flags = Args[0].Flags;

3684

if (!Flags.isSRet() || Flags.isInReg())

3685

return false;

3686

3687

// The MSVCabi does not pop the sret.

3688

if (Subtarget.getTargetTriple().isOSMSVCRT())

3689

return false;

3690

3691

// MCUs don't pop the sret

3692

if (Subtarget.isTargetMCU())

3693

return false;

3694

3695

// Callee pops argument

3696

return true;

3697

}

3698

3699

/// Make a copy of an aggregate at address specified by "Src" to address

3700

/// "Dst" with size and alignment information specified by the specific

3701

/// parameter attribute. The copy will be passed as a byval function parameter.

3702

static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst,

3703

SDValue Chain, ISD::ArgFlagsTy Flags,

3704

SelectionDAG &DAG, const SDLoc &dl) {

3705

SDValue SizeNode = DAG.getIntPtrConstant(Flags.getByValSize(), dl);

3706

3707

return DAG.getMemcpy(

3708

Chain, dl, Dst, Src, SizeNode, Flags.getNonZeroByValAlign(),

3709

/*isVolatile*/ false, /*AlwaysInline=*/true,

3710

/*isTailCall*/ false, MachinePointerInfo(), MachinePointerInfo());

3711

}

3712

3713

/// Return true if the calling convention is one that we can guarantee TCO for.

3714

static bool canGuaranteeTCO(CallingConv::ID CC) {

3715

return (CC == CallingConv::Fast || CC == CallingConv::GHC ||

3716

CC == CallingConv::X86_RegCall || CC == CallingConv::HiPE ||

3717

CC == CallingConv::Tail || CC == CallingConv::SwiftTail);

3718

}

3719

3720

/// Return true if we might ever do TCO for calls with this calling convention.

3721

static bool mayTailCallThisCC(CallingConv::ID CC) {

3722

switch (CC) {

3723

// C calling conventions:

3724

case CallingConv::C:

3725

case CallingConv::Win64:

3726

case CallingConv::X86_64_SysV:

3727

// Callee pop conventions:

3728

case CallingConv::X86_ThisCall:

3729

case CallingConv::X86_StdCall:

3730

case CallingConv::X86_VectorCall:

3731

case CallingConv::X86_FastCall:

3732

// Swift:

3733

case CallingConv::Swift:

3734

return true;

3735

default:

3736

return canGuaranteeTCO(CC);

3737

}

3738

}

3739

3740

/// Return true if the function is being made into a tailcall target by

3741

/// changing its ABI.

3742

static bool shouldGuaranteeTCO(CallingConv::ID CC, bool GuaranteedTailCallOpt) {

3743

return (GuaranteedTailCallOpt && canGuaranteeTCO(CC)) ||

3744

CC == CallingConv::Tail || CC == CallingConv::SwiftTail;

3745

}

3746

3747

bool X86TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {

3748

if (!CI->isTailCall())

3749

return false;

3750

3751

CallingConv::ID CalleeCC = CI->getCallingConv();

3752

if (!mayTailCallThisCC(CalleeCC))

3753

return false;

3754

3755

return true;

3756

}

3757

3758

SDValue

3759

X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,

3760

const SmallVectorImpl<ISD::InputArg> &Ins,

3761

const SDLoc &dl, SelectionDAG &DAG,

3762

const CCValAssign &VA,

3763

MachineFrameInfo &MFI, unsigned i) const {

3764

// Create the nodes corresponding to a load from this parameter slot.

3765

ISD::ArgFlagsTy Flags = Ins[i].Flags;

3766

bool AlwaysUseMutable = shouldGuaranteeTCO(

3767

CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt);

3768

bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();

3769

EVT ValVT;

3770

MVT PtrVT = getPointerTy(DAG.getDataLayout());

3771

3772

// If value is passed by pointer we have address passed instead of the value

3773

// itself. No need to extend if the mask value and location share the same

3774

// absolute size.

3775

bool ExtendedInMem =

3776

VA.isExtInLoc() && VA.getValVT().getScalarType() == MVT::i1 &&

3777

VA.getValVT().getSizeInBits() != VA.getLocVT().getSizeInBits();

3778

3779

if (VA.getLocInfo() == CCValAssign::Indirect || ExtendedInMem)

3780

ValVT = VA.getLocVT();

3781

else

3782

ValVT = VA.getValVT();

3783

3784

// FIXME: For now, all byval parameter objects are marked mutable. This can be

3785

// changed with more analysis.

3786

// In case of tail call optimization mark all arguments mutable. Since they

3787

// could be overwritten by lowering of arguments in case of a tail call.

3788

if (Flags.isByVal()) {

3789

unsigned Bytes = Flags.getByValSize();

3790

if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.

3791

3792

// FIXME: For now, all byval parameter objects are marked as aliasing. This

3793

// can be improved with deeper analysis.

3794

int FI = MFI.CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable,

3795

/*isAliased=*/true);

3796

return DAG.getFrameIndex(FI, PtrVT);

3797

}

3798

3799

EVT ArgVT = Ins[i].ArgVT;

3800

3801

// If this is a vector that has been split into multiple parts, and the

3802

// scalar size of the parts don't match the vector element size, then we can't

3803

// elide the copy. The parts will have padding between them instead of being

3804

// packed like a vector.

3805

bool ScalarizedAndExtendedVector =

3806

ArgVT.isVector() && !VA.getLocVT().isVector() &&

3807

VA.getLocVT().getSizeInBits() != ArgVT.getScalarSizeInBits();

3808

3809

// This is an argument in memory. We might be able to perform copy elision.

3810

// If the argument is passed directly in memory without any extension, then we

3811

// can perform copy elision. Large vector types, for example, may be passed

3812

// indirectly by pointer.

3813

if (Flags.isCopyElisionCandidate() &&

3814

VA.getLocInfo() != CCValAssign::Indirect && !ExtendedInMem &&

3815

!ScalarizedAndExtendedVector) {

3816

SDValue PartAddr;

3817

if (Ins[i].PartOffset == 0) {

3818

// If this is a one-part value or the first part of a multi-part value,

3819

// create a stack object for the entire argument value type and return a

3820

// load from our portion of it. This assumes that if the first part of an

3821

// argument is in memory, the rest will also be in memory.

3822

int FI = MFI.CreateFixedObject(ArgVT.getStoreSize(), VA.getLocMemOffset(),

3823

/*IsImmutable=*/false);

3824

PartAddr = DAG.getFrameIndex(FI, PtrVT);

3825

return DAG.getLoad(

3826

ValVT, dl, Chain, PartAddr,

3827

MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));

3828

} else {

3829

// This is not the first piece of an argument in memory. See if there is

3830

// already a fixed stack object including this offset. If so, assume it

3831

// was created by the PartOffset == 0 branch above and create a load from

3832

// the appropriate offset into it.

3833

int64_t PartBegin = VA.getLocMemOffset();

3834

int64_t PartEnd = PartBegin + ValVT.getSizeInBits() / 8;

3835

int FI = MFI.getObjectIndexBegin();

3836

for (; MFI.isFixedObjectIndex(FI); ++FI) {

3837

int64_t ObjBegin = MFI.getObjectOffset(FI);

3838

int64_t ObjEnd = ObjBegin + MFI.getObjectSize(FI);

3839

if (ObjBegin <= PartBegin && PartEnd <= ObjEnd)

3840

break;

3841

}

3842

if (MFI.isFixedObjectIndex(FI)) {

3843

SDValue Addr =

3844

DAG.getNode(ISD::ADD, dl, PtrVT, DAG.getFrameIndex(FI, PtrVT),

3845

DAG.getIntPtrConstant(Ins[i].PartOffset, dl));

3846

return DAG.getLoad(

3847

ValVT, dl, Chain, Addr,

3848

MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI,

3849

Ins[i].PartOffset));

3850

}

3851

}

3852

}

3853

3854

int FI = MFI.CreateFixedObject(ValVT.getSizeInBits() / 8,

3855

VA.getLocMemOffset(), isImmutable);

3856

3857

// Set SExt or ZExt flag.

3858

if (VA.getLocInfo() == CCValAssign::ZExt) {

3859

MFI.setObjectZExt(FI, true);

3860

} else if (VA.getLocInfo() == CCValAssign::SExt) {

3861

MFI.setObjectSExt(FI, true);

3862

}

3863

3864

MaybeAlign Alignment;

3865

if (Subtarget.isTargetWindowsMSVC() && !Subtarget.is64Bit() &&

3866

ValVT != MVT::f80)

3867

Alignment = MaybeAlign(4);

3868

SDValue FIN = DAG.getFrameIndex(FI, PtrVT);

3869

SDValue Val = DAG.getLoad(

3870

ValVT, dl, Chain, FIN,

3871

MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI),

3872

Alignment);

3873

return ExtendedInMem

3874

? (VA.getValVT().isVector()

3875

? DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VA.getValVT(), Val)

3876

: DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val))

3877

: Val;

3878

}

3879

3880

// FIXME: Get this from tablegen.

3881

static ArrayRef<MCPhysReg> get64BitArgumentGPRs(CallingConv::ID CallConv,

3882

const X86Subtarget &Subtarget) {

3883

assert(Subtarget.is64Bit())(static_cast <bool> (Subtarget.is64Bit()) ? void (0) : __assert_fail
("Subtarget.is64Bit()", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3883, __extension__ __PRETTY_FUNCTION__));

3884

3885

if (Subtarget.isCallingConvWin64(CallConv)) {

3886

static const MCPhysReg GPR64ArgRegsWin64[] = {

3887

X86::RCX, X86::RDX, X86::R8, X86::R9

3888

};

3889

return ArrayRef(std::begin(GPR64ArgRegsWin64), std::end(GPR64ArgRegsWin64));

3890

}

3891

3892

static const MCPhysReg GPR64ArgRegs64Bit[] = {

3893

X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9

3894

};

3895

return ArrayRef(std::begin(GPR64ArgRegs64Bit), std::end(GPR64ArgRegs64Bit));

3896

}

3897

3898

// FIXME: Get this from tablegen.

3899

static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,

3900

CallingConv::ID CallConv,

3901

const X86Subtarget &Subtarget) {

3902

assert(Subtarget.is64Bit())(static_cast <bool> (Subtarget.is64Bit()) ? void (0) : __assert_fail
("Subtarget.is64Bit()", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3902, __extension__ __PRETTY_FUNCTION__));

3903

if (Subtarget.isCallingConvWin64(CallConv)) {

3904

// The XMM registers which might contain var arg parameters are shadowed

3905

// in their paired GPR. So we only need to save the GPR to their home

3906

// slots.

3907

// TODO: __vectorcall will change this.

3908

return std::nullopt;

3909

}

3910

3911

bool isSoftFloat = Subtarget.useSoftFloat();

3912

if (isSoftFloat || !Subtarget.hasSSE1())

3913

// Kernel mode asks for SSE to be disabled, so there are no XMM argument

3914

// registers.

3915

return std::nullopt;

3916

3917

static const MCPhysReg XMMArgRegs64Bit[] = {

3918

X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,

3919

X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7

3920

};

3921

return ArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit));

3922

}

3923

3924

#ifndef NDEBUG

3925

static bool isSortedByValueNo(ArrayRef<CCValAssign> ArgLocs) {

3926

return llvm::is_sorted(

3927

ArgLocs, [](const CCValAssign &A, const CCValAssign &B) -> bool {

3928

return A.getValNo() < B.getValNo();

3929

});

3930

}

3931

#endif

3932

3933

namespace {

3934

/// This is a helper class for lowering variable arguments parameters.

3935

class VarArgsLoweringHelper {

3936

public:

3937

VarArgsLoweringHelper(X86MachineFunctionInfo *FuncInfo, const SDLoc &Loc,

3938

SelectionDAG &DAG, const X86Subtarget &Subtarget,

3939

CallingConv::ID CallConv, CCState &CCInfo)

3940

: FuncInfo(FuncInfo), DL(Loc), DAG(DAG), Subtarget(Subtarget),

3941

TheMachineFunction(DAG.getMachineFunction()),

3942

TheFunction(TheMachineFunction.getFunction()),

3943

FrameInfo(TheMachineFunction.getFrameInfo()),

3944

FrameLowering(*Subtarget.getFrameLowering()),

3945

TargLowering(DAG.getTargetLoweringInfo()), CallConv(CallConv),

3946

CCInfo(CCInfo) {}

3947

3948

// Lower variable arguments parameters.

3949

void lowerVarArgsParameters(SDValue &Chain, unsigned StackSize);

3950

3951

private:

3952

void createVarArgAreaAndStoreRegisters(SDValue &Chain, unsigned StackSize);

3953

3954

void forwardMustTailParameters(SDValue &Chain);

3955

3956

bool is64Bit() const { return Subtarget.is64Bit(); }

3957

bool isWin64() const { return Subtarget.isCallingConvWin64(CallConv); }

3958

3959

X86MachineFunctionInfo *FuncInfo;

3960

const SDLoc &DL;

3961

SelectionDAG &DAG;

3962

const X86Subtarget &Subtarget;

3963

MachineFunction &TheMachineFunction;

3964

const Function &TheFunction;

3965

MachineFrameInfo &FrameInfo;

3966

const TargetFrameLowering &FrameLowering;

3967

const TargetLowering &TargLowering;

3968

CallingConv::ID CallConv;

3969

CCState &CCInfo;

3970

};

3971

} // namespace

3972

3973

void VarArgsLoweringHelper::createVarArgAreaAndStoreRegisters(

3974

SDValue &Chain, unsigned StackSize) {

3975

// If the function takes variable number of arguments, make a frame index for

3976

// the start of the first vararg value... for expansion of llvm.va_start. We

3977

// can skip this if there are no va_start calls.

3978

if (is64Bit() || (CallConv != CallingConv::X86_FastCall &&

3979

CallConv != CallingConv::X86_ThisCall)) {

3980

FuncInfo->setVarArgsFrameIndex(

3981

FrameInfo.CreateFixedObject(1, StackSize, true));

3982

}

3983

3984

// 64-bit calling conventions support varargs and register parameters, so we

3985

// have to do extra work to spill them in the prologue.

3986

if (is64Bit()) {

3987

// Find the first unallocated argument registers.

3988

ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget);

3989

ArrayRef<MCPhysReg> ArgXMMs =

3990

get64BitArgumentXMMs(TheMachineFunction, CallConv, Subtarget);

3991

unsigned NumIntRegs = CCInfo.getFirstUnallocated(ArgGPRs);

3992

unsigned NumXMMRegs = CCInfo.getFirstUnallocated(ArgXMMs);

3993

3994

assert(!(NumXMMRegs && !Subtarget.hasSSE1()) &&(static_cast <bool> (!(NumXMMRegs && !Subtarget
.hasSSE1()) && "SSE register cannot be used when SSE is disabled!"
) ? void (0) : __assert_fail ("!(NumXMMRegs && !Subtarget.hasSSE1()) && \"SSE register cannot be used when SSE is disabled!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3995, __extension__
__PRETTY_FUNCTION__))

3995

"SSE register cannot be used when SSE is disabled!")(static_cast <bool> (!(NumXMMRegs && !Subtarget
.hasSSE1()) && "SSE register cannot be used when SSE is disabled!"
) ? void (0) : __assert_fail ("!(NumXMMRegs && !Subtarget.hasSSE1()) && \"SSE register cannot be used when SSE is disabled!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3995, __extension__
__PRETTY_FUNCTION__));

3996

3997

if (isWin64()) {

3998

// Get to the caller-allocated home save location. Add 8 to account

3999

// for the return address.

4000

int HomeOffset = FrameLowering.getOffsetOfLocalArea() + 8;

4001

FuncInfo->setRegSaveFrameIndex(

4002

FrameInfo.CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));

4003

// Fixup to set vararg frame on shadow area (4 x i64).

4004

if (NumIntRegs < 4)

4005

FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());

4006

} else {

4007

// For X86-64, if there are vararg parameters that are passed via

4008

// registers, then we must store them to their spots on the stack so

4009

// they may be loaded by dereferencing the result of va_next.

4010

FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);

4011

FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16);

4012

FuncInfo->setRegSaveFrameIndex(FrameInfo.CreateStackObject(

4013

ArgGPRs.size() * 8 + ArgXMMs.size() * 16, Align(16), false));

4014

}

4015

4016

SmallVector<SDValue, 6>

4017

LiveGPRs; // list of SDValue for GPR registers keeping live input value

4018

SmallVector<SDValue, 8> LiveXMMRegs; // list of SDValue for XMM registers

4019

// keeping live input value

4020

SDValue ALVal; // if applicable keeps SDValue for %al register

4021

4022

// Gather all the live in physical registers.

4023

for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) {

4024

Register GPR = TheMachineFunction.addLiveIn(Reg, &X86::GR64RegClass);

4025

LiveGPRs.push_back(DAG.getCopyFromReg(Chain, DL, GPR, MVT::i64));

4026

}

4027

const auto &AvailableXmms = ArgXMMs.slice(NumXMMRegs);

4028

if (!AvailableXmms.empty()) {

4029

Register AL = TheMachineFunction.addLiveIn(X86::AL, &X86::GR8RegClass);

4030

ALVal = DAG.getCopyFromReg(Chain, DL, AL, MVT::i8);

4031

for (MCPhysReg Reg : AvailableXmms) {

4032

// FastRegisterAllocator spills virtual registers at basic

4033

// block boundary. That leads to usages of xmm registers

4034

// outside of check for %al. Pass physical registers to

4035

// VASTART_SAVE_XMM_REGS to avoid unneccessary spilling.

4036

TheMachineFunction.getRegInfo().addLiveIn(Reg);

4037

LiveXMMRegs.push_back(DAG.getRegister(Reg, MVT::v4f32));

4038

}

4039

}

4040

4041

// Store the integer parameter registers.

4042

SmallVector<SDValue, 8> MemOps;

4043

SDValue RSFIN =

4044

DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),

4045

TargLowering.getPointerTy(DAG.getDataLayout()));

4046

unsigned Offset = FuncInfo->getVarArgsGPOffset();

4047

for (SDValue Val : LiveGPRs) {

4048

SDValue FIN = DAG.getNode(ISD::ADD, DL,

4049

TargLowering.getPointerTy(DAG.getDataLayout()),

4050

RSFIN, DAG.getIntPtrConstant(Offset, DL));

4051

SDValue Store =

4052

DAG.getStore(Val.getValue(1), DL, Val, FIN,

4053

MachinePointerInfo::getFixedStack(

4054

DAG.getMachineFunction(),

4055

FuncInfo->getRegSaveFrameIndex(), Offset));

4056

MemOps.push_back(Store);

4057

Offset += 8;

4058

}

4059

4060

// Now store the XMM (fp + vector) parameter registers.

4061

if (!LiveXMMRegs.empty()) {

4062

SmallVector<SDValue, 12> SaveXMMOps;

4063

SaveXMMOps.push_back(Chain);

4064

SaveXMMOps.push_back(ALVal);

4065

SaveXMMOps.push_back(RSFIN);

4066

SaveXMMOps.push_back(

4067

DAG.getTargetConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32));

4068

llvm::append_range(SaveXMMOps, LiveXMMRegs);

4069

MachineMemOperand *StoreMMO =

4070

DAG.getMachineFunction().getMachineMemOperand(

4071

MachinePointerInfo::getFixedStack(

4072

DAG.getMachineFunction(), FuncInfo->getRegSaveFrameIndex(),

4073

Offset),

4074

MachineMemOperand::MOStore, 128, Align(16));

4075

MemOps.push_back(DAG.getMemIntrinsicNode(X86ISD::VASTART_SAVE_XMM_REGS,

4076

DL, DAG.getVTList(MVT::Other),

4077

SaveXMMOps, MVT::i8, StoreMMO));

4078

}

4079

4080

if (!MemOps.empty())

4081

Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);

4082

}

4083

}

4084

4085

void VarArgsLoweringHelper::forwardMustTailParameters(SDValue &Chain) {

4086

// Find the largest legal vector type.

4087

MVT VecVT = MVT::Other;

4088

// FIXME: Only some x86_32 calling conventions support AVX512.

4089

if (Subtarget.useAVX512Regs() &&

4090

(is64Bit() || (CallConv == CallingConv::X86_VectorCall ||

4091

CallConv == CallingConv::Intel_OCL_BI)))

4092

VecVT = MVT::v16f32;

4093

else if (Subtarget.hasAVX())

4094

VecVT = MVT::v8f32;

4095

else if (Subtarget.hasSSE2())

4096

VecVT = MVT::v4f32;

4097

4098

// We forward some GPRs and some vector types.

4099

SmallVector<MVT, 2> RegParmTypes;

4100

MVT IntVT = is64Bit() ? MVT::i64 : MVT::i32;

4101

RegParmTypes.push_back(IntVT);

4102

if (VecVT != MVT::Other)

4103

RegParmTypes.push_back(VecVT);

4104

4105

// Compute the set of forwarded registers. The rest are scratch.

4106

SmallVectorImpl<ForwardedRegister> &Forwards =

4107

FuncInfo->getForwardedMustTailRegParms();

4108

CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86);

4109

4110

// Forward AL for SysV x86_64 targets, since it is used for varargs.

4111

if (is64Bit() && !isWin64() && !CCInfo.isAllocated(X86::AL)) {

4112

Register ALVReg = TheMachineFunction.addLiveIn(X86::AL, &X86::GR8RegClass);

4113

Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8));

4114

}

4115

4116

// Copy all forwards from physical to virtual registers.

4117

for (ForwardedRegister &FR : Forwards) {

4118

// FIXME: Can we use a less constrained schedule?

4119

SDValue RegVal = DAG.getCopyFromReg(Chain, DL, FR.VReg, FR.VT);

4120

FR.VReg = TheMachineFunction.getRegInfo().createVirtualRegister(

4121

TargLowering.getRegClassFor(FR.VT));

4122

Chain = DAG.getCopyToReg(Chain, DL, FR.VReg, RegVal);

4123

}

4124

}

4125

4126

void VarArgsLoweringHelper::lowerVarArgsParameters(SDValue &Chain,

4127

unsigned StackSize) {

4128

// Set FrameIndex to the 0xAAAAAAA value to mark unset state.

4129

// If necessary, it would be set into the correct value later.

4130

FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);

4131

FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);

4132

4133

if (FrameInfo.hasVAStart())

4134

createVarArgAreaAndStoreRegisters(Chain, StackSize);

4135

4136

if (FrameInfo.hasMustTailInVarArgFunc())

4137

forwardMustTailParameters(Chain);

4138

}

4139

4140

SDValue X86TargetLowering::LowerFormalArguments(

4141

SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,

4142

const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,

4143

SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {

4144

MachineFunction &MF = DAG.getMachineFunction();

4145

X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();

4146

4147

const Function &F = MF.getFunction();

4148

if (F.hasExternalLinkage() && Subtarget.isTargetCygMing() &&

4149

F.getName() == "main")

4150

FuncInfo->setForceFramePointer(true);

4151

4152

MachineFrameInfo &MFI = MF.getFrameInfo();

4153

bool Is64Bit = Subtarget.is64Bit();

4154

bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);

4155

4156

assert((static_cast <bool> (!(IsVarArg && canGuaranteeTCO
(CallConv)) && "Var args not supported with calling conv' regcall, fastcc, ghc or hipe"
) ? void (0) : __assert_fail ("!(IsVarArg && canGuaranteeTCO(CallConv)) && \"Var args not supported with calling conv' regcall, fastcc, ghc or hipe\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4158, __extension__
__PRETTY_FUNCTION__))

4157

!(IsVarArg && canGuaranteeTCO(CallConv)) &&(static_cast <bool> (!(IsVarArg && canGuaranteeTCO
(CallConv)) && "Var args not supported with calling conv' regcall, fastcc, ghc or hipe"
) ? void (0) : __assert_fail ("!(IsVarArg && canGuaranteeTCO(CallConv)) && \"Var args not supported with calling conv' regcall, fastcc, ghc or hipe\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4158, __extension__
__PRETTY_FUNCTION__))

4158

"Var args not supported with calling conv' regcall, fastcc, ghc or hipe")(static_cast <bool> (!(IsVarArg && canGuaranteeTCO
(CallConv)) && "Var args not supported with calling conv' regcall, fastcc, ghc or hipe"
) ? void (0) : __assert_fail ("!(IsVarArg && canGuaranteeTCO(CallConv)) && \"Var args not supported with calling conv' regcall, fastcc, ghc or hipe\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4158, __extension__
__PRETTY_FUNCTION__));

4159

4160

// Assign locations to all of the incoming arguments.

4161

SmallVector<CCValAssign, 16> ArgLocs;

4162

CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());

4163

4164

// Allocate shadow area for Win64.

4165

if (IsWin64)

4166

CCInfo.AllocateStack(32, Align(8));

4167

4168

CCInfo.AnalyzeArguments(Ins, CC_X86);

4169

4170

// In vectorcall calling convention a second pass is required for the HVA

4171

// types.

4172

if (CallingConv::X86_VectorCall == CallConv) {

4173

CCInfo.AnalyzeArgumentsSecondPass(Ins, CC_X86);

4174

}

4175

4176

// The next loop assumes that the locations are in the same order of the

4177

// input arguments.

4178

assert(isSortedByValueNo(ArgLocs) &&(static_cast <bool> (isSortedByValueNo(ArgLocs) &&
"Argument Location list must be sorted before lowering") ? void
(0) : __assert_fail ("isSortedByValueNo(ArgLocs) && \"Argument Location list must be sorted before lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4179, __extension__
__PRETTY_FUNCTION__))

4179

"Argument Location list must be sorted before lowering")(static_cast <bool> (isSortedByValueNo(ArgLocs) &&
"Argument Location list must be sorted before lowering") ? void
(0) : __assert_fail ("isSortedByValueNo(ArgLocs) && \"Argument Location list must be sorted before lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4179, __extension__
__PRETTY_FUNCTION__));

4180

4181

SDValue ArgValue;

4182

for (unsigned I = 0, InsIndex = 0, E = ArgLocs.size(); I != E;

4183

++I, ++InsIndex) {

4184

assert(InsIndex < Ins.size() && "Invalid Ins index")(static_cast <bool> (InsIndex < Ins.size() &&
"Invalid Ins index") ? void (0) : __assert_fail ("InsIndex < Ins.size() && \"Invalid Ins index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4184, __extension__
__PRETTY_FUNCTION__));

4185

CCValAssign &VA = ArgLocs[I];

4186

4187

if (VA.isRegLoc()) {

4188

EVT RegVT = VA.getLocVT();

4189

if (VA.needsCustom()) {

4190

assert((static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs"
) ? void (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4192, __extension__
__PRETTY_FUNCTION__))

4191

VA.getValVT() == MVT::v64i1 &&(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs"
) ? void (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4192, __extension__
__PRETTY_FUNCTION__))

4192

"Currently the only custom case is when we split v64i1 to 2 regs")(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs"
) ? void (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4192, __extension__
__PRETTY_FUNCTION__));

4193

4194

// v64i1 values, in regcall calling convention, that are

4195

// compiled to 32 bit arch, are split up into two registers.

4196

ArgValue =

4197

getv64i1Argument(VA, ArgLocs[++I], Chain, DAG, dl, Subtarget);

4198

} else {

4199

const TargetRegisterClass *RC;

4200

if (RegVT == MVT::i8)

4201

RC = &X86::GR8RegClass;

4202

else if (RegVT == MVT::i16)

4203

RC = &X86::GR16RegClass;

4204

else if (RegVT == MVT::i32)

4205

RC = &X86::GR32RegClass;

4206

else if (Is64Bit && RegVT == MVT::i64)

4207

RC = &X86::GR64RegClass;

4208

else if (RegVT == MVT::f16)

4209

RC = Subtarget.hasAVX512() ? &X86::FR16XRegClass : &X86::FR16RegClass;

4210

else if (RegVT == MVT::f32)

4211

RC = Subtarget.hasAVX512() ? &X86::FR32XRegClass : &X86::FR32RegClass;

4212

else if (RegVT == MVT::f64)

4213

RC = Subtarget.hasAVX512() ? &X86::FR64XRegClass : &X86::FR64RegClass;

4214

else if (RegVT == MVT::f80)

4215

RC = &X86::RFP80RegClass;

4216

else if (RegVT == MVT::f128)

4217

RC = &X86::VR128RegClass;

4218

else if (RegVT.is512BitVector())

4219

RC = &X86::VR512RegClass;

4220

else if (RegVT.is256BitVector())

4221

RC = Subtarget.hasVLX() ? &X86::VR256XRegClass : &X86::VR256RegClass;

4222

else if (RegVT.is128BitVector())

4223

RC = Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass;

4224

else if (RegVT == MVT::x86mmx)

4225

RC = &X86::VR64RegClass;

4226

else if (RegVT == MVT::v1i1)

4227

RC = &X86::VK1RegClass;

4228

else if (RegVT == MVT::v8i1)

4229

RC = &X86::VK8RegClass;

4230

else if (RegVT == MVT::v16i1)

4231

RC = &X86::VK16RegClass;

4232

else if (RegVT == MVT::v32i1)

4233

RC = &X86::VK32RegClass;

4234

else if (RegVT == MVT::v64i1)

4235

RC = &X86::VK64RegClass;

4236

else

4237

llvm_unreachable("Unknown argument type!")::llvm::llvm_unreachable_internal("Unknown argument type!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 4237);

4238

4239

Register Reg = MF.addLiveIn(VA.getLocReg(), RC);

4240

ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);

4241

}

4242

4243

// If this is an 8 or 16-bit value, it is really passed promoted to 32

4244

// bits. Insert an assert[sz]ext to capture this, then truncate to the

4245

// right size.

4246

if (VA.getLocInfo() == CCValAssign::SExt)

4247

ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,

4248

DAG.getValueType(VA.getValVT()));

4249

else if (VA.getLocInfo() == CCValAssign::ZExt)

4250

ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,

4251

DAG.getValueType(VA.getValVT()));

4252

else if (VA.getLocInfo() == CCValAssign::BCvt)

4253

ArgValue = DAG.getBitcast(VA.getValVT(), ArgValue);

4254

4255

if (VA.isExtInLoc()) {

4256

// Handle MMX values passed in XMM regs.

4257

if (RegVT.isVector() && VA.getValVT().getScalarType() != MVT::i1)

4258

ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue);

4259

else if (VA.getValVT().isVector() &&

4260

VA.getValVT().getScalarType() == MVT::i1 &&

4261

((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||

4262

(VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {

4263

// Promoting a mask type (v*i1) into a register of type i64/i32/i16/i8

4264

ArgValue = lowerRegToMasks(ArgValue, VA.getValVT(), RegVT, dl, DAG);

4265

} else

4266

ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);

4267

}

4268

} else {

4269

assert(VA.isMemLoc())(static_cast <bool> (VA.isMemLoc()) ? void (0) : __assert_fail
("VA.isMemLoc()", "llvm/lib/Target/X86/X86ISelLowering.cpp",
4269, __extension__ __PRETTY_FUNCTION__));

4270

ArgValue =

4271

LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, InsIndex);

4272

}

4273

4274

// If value is passed via pointer - do a load.

4275

if (VA.getLocInfo() == CCValAssign::Indirect && !Ins[I].Flags.isByVal())

4276

ArgValue =

4277

DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, MachinePointerInfo());

4278

4279

InVals.push_back(ArgValue);

4280

}

4281

4282

for (unsigned I = 0, E = Ins.size(); I != E; ++I) {

4283

if (Ins[I].Flags.isSwiftAsync()) {

4284

auto X86FI = MF.getInfo<X86MachineFunctionInfo>();

4285

if (Subtarget.is64Bit())

4286

X86FI->setHasSwiftAsyncContext(true);

4287

else {

4288

int FI = MF.getFrameInfo().CreateStackObject(4, Align(4), false);

4289

X86FI->setSwiftAsyncContextFrameIdx(FI);

4290

SDValue St = DAG.getStore(DAG.getEntryNode(), dl, InVals[I],

4291

DAG.getFrameIndex(FI, MVT::i32),

4292

MachinePointerInfo::getFixedStack(MF, FI));

4293

Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, St, Chain);

4294

}

4295

}

4296

4297

// Swift calling convention does not require we copy the sret argument

4298

// into %rax/%eax for the return. We don't set SRetReturnReg for Swift.

4299

if (CallConv == CallingConv::Swift || CallConv == CallingConv::SwiftTail)

4300

continue;

4301

4302

// All x86 ABIs require that for returning structs by value we copy the

4303

// sret argument into %rax/%eax (depending on ABI) for the return. Save

4304

// the argument into a virtual register so that we can access it from the

4305

// return points.

4306

if (Ins[I].Flags.isSRet()) {

4307

assert(!FuncInfo->getSRetReturnReg() &&(static_cast <bool> (!FuncInfo->getSRetReturnReg() &&
"SRet return has already been set") ? void (0) : __assert_fail
("!FuncInfo->getSRetReturnReg() && \"SRet return has already been set\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4308, __extension__
__PRETTY_FUNCTION__))

4308

"SRet return has already been set")(static_cast <bool> (!FuncInfo->getSRetReturnReg() &&
"SRet return has already been set") ? void (0) : __assert_fail
("!FuncInfo->getSRetReturnReg() && \"SRet return has already been set\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4308, __extension__
__PRETTY_FUNCTION__));

4309

MVT PtrTy = getPointerTy(DAG.getDataLayout());

4310

Register Reg =

4311

MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));

4312

FuncInfo->setSRetReturnReg(Reg);

4313

SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[I]);

4314

Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);

4315

break;

4316

}

4317

}

4318

4319

unsigned StackSize = CCInfo.getNextStackOffset();

4320

// Align stack specially for tail calls.

4321

if (shouldGuaranteeTCO(CallConv,

4322

MF.getTarget().Options.GuaranteedTailCallOpt))

4323

StackSize = GetAlignedArgumentStackSize(StackSize, DAG);

4324

4325

if (IsVarArg)

4326

VarArgsLoweringHelper(FuncInfo, dl, DAG, Subtarget, CallConv, CCInfo)

4327

.lowerVarArgsParameters(Chain, StackSize);

4328

4329

// Some CCs need callee pop.

4330

if (X86::isCalleePop(CallConv, Is64Bit, IsVarArg,

4331

MF.getTarget().Options.GuaranteedTailCallOpt)) {

4332

FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.

4333

} else if (CallConv == CallingConv::X86_INTR && Ins.size() == 2) {

4334

// X86 interrupts must pop the error code (and the alignment padding) if

4335

// present.

4336

FuncInfo->setBytesToPopOnReturn(Is64Bit ? 16 : 4);

4337

} else {

4338

FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.

4339

// If this is an sret function, the return should pop the hidden pointer.

4340

if (!canGuaranteeTCO(CallConv) && hasCalleePopSRet(Ins, Subtarget))

4341

FuncInfo->setBytesToPopOnReturn(4);

4342

}

4343

4344

if (!Is64Bit) {

4345

// RegSaveFrameIndex is X86-64 only.

4346

FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);

4347

}

4348

4349

FuncInfo->setArgumentStackSize(StackSize);

4350

4351

if (WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo()) {

4352

EHPersonality Personality = classifyEHPersonality(F.getPersonalityFn());

4353

if (Personality == EHPersonality::CoreCLR) {

4354

assert(Is64Bit)(static_cast <bool> (Is64Bit) ? void (0) : __assert_fail
("Is64Bit", "llvm/lib/Target/X86/X86ISelLowering.cpp", 4354,
__extension__ __PRETTY_FUNCTION__));

4355

// TODO: Add a mechanism to frame lowering that will allow us to indicate

4356

// that we'd prefer this slot be allocated towards the bottom of the frame

4357

// (i.e. near the stack pointer after allocating the frame). Every

4358

// funclet needs a copy of this slot in its (mostly empty) frame, and the

4359

// offset from the bottom of this and each funclet's frame must be the

4360

// same, so the size of funclets' (mostly empty) frames is dictated by

4361

// how far this slot is from the bottom (since they allocate just enough

4362

// space to accommodate holding this slot at the correct offset).

4363

int PSPSymFI = MFI.CreateStackObject(8, Align(8), /*isSpillSlot=*/false);

4364

EHInfo->PSPSymFrameIdx = PSPSymFI;

4365

}

4366

}

4367

4368

if (shouldDisableArgRegFromCSR(CallConv) ||

4369

F.hasFnAttribute("no_caller_saved_registers")) {

4370

MachineRegisterInfo &MRI = MF.getRegInfo();

4371

for (std::pair<Register, Register> Pair : MRI.liveins())

4372

MRI.disableCalleeSavedRegister(Pair.first);

4373

}

4374

4375

return Chain;

4376

}

4377

4378

SDValue X86TargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr,

4379

SDValue Arg, const SDLoc &dl,

4380

SelectionDAG &DAG,

4381

const CCValAssign &VA,

4382

ISD::ArgFlagsTy Flags,

4383

bool isByVal) const {

4384

unsigned LocMemOffset = VA.getLocMemOffset();

4385

SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);

4386

PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),

4387

StackPtr, PtrOff);

4388

if (isByVal)

4389

return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);

4390

4391

MaybeAlign Alignment;

4392

if (Subtarget.isTargetWindowsMSVC() && !Subtarget.is64Bit() &&

4393

Arg.getSimpleValueType() != MVT::f80)

4394

Alignment = MaybeAlign(4);

4395

return DAG.getStore(

4396

Chain, dl, Arg, PtrOff,

4397

MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset),

4398

Alignment);

4399

}

4400

4401

/// Emit a load of return address if tail call

4402

/// optimization is performed and it is required.

4403

SDValue X86TargetLowering::EmitTailCallLoadRetAddr(

4404

SelectionDAG &DAG, SDValue &OutRetAddr, SDValue Chain, bool IsTailCall,

4405

bool Is64Bit, int FPDiff, const SDLoc &dl) const {

4406

// Adjust the Return address stack slot.

4407

EVT VT = getPointerTy(DAG.getDataLayout());

4408

OutRetAddr = getReturnAddressFrameIndex(DAG);

4409

4410

// Load the "old" Return address.

4411

OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo());

4412

return SDValue(OutRetAddr.getNode(), 1);

4413

}

4414

4415

/// Emit a store of the return address if tail call

4416

/// optimization is performed and it is required (FPDiff!=0).

4417

static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,

4418

SDValue Chain, SDValue RetAddrFrIdx,

4419

EVT PtrVT, unsigned SlotSize,

4420

int FPDiff, const SDLoc &dl) {

4421

// Store the return address to the appropriate stack slot.

4422

if (!FPDiff) return Chain;

4423

// Calculate the new stack slot for the return address.

4424

int NewReturnAddrFI =

4425

MF.getFrameInfo().CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize,

4426

false);

4427

SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);

4428

Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,

4429

MachinePointerInfo::getFixedStack(

4430

DAG.getMachineFunction(), NewReturnAddrFI));

4431

return Chain;

4432

}

4433

4434

/// Returns a vector_shuffle mask for an movs{s|d}, movd

4435

/// operation of specified width.

4436

static SDValue getMOVL(SelectionDAG &DAG, const SDLoc &dl, MVT VT, SDValue V1,

4437

SDValue V2) {

4438

unsigned NumElems = VT.getVectorNumElements();

4439

SmallVector<int, 8> Mask;

4440

Mask.push_back(NumElems);

4441

for (unsigned i = 1; i != NumElems; ++i)

4442

Mask.push_back(i);

4443

return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);

4444

}

4445

4446

SDValue

4447

X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,

4448

SmallVectorImpl<SDValue> &InVals) const {

4449

SelectionDAG &DAG = CLI.DAG;

4450

SDLoc &dl = CLI.DL;

4451

SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;

4452

SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;

4453

SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;

4454

SDValue Chain = CLI.Chain;

4455

SDValue Callee = CLI.Callee;

4456

CallingConv::ID CallConv = CLI.CallConv;

4457

bool &isTailCall = CLI.IsTailCall;

4458

bool isVarArg = CLI.IsVarArg;

4459

const auto *CB = CLI.CB;

4460

4461

MachineFunction &MF = DAG.getMachineFunction();

4462

bool Is64Bit = Subtarget.is64Bit();

4463

bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);

4464

bool IsSibcall = false;

4465

bool IsGuaranteeTCO = MF.getTarget().Options.GuaranteedTailCallOpt ||

4466

CallConv == CallingConv::Tail || CallConv == CallingConv::SwiftTail;

4467

bool IsCalleePopSRet = !IsGuaranteeTCO && hasCalleePopSRet(Outs, Subtarget);

4468

X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();

4469

bool HasNCSR = (CB && isa<CallInst>(CB) &&

4470

CB->hasFnAttr("no_caller_saved_registers"));

4471

bool HasNoCfCheck = (CB && CB->doesNoCfCheck());

4472

bool IsIndirectCall = (CB && isa<CallInst>(CB) && CB->isIndirectCall());

4473

bool IsCFICall = IsIndirectCall && CLI.CFIType;

4474

const Module *M = MF.getMMI().getModule();

4475

Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch");

4476

4477

MachineFunction::CallSiteInfo CSInfo;

4478

if (CallConv == CallingConv::X86_INTR)

4479

report_fatal_error("X86 interrupts may not be called directly");

4480

4481

bool IsMustTail = CLI.CB && CLI.CB->isMustTailCall();

4482

if (Subtarget.isPICStyleGOT() && !IsGuaranteeTCO && !IsMustTail) {

4483

// If we are using a GOT, disable tail calls to external symbols with

4484

// default visibility. Tail calling such a symbol requires using a GOT

4485

// relocation, which forces early binding of the symbol. This breaks code

4486

// that require lazy function symbol resolution. Using musttail or

4487

// GuaranteedTailCallOpt will override this.

4488

GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);

4489

if (!G || (!G->getGlobal()->hasLocalLinkage() &&

4490

G->getGlobal()->hasDefaultVisibility()))

4491

isTailCall = false;

4492

}

4493

4494

if (isTailCall && !IsMustTail) {

4495

// Check if it's really possible to do a tail call.

4496

isTailCall = IsEligibleForTailCallOptimization(

4497

Callee, CallConv, IsCalleePopSRet, isVarArg, CLI.RetTy, Outs, OutVals,

4498

Ins, DAG);

4499

4500

// Sibcalls are automatically detected tailcalls which do not require

4501

// ABI changes.

4502

if (!IsGuaranteeTCO && isTailCall)

4503

IsSibcall = true;

4504

4505

if (isTailCall)

4506

++NumTailCalls;

4507

}

4508

4509

if (IsMustTail && !isTailCall)

4510

report_fatal_error("failed to perform tail call elimination on a call "

4511

"site marked musttail");

4512

4513

assert(!(isVarArg && canGuaranteeTCO(CallConv)) &&(static_cast <bool> (!(isVarArg && canGuaranteeTCO
(CallConv)) && "Var args not supported with calling convention fastcc, ghc or hipe"
) ? void (0) : __assert_fail ("!(isVarArg && canGuaranteeTCO(CallConv)) && \"Var args not supported with calling convention fastcc, ghc or hipe\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4514, __extension__
__PRETTY_FUNCTION__))

4514

"Var args not supported with calling convention fastcc, ghc or hipe")(static_cast <bool> (!(isVarArg && canGuaranteeTCO
(CallConv)) && "Var args not supported with calling convention fastcc, ghc or hipe"
) ? void (0) : __assert_fail ("!(isVarArg && canGuaranteeTCO(CallConv)) && \"Var args not supported with calling convention fastcc, ghc or hipe\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4514, __extension__
__PRETTY_FUNCTION__));

4515

4516

// Analyze operands of the call, assigning locations to each operand.

4517

SmallVector<CCValAssign, 16> ArgLocs;

4518

CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());

4519

4520

// Allocate shadow area for Win64.

4521

if (IsWin64)

4522

CCInfo.AllocateStack(32, Align(8));

4523

4524

CCInfo.AnalyzeArguments(Outs, CC_X86);

4525

4526

// In vectorcall calling convention a second pass is required for the HVA

4527

// types.

4528

if (CallingConv::X86_VectorCall == CallConv) {

4529

CCInfo.AnalyzeArgumentsSecondPass(Outs, CC_X86);

4530

}

4531

4532

// Get a count of how many bytes are to be pushed on the stack.

4533

unsigned NumBytes = CCInfo.getAlignedCallFrameSize();

4534

if (IsSibcall)

4535

// This is a sibcall. The memory operands are available in caller's

4536

// own caller's stack.

4537

NumBytes = 0;

4538

else if (IsGuaranteeTCO && canGuaranteeTCO(CallConv))

4539

NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);

4540

4541

int FPDiff = 0;

4542

if (isTailCall &&

4543

shouldGuaranteeTCO(CallConv,

4544

MF.getTarget().Options.GuaranteedTailCallOpt)) {

4545

// Lower arguments at fp - stackoffset + fpdiff.

4546

unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();

4547

4548

FPDiff = NumBytesCallerPushed - NumBytes;

4549

4550

// Set the delta of movement of the returnaddr stackslot.

4551

// But only set if delta is greater than previous delta.

4552

if (FPDiff < X86Info->getTCReturnAddrDelta())

4553

X86Info->setTCReturnAddrDelta(FPDiff);

4554

}

4555

4556

unsigned NumBytesToPush = NumBytes;

4557

unsigned NumBytesToPop = NumBytes;

4558

4559

// If we have an inalloca argument, all stack space has already been allocated

4560

// for us and be right at the top of the stack. We don't support multiple

4561

// arguments passed in memory when using inalloca.

4562

if (!Outs.empty() && Outs.back().Flags.isInAlloca()) {

4563

NumBytesToPush = 0;

4564

if (!ArgLocs.back().isMemLoc())

4565

report_fatal_error("cannot use inalloca attribute on a register "

4566

"parameter");

4567

if (ArgLocs.back().getLocMemOffset() != 0)

4568

report_fatal_error("any parameter with the inalloca attribute must be "

4569

"the only memory argument");

4570

} else if (CLI.IsPreallocated) {

4571

assert(ArgLocs.back().isMemLoc() &&(static_cast <bool> (ArgLocs.back().isMemLoc() &&
"cannot use preallocated attribute on a register " "parameter"
) ? void (0) : __assert_fail ("ArgLocs.back().isMemLoc() && \"cannot use preallocated attribute on a register \" \"parameter\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4573, __extension__
__PRETTY_FUNCTION__))

4572

"cannot use preallocated attribute on a register "(static_cast <bool> (ArgLocs.back().isMemLoc() &&
"cannot use preallocated attribute on a register " "parameter"
) ? void (0) : __assert_fail ("ArgLocs.back().isMemLoc() && \"cannot use preallocated attribute on a register \" \"parameter\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4573, __extension__
__PRETTY_FUNCTION__))

4573

"parameter")(static_cast <bool> (ArgLocs.back().isMemLoc() &&
"cannot use preallocated attribute on a register " "parameter"
) ? void (0) : __assert_fail ("ArgLocs.back().isMemLoc() && \"cannot use preallocated attribute on a register \" \"parameter\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4573, __extension__
__PRETTY_FUNCTION__));

4574

SmallVector<size_t, 4> PreallocatedOffsets;

4575

for (size_t i = 0; i < CLI.OutVals.size(); ++i) {

4576

if (CLI.CB->paramHasAttr(i, Attribute::Preallocated)) {

4577

PreallocatedOffsets.push_back(ArgLocs[i].getLocMemOffset());

4578

}

4579

}

4580

auto *MFI = DAG.getMachineFunction().getInfo<X86MachineFunctionInfo>();

4581

size_t PreallocatedId = MFI->getPreallocatedIdForCallSite(CLI.CB);

4582

MFI->setPreallocatedStackSize(PreallocatedId, NumBytes);

4583

MFI->setPreallocatedArgOffsets(PreallocatedId, PreallocatedOffsets);

4584

NumBytesToPush = 0;

4585

}

4586

4587

if (!IsSibcall && !IsMustTail)

4588

Chain = DAG.getCALLSEQ_START(Chain, NumBytesToPush,

4589

NumBytes - NumBytesToPush, dl);

4590

4591

SDValue RetAddrFrIdx;

4592

// Load return address for tail calls.

4593

if (isTailCall && FPDiff)

4594

Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,

4595

Is64Bit, FPDiff, dl);

4596

4597

SmallVector<std::pair<Register, SDValue>, 8> RegsToPass;

4598

SmallVector<SDValue, 8> MemOpChains;

4599

SDValue StackPtr;

4600

4601

// The next loop assumes that the locations are in the same order of the

4602

// input arguments.

4603

assert(isSortedByValueNo(ArgLocs) &&(static_cast <bool> (isSortedByValueNo(ArgLocs) &&
"Argument Location list must be sorted before lowering") ? void
(0) : __assert_fail ("isSortedByValueNo(ArgLocs) && \"Argument Location list must be sorted before lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4604, __extension__
__PRETTY_FUNCTION__))

4604

"Argument Location list must be sorted before lowering")(static_cast <bool> (isSortedByValueNo(ArgLocs) &&
"Argument Location list must be sorted before lowering") ? void
(0) : __assert_fail ("isSortedByValueNo(ArgLocs) && \"Argument Location list must be sorted before lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4604, __extension__
__PRETTY_FUNCTION__));

4605

4606

// Walk the register/memloc assignments, inserting copies/loads. In the case

4607

// of tail call optimization arguments are handle later.

4608

const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();

4609

for (unsigned I = 0, OutIndex = 0, E = ArgLocs.size(); I != E;

4610

++I, ++OutIndex) {

4611

assert(OutIndex < Outs.size() && "Invalid Out index")(static_cast <bool> (OutIndex < Outs.size() &&
"Invalid Out index") ? void (0) : __assert_fail ("OutIndex < Outs.size() && \"Invalid Out index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4611, __extension__
__PRETTY_FUNCTION__));

4612

// Skip inalloca/preallocated arguments, they have already been written.

4613

ISD::ArgFlagsTy Flags = Outs[OutIndex].Flags;

4614

if (Flags.isInAlloca() || Flags.isPreallocated())

4615

continue;

4616

4617

CCValAssign &VA = ArgLocs[I];

4618

EVT RegVT = VA.getLocVT();

4619

SDValue Arg = OutVals[OutIndex];

4620

bool isByVal = Flags.isByVal();

4621

4622

// Promote the value if needed.

4623

switch (VA.getLocInfo()) {

4624

default: llvm_unreachable("Unknown loc info!")::llvm::llvm_unreachable_internal("Unknown loc info!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 4624);

4625

case CCValAssign::Full: break;

4626

case CCValAssign::SExt:

4627

Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);

4628

break;

4629

case CCValAssign::ZExt:

4630

Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);

4631

break;

4632

case CCValAssign::AExt:

4633

if (Arg.getValueType().isVector() &&

4634

Arg.getValueType().getVectorElementType() == MVT::i1)

4635

Arg = lowerMasksToReg(Arg, RegVT, dl, DAG);

4636

else if (RegVT.is128BitVector()) {

4637

// Special case: passing MMX values in XMM registers.

4638

Arg = DAG.getBitcast(MVT::i64, Arg);

4639

Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);

4640

Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);

4641

} else

4642

Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);

4643

break;

4644

case CCValAssign::BCvt:

4645

Arg = DAG.getBitcast(RegVT, Arg);

4646

break;

4647

case CCValAssign::Indirect: {

4648

if (isByVal) {

4649

// Memcpy the argument to a temporary stack slot to prevent

4650

// the caller from seeing any modifications the callee may make

4651

// as guaranteed by the `byval` attribute.

4652

int FrameIdx = MF.getFrameInfo().CreateStackObject(

4653

Flags.getByValSize(),

4654

std::max(Align(16), Flags.getNonZeroByValAlign()), false);

4655

SDValue StackSlot =

4656

DAG.getFrameIndex(FrameIdx, getPointerTy(DAG.getDataLayout()));

4657

Chain =

4658

CreateCopyOfByValArgument(Arg, StackSlot, Chain, Flags, DAG, dl);

4659

// From now on treat this as a regular pointer

4660

Arg = StackSlot;

4661

isByVal = false;

4662

} else {

4663

// Store the argument.

4664

SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());

4665

int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();

4666

Chain = DAG.getStore(

4667

Chain, dl, Arg, SpillSlot,

4668

MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));

4669

Arg = SpillSlot;

4670

}

4671

break;

4672

}

4673

}

4674

4675

if (VA.needsCustom()) {

4676

assert(VA.getValVT() == MVT::v64i1 &&(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs"
) ? void (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4677, __extension__
__PRETTY_FUNCTION__))

4677

"Currently the only custom case is when we split v64i1 to 2 regs")(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs"
) ? void (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4677, __extension__
__PRETTY_FUNCTION__));

4678

// Split v64i1 value into two registers

4679

Passv64i1ArgInRegs(dl, DAG, Arg, RegsToPass, VA, ArgLocs[++I], Subtarget);

4680

} else if (VA.isRegLoc()) {

4681

RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));

4682

const TargetOptions &Options = DAG.getTarget().Options;

4683

if (Options.EmitCallSiteInfo)

4684

CSInfo.emplace_back(VA.getLocReg(), I);

4685

if (isVarArg && IsWin64) {

4686

// Win64 ABI requires argument XMM reg to be copied to the corresponding

4687

// shadow reg if callee is a varargs function.

4688

Register ShadowReg;

4689

switch (VA.getLocReg()) {

4690

case X86::XMM0: ShadowReg = X86::RCX; break;

4691

case X86::XMM1: ShadowReg = X86::RDX; break;

4692

case X86::XMM2: ShadowReg = X86::R8; break;

4693

case X86::XMM3: ShadowReg = X86::R9; break;

4694

}

4695

if (ShadowReg)

4696

RegsToPass.push_back(std::make_pair(ShadowReg, Arg));

4697

}

4698

} else if (!IsSibcall && (!isTailCall || isByVal)) {

4699

assert(VA.isMemLoc())(static_cast <bool> (VA.isMemLoc()) ? void (0) : __assert_fail
("VA.isMemLoc()", "llvm/lib/Target/X86/X86ISelLowering.cpp",
4699, __extension__ __PRETTY_FUNCTION__));

4700

if (!StackPtr.getNode())

4701

StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),

4702

getPointerTy(DAG.getDataLayout()));

4703

MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,

4704

dl, DAG, VA, Flags, isByVal));

4705

}

4706

}

4707

4708

if (!MemOpChains.empty())

4709

Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);

4710

4711

if (Subtarget.isPICStyleGOT()) {

4712

// ELF / PIC requires GOT in the EBX register before function calls via PLT

4713

// GOT pointer (except regcall).

4714

if (!isTailCall) {

4715

// Indirect call with RegCall calling convertion may use up all the

4716

// general registers, so it is not suitable to bind EBX reister for

4717

// GOT address, just let register allocator handle it.

4718

if (CallConv != CallingConv::X86_RegCall)

4719

RegsToPass.push_back(std::make_pair(

4720

Register(X86::EBX), DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),

4721

getPointerTy(DAG.getDataLayout()))));

4722

} else {

4723

// If we are tail calling and generating PIC/GOT style code load the

4724

// address of the callee into ECX. The value in ecx is used as target of

4725

// the tail jump. This is done to circumvent the ebx/callee-saved problem

4726

// for tail calls on PIC/GOT architectures. Normally we would just put the

4727

// address of GOT into ebx and then call target@PLT. But for tail calls

4728

// ebx would be restored (since ebx is callee saved) before jumping to the

4729

// target@PLT.

4730

4731

// Note: The actual moving to ECX is done further down.

4732

GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);

4733

if (G && !G->getGlobal()->hasLocalLinkage() &&

4734

G->getGlobal()->hasDefaultVisibility())

4735

Callee = LowerGlobalAddress(Callee, DAG);

4736

else if (isa<ExternalSymbolSDNode>(Callee))

4737

Callee = LowerExternalSymbol(Callee, DAG);

4738

}

4739

}

4740

4741

if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail &&

4742

(Subtarget.hasSSE1() || !M->getModuleFlag("SkipRaxSetup"))) {

4743

// From AMD64 ABI document:

4744

// For calls that may call functions that use varargs or stdargs

4745

// (prototype-less calls or calls to functions containing ellipsis (...) in

4746

// the declaration) %al is used as hidden argument to specify the number

4747

// of SSE registers used. The contents of %al do not need to match exactly

4748

// the number of registers, but must be an ubound on the number of SSE

4749

// registers used and is in the range 0 - 8 inclusive.

4750

4751

// Count the number of XMM registers allocated.

4752

static const MCPhysReg XMMArgRegs[] = {

4753

X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,

4754

X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7

4755

};

4756

unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs);

4757

assert((Subtarget.hasSSE1() || !NumXMMRegs)(static_cast <bool> ((Subtarget.hasSSE1() || !NumXMMRegs
) && "SSE registers cannot be used when SSE is disabled"
) ? void (0) : __assert_fail ("(Subtarget.hasSSE1() || !NumXMMRegs) && \"SSE registers cannot be used when SSE is disabled\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4758, __extension__
__PRETTY_FUNCTION__))

4758

&& "SSE registers cannot be used when SSE is disabled")(static_cast <bool> ((Subtarget.hasSSE1() || !NumXMMRegs
) && "SSE registers cannot be used when SSE is disabled"
) ? void (0) : __assert_fail ("(Subtarget.hasSSE1() || !NumXMMRegs) && \"SSE registers cannot be used when SSE is disabled\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4758, __extension__
__PRETTY_FUNCTION__));

4759

RegsToPass.push_back(std::make_pair(Register(X86::AL),

4760

DAG.getConstant(NumXMMRegs, dl,

4761

MVT::i8)));

4762

}

4763

4764

if (isVarArg && IsMustTail) {

4765

const auto &Forwards = X86Info->getForwardedMustTailRegParms();

4766

for (const auto &F : Forwards) {

4767

SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);

4768

RegsToPass.push_back(std::make_pair(F.PReg, Val));

4769

}

4770

}

4771

4772

// For tail calls lower the arguments to the 'real' stack slots. Sibcalls

4773

// don't need this because the eligibility check rejects calls that require

4774

// shuffling arguments passed in memory.

4775

if (!IsSibcall && isTailCall) {

4776

// Force all the incoming stack arguments to be loaded from the stack

4777

// before any new outgoing arguments are stored to the stack, because the

4778

// outgoing stack slots may alias the incoming argument stack slots, and

4779

// the alias isn't otherwise explicit. This is slightly more conservative

4780

// than necessary, because it means that each store effectively depends

4781

// on every argument instead of just those arguments it would clobber.

4782

SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);

4783

4784

SmallVector<SDValue, 8> MemOpChains2;

4785

SDValue FIN;

4786

int FI = 0;

4787

for (unsigned I = 0, OutsIndex = 0, E = ArgLocs.size(); I != E;

4788

++I, ++OutsIndex) {

4789

CCValAssign &VA = ArgLocs[I];

4790

4791

if (VA.isRegLoc()) {

4792

if (VA.needsCustom()) {

4793

assert((CallConv == CallingConv::X86_RegCall) &&(static_cast <bool> ((CallConv == CallingConv::X86_RegCall
) && "Expecting custom case only in regcall calling convention"
) ? void (0) : __assert_fail ("(CallConv == CallingConv::X86_RegCall) && \"Expecting custom case only in regcall calling convention\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4794, __extension__
__PRETTY_FUNCTION__))

4794

"Expecting custom case only in regcall calling convention")(static_cast <bool> ((CallConv == CallingConv::X86_RegCall
) && "Expecting custom case only in regcall calling convention"
) ? void (0) : __assert_fail ("(CallConv == CallingConv::X86_RegCall) && \"Expecting custom case only in regcall calling convention\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4794, __extension__
__PRETTY_FUNCTION__));

4795

// This means that we are in special case where one argument was

4796

// passed through two register locations - Skip the next location

4797

++I;

4798

}

4799

4800

continue;

4801

}

4802

4803

assert(VA.isMemLoc())(static_cast <bool> (VA.isMemLoc()) ? void (0) : __assert_fail
("VA.isMemLoc()", "llvm/lib/Target/X86/X86ISelLowering.cpp",
4803, __extension__ __PRETTY_FUNCTION__));

4804

SDValue Arg = OutVals[OutsIndex];

4805

ISD::ArgFlagsTy Flags = Outs[OutsIndex].Flags;

4806

// Skip inalloca/preallocated arguments. They don't require any work.

4807

if (Flags.isInAlloca() || Flags.isPreallocated())

4808

continue;

4809

// Create frame index.

4810

int32_t Offset = VA.getLocMemOffset()+FPDiff;

4811

uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;

4812

FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);

4813

FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));

4814

4815

if (Flags.isByVal()) {

4816

// Copy relative to framepointer.

4817

SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset(), dl);

4818

if (!StackPtr.getNode())

4819

StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),

4820

getPointerTy(DAG.getDataLayout()));

4821

Source = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),

4822

StackPtr, Source);

4823

4824

MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,

4825

ArgChain,

4826

Flags, DAG, dl));

4827

} else {

4828

// Store relative to framepointer.

4829

MemOpChains2.push_back(DAG.getStore(

4830

ArgChain, dl, Arg, FIN,

4831

MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)));

4832

}

4833

}

4834

4835

if (!MemOpChains2.empty())

4836

Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);

4837

4838

// Store the return address to the appropriate stack slot.

4839

Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,

4840

getPointerTy(DAG.getDataLayout()),

4841

RegInfo->getSlotSize(), FPDiff, dl);

4842

}

4843

4844

// Build a sequence of copy-to-reg nodes chained together with token chain

4845

// and glue operands which copy the outgoing args into registers.

4846

SDValue InGlue;

4847

for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {

4848

Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,

4849

RegsToPass[i].second, InGlue);

4850

InGlue = Chain.getValue(1);

4851

}

4852

4853

if (DAG.getTarget().getCodeModel() == CodeModel::Large) {

4854

assert(Is64Bit && "Large code model is only legal in 64-bit mode.")(static_cast <bool> (Is64Bit && "Large code model is only legal in 64-bit mode."
) ? void (0) : __assert_fail ("Is64Bit && \"Large code model is only legal in 64-bit mode.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4854, __extension__
__PRETTY_FUNCTION__));

4855

// In the 64-bit large code model, we have to make all calls

4856

// through a register, since the call instruction's 32-bit

4857

// pc-relative offset may not be large enough to hold the whole

4858

// address.

4859

} else if (Callee->getOpcode() == ISD::GlobalAddress ||

4860

Callee->getOpcode() == ISD::ExternalSymbol) {

4861

// Lower direct calls to global addresses and external symbols. Setting

4862

// ForCall to true here has the effect of removing WrapperRIP when possible

4863

// to allow direct calls to be selected without first materializing the

4864

// address into a register.

4865

Callee = LowerGlobalOrExternal(Callee, DAG, /*ForCall=*/true);

4866

} else if (Subtarget.isTarget64BitILP32() &&

4867

Callee.getValueType() == MVT::i32) {

4868

// Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI

4869

Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee);

4870

}

4871

4872

// Returns a chain & a glue for retval copy to use.

4873

SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);

4874

SmallVector<SDValue, 8> Ops;

4875

4876

if (!IsSibcall && isTailCall && !IsMustTail) {

4877

Chain = DAG.getCALLSEQ_END(Chain, NumBytesToPop, 0, InGlue, dl);

4878

InGlue = Chain.getValue(1);

4879

}

4880

4881

Ops.push_back(Chain);

4882

Ops.push_back(Callee);

4883

4884

if (isTailCall)

4885

Ops.push_back(DAG.getTargetConstant(FPDiff, dl, MVT::i32));

4886

4887

// Add argument registers to the end of the list so that they are known live

4888

// into the call.

4889

for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)

4890

Ops.push_back(DAG.getRegister(RegsToPass[i].first,

4891

RegsToPass[i].second.getValueType()));

4892

4893

// Add a register mask operand representing the call-preserved registers.

4894

const uint32_t *Mask = [&]() {

4895

auto AdaptedCC = CallConv;

4896

// If HasNCSR is asserted (attribute NoCallerSavedRegisters exists),

4897

// use X86_INTR calling convention because it has the same CSR mask

4898

// (same preserved registers).

4899

if (HasNCSR)

4900

AdaptedCC = (CallingConv::ID)CallingConv::X86_INTR;

4901

// If NoCalleeSavedRegisters is requested, than use GHC since it happens

4902

// to use the CSR_NoRegs_RegMask.

4903

if (CB && CB->hasFnAttr("no_callee_saved_registers"))

4904

AdaptedCC = (CallingConv::ID)CallingConv::GHC;

4905

return RegInfo->getCallPreservedMask(MF, AdaptedCC);

4906

}();

4907

assert(Mask && "Missing call preserved mask for calling convention")(static_cast <bool> (Mask && "Missing call preserved mask for calling convention"
) ? void (0) : __assert_fail ("Mask && \"Missing call preserved mask for calling convention\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4907, __extension__
__PRETTY_FUNCTION__));

4908

4909

// If this is an invoke in a 32-bit function using a funclet-based

4910

// personality, assume the function clobbers all registers. If an exception

4911

// is thrown, the runtime will not restore CSRs.

4912

// FIXME: Model this more precisely so that we can register allocate across

4913

// the normal edge and spill and fill across the exceptional edge.

4914

if (!Is64Bit && CLI.CB && isa<InvokeInst>(CLI.CB)) {

4915

const Function &CallerFn = MF.getFunction();

4916

EHPersonality Pers =

4917

CallerFn.hasPersonalityFn()

4918

? classifyEHPersonality(CallerFn.getPersonalityFn())

4919

: EHPersonality::Unknown;

4920

if (isFuncletEHPersonality(Pers))

4921

Mask = RegInfo->getNoPreservedMask();

4922

}

4923

4924

// Define a new register mask from the existing mask.

4925

uint32_t *RegMask = nullptr;

4926

4927

// In some calling conventions we need to remove the used physical registers

4928

// from the reg mask. Create a new RegMask for such calling conventions.

4929

// RegMask for calling conventions that disable only return registers (e.g.

4930

// preserve_most) will be modified later in LowerCallResult.

4931

bool ShouldDisableArgRegs = shouldDisableArgRegFromCSR(CallConv) || HasNCSR;

4932

if (ShouldDisableArgRegs || shouldDisableRetRegFromCSR(CallConv)) {

4933

const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();

4934

4935

// Allocate a new Reg Mask and copy Mask.

4936

RegMask = MF.allocateRegMask();

4937

unsigned RegMaskSize = MachineOperand::getRegMaskSize(TRI->getNumRegs());

4938

memcpy(RegMask, Mask, sizeof(RegMask[0]) * RegMaskSize);

4939

4940

// Make sure all sub registers of the argument registers are reset

4941

// in the RegMask.

4942

if (ShouldDisableArgRegs) {

4943

for (auto const &RegPair : RegsToPass)

4944

for (MCSubRegIterator SubRegs(RegPair.first, TRI, /*IncludeSelf=*/true);

4945

SubRegs.isValid(); ++SubRegs)

4946

RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32));

4947

}

4948

4949

// Create the RegMask Operand according to our updated mask.

4950

Ops.push_back(DAG.getRegisterMask(RegMask));

4951

} else {

4952

// Create the RegMask Operand according to the static mask.

4953

Ops.push_back(DAG.getRegisterMask(Mask));

4954

}

4955

4956

if (InGlue.getNode())

4957

Ops.push_back(InGlue);

4958

4959

if (isTailCall) {

4960

// We used to do:

4961

//// If this is the first return lowered for this function, add the regs

4962

//// to the liveout set for the function.

4963

// This isn't right, although it's probably harmless on x86; liveouts

4964

// should be computed from returns not tail calls. Consider a void

4965

// function making a tail call to a function returning int.

4966

MF.getFrameInfo().setHasTailCall();

4967

SDValue Ret = DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops);

4968

4969

if (IsCFICall)

4970

Ret.getNode()->setCFIType(CLI.CFIType->getZExtValue());

4971

4972

DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo));

4973

return Ret;

4974

}

4975

4976

if (HasNoCfCheck && IsCFProtectionSupported && IsIndirectCall) {

4977

Chain = DAG.getNode(X86ISD::NT_CALL, dl, NodeTys, Ops);

4978

} else if (CLI.CB && objcarc::hasAttachedCallOpBundle(CLI.CB)) {

4979

// Calls with a "clang.arc.attachedcall" bundle are special. They should be

4980

// expanded to the call, directly followed by a special marker sequence and

4981

// a call to a ObjC library function. Use the CALL_RVMARKER to do that.

4982

assert(!isTailCall &&(static_cast <bool> (!isTailCall && "tail calls cannot be marked with clang.arc.attachedcall"
) ? void (0) : __assert_fail ("!isTailCall && \"tail calls cannot be marked with clang.arc.attachedcall\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4983, __extension__
__PRETTY_FUNCTION__))

4983

"tail calls cannot be marked with clang.arc.attachedcall")(static_cast <bool> (!isTailCall && "tail calls cannot be marked with clang.arc.attachedcall"
) ? void (0) : __assert_fail ("!isTailCall && \"tail calls cannot be marked with clang.arc.attachedcall\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4983, __extension__
__PRETTY_FUNCTION__));

4984

assert(Is64Bit && "clang.arc.attachedcall is only supported in 64bit mode")(static_cast <bool> (Is64Bit && "clang.arc.attachedcall is only supported in 64bit mode"
) ? void (0) : __assert_fail ("Is64Bit && \"clang.arc.attachedcall is only supported in 64bit mode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4984, __extension__
__PRETTY_FUNCTION__));

4985

4986

// Add a target global address for the retainRV/claimRV runtime function

4987

// just before the call target.

4988

Function *ARCFn = *objcarc::getAttachedARCFunction(CLI.CB);

4989

auto PtrVT = getPointerTy(DAG.getDataLayout());

4990

auto GA = DAG.getTargetGlobalAddress(ARCFn, dl, PtrVT);

4991

Ops.insert(Ops.begin() + 1, GA);

4992

Chain = DAG.getNode(X86ISD::CALL_RVMARKER, dl, NodeTys, Ops);

4993

} else {

4994

Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);

4995

}

4996

4997

if (IsCFICall)

4998

Chain.getNode()->setCFIType(CLI.CFIType->getZExtValue());

4999

5000

InGlue = Chain.getValue(1);

5001

DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge);

5002

DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));

5003

5004

// Save heapallocsite metadata.

5005

if (CLI.CB)

5006

if (MDNode *HeapAlloc = CLI.CB->getMetadata("heapallocsite"))

5007

DAG.addHeapAllocSite(Chain.getNode(), HeapAlloc);

5008

5009

// Create the CALLSEQ_END node.

5010

unsigned NumBytesForCalleeToPop = 0; // Callee pops nothing.

5011

if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,

5012

DAG.getTarget().Options.GuaranteedTailCallOpt))

5013

NumBytesForCalleeToPop = NumBytes; // Callee pops everything

5014

else if (!canGuaranteeTCO(CallConv) && IsCalleePopSRet)

5015

// If this call passes a struct-return pointer, the callee

5016

// pops that struct pointer.

5017

NumBytesForCalleeToPop = 4;

5018

5019

// Returns a glue for retval copy to use.

5020

if (!IsSibcall) {

5021

Chain = DAG.getCALLSEQ_END(Chain, NumBytesToPop, NumBytesForCalleeToPop,

5022

InGlue, dl);

5023

InGlue = Chain.getValue(1);

5024

}

5025

5026

// Handle result values, copying them out of physregs into vregs that we

5027

// return.

5028

return LowerCallResult(Chain, InGlue, CallConv, isVarArg, Ins, dl, DAG,

5029

InVals, RegMask);

5030

}

5031

5032

//===----------------------------------------------------------------------===//

5033

// Fast Calling Convention (tail call) implementation

5034

//===----------------------------------------------------------------------===//

5035

5036

// Like std call, callee cleans arguments, convention except that ECX is

5037

// reserved for storing the tail called function address. Only 2 registers are

5038

// free for argument passing (inreg). Tail call optimization is performed

5039

// provided:

5040

// * tailcallopt is enabled

5041

// * caller/callee are fastcc

5042

// On X86_64 architecture with GOT-style position independent code only local

5043

// (within module) calls are supported at the moment.

5044

// To keep the stack aligned according to platform abi the function

5045

// GetAlignedArgumentStackSize ensures that argument delta is always multiples

5046

// of stack alignment. (Dynamic linkers need this - Darwin's dyld for example)

5047

// If a tail called function callee has more arguments than the caller the

5048

// caller needs to make sure that there is room to move the RETADDR to. This is

5049

// achieved by reserving an area the size of the argument delta right after the

5050

// original RETADDR, but before the saved framepointer or the spilled registers

5051

// e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)

5052

// stack layout:

5053

// arg1

5054

// arg2

5055

// RETADDR

5056

// [ new RETADDR

5057

// move area ]

5058

// (possible EBP)

5059

// ESI

5060

// EDI

5061

// local1 ..

5062

5063

/// Make the stack size align e.g 16n + 12 aligned for a 16-byte align

5064

/// requirement.

5065

unsigned

5066

X86TargetLowering::GetAlignedArgumentStackSize(const unsigned StackSize,

5067

SelectionDAG &DAG) const {

5068

const Align StackAlignment = Subtarget.getFrameLowering()->getStackAlign();

5069

const uint64_t SlotSize = Subtarget.getRegisterInfo()->getSlotSize();

5070

assert(StackSize % SlotSize == 0 &&(static_cast <bool> (StackSize % SlotSize == 0 &&
"StackSize must be a multiple of SlotSize") ? void (0) : __assert_fail
("StackSize % SlotSize == 0 && \"StackSize must be a multiple of SlotSize\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 5071, __extension__
__PRETTY_FUNCTION__))

5071

"StackSize must be a multiple of SlotSize")(static_cast <bool> (StackSize % SlotSize == 0 &&
"StackSize must be a multiple of SlotSize") ? void (0) : __assert_fail
("StackSize % SlotSize == 0 && \"StackSize must be a multiple of SlotSize\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 5071, __extension__
__PRETTY_FUNCTION__));

5072

return alignTo(StackSize + SlotSize, StackAlignment) - SlotSize;

5073

}

5074

5075

/// Return true if the given stack call argument is already available in the

5076

/// same position (relatively) of the caller's incoming argument stack.

5077

static

5078

bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,

5079

MachineFrameInfo &MFI, const MachineRegisterInfo *MRI,

5080

const X86InstrInfo *TII, const CCValAssign &VA) {

5081

unsigned Bytes = Arg.getValueSizeInBits() / 8;

5082

5083

for (;;) {

5084

// Look through nodes that don't alter the bits of the incoming value.

5085

unsigned Op = Arg.getOpcode();

5086

if (Op == ISD::ZERO_EXTEND || Op == ISD::ANY_EXTEND || Op == ISD::BITCAST) {

5087

Arg = Arg.getOperand(0);

5088

continue;

5089

}

5090

if (Op == ISD::TRUNCATE) {

5091

const SDValue &TruncInput = Arg.getOperand(0);

5092

if (TruncInput.getOpcode() == ISD::AssertZext &&

5093

cast<VTSDNode>(TruncInput.getOperand(1))->getVT() ==

5094

Arg.getValueType()) {

5095

Arg = TruncInput.getOperand(0);

5096

continue;

5097

}

5098

}

5099

break;

5100

}

5101

5102

int FI = INT_MAX2147483647;

5103

if (Arg.getOpcode() == ISD::CopyFromReg) {

5104

Register VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();

5105

if (!VR.isVirtual())

5106

return false;

5107

MachineInstr *Def = MRI->getVRegDef(VR);

5108

if (!Def)

5109

return false;

5110

if (!Flags.isByVal()) {

5111

if (!TII->isLoadFromStackSlot(*Def, FI))

5112

return false;

5113

} else {

5114

unsigned Opcode = Def->getOpcode();

5115

if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r ||

5116

Opcode == X86::LEA64_32r) &&

5117

Def->getOperand(1).isFI()) {

5118

FI = Def->getOperand(1).getIndex();

5119

Bytes = Flags.getByValSize();

5120

} else

5121

return false;

5122

}

5123

} else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {

5124

if (Flags.isByVal())

5125

// ByVal argument is passed in as a pointer but it's now being

5126

// dereferenced. e.g.

5127

// define @foo(%struct.X* %A) {

5128

// tail call @bar(%struct.X* byval %A)

5129

// }

5130

return false;

5131

SDValue Ptr = Ld->getBasePtr();

5132

FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);

5133

if (!FINode)

5134

return false;

5135

FI = FINode->getIndex();

5136

} else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) {

5137

FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg);

5138

FI = FINode->getIndex();

5139

Bytes = Flags.getByValSize();

5140

} else

5141

return false;

5142

5143

assert(FI != INT_MAX)(static_cast <bool> (FI != 2147483647) ? void (0) : __assert_fail
("FI != INT_MAX", "llvm/lib/Target/X86/X86ISelLowering.cpp",
5143, __extension__ __PRETTY_FUNCTION__));

5144

if (!MFI.isFixedObjectIndex(FI))

5145

return false;

5146

5147

if (Offset != MFI.getObjectOffset(FI))

5148

return false;

5149

5150

// If this is not byval, check that the argument stack object is immutable.

5151

// inalloca and argument copy elision can create mutable argument stack

5152

// objects. Byval objects can be mutated, but a byval call intends to pass the

5153

// mutated memory.

5154

if (!Flags.isByVal() && !MFI.isImmutableObjectIndex(FI))

5155

return false;

5156

5157

if (VA.getLocVT().getFixedSizeInBits() >

5158

Arg.getValueSizeInBits().getFixedValue()) {

5159

// If the argument location is wider than the argument type, check that any

5160

// extension flags match.

5161

if (Flags.isZExt() != MFI.isObjectZExt(FI) ||

5162

Flags.isSExt() != MFI.isObjectSExt(FI)) {

5163

return false;

5164

}

5165

}

5166

5167

return Bytes == MFI.getObjectSize(FI);

5168

}

5169

5170

/// Check whether the call is eligible for tail call optimization. Targets

5171

/// that want to do tail call optimization should implement this function.

5172

bool X86TargetLowering::IsEligibleForTailCallOptimization(

5173

SDValue Callee, CallingConv::ID CalleeCC, bool IsCalleePopSRet,

5174

bool isVarArg, Type *RetTy, const SmallVectorImpl<ISD::OutputArg> &Outs,

5175

const SmallVectorImpl<SDValue> &OutVals,

5176

const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {

5177

if (!mayTailCallThisCC(CalleeCC))

5178

return false;

5179

5180

// If -tailcallopt is specified, make fastcc functions tail-callable.

5181

MachineFunction &MF = DAG.getMachineFunction();

5182

const Function &CallerF = MF.getFunction();

5183

5184

// If the function return type is x86_fp80 and the callee return type is not,

5185

// then the FP_EXTEND of the call result is not a nop. It's not safe to

5186

// perform a tailcall optimization here.

5187

if (CallerF.getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty())

5188

return false;

5189

5190

CallingConv::ID CallerCC = CallerF.getCallingConv();

5191

bool CCMatch = CallerCC == CalleeCC;

5192

bool IsCalleeWin64 = Subtarget.isCallingConvWin64(CalleeCC);

5193

bool IsCallerWin64 = Subtarget.isCallingConvWin64(CallerCC);

5194

bool IsGuaranteeTCO = DAG.getTarget().Options.GuaranteedTailCallOpt ||

5195

CalleeCC == CallingConv::Tail || CalleeCC == CallingConv::SwiftTail;

5196

5197

// Win64 functions have extra shadow space for argument homing. Don't do the

5198

// sibcall if the caller and callee have mismatched expectations for this

5199

// space.

5200

if (IsCalleeWin64 != IsCallerWin64)

5201

return false;

5202

5203

if (IsGuaranteeTCO) {

5204

if (canGuaranteeTCO(CalleeCC) && CCMatch)

5205

return true;

5206

return false;

5207

}

5208

5209

// Look for obvious safe cases to perform tail call optimization that do not

5210

// require ABI changes. This is what gcc calls sibcall.

5211

5212

// Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to

5213

// emit a special epilogue.

5214

const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();

5215

if (RegInfo->hasStackRealignment(MF))

5216

return false;

5217

5218

// Also avoid sibcall optimization if we're an sret return fn and the callee

5219

// is incompatible. See comment in LowerReturn about why hasStructRetAttr is

5220

// insufficient.

5221

if (MF.getInfo<X86MachineFunctionInfo>()->getSRetReturnReg()) {

5222

// For a compatible tail call the callee must return our sret pointer. So it

5223

// needs to be (a) an sret function itself and (b) we pass our sret as its

5224

// sret. Condition #b is harder to determine.

5225

return false;

5226

} else if (IsCalleePopSRet)

5227

// The callee pops an sret, so we cannot tail-call, as our caller doesn't

5228

// expect that.

5229

return false;

5230

5231

// Do not sibcall optimize vararg calls unless all arguments are passed via

5232

// registers.

5233

LLVMContext &C = *DAG.getContext();

5234

if (isVarArg && !Outs.empty()) {

5235

// Optimizing for varargs on Win64 is unlikely to be safe without

5236

// additional testing.

5237

if (IsCalleeWin64 || IsCallerWin64)

5238

return false;

5239

5240

SmallVector<CCValAssign, 16> ArgLocs;

5241

CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);

5242

5243

CCInfo.AnalyzeCallOperands(Outs, CC_X86);

5244

for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)

5245

if (!ArgLocs[i].isRegLoc())

5246

return false;

5247

}

5248

5249

// If the call result is in ST0 / ST1, it needs to be popped off the x87

5250

// stack. Therefore, if it's not used by the call it is not safe to optimize

5251

// this into a sibcall.

5252

bool Unused = false;

5253

for (unsigned i = 0, e = Ins.size(); i != e; ++i) {

5254

if (!Ins[i].Used) {

5255

Unused = true;

5256

break;

5257

}

5258

}

5259

if (Unused) {

5260

SmallVector<CCValAssign, 16> RVLocs;

5261

CCState CCInfo(CalleeCC, false, MF, RVLocs, C);

5262

CCInfo.AnalyzeCallResult(Ins, RetCC_X86);

5263

for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {

5264

CCValAssign &VA = RVLocs[i];

5265

if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1)

5266

return false;

5267

}

5268

}

5269

5270

// Check that the call results are passed in the same way.

5271

if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,

5272

RetCC_X86, RetCC_X86))

5273

return false;

5274

// The callee has to preserve all registers the caller needs to preserve.

5275

const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();

5276

const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);

5277

if (!CCMatch) {

5278

const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);

5279

if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))

5280

return false;

5281

}

5282

5283

unsigned StackArgsSize = 0;

5284

5285

// If the callee takes no arguments then go on to check the results of the

5286

// call.

5287

if (!Outs.empty()) {

5288

// Check if stack adjustment is needed. For now, do not do this if any

5289

// argument is passed on the stack.

5290

SmallVector<CCValAssign, 16> ArgLocs;

5291

CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);

5292

5293

// Allocate shadow area for Win64

5294

if (IsCalleeWin64)

5295

CCInfo.AllocateStack(32, Align(8));

5296

5297

CCInfo.AnalyzeCallOperands(Outs, CC_X86);

5298

StackArgsSize = CCInfo.getNextStackOffset();

5299

5300

if (CCInfo.getNextStackOffset()) {

5301

// Check if the arguments are already laid out in the right way as

5302

// the caller's fixed stack objects.

5303

MachineFrameInfo &MFI = MF.getFrameInfo();

5304

const MachineRegisterInfo *MRI = &MF.getRegInfo();

5305

const X86InstrInfo *TII = Subtarget.getInstrInfo();

5306

for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {

5307

CCValAssign &VA = ArgLocs[i];

5308

SDValue Arg = OutVals[i];

5309

ISD::ArgFlagsTy Flags = Outs[i].Flags;

5310

if (VA.getLocInfo() == CCValAssign::Indirect)

5311

return false;

5312

if (!VA.isRegLoc()) {

5313

if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,

5314

MFI, MRI, TII, VA))

5315

return false;

5316

}

5317

}

5318

}

5319

5320

bool PositionIndependent = isPositionIndependent();

5321

// If the tailcall address may be in a register, then make sure it's

5322

// possible to register allocate for it. In 32-bit, the call address can

5323

// only target EAX, EDX, or ECX since the tail call must be scheduled after

5324

// callee-saved registers are restored. These happen to be the same

5325

// registers used to pass 'inreg' arguments so watch out for those.

5326

if (!Subtarget.is64Bit() && ((!isa<GlobalAddressSDNode>(Callee) &&

5327

!isa<ExternalSymbolSDNode>(Callee)) ||

5328

PositionIndependent)) {

5329

unsigned NumInRegs = 0;

5330

// In PIC we need an extra register to formulate the address computation

5331

// for the callee.

5332

unsigned MaxInRegs = PositionIndependent ? 2 : 3;

5333

5334

for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {

5335

CCValAssign &VA = ArgLocs[i];

5336

if (!VA.isRegLoc())

5337

continue;

5338

Register Reg = VA.getLocReg();

5339

switch (Reg) {

5340

default: break;

5341

case X86::EAX: case X86::EDX: case X86::ECX:

5342

if (++NumInRegs == MaxInRegs)

5343

return false;

5344

break;

5345

}

5346

}

5347

}

5348

5349

const MachineRegisterInfo &MRI = MF.getRegInfo();

5350

if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))

5351

return false;

5352

}

5353

5354

bool CalleeWillPop =

5355

X86::isCalleePop(CalleeCC, Subtarget.is64Bit(), isVarArg,

5356

MF.getTarget().Options.GuaranteedTailCallOpt);

5357

5358

if (unsigned BytesToPop =

5359

MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) {

5360

// If we have bytes to pop, the callee must pop them.

5361

bool CalleePopMatches = CalleeWillPop && BytesToPop == StackArgsSize;

5362

if (!CalleePopMatches)

5363

return false;

5364

} else if (CalleeWillPop && StackArgsSize > 0) {

5365

// If we don't have bytes to pop, make sure the callee doesn't pop any.

5366

return false;

5367

}

5368

5369

return true;

5370

}

5371

5372

FastISel *

5373

X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,

5374

const TargetLibraryInfo *libInfo) const {

5375

return X86::createFastISel(funcInfo, libInfo);

5376

}

5377

5378

//===----------------------------------------------------------------------===//

5379

// Other Lowering Hooks

5380

//===----------------------------------------------------------------------===//

5381

5382

bool X86::mayFoldLoad(SDValue Op, const X86Subtarget &Subtarget,

5383

bool AssumeSingleUse) {

5384

if (!AssumeSingleUse && !Op.hasOneUse())

5385

return false;

5386

if (!ISD::isNormalLoad(Op.getNode()))

5387

return false;

5388

5389

// If this is an unaligned vector, make sure the target supports folding it.

5390

auto *Ld = cast<LoadSDNode>(Op.getNode());

5391

if (!Subtarget.hasAVX() && !Subtarget.hasSSEUnalignedMem() &&

5392

Ld->getValueSizeInBits(0) == 128 && Ld->getAlign() < Align(16))

5393

return false;

5394

5395

// TODO: If this is a non-temporal load and the target has an instruction

5396

// for it, it should not be folded. See "useNonTemporalLoad()".

5397

5398

return true;

5399

}

5400

5401

bool X86::mayFoldLoadIntoBroadcastFromMem(SDValue Op, MVT EltVT,

5402

const X86Subtarget &Subtarget,

5403

bool AssumeSingleUse) {

5404

assert(Subtarget.hasAVX() && "Expected AVX for broadcast from memory")(static_cast <bool> (Subtarget.hasAVX() && "Expected AVX for broadcast from memory"
) ? void (0) : __assert_fail ("Subtarget.hasAVX() && \"Expected AVX for broadcast from memory\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 5404, __extension__
__PRETTY_FUNCTION__));

5405

if (!X86::mayFoldLoad(Op, Subtarget, AssumeSingleUse))

5406

return false;

5407

5408

// We can not replace a wide volatile load with a broadcast-from-memory,

5409

// because that would narrow the load, which isn't legal for volatiles.

5410

auto *Ld = cast<LoadSDNode>(Op.getNode());

5411

return !Ld->isVolatile() ||

5412

Ld->getValueSizeInBits(0) == EltVT.getScalarSizeInBits();

5413

}

5414

5415

bool X86::mayFoldIntoStore(SDValue Op) {

5416

return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());

5417

}

5418

5419

bool X86::mayFoldIntoZeroExtend(SDValue Op) {

5420

if (Op.hasOneUse()) {

5421

unsigned Opcode = Op.getNode()->use_begin()->getOpcode();

5422

return (ISD::ZERO_EXTEND == Opcode);

5423

}

5424

return false;

5425

}

5426

5427

static bool isTargetShuffle(unsigned Opcode) {

5428

switch(Opcode) {

5429

default: return false;

5430

case X86ISD::BLENDI:

5431

case X86ISD::PSHUFB:

5432

case X86ISD::PSHUFD:

5433

case X86ISD::PSHUFHW:

5434

case X86ISD::PSHUFLW:

5435

case X86ISD::SHUFP:

5436

case X86ISD::INSERTPS:

5437

case X86ISD::EXTRQI:

5438

case X86ISD::INSERTQI:

5439

case X86ISD::VALIGN:

5440

case X86ISD::PALIGNR:

5441

case X86ISD::VSHLDQ:

5442

case X86ISD::VSRLDQ:

5443

case X86ISD::MOVLHPS:

5444

case X86ISD::MOVHLPS:

5445

case X86ISD::MOVSHDUP:

5446

case X86ISD::MOVSLDUP:

5447

case X86ISD::MOVDDUP:

5448

case X86ISD::MOVSS:

5449

case X86ISD::MOVSD:

5450

case X86ISD::MOVSH:

5451

case X86ISD::UNPCKL:

5452

case X86ISD::UNPCKH:

5453

case X86ISD::VBROADCAST:

5454

case X86ISD::VPERMILPI:

5455

case X86ISD::VPERMILPV:

5456

case X86ISD::VPERM2X128:

5457

case X86ISD::SHUF128:

5458

case X86ISD::VPERMIL2:

5459

case X86ISD::VPERMI:

5460

case X86ISD::VPPERM:

5461

case X86ISD::VPERMV:

5462

case X86ISD::VPERMV3:

5463

case X86ISD::VZEXT_MOVL:

5464

return true;

5465

}

5466

}

5467

5468

static bool isTargetShuffleVariableMask(unsigned Opcode) {

5469

switch (Opcode) {

5470

default: return false;

5471

// Target Shuffles.

5472

case X86ISD::PSHUFB:

5473

case X86ISD::VPERMILPV:

5474

case X86ISD::VPERMIL2:

5475

case X86ISD::VPPERM:

5476

case X86ISD::VPERMV:

5477

case X86ISD::VPERMV3:

5478

return true;

5479

// 'Faux' Target Shuffles.

5480

case ISD::OR:

5481

case ISD::AND:

5482

case X86ISD::ANDNP:

5483

return true;

5484

}

5485

}

5486

5487

SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {

5488

MachineFunction &MF = DAG.getMachineFunction();

5489

const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();

5490

X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();

5491

int ReturnAddrIndex = FuncInfo->getRAIndex();

5492

5493

if (ReturnAddrIndex == 0) {

5494

// Set up a frame object for the return address.

5495

unsigned SlotSize = RegInfo->getSlotSize();

5496

ReturnAddrIndex = MF.getFrameInfo().CreateFixedObject(SlotSize,

5497

-(int64_t)SlotSize,

5498

false);

5499

FuncInfo->setRAIndex(ReturnAddrIndex);

5500

}

5501

5502

return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy(DAG.getDataLayout()));

5503

}

5504

5505

bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,

5506

bool hasSymbolicDisplacement) {

5507

// Offset should fit into 32 bit immediate field.

5508

if (!isInt<32>(Offset))

5509

return false;

5510

5511

// If we don't have a symbolic displacement - we don't have any extra

5512

// restrictions.

5513

if (!hasSymbolicDisplacement)

5514

return true;

5515

5516

// FIXME: Some tweaks might be needed for medium code model.

5517

if (M != CodeModel::Small && M != CodeModel::Kernel)

5518

return false;

5519

5520

// For small code model we assume that latest object is 16MB before end of 31

5521

// bits boundary. We may also accept pretty large negative constants knowing

5522

// that all objects are in the positive half of address space.

5523

if (M == CodeModel::Small && Offset < 16*1024*1024)

5524

return true;

5525

5526

// For kernel code model we know that all object resist in the negative half

5527

// of 32bits address space. We may not accept negative offsets, since they may

5528

// be just off and we may accept pretty large positive ones.

5529

if (M == CodeModel::Kernel && Offset >= 0)

5530

return true;

5531

5532

return false;

5533

}

5534

5535

/// Determines whether the callee is required to pop its own arguments.

5536

/// Callee pop is necessary to support tail calls.

5537

bool X86::isCalleePop(CallingConv::ID CallingConv,

5538

bool is64Bit, bool IsVarArg, bool GuaranteeTCO) {

5539

// If GuaranteeTCO is true, we force some calls to be callee pop so that we

5540

// can guarantee TCO.

5541

if (!IsVarArg && shouldGuaranteeTCO(CallingConv, GuaranteeTCO))

5542

return true;

5543

5544

switch (CallingConv) {

5545

default:

5546

return false;

5547

case CallingConv::X86_StdCall:

5548

case CallingConv::X86_FastCall:

5549

case CallingConv::X86_ThisCall:

5550

case CallingConv::X86_VectorCall:

5551

return !is64Bit;

5552

}

5553

}

5554

5555

/// Return true if the condition is an signed comparison operation.

5556

static bool isX86CCSigned(unsigned X86CC) {

5557

switch (X86CC) {

5558

default:

5559

llvm_unreachable("Invalid integer condition!")::llvm::llvm_unreachable_internal("Invalid integer condition!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 5559);

5560

case X86::COND_E:

5561

case X86::COND_NE:

5562

case X86::COND_B:

5563

case X86::COND_A:

5564

case X86::COND_BE:

5565

case X86::COND_AE:

5566

return false;

5567

case X86::COND_G:

5568

case X86::COND_GE:

5569

case X86::COND_L:

5570

case X86::COND_LE:

5571

return true;

5572

}

5573

}

5574

5575

static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode) {

5576

switch (SetCCOpcode) {

5577

default: llvm_unreachable("Invalid integer condition!")::llvm::llvm_unreachable_internal("Invalid integer condition!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 5577);

5578

case ISD::SETEQ: return X86::COND_E;

5579

case ISD::SETGT: return X86::COND_G;

5580

case ISD::SETGE: return X86::COND_GE;

5581

case ISD::SETLT: return X86::COND_L;

5582

case ISD::SETLE: return X86::COND_LE;

5583

case ISD::SETNE: return X86::COND_NE;

5584

case ISD::SETULT: return X86::COND_B;

5585

case ISD::SETUGT: return X86::COND_A;

5586

case ISD::SETULE: return X86::COND_BE;

5587

case ISD::SETUGE: return X86::COND_AE;

5588

}

5589

}

5590

5591

/// Do a one-to-one translation of a ISD::CondCode to the X86-specific

5592

/// condition code, returning the condition code and the LHS/RHS of the

5593

/// comparison to make.

5594

static X86::CondCode TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL,

5595

bool isFP, SDValue &LHS, SDValue &RHS,

5596

SelectionDAG &DAG) {

5597

if (!isFP) {

5598

if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {

5599

if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnes()) {

5600

// X > -1 -> X == 0, jump !sign.

5601

RHS = DAG.getConstant(0, DL, RHS.getValueType());

5602

return X86::COND_NS;

5603

}

5604

if (SetCCOpcode == ISD::SETLT && RHSC->isZero()) {

5605

// X < 0 -> X == 0, jump on sign.

5606

return X86::COND_S;

5607

}

5608

if (SetCCOpcode == ISD::SETGE && RHSC->isZero()) {

5609

// X >= 0 -> X == 0, jump on !sign.

5610

return X86::COND_NS;

5611

}

5612

if (SetCCOpcode == ISD::SETLT && RHSC->isOne()) {

5613

// X < 1 -> X <= 0

5614

RHS = DAG.getConstant(0, DL, RHS.getValueType());

5615

return X86::COND_LE;

5616

}

5617

}

5618

5619

return TranslateIntegerX86CC(SetCCOpcode);

5620

}

5621

5622

// First determine if it is required or is profitable to flip the operands.

5623

5624

// If LHS is a foldable load, but RHS is not, flip the condition.

5625

if (ISD::isNON_EXTLoad(LHS.getNode()) &&

5626

!ISD::isNON_EXTLoad(RHS.getNode())) {

5627

SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);

5628

std::swap(LHS, RHS);

5629

}

5630

5631

switch (SetCCOpcode) {

5632

default: break;

5633

case ISD::SETOLT:

5634

case ISD::SETOLE:

5635

case ISD::SETUGT:

5636

case ISD::SETUGE:

5637

std::swap(LHS, RHS);

5638

break;

5639

}

5640

5641

// On a floating point condition, the flags are set as follows:

5642

// ZF PF CF op

5643

// 0 | 0 | 0 | X > Y

5644

// 0 | 0 | 1 | X < Y

5645

// 1 | 0 | 0 | X == Y

5646

// 1 | 1 | 1 | unordered

5647

switch (SetCCOpcode) {

5648

default: llvm_unreachable("Condcode should be pre-legalized away")::llvm::llvm_unreachable_internal("Condcode should be pre-legalized away"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 5648);

5649

case ISD::SETUEQ:

5650

case ISD::SETEQ: return X86::COND_E;

5651

case ISD::SETOLT: // flipped

5652

case ISD::SETOGT:

5653

case ISD::SETGT: return X86::COND_A;

5654

case ISD::SETOLE: // flipped

5655

case ISD::SETOGE:

5656

case ISD::SETGE: return X86::COND_AE;

5657

case ISD::SETUGT: // flipped

5658

case ISD::SETULT:

5659

case ISD::SETLT: return X86::COND_B;

5660

case ISD::SETUGE: // flipped

5661

case ISD::SETULE:

5662

case ISD::SETLE: return X86::COND_BE;

5663

case ISD::SETONE:

5664

case ISD::SETNE: return X86::COND_NE;

5665

case ISD::SETUO: return X86::COND_P;

5666

case ISD::SETO: return X86::COND_NP;

5667

case ISD::SETOEQ:

5668

case ISD::SETUNE: return X86::COND_INVALID;

5669

}

5670

}

5671

5672

/// Is there a floating point cmov for the specific X86 condition code?

5673

/// Current x86 isa includes the following FP cmov instructions:

5674

/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.

5675

static bool hasFPCMov(unsigned X86CC) {

5676

switch (X86CC) {

5677

default:

5678

return false;

5679

case X86::COND_B:

5680

case X86::COND_BE:

5681

case X86::COND_E:

5682

case X86::COND_P:

5683

case X86::COND_A:

5684

case X86::COND_AE:

5685

case X86::COND_NE:

5686

case X86::COND_NP:

5687

return true;

5688

}

5689

}

5690

5691

static bool useVPTERNLOG(const X86Subtarget &Subtarget, MVT VT) {

5692

return Subtarget.hasVLX() || Subtarget.canExtendTo512DQ() ||

5693

VT.is512BitVector();

5694

}

5695

5696

bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,

5697

const CallInst &I,

5698

MachineFunction &MF,

5699

unsigned Intrinsic) const {

5700

Info.flags = MachineMemOperand::MONone;

5701

Info.offset = 0;

5702

5703

const IntrinsicData* IntrData = getIntrinsicWithChain(Intrinsic);

5704

if (!IntrData) {

5705

switch (Intrinsic) {

5706

case Intrinsic::x86_aesenc128kl:

5707

case Intrinsic::x86_aesdec128kl:

5708

Info.opc = ISD::INTRINSIC_W_CHAIN;

5709

Info.ptrVal = I.getArgOperand(1);

5710

Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 48);

5711

Info.align = Align(1);

5712

Info.flags |= MachineMemOperand::MOLoad;

5713

return true;

5714

case Intrinsic::x86_aesenc256kl:

5715

case Intrinsic::x86_aesdec256kl:

5716

Info.opc = ISD::INTRINSIC_W_CHAIN;

5717

Info.ptrVal = I.getArgOperand(1);

5718

Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 64);

5719

Info.align = Align(1);

5720

Info.flags |= MachineMemOperand::MOLoad;

5721

return true;

5722

case Intrinsic::x86_aesencwide128kl:

5723

case Intrinsic::x86_aesdecwide128kl:

5724

Info.opc = ISD::INTRINSIC_W_CHAIN;

5725

Info.ptrVal = I.getArgOperand(0);

5726

Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 48);

5727

Info.align = Align(1);

5728

Info.flags |= MachineMemOperand::MOLoad;

5729

return true;

5730

case Intrinsic::x86_aesencwide256kl:

5731

case Intrinsic::x86_aesdecwide256kl:

5732

Info.opc = ISD::INTRINSIC_W_CHAIN;

5733

Info.ptrVal = I.getArgOperand(0);

5734

Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 64);

5735

Info.align = Align(1);

5736

Info.flags |= MachineMemOperand::MOLoad;

5737

return true;

5738

case Intrinsic::x86_cmpccxadd32:

5739

case Intrinsic::x86_cmpccxadd64:

5740

case Intrinsic::x86_atomic_bts:

5741

case Intrinsic::x86_atomic_btc:

5742

case Intrinsic::x86_atomic_btr: {

5743

Info.opc = ISD::INTRINSIC_W_CHAIN;

5744

Info.ptrVal = I.getArgOperand(0);

5745

unsigned Size = I.getType()->getScalarSizeInBits();

5746

Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);

5747

Info.align = Align(Size);

5748

Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore |

5749

MachineMemOperand::MOVolatile;

5750

return true;

5751

}

5752

case Intrinsic::x86_atomic_bts_rm:

5753

case Intrinsic::x86_atomic_btc_rm:

5754

case Intrinsic::x86_atomic_btr_rm: {

5755

Info.opc = ISD::INTRINSIC_W_CHAIN;

5756

Info.ptrVal = I.getArgOperand(0);

5757

unsigned Size = I.getArgOperand(1)->getType()->getScalarSizeInBits();

5758

Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);

5759

Info.align = Align(Size);

5760

Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore |

5761

MachineMemOperand::MOVolatile;

5762

return true;

5763

}

5764

case Intrinsic::x86_aadd32:

5765

case Intrinsic::x86_aadd64:

5766

case Intrinsic::x86_aand32:

5767

case Intrinsic::x86_aand64:

5768

case Intrinsic::x86_aor32:

5769

case Intrinsic::x86_aor64:

5770

case Intrinsic::x86_axor32:

5771

case Intrinsic::x86_axor64:

5772

case Intrinsic::x86_atomic_add_cc:

5773

case Intrinsic::x86_atomic_sub_cc:

5774

case Intrinsic::x86_atomic_or_cc:

5775

case Intrinsic::x86_atomic_and_cc:

5776

case Intrinsic::x86_atomic_xor_cc: {

5777

Info.opc = ISD::INTRINSIC_W_CHAIN;

5778

Info.ptrVal = I.getArgOperand(0);

5779

unsigned Size = I.getArgOperand(1)->getType()->getScalarSizeInBits();

5780

Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);

5781

Info.align = Align(Size);

5782

Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore |

5783

MachineMemOperand::MOVolatile;

5784

return true;

5785

}

5786

}

5787

return false;

5788

}

5789

5790

switch (IntrData->Type) {

5791

case TRUNCATE_TO_MEM_VI8:

5792

case TRUNCATE_TO_MEM_VI16:

5793

case TRUNCATE_TO_MEM_VI32: {

5794

Info.opc = ISD::INTRINSIC_VOID;

5795

Info.ptrVal = I.getArgOperand(0);

5796

MVT VT = MVT::getVT(I.getArgOperand(1)->getType());

5797

MVT ScalarVT = MVT::INVALID_SIMPLE_VALUE_TYPE;

5798

if (IntrData->Type == TRUNCATE_TO_MEM_VI8)

5799

ScalarVT = MVT::i8;

5800

else if (IntrData->Type == TRUNCATE_TO_MEM_VI16)

5801

ScalarVT = MVT::i16;

5802

else if (IntrData->Type == TRUNCATE_TO_MEM_VI32)

5803

ScalarVT = MVT::i32;

5804

5805

Info.memVT = MVT::getVectorVT(ScalarVT, VT.getVectorNumElements());

5806

Info.align = Align(1);

5807

Info.flags |= MachineMemOperand::MOStore;

5808

break;

5809

}

5810

case GATHER:

5811

case GATHER_AVX2: {

5812

Info.opc = ISD::INTRINSIC_W_CHAIN;

5813

Info.ptrVal = nullptr;

5814

MVT DataVT = MVT::getVT(I.getType());

5815

MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());

5816

unsigned NumElts = std::min(DataVT.getVectorNumElements(),

5817

IndexVT.getVectorNumElements());

5818

Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);

5819

Info.align = Align(1);

5820

Info.flags |= MachineMemOperand::MOLoad;

5821

break;

5822

}

5823

case SCATTER: {

5824

Info.opc = ISD::INTRINSIC_VOID;

5825

Info.ptrVal = nullptr;

5826

MVT DataVT = MVT::getVT(I.getArgOperand(3)->getType());

5827

MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());

5828

unsigned NumElts = std::min(DataVT.getVectorNumElements(),

5829

IndexVT.getVectorNumElements());

5830

Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);

5831

Info.align = Align(1);

5832

Info.flags |= MachineMemOperand::MOStore;

5833

break;

5834

}

5835

default:

5836

return false;

5837

}

5838

5839

return true;

5840

}

5841

5842

/// Returns true if the target can instruction select the

5843

/// specified FP immediate natively. If false, the legalizer will

5844

/// materialize the FP immediate as a load from a constant pool.

5845

bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,

5846

bool ForCodeSize) const {

5847

for (const APFloat &FPImm : LegalFPImmediates)

5848

if (Imm.bitwiseIsEqual(FPImm))

5849

return true;

5850

return false;

5851

}

5852

5853

bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load,

5854

ISD::LoadExtType ExtTy,

5855

EVT NewVT) const {

5856

assert(cast<LoadSDNode>(Load)->isSimple() && "illegal to narrow")(static_cast <bool> (cast<LoadSDNode>(Load)->isSimple
() && "illegal to narrow") ? void (0) : __assert_fail
("cast<LoadSDNode>(Load)->isSimple() && \"illegal to narrow\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 5856, __extension__
__PRETTY_FUNCTION__));

5857

5858

// "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF

5859

// relocation target a movq or addq instruction: don't let the load shrink.

5860

SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();

5861

if (BasePtr.getOpcode() == X86ISD::WrapperRIP)

5862

if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))

5863

return GA->getTargetFlags() != X86II::MO_GOTTPOFF;

5864

5865

// If this is an (1) AVX vector load with (2) multiple uses and (3) all of

5866

// those uses are extracted directly into a store, then the extract + store

5867

// can be store-folded. Therefore, it's probably not worth splitting the load.

5868

EVT VT = Load->getValueType(0);

5869

if ((VT.is256BitVector() || VT.is512BitVector()) && !Load->hasOneUse()) {

5870

for (auto UI = Load->use_begin(), UE = Load->use_end(); UI != UE; ++UI) {

5871

// Skip uses of the chain value. Result 0 of the node is the load value.

5872

if (UI.getUse().getResNo() != 0)

5873

continue;

5874

5875

// If this use is not an extract + store, it's probably worth splitting.

5876

if (UI->getOpcode() != ISD::EXTRACT_SUBVECTOR || !UI->hasOneUse() ||

5877

UI->use_begin()->getOpcode() != ISD::STORE)

5878

return true;

5879

}

5880

// All non-chain uses are extract + store.

5881

return false;

5882

}

5883

5884

return true;

5885

}

5886

5887

/// Returns true if it is beneficial to convert a load of a constant

5888

/// to just the constant itself.

5889

bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,

5890

Type *Ty) const {

5891

assert(Ty->isIntegerTy())(static_cast <bool> (Ty->isIntegerTy()) ? void (0) :
__assert_fail ("Ty->isIntegerTy()", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5891, __extension__ __PRETTY_FUNCTION__));

5892

5893

unsigned BitSize = Ty->getPrimitiveSizeInBits();

5894

if (BitSize == 0 || BitSize > 64)

5895

return false;

5896

return true;

5897

}

5898

5899

bool X86TargetLowering::reduceSelectOfFPConstantLoads(EVT CmpOpVT) const {

5900

// If we are using XMM registers in the ABI and the condition of the select is

5901

// a floating-point compare and we have blendv or conditional move, then it is

5902

// cheaper to select instead of doing a cross-register move and creating a

5903

// load that depends on the compare result.

5904

bool IsFPSetCC = CmpOpVT.isFloatingPoint() && CmpOpVT != MVT::f128;

5905

return !IsFPSetCC || !Subtarget.isTarget64BitLP64() || !Subtarget.hasAVX();

5906

}

5907

5908

bool X86TargetLowering::convertSelectOfConstantsToMath(EVT VT) const {

5909

// TODO: It might be a win to ease or lift this restriction, but the generic

5910

// folds in DAGCombiner conflict with vector folds for an AVX512 target.

5911

if (VT.isVector() && Subtarget.hasAVX512())

5912

return false;

5913

5914

return true;

5915

}

5916

5917

bool X86TargetLowering::decomposeMulByConstant(LLVMContext &Context, EVT VT,

5918

SDValue C) const {

5919

// TODO: We handle scalars using custom code, but generic combining could make

5920

// that unnecessary.

5921

APInt MulC;

5922

if (!ISD::isConstantSplatVector(C.getNode(), MulC))

5923

return false;

5924

5925

// Find the type this will be legalized too. Otherwise we might prematurely

5926

// convert this to shl+add/sub and then still have to type legalize those ops.

5927

// Another choice would be to defer the decision for illegal types until

5928

// after type legalization. But constant splat vectors of i64 can't make it

5929

// through type legalization on 32-bit targets so we would need to special

5930

// case vXi64.

5931

while (getTypeAction(Context, VT) != TypeLegal)

5932

VT = getTypeToTransformTo(Context, VT);

5933

5934

// If vector multiply is legal, assume that's faster than shl + add/sub.

5935

// Multiply is a complex op with higher latency and lower throughput in

5936

// most implementations, sub-vXi32 vector multiplies are always fast,

5937

// vXi32 mustn't have a SlowMULLD implementation, and anything larger (vXi64)

5938

// is always going to be slow.

5939

unsigned EltSizeInBits = VT.getScalarSizeInBits();

5940

if (isOperationLegal(ISD::MUL, VT) && EltSizeInBits <= 32 &&

5941

(EltSizeInBits != 32 || !Subtarget.isPMULLDSlow()))

5942

return false;

5943

5944

// shl+add, shl+sub, shl+add+neg

5945

return (MulC + 1).isPowerOf2() || (MulC - 1).isPowerOf2() ||

5946

(1 - MulC).isPowerOf2() || (-(MulC + 1)).isPowerOf2();

5947

}

5948

5949

bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,

5950

unsigned Index) const {

5951

if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))

5952

return false;

5953

5954

// Mask vectors support all subregister combinations and operations that

5955

// extract half of vector.

5956

if (ResVT.getVectorElementType() == MVT::i1)

5957

return Index == 0 || ((ResVT.getSizeInBits() == SrcVT.getSizeInBits()*2) &&

5958

(Index == ResVT.getVectorNumElements()));

5959

5960

return (Index % ResVT.getVectorNumElements()) == 0;

5961

}

5962

5963

bool X86TargetLowering::shouldScalarizeBinop(SDValue VecOp) const {

5964

unsigned Opc = VecOp.getOpcode();

5965

5966

// Assume target opcodes can't be scalarized.

5967

// TODO - do we have any exceptions?

5968

if (Opc >= ISD::BUILTIN_OP_END)

5969

return false;

5970

5971

// If the vector op is not supported, try to convert to scalar.

5972

EVT VecVT = VecOp.getValueType();

5973

if (!isOperationLegalOrCustomOrPromote(Opc, VecVT))

5974

return true;

5975

5976

// If the vector op is supported, but the scalar op is not, the transform may

5977

// not be worthwhile.

5978

EVT ScalarVT = VecVT.getScalarType();

5979

return isOperationLegalOrCustomOrPromote(Opc, ScalarVT);

5980

}

5981

5982

bool X86TargetLowering::shouldFormOverflowOp(unsigned Opcode, EVT VT,

5983

bool) const {

5984

// TODO: Allow vectors?

5985

if (VT.isVector())

5986

return false;

5987

return VT.isSimple() || !isOperationExpand(Opcode, VT);

5988

}

5989

5990

bool X86TargetLowering::isCheapToSpeculateCttz(Type *Ty) const {

5991

// Speculate cttz only if we can directly use TZCNT or can promote to i32.

5992

return Subtarget.hasBMI() ||

5993

(!Ty->isVectorTy() && Ty->getScalarSizeInBits() < 32);

5994

}

5995

5996

bool X86TargetLowering::isCheapToSpeculateCtlz(Type *Ty) const {

5997

// Speculate ctlz only if we can directly use LZCNT.

5998

return Subtarget.hasLZCNT();

5999

}

6000

6001

bool X86TargetLowering::ShouldShrinkFPConstant(EVT VT) const {

6002

// Don't shrink FP constpool if SSE2 is available since cvtss2sd is more

6003

// expensive than a straight movsd. On the other hand, it's important to

6004

// shrink long double fp constant since fldt is very slow.

6005

return !Subtarget.hasSSE2() || VT == MVT::f80;

6006

}

6007

6008

bool X86TargetLowering::isScalarFPTypeInSSEReg(EVT VT) const {

6009

return (VT == MVT::f64 && Subtarget.hasSSE2()) ||

6010

(VT == MVT::f32 && Subtarget.hasSSE1()) || VT == MVT::f16;

6011

}

6012

6013

bool X86TargetLowering::isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT,

6014

const SelectionDAG &DAG,

6015

const MachineMemOperand &MMO) const {

6016

if (!Subtarget.hasAVX512() && !LoadVT.isVector() && BitcastVT.isVector() &&

6017

BitcastVT.getVectorElementType() == MVT::i1)

6018

return false;

6019

6020

if (!Subtarget.hasDQI() && BitcastVT == MVT::v8i1 && LoadVT == MVT::i8)

6021

return false;

6022

6023

// If both types are legal vectors, it's always ok to convert them.

6024

if (LoadVT.isVector() && BitcastVT.isVector() &&

6025

isTypeLegal(LoadVT) && isTypeLegal(BitcastVT))

6026

return true;

6027

6028

return TargetLowering::isLoadBitCastBeneficial(LoadVT, BitcastVT, DAG, MMO);

6029

}

6030

6031

bool X86TargetLowering::canMergeStoresTo(unsigned AddressSpace, EVT MemVT,

6032

const MachineFunction &MF) const {

6033

// Do not merge to float value size (128 bytes) if no implicit

6034

// float attribute is set.

6035

bool NoFloat = MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat);

6036

6037

if (NoFloat) {

6038

unsigned MaxIntSize = Subtarget.is64Bit() ? 64 : 32;

6039

return (MemVT.getSizeInBits() <= MaxIntSize);

6040

}

6041

// Make sure we don't merge greater than our preferred vector

6042

// width.

6043

if (MemVT.getSizeInBits() > Subtarget.getPreferVectorWidth())

6044

return false;

6045

6046

return true;

6047

}

6048

6049

bool X86TargetLowering::isCtlzFast() const {

6050

return Subtarget.hasFastLZCNT();

6051

}

6052

6053

bool X86TargetLowering::isMaskAndCmp0FoldingBeneficial(

6054

const Instruction &AndI) const {

6055

return true;

6056

}

6057

6058

bool X86TargetLowering::hasAndNotCompare(SDValue Y) const {

6059

EVT VT = Y.getValueType();

6060

6061

if (VT.isVector())

6062

return false;

6063

6064

if (!Subtarget.hasBMI())

6065

return false;

6066

6067

// There are only 32-bit and 64-bit forms for 'andn'.

6068

if (VT != MVT::i32 && VT != MVT::i64)

6069

return false;

6070

6071

return !isa<ConstantSDNode>(Y);

6072

}

6073

6074

bool X86TargetLowering::hasAndNot(SDValue Y) const {

6075

EVT VT = Y.getValueType();

6076

6077

if (!VT.isVector())

6078

return hasAndNotCompare(Y);

6079

6080

// Vector.

6081

6082

if (!Subtarget.hasSSE1() || VT.getSizeInBits() < 128)

6083

return false;

6084

6085

if (VT == MVT::v4i32)

6086

return true;

6087

6088

return Subtarget.hasSSE2();

6089

}

6090

6091

bool X86TargetLowering::hasBitTest(SDValue X, SDValue Y) const {

6092

return X.getValueType().isScalarInteger(); // 'bt'

6093

}

6094

6095

bool X86TargetLowering::

6096

shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(

6097

SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y,

6098

unsigned OldShiftOpcode, unsigned NewShiftOpcode,

6099

SelectionDAG &DAG) const {

6100

// Does baseline recommend not to perform the fold by default?

6101

if (!TargetLowering::shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(

6102

X, XC, CC, Y, OldShiftOpcode, NewShiftOpcode, DAG))

6103

return false;

6104

// For scalars this transform is always beneficial.

6105

if (X.getValueType().isScalarInteger())

6106

return true;

6107

// If all the shift amounts are identical, then transform is beneficial even

6108

// with rudimentary SSE2 shifts.

6109

if (DAG.isSplatValue(Y, /*AllowUndefs=*/true))

6110

return true;

6111

// If we have AVX2 with it's powerful shift operations, then it's also good.

6112

if (Subtarget.hasAVX2())

6113

return true;

6114

// Pre-AVX2 vector codegen for this pattern is best for variant with 'shl'.

6115

return NewShiftOpcode == ISD::SHL;

6116

}

6117

6118

bool X86TargetLowering::preferScalarizeSplat(SDNode *N) const {

6119

return N->getOpcode() != ISD::FP_EXTEND;

6120

}

6121

6122

bool X86TargetLowering::shouldFoldConstantShiftPairToMask(

6123

const SDNode *N, CombineLevel Level) const {

6124

assert(((N->getOpcode() == ISD::SHL &&(static_cast <bool> (((N->getOpcode() == ISD::SHL &&
N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode
() == ISD::SRL && N->getOperand(0).getOpcode() == ISD
::SHL)) && "Expected shift-shift mask") ? void (0) : __assert_fail
("((N->getOpcode() == ISD::SHL && N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode() == ISD::SRL && N->getOperand(0).getOpcode() == ISD::SHL)) && \"Expected shift-shift mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6128, __extension__
__PRETTY_FUNCTION__))

6125

N->getOperand(0).getOpcode() == ISD::SRL) ||(static_cast <bool> (((N->getOpcode() == ISD::SHL &&
N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode
() == ISD::SRL && N->getOperand(0).getOpcode() == ISD
::SHL)) && "Expected shift-shift mask") ? void (0) : __assert_fail
("((N->getOpcode() == ISD::SHL && N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode() == ISD::SRL && N->getOperand(0).getOpcode() == ISD::SHL)) && \"Expected shift-shift mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6128, __extension__
__PRETTY_FUNCTION__))

6126

(N->getOpcode() == ISD::SRL &&(static_cast <bool> (((N->getOpcode() == ISD::SHL &&
N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode
() == ISD::SRL && N->getOperand(0).getOpcode() == ISD
::SHL)) && "Expected shift-shift mask") ? void (0) : __assert_fail
("((N->getOpcode() == ISD::SHL && N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode() == ISD::SRL && N->getOperand(0).getOpcode() == ISD::SHL)) && \"Expected shift-shift mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6128, __extension__
__PRETTY_FUNCTION__))

6127

N->getOperand(0).getOpcode() == ISD::SHL)) &&(static_cast <bool> (((N->getOpcode() == ISD::SHL &&
N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode
() == ISD::SRL && N->getOperand(0).getOpcode() == ISD
::SHL)) && "Expected shift-shift mask") ? void (0) : __assert_fail
("((N->getOpcode() == ISD::SHL && N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode() == ISD::SRL && N->getOperand(0).getOpcode() == ISD::SHL)) && \"Expected shift-shift mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6128, __extension__
__PRETTY_FUNCTION__))

6128

"Expected shift-shift mask")(static_cast <bool> (((N->getOpcode() == ISD::SHL &&
N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode
() == ISD::SRL && N->getOperand(0).getOpcode() == ISD
::SHL)) && "Expected shift-shift mask") ? void (0) : __assert_fail
("((N->getOpcode() == ISD::SHL && N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode() == ISD::SRL && N->getOperand(0).getOpcode() == ISD::SHL)) && \"Expected shift-shift mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6128, __extension__
__PRETTY_FUNCTION__));

6129

// TODO: Should we always create i64 masks? Or only folded immediates?

6130

EVT VT = N->getValueType(0);

6131

if ((Subtarget.hasFastVectorShiftMasks() && VT.isVector()) ||

6132

(Subtarget.hasFastScalarShiftMasks() && !VT.isVector())) {

6133

// Only fold if the shift values are equal - so it folds to AND.

6134

// TODO - we should fold if either is a non-uniform vector but we don't do

6135

// the fold for non-splats yet.

6136

return N->getOperand(1) == N->getOperand(0).getOperand(1);

6137

}

6138

return TargetLoweringBase::shouldFoldConstantShiftPairToMask(N, Level);

6139

}

6140

6141

bool X86TargetLowering::shouldFoldMaskToVariableShiftPair(SDValue Y) const {

6142

EVT VT = Y.getValueType();

6143

6144

// For vectors, we don't have a preference, but we probably want a mask.

6145

if (VT.isVector())

6146

return false;

6147

6148

// 64-bit shifts on 32-bit targets produce really bad bloated code.

6149

if (VT == MVT::i64 && !Subtarget.is64Bit())

6150

return false;

6151

6152

return true;

6153

}

6154

6155

TargetLowering::ShiftLegalizationStrategy

6156

X86TargetLowering::preferredShiftLegalizationStrategy(

6157

SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const {

6158

if (DAG.getMachineFunction().getFunction().hasMinSize() &&

6159

!Subtarget.isOSWindows())

6160

return ShiftLegalizationStrategy::LowerToLibcall;

6161

return TargetLowering::preferredShiftLegalizationStrategy(DAG, N,

6162

ExpansionFactor);

6163

}

6164

6165

bool X86TargetLowering::shouldSplatInsEltVarIndex(EVT VT) const {

6166

// Any legal vector type can be splatted more efficiently than

6167

// loading/spilling from memory.

6168

return isTypeLegal(VT);

6169

}

6170

6171

MVT X86TargetLowering::hasFastEqualityCompare(unsigned NumBits) const {

6172

MVT VT = MVT::getIntegerVT(NumBits);

6173

if (isTypeLegal(VT))

6174

return VT;

6175

6176

// PMOVMSKB can handle this.

6177

if (NumBits == 128 && isTypeLegal(MVT::v16i8))

6178

return MVT::v16i8;

6179

6180

// VPMOVMSKB can handle this.

6181

if (NumBits == 256 && isTypeLegal(MVT::v32i8))

6182

return MVT::v32i8;

6183

6184

// TODO: Allow 64-bit type for 32-bit target.

6185

// TODO: 512-bit types should be allowed, but make sure that those

6186

// cases are handled in combineVectorSizedSetCCEquality().

6187

6188

return MVT::INVALID_SIMPLE_VALUE_TYPE;

6189

}

6190

6191

/// Val is the undef sentinel value or equal to the specified value.

6192

static bool isUndefOrEqual(int Val, int CmpVal) {

6193

return ((Val == SM_SentinelUndef) || (Val == CmpVal));

6194

}

6195

6196

/// Return true if every element in Mask is the undef sentinel value or equal to

6197

/// the specified value..

6198

static bool isUndefOrEqual(ArrayRef<int> Mask, int CmpVal) {

6199

return llvm::all_of(Mask, [CmpVal](int M) {

6200

return (M == SM_SentinelUndef) || (M == CmpVal);

6201

});

6202

}

6203

6204

/// Val is either the undef or zero sentinel value.

6205

static bool isUndefOrZero(int Val) {

6206

return ((Val == SM_SentinelUndef) || (Val == SM_SentinelZero));

6207

}

6208

6209

/// Return true if every element in Mask, beginning from position Pos and ending

6210

/// in Pos+Size is the undef sentinel value.

6211

static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) {

6212

return llvm::all_of(Mask.slice(Pos, Size),

6213

[](int M) { return M == SM_SentinelUndef; });

6214

}

6215

6216

/// Return true if the mask creates a vector whose lower half is undefined.

6217

static bool isUndefLowerHalf(ArrayRef<int> Mask) {

6218

unsigned NumElts = Mask.size();

6219

return isUndefInRange(Mask, 0, NumElts / 2);

6220

}

6221

6222

/// Return true if the mask creates a vector whose upper half is undefined.

6223

static bool isUndefUpperHalf(ArrayRef<int> Mask) {

6224

unsigned NumElts = Mask.size();

6225

return isUndefInRange(Mask, NumElts / 2, NumElts / 2);

6226

}

6227

6228

/// Return true if Val falls within the specified range (L, H].

6229

static bool isInRange(int Val, int Low, int Hi) {

6230

return (Val >= Low && Val < Hi);

6231

}

6232

6233

/// Return true if the value of any element in Mask falls within the specified

6234

/// range (L, H].

6235

static bool isAnyInRange(ArrayRef<int> Mask, int Low, int Hi) {

6236

return llvm::any_of(Mask, [Low, Hi](int M) { return isInRange(M, Low, Hi); });

6237

}

6238

6239

/// Return true if the value of any element in Mask is the zero sentinel value.

6240

static bool isAnyZero(ArrayRef<int> Mask) {

6241

return llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; });

6242

}

6243

6244

/// Return true if the value of any element in Mask is the zero or undef

6245

/// sentinel values.

6246

static bool isAnyZeroOrUndef(ArrayRef<int> Mask) {

6247

return llvm::any_of(Mask, [](int M) {

6248

return M == SM_SentinelZero || M == SM_SentinelUndef;

6249

});

6250

}

6251

6252

/// Return true if Val is undef or if its value falls within the

6253

/// specified range (L, H].

6254

static bool isUndefOrInRange(int Val, int Low, int Hi) {

6255

return (Val == SM_SentinelUndef) || isInRange(Val, Low, Hi);

6256

}

6257

6258

/// Return true if every element in Mask is undef or if its value

6259

/// falls within the specified range (L, H].

6260

static bool isUndefOrInRange(ArrayRef<int> Mask, int Low, int Hi) {

6261

return llvm::all_of(

6262

Mask, [Low, Hi](int M) { return isUndefOrInRange(M, Low, Hi); });

6263

}

6264

6265

/// Return true if Val is undef, zero or if its value falls within the

6266

/// specified range (L, H].

6267

static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi) {

6268

return isUndefOrZero(Val) || isInRange(Val, Low, Hi);

6269

}

6270

6271

/// Return true if every element in Mask is undef, zero or if its value

6272

/// falls within the specified range (L, H].

6273

static bool isUndefOrZeroOrInRange(ArrayRef<int> Mask, int Low, int Hi) {

6274

return llvm::all_of(

6275

Mask, [Low, Hi](int M) { return isUndefOrZeroOrInRange(M, Low, Hi); });

6276

}

6277

6278

/// Return true if every element in Mask, beginning

6279

/// from position Pos and ending in Pos + Size, falls within the specified

6280

/// sequence (Low, Low + Step, ..., Low + (Size - 1) * Step) or is undef.

6281

static bool isSequentialOrUndefInRange(ArrayRef<int> Mask, unsigned Pos,

6282

unsigned Size, int Low, int Step = 1) {

6283

for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)

6284

if (!isUndefOrEqual(Mask[i], Low))

6285

return false;

6286

return true;

6287

}

6288

6289

/// Return true if every element in Mask, beginning

6290

/// from position Pos and ending in Pos+Size, falls within the specified

6291

/// sequential range (Low, Low+Size], or is undef or is zero.

6292

static bool isSequentialOrUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,

6293

unsigned Size, int Low,

6294

int Step = 1) {

6295

for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)

6296

if (!isUndefOrZero(Mask[i]) && Mask[i] != Low)

6297

return false;

6298

return true;

6299

}

6300

6301

/// Return true if every element in Mask, beginning

6302

/// from position Pos and ending in Pos+Size is undef or is zero.

6303

static bool isUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,

6304

unsigned Size) {

6305

return llvm::all_of(Mask.slice(Pos, Size), isUndefOrZero);

6306

}

6307

6308

/// Helper function to test whether a shuffle mask could be

6309

/// simplified by widening the elements being shuffled.

6310

///

6311

/// Appends the mask for wider elements in WidenedMask if valid. Otherwise

6312

/// leaves it in an unspecified state.

6313

///

6314

/// NOTE: This must handle normal vector shuffle masks and *target* vector

6315

/// shuffle masks. The latter have the special property of a '-2' representing

6316

/// a zero-ed lane of a vector.

6317

static bool canWidenShuffleElements(ArrayRef<int> Mask,

6318

SmallVectorImpl<int> &WidenedMask) {

6319

WidenedMask.assign(Mask.size() / 2, 0);

6320

for (int i = 0, Size = Mask.size(); i < Size; i += 2) {

6321

int M0 = Mask[i];

6322

int M1 = Mask[i + 1];

6323

6324

// If both elements are undef, its trivial.

6325

if (M0 == SM_SentinelUndef && M1 == SM_SentinelUndef) {

6326

WidenedMask[i / 2] = SM_SentinelUndef;

6327

continue;

6328

}

6329

6330

// Check for an undef mask and a mask value properly aligned to fit with

6331

// a pair of values. If we find such a case, use the non-undef mask's value.

6332

if (M0 == SM_SentinelUndef && M1 >= 0 && (M1 % 2) == 1) {

6333

WidenedMask[i / 2] = M1 / 2;

6334

continue;

6335

}

6336

if (M1 == SM_SentinelUndef && M0 >= 0 && (M0 % 2) == 0) {

6337

WidenedMask[i / 2] = M0 / 2;

6338

continue;

6339

}

6340

6341

// When zeroing, we need to spread the zeroing across both lanes to widen.

6342

if (M0 == SM_SentinelZero || M1 == SM_SentinelZero) {

6343

if ((M0 == SM_SentinelZero || M0 == SM_SentinelUndef) &&

6344

(M1 == SM_SentinelZero || M1 == SM_SentinelUndef)) {

6345

WidenedMask[i / 2] = SM_SentinelZero;

6346

continue;

6347

}

6348

return false;

6349

}

6350

6351

// Finally check if the two mask values are adjacent and aligned with

6352

// a pair.

6353

if (M0 != SM_SentinelUndef && (M0 % 2) == 0 && (M0 + 1) == M1) {

6354

WidenedMask[i / 2] = M0 / 2;

6355

continue;

6356

}

6357

6358

// Otherwise we can't safely widen the elements used in this shuffle.

6359

return false;

6360

}

6361

assert(WidenedMask.size() == Mask.size() / 2 &&(static_cast <bool> (WidenedMask.size() == Mask.size() /
2 && "Incorrect size of mask after widening the elements!"
) ? void (0) : __assert_fail ("WidenedMask.size() == Mask.size() / 2 && \"Incorrect size of mask after widening the elements!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6362, __extension__
__PRETTY_FUNCTION__))

6362

"Incorrect size of mask after widening the elements!")(static_cast <bool> (WidenedMask.size() == Mask.size() /
2 && "Incorrect size of mask after widening the elements!"
) ? void (0) : __assert_fail ("WidenedMask.size() == Mask.size() / 2 && \"Incorrect size of mask after widening the elements!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6362, __extension__
__PRETTY_FUNCTION__));

6363

6364

return true;

6365

}

6366

6367

static bool canWidenShuffleElements(ArrayRef<int> Mask,

6368

const APInt &Zeroable,

6369

bool V2IsZero,

6370

SmallVectorImpl<int> &WidenedMask) {

6371

// Create an alternative mask with info about zeroable elements.

6372

// Here we do not set undef elements as zeroable.

6373

SmallVector<int, 64> ZeroableMask(Mask);

6374

if (V2IsZero) {

6375

assert(!Zeroable.isZero() && "V2's non-undef elements are used?!")(static_cast <bool> (!Zeroable.isZero() && "V2's non-undef elements are used?!"
) ? void (0) : __assert_fail ("!Zeroable.isZero() && \"V2's non-undef elements are used?!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6375, __extension__
__PRETTY_FUNCTION__));

6376

for (int i = 0, Size = Mask.size(); i != Size; ++i)

6377

if (Mask[i] != SM_SentinelUndef && Zeroable[i])

6378

ZeroableMask[i] = SM_SentinelZero;

6379

}

6380

return canWidenShuffleElements(ZeroableMask, WidenedMask);

6381

}

6382

6383

static bool canWidenShuffleElements(ArrayRef<int> Mask) {

6384

SmallVector<int, 32> WidenedMask;

6385

return canWidenShuffleElements(Mask, WidenedMask);

6386

}

6387

6388

// Attempt to narrow/widen shuffle mask until it matches the target number of

6389

// elements.

6390

static bool scaleShuffleElements(ArrayRef<int> Mask, unsigned NumDstElts,

6391

SmallVectorImpl<int> &ScaledMask) {

6392

unsigned NumSrcElts = Mask.size();

6393

assert(((NumSrcElts % NumDstElts) == 0 || (NumDstElts % NumSrcElts) == 0) &&(static_cast <bool> (((NumSrcElts % NumDstElts) == 0 ||
(NumDstElts % NumSrcElts) == 0) && "Illegal shuffle scale factor"
) ? void (0) : __assert_fail ("((NumSrcElts % NumDstElts) == 0 || (NumDstElts % NumSrcElts) == 0) && \"Illegal shuffle scale factor\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6394, __extension__
__PRETTY_FUNCTION__))

6394

"Illegal shuffle scale factor")(static_cast <bool> (((NumSrcElts % NumDstElts) == 0 ||
(NumDstElts % NumSrcElts) == 0) && "Illegal shuffle scale factor"
) ? void (0) : __assert_fail ("((NumSrcElts % NumDstElts) == 0 || (NumDstElts % NumSrcElts) == 0) && \"Illegal shuffle scale factor\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6394, __extension__
__PRETTY_FUNCTION__));

6395

6396

// Narrowing is guaranteed to work.

6397

if (NumDstElts >= NumSrcElts) {

6398

int Scale = NumDstElts / NumSrcElts;

6399

llvm::narrowShuffleMaskElts(Scale, Mask, ScaledMask);

6400

return true;

6401

}

6402

6403

// We have to repeat the widening until we reach the target size, but we can

6404

// split out the first widening as it sets up ScaledMask for us.

6405

if (canWidenShuffleElements(Mask, ScaledMask)) {

6406

while (ScaledMask.size() > NumDstElts) {

6407

SmallVector<int, 16> WidenedMask;

6408

if (!canWidenShuffleElements(ScaledMask, WidenedMask))

6409

return false;

6410

ScaledMask = std::move(WidenedMask);

6411

}

6412

return true;

6413

}

6414

6415

return false;

6416

}

6417

6418

/// Returns true if Elt is a constant zero or a floating point constant +0.0.

6419

bool X86::isZeroNode(SDValue Elt) {

6420

return isNullConstant(Elt) || isNullFPConstant(Elt);

6421

}

6422

6423

// Build a vector of constants.

6424

// Use an UNDEF node if MaskElt == -1.

6425

// Split 64-bit constants in the 32-bit mode.

6426

static SDValue getConstVector(ArrayRef<int> Values, MVT VT, SelectionDAG &DAG,

6427

const SDLoc &dl, bool IsMask = false) {

6428

6429

SmallVector<SDValue, 32> Ops;

6430

bool Split = false;

6431

6432

MVT ConstVecVT = VT;

6433

unsigned NumElts = VT.getVectorNumElements();

6434

bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);

6435

if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {

6436

ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);

6437

Split = true;

6438

}

6439

6440

MVT EltVT = ConstVecVT.getVectorElementType();

6441

for (unsigned i = 0; i < NumElts; ++i) {

6442

bool IsUndef = Values[i] < 0 && IsMask;

6443

SDValue OpNode = IsUndef ? DAG.getUNDEF(EltVT) :

6444

DAG.getConstant(Values[i], dl, EltVT);

6445

Ops.push_back(OpNode);

6446

if (Split)

6447

Ops.push_back(IsUndef ? DAG.getUNDEF(EltVT) :

6448

DAG.getConstant(0, dl, EltVT));

6449

}

6450

SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);

6451

if (Split)

6452

ConstsNode = DAG.getBitcast(VT, ConstsNode);

6453

return ConstsNode;

6454

}

6455

6456

static SDValue getConstVector(ArrayRef<APInt> Bits, APInt &Undefs,

6457

MVT VT, SelectionDAG &DAG, const SDLoc &dl) {

6458

assert(Bits.size() == Undefs.getBitWidth() &&(static_cast <bool> (Bits.size() == Undefs.getBitWidth(
) && "Unequal constant and undef arrays") ? void (0) :
__assert_fail ("Bits.size() == Undefs.getBitWidth() && \"Unequal constant and undef arrays\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6459, __extension__
__PRETTY_FUNCTION__))

6459

"Unequal constant and undef arrays")(static_cast <bool> (Bits.size() == Undefs.getBitWidth(
) && "Unequal constant and undef arrays") ? void (0) :
__assert_fail ("Bits.size() == Undefs.getBitWidth() && \"Unequal constant and undef arrays\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6459, __extension__
__PRETTY_FUNCTION__));

6460

SmallVector<SDValue, 32> Ops;

6461

bool Split = false;

6462

6463

MVT ConstVecVT = VT;

6464

unsigned NumElts = VT.getVectorNumElements();

6465

bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);

6466

if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {

6467

ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);

6468

Split = true;

6469

}

6470

6471

MVT EltVT = ConstVecVT.getVectorElementType();

6472

for (unsigned i = 0, e = Bits.size(); i != e; ++i) {

6473

if (Undefs[i]) {

6474

Ops.append(Split ? 2 : 1, DAG.getUNDEF(EltVT));

6475

continue;

6476

}

6477

const APInt &V = Bits[i];

6478

assert(V.getBitWidth() == VT.getScalarSizeInBits() && "Unexpected sizes")(static_cast <bool> (V.getBitWidth() == VT.getScalarSizeInBits
() && "Unexpected sizes") ? void (0) : __assert_fail (
"V.getBitWidth() == VT.getScalarSizeInBits() && \"Unexpected sizes\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6478, __extension__
__PRETTY_FUNCTION__));

6479

if (Split) {

6480

Ops.push_back(DAG.getConstant(V.trunc(32), dl, EltVT));

6481

Ops.push_back(DAG.getConstant(V.lshr(32).trunc(32), dl, EltVT));

6482

} else if (EltVT == MVT::f32) {

6483

APFloat FV(APFloat::IEEEsingle(), V);

6484

Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));

6485

} else if (EltVT == MVT::f64) {

6486

APFloat FV(APFloat::IEEEdouble(), V);

6487

Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));

6488

} else {

6489

Ops.push_back(DAG.getConstant(V, dl, EltVT));

6490

}

6491

}

6492

6493

SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);

6494

return DAG.getBitcast(VT, ConstsNode);

6495

}

6496

6497

/// Returns a vector of specified type with all zero elements.

6498

static SDValue getZeroVector(MVT VT, const X86Subtarget &Subtarget,

6499

SelectionDAG &DAG, const SDLoc &dl) {

6500

assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() ||(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector() || VT.getVectorElementType() == MVT
::i1) && "Unexpected vector type") ? void (0) : __assert_fail
("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() || VT.getVectorElementType() == MVT::i1) && \"Unexpected vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6502, __extension__
__PRETTY_FUNCTION__))

6501

VT.getVectorElementType() == MVT::i1) &&(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector() || VT.getVectorElementType() == MVT
::i1) && "Unexpected vector type") ? void (0) : __assert_fail
("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() || VT.getVectorElementType() == MVT::i1) && \"Unexpected vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6502, __extension__
__PRETTY_FUNCTION__))

6502

"Unexpected vector type")(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector() || VT.getVectorElementType() == MVT
::i1) && "Unexpected vector type") ? void (0) : __assert_fail
("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() || VT.getVectorElementType() == MVT::i1) && \"Unexpected vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6502, __extension__
__PRETTY_FUNCTION__));

6503

6504

// Try to build SSE/AVX zero vectors as <N x i32> bitcasted to their dest

6505

// type. This ensures they get CSE'd. But if the integer type is not

6506

// available, use a floating-point +0.0 instead.

6507

SDValue Vec;

6508

if (!Subtarget.hasSSE2() && VT.is128BitVector()) {

6509

Vec = DAG.getConstantFP(+0.0, dl, MVT::v4f32);

6510

} else if (VT.isFloatingPoint()) {

6511

Vec = DAG.getConstantFP(+0.0, dl, VT);

6512

} else if (VT.getVectorElementType() == MVT::i1) {

6513

assert((Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) &&(static_cast <bool> ((Subtarget.hasBWI() || VT.getVectorNumElements
() <= 16) && "Unexpected vector type") ? void (0) :
__assert_fail ("(Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) && \"Unexpected vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6514, __extension__
__PRETTY_FUNCTION__))

6514

"Unexpected vector type")(static_cast <bool> ((Subtarget.hasBWI() || VT.getVectorNumElements
() <= 16) && "Unexpected vector type") ? void (0) :
__assert_fail ("(Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) && \"Unexpected vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6514, __extension__
__PRETTY_FUNCTION__));

6515

Vec = DAG.getConstant(0, dl, VT);

6516

} else {

6517

unsigned Num32BitElts = VT.getSizeInBits() / 32;

6518

Vec = DAG.getConstant(0, dl, MVT::getVectorVT(MVT::i32, Num32BitElts));

6519

}

6520

return DAG.getBitcast(VT, Vec);

6521

}

6522

6523

// Helper to determine if the ops are all the extracted subvectors come from a

6524

// single source. If we allow commute they don't have to be in order (Lo/Hi).

6525

static SDValue getSplitVectorSrc(SDValue LHS, SDValue RHS, bool AllowCommute) {

6526

if (LHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||

6527

RHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||

6528

LHS.getValueType() != RHS.getValueType() ||

6529

LHS.getOperand(0) != RHS.getOperand(0))

6530

return SDValue();

6531

6532

SDValue Src = LHS.getOperand(0);

6533

if (Src.getValueSizeInBits() != (LHS.getValueSizeInBits() * 2))

6534

return SDValue();

6535

6536

unsigned NumElts = LHS.getValueType().getVectorNumElements();

6537

if ((LHS.getConstantOperandAPInt(1) == 0 &&

6538

RHS.getConstantOperandAPInt(1) == NumElts) ||

6539

(AllowCommute && RHS.getConstantOperandAPInt(1) == 0 &&

6540

LHS.getConstantOperandAPInt(1) == NumElts))

6541

return Src;

6542

6543

return SDValue();

6544

}

6545

6546

static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG,

6547

const SDLoc &dl, unsigned vectorWidth) {

6548

EVT VT = Vec.getValueType();

6549

EVT ElVT = VT.getVectorElementType();

6550

unsigned Factor = VT.getSizeInBits() / vectorWidth;

6551

EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,

6552

VT.getVectorNumElements() / Factor);

6553

6554

// Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR

6555

unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();

6556

assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2")(static_cast <bool> (isPowerOf2_32(ElemsPerChunk) &&
"Elements per chunk not power of 2") ? void (0) : __assert_fail
("isPowerOf2_32(ElemsPerChunk) && \"Elements per chunk not power of 2\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6556, __extension__
__PRETTY_FUNCTION__));

6557

6558

// This is the index of the first element of the vectorWidth-bit chunk

6559

// we want. Since ElemsPerChunk is a power of 2 just need to clear bits.

6560

IdxVal &= ~(ElemsPerChunk - 1);

6561

6562

// If the input is a buildvector just emit a smaller one.

6563

if (Vec.getOpcode() == ISD::BUILD_VECTOR)

6564

return DAG.getBuildVector(ResultVT, dl,

6565

Vec->ops().slice(IdxVal, ElemsPerChunk));

6566

6567

SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);

6568

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx);

6569

}

6570

6571

/// Generate a DAG to grab 128-bits from a vector > 128 bits. This

6572

/// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128

6573

/// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4

6574

/// instructions or a simple subregister reference. Idx is an index in the

6575

/// 128 bits we want. It need not be aligned to a 128-bit boundary. That makes

6576

/// lowering EXTRACT_VECTOR_ELT operations easier.

6577

static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal,

6578

SelectionDAG &DAG, const SDLoc &dl) {

6579

assert((Vec.getValueType().is256BitVector() ||(static_cast <bool> ((Vec.getValueType().is256BitVector
() || Vec.getValueType().is512BitVector()) && "Unexpected vector size!"
) ? void (0) : __assert_fail ("(Vec.getValueType().is256BitVector() || Vec.getValueType().is512BitVector()) && \"Unexpected vector size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6580, __extension__
__PRETTY_FUNCTION__))

6580

Vec.getValueType().is512BitVector()) && "Unexpected vector size!")(static_cast <bool> ((Vec.getValueType().is256BitVector
() || Vec.getValueType().is512BitVector()) && "Unexpected vector size!"
) ? void (0) : __assert_fail ("(Vec.getValueType().is256BitVector() || Vec.getValueType().is512BitVector()) && \"Unexpected vector size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6580, __extension__
__PRETTY_FUNCTION__));

6581

return extractSubVector(Vec, IdxVal, DAG, dl, 128);

6582

}

6583

6584

/// Generate a DAG to grab 256-bits from a 512-bit vector.

6585

static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal,

6586

SelectionDAG &DAG, const SDLoc &dl) {

6587

assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!")(static_cast <bool> (Vec.getValueType().is512BitVector(
) && "Unexpected vector size!") ? void (0) : __assert_fail
("Vec.getValueType().is512BitVector() && \"Unexpected vector size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6587, __extension__
__PRETTY_FUNCTION__));

6588

return extractSubVector(Vec, IdxVal, DAG, dl, 256);

6589

}

6590

6591

static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal,

6592

SelectionDAG &DAG, const SDLoc &dl,

6593

unsigned vectorWidth) {

6594

assert((vectorWidth == 128 || vectorWidth == 256) &&(static_cast <bool> ((vectorWidth == 128 || vectorWidth
== 256) && "Unsupported vector width") ? void (0) : __assert_fail
("(vectorWidth == 128 || vectorWidth == 256) && \"Unsupported vector width\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6595, __extension__
__PRETTY_FUNCTION__))

6595

"Unsupported vector width")(static_cast <bool> ((vectorWidth == 128 || vectorWidth
== 256) && "Unsupported vector width") ? void (0) : __assert_fail
("(vectorWidth == 128 || vectorWidth == 256) && \"Unsupported vector width\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6595, __extension__
__PRETTY_FUNCTION__));

6596

// Inserting UNDEF is Result

6597

if (Vec.isUndef())

6598

return Result;

6599

EVT VT = Vec.getValueType();

6600

EVT ElVT = VT.getVectorElementType();

6601

EVT ResultVT = Result.getValueType();

6602

6603

// Insert the relevant vectorWidth bits.

6604

unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();

6605

assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2")(static_cast <bool> (isPowerOf2_32(ElemsPerChunk) &&
"Elements per chunk not power of 2") ? void (0) : __assert_fail
("isPowerOf2_32(ElemsPerChunk) && \"Elements per chunk not power of 2\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6605, __extension__
__PRETTY_FUNCTION__));

6606

6607

// This is the index of the first element of the vectorWidth-bit chunk

6608

// we want. Since ElemsPerChunk is a power of 2 just need to clear bits.

6609

IdxVal &= ~(ElemsPerChunk - 1);

6610

6611

SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);

6612

return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx);

6613

}

6614

6615

/// Generate a DAG to put 128-bits into a vector > 128 bits. This

6616

/// sets things up to match to an AVX VINSERTF128/VINSERTI128 or

6617

/// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a

6618

/// simple superregister reference. Idx is an index in the 128 bits

6619

/// we want. It need not be aligned to a 128-bit boundary. That makes

6620

/// lowering INSERT_VECTOR_ELT operations easier.

6621

static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,

6622

SelectionDAG &DAG, const SDLoc &dl) {

6623

assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!")(static_cast <bool> (Vec.getValueType().is128BitVector(
) && "Unexpected vector size!") ? void (0) : __assert_fail
("Vec.getValueType().is128BitVector() && \"Unexpected vector size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6623, __extension__
__PRETTY_FUNCTION__));

6624

return insertSubVector(Result, Vec, IdxVal, DAG, dl, 128);

6625

}

6626

6627

/// Widen a vector to a larger size with the same scalar type, with the new

6628

/// elements either zero or undef.

6629

static SDValue widenSubVector(MVT VT, SDValue Vec, bool ZeroNewElements,

6630

const X86Subtarget &Subtarget, SelectionDAG &DAG,

6631

const SDLoc &dl) {

6632

assert(Vec.getValueSizeInBits().getFixedValue() < VT.getFixedSizeInBits() &&(static_cast <bool> (Vec.getValueSizeInBits().getFixedValue
() < VT.getFixedSizeInBits() && Vec.getValueType()
.getScalarType() == VT.getScalarType() && "Unsupported vector widening type"
) ? void (0) : __assert_fail ("Vec.getValueSizeInBits().getFixedValue() < VT.getFixedSizeInBits() && Vec.getValueType().getScalarType() == VT.getScalarType() && \"Unsupported vector widening type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6634, __extension__
__PRETTY_FUNCTION__))

6633

Vec.getValueType().getScalarType() == VT.getScalarType() &&(static_cast <bool> (Vec.getValueSizeInBits().getFixedValue
() < VT.getFixedSizeInBits() && Vec.getValueType()
.getScalarType() == VT.getScalarType() && "Unsupported vector widening type"
) ? void (0) : __assert_fail ("Vec.getValueSizeInBits().getFixedValue() < VT.getFixedSizeInBits() && Vec.getValueType().getScalarType() == VT.getScalarType() && \"Unsupported vector widening type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6634, __extension__
__PRETTY_FUNCTION__))

6634

"Unsupported vector widening type")(static_cast <bool> (Vec.getValueSizeInBits().getFixedValue
() < VT.getFixedSizeInBits() && Vec.getValueType()
.getScalarType() == VT.getScalarType() && "Unsupported vector widening type"
) ? void (0) : __assert_fail ("Vec.getValueSizeInBits().getFixedValue() < VT.getFixedSizeInBits() && Vec.getValueType().getScalarType() == VT.getScalarType() && \"Unsupported vector widening type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6634, __extension__
__PRETTY_FUNCTION__));

6635

SDValue Res = ZeroNewElements ? getZeroVector(VT, Subtarget, DAG, dl)

6636

: DAG.getUNDEF(VT);

6637

return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VT, Res, Vec,

6638

DAG.getIntPtrConstant(0, dl));

6639

}

6640

6641

/// Widen a vector to a larger size with the same scalar type, with the new

6642

/// elements either zero or undef.

6643

static SDValue widenSubVector(SDValue Vec, bool ZeroNewElements,

6644

const X86Subtarget &Subtarget, SelectionDAG &DAG,

6645

const SDLoc &dl, unsigned WideSizeInBits) {

6646

assert(Vec.getValueSizeInBits() < WideSizeInBits &&(static_cast <bool> (Vec.getValueSizeInBits() < WideSizeInBits
&& (WideSizeInBits % Vec.getScalarValueSizeInBits())
== 0 && "Unsupported vector widening type") ? void (
0) : __assert_fail ("Vec.getValueSizeInBits() < WideSizeInBits && (WideSizeInBits % Vec.getScalarValueSizeInBits()) == 0 && \"Unsupported vector widening type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6648, __extension__
__PRETTY_FUNCTION__))

6647

(WideSizeInBits % Vec.getScalarValueSizeInBits()) == 0 &&(static_cast <bool> (Vec.getValueSizeInBits() < WideSizeInBits
&& (WideSizeInBits % Vec.getScalarValueSizeInBits())
== 0 && "Unsupported vector widening type") ? void (
0) : __assert_fail ("Vec.getValueSizeInBits() < WideSizeInBits && (WideSizeInBits % Vec.getScalarValueSizeInBits()) == 0 && \"Unsupported vector widening type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6648, __extension__
__PRETTY_FUNCTION__))

6648

"Unsupported vector widening type")(static_cast <bool> (Vec.getValueSizeInBits() < WideSizeInBits
&& (WideSizeInBits % Vec.getScalarValueSizeInBits())
== 0 && "Unsupported vector widening type") ? void (
0) : __assert_fail ("Vec.getValueSizeInBits() < WideSizeInBits && (WideSizeInBits % Vec.getScalarValueSizeInBits()) == 0 && \"Unsupported vector widening type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6648, __extension__
__PRETTY_FUNCTION__));

6649

unsigned WideNumElts = WideSizeInBits / Vec.getScalarValueSizeInBits();

6650

MVT SVT = Vec.getSimpleValueType().getScalarType();

6651

MVT VT = MVT::getVectorVT(SVT, WideNumElts);

6652

return widenSubVector(VT, Vec, ZeroNewElements, Subtarget, DAG, dl);

6653

}

6654

6655

// Helper function to collect subvector ops that are concatenated together,

6656

// either by ISD::CONCAT_VECTORS or a ISD::INSERT_SUBVECTOR series.

6657

// The subvectors in Ops are guaranteed to be the same type.

6658

static bool collectConcatOps(SDNode *N, SmallVectorImpl<SDValue> &Ops,

6659

SelectionDAG &DAG) {

6660

assert(Ops.empty() && "Expected an empty ops vector")(static_cast <bool> (Ops.empty() && "Expected an empty ops vector"
) ? void (0) : __assert_fail ("Ops.empty() && \"Expected an empty ops vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6660, __extension__
__PRETTY_FUNCTION__));

6661

6662

if (N->getOpcode() == ISD::CONCAT_VECTORS) {

6663

Ops.append(N->op_begin(), N->op_end());

6664

return true;

6665

}

6666

6667

if (N->getOpcode() == ISD::INSERT_SUBVECTOR) {

6668

SDValue Src = N->getOperand(0);

6669

SDValue Sub = N->getOperand(1);

6670

const APInt &Idx = N->getConstantOperandAPInt(2);

6671

EVT VT = Src.getValueType();

6672

EVT SubVT = Sub.getValueType();

6673

6674

// TODO - Handle more general insert_subvector chains.

6675

if (VT.getSizeInBits() == (SubVT.getSizeInBits() * 2)) {

6676

// insert_subvector(undef, x, lo)

6677

if (Idx == 0 && Src.isUndef()) {

6678

Ops.push_back(Sub);

6679

Ops.push_back(DAG.getUNDEF(SubVT));

6680

return true;

6681

}

6682

if (Idx == (VT.getVectorNumElements() / 2)) {

6683

// insert_subvector(insert_subvector(undef, x, lo), y, hi)

6684

if (Src.getOpcode() == ISD::INSERT_SUBVECTOR &&

6685

Src.getOperand(1).getValueType() == SubVT &&

6686

isNullConstant(Src.getOperand(2))) {

6687

Ops.push_back(Src.getOperand(1));

6688

Ops.push_back(Sub);

6689

return true;

6690

}

6691

// insert_subvector(x, extract_subvector(x, lo), hi)

6692

if (Sub.getOpcode() == ISD::EXTRACT_SUBVECTOR &&

6693

Sub.getOperand(0) == Src && isNullConstant(Sub.getOperand(1))) {

6694

Ops.append(2, Sub);

6695

return true;

6696

}

6697

// insert_subvector(undef, x, hi)

6698

if (Src.isUndef()) {

6699

Ops.push_back(DAG.getUNDEF(SubVT));

6700

Ops.push_back(Sub);

6701

return true;

6702

}

6703

}

6704

}

6705

}

6706

6707

return false;

6708

}

6709

6710

static std::pair<SDValue, SDValue> splitVector(SDValue Op, SelectionDAG &DAG,

6711

const SDLoc &dl) {

6712

EVT VT = Op.getValueType();

6713

unsigned NumElems = VT.getVectorNumElements();

6714

unsigned SizeInBits = VT.getSizeInBits();

6715

assert((NumElems % 2) == 0 && (SizeInBits % 2) == 0 &&(static_cast <bool> ((NumElems % 2) == 0 && (SizeInBits
% 2) == 0 && "Can't split odd sized vector") ? void (
0) : __assert_fail ("(NumElems % 2) == 0 && (SizeInBits % 2) == 0 && \"Can't split odd sized vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6716, __extension__
__PRETTY_FUNCTION__))

6716

"Can't split odd sized vector")(static_cast <bool> ((NumElems % 2) == 0 && (SizeInBits
% 2) == 0 && "Can't split odd sized vector") ? void (
0) : __assert_fail ("(NumElems % 2) == 0 && (SizeInBits % 2) == 0 && \"Can't split odd sized vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6716, __extension__
__PRETTY_FUNCTION__));

6717

6718

// If this is a splat value (with no-undefs) then use the lower subvector,

6719

// which should be a free extraction.

6720

SDValue Lo = extractSubVector(Op, 0, DAG, dl, SizeInBits / 2);

6721

if (DAG.isSplatValue(Op, /*AllowUndefs*/ false))

6722

return std::make_pair(Lo, Lo);

6723

6724

SDValue Hi = extractSubVector(Op, NumElems / 2, DAG, dl, SizeInBits / 2);

6725

return std::make_pair(Lo, Hi);

6726

}

6727

6728

/// Break an operation into 2 half sized ops and then concatenate the results.

6729

static SDValue splitVectorOp(SDValue Op, SelectionDAG &DAG) {

6730

unsigned NumOps = Op.getNumOperands();

6731

EVT VT = Op.getValueType();

6732

SDLoc dl(Op);

6733

6734

// Extract the LHS Lo/Hi vectors

6735

SmallVector<SDValue> LoOps(NumOps, SDValue());

6736

SmallVector<SDValue> HiOps(NumOps, SDValue());

6737

for (unsigned I = 0; I != NumOps; ++I) {

6738

SDValue SrcOp = Op.getOperand(I);

6739

if (!SrcOp.getValueType().isVector()) {

6740

LoOps[I] = HiOps[I] = SrcOp;

6741

continue;

6742

}

6743

std::tie(LoOps[I], HiOps[I]) = splitVector(SrcOp, DAG, dl);

6744

}

6745

6746

EVT LoVT, HiVT;

6747

std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);

6748

return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,

6749

DAG.getNode(Op.getOpcode(), dl, LoVT, LoOps),

6750

DAG.getNode(Op.getOpcode(), dl, HiVT, HiOps));

6751

}

6752

6753

/// Break an unary integer operation into 2 half sized ops and then

6754

/// concatenate the result back.

6755

static SDValue splitVectorIntUnary(SDValue Op, SelectionDAG &DAG) {

6756

// Make sure we only try to split 256/512-bit types to avoid creating

6757

// narrow vectors.

6758

EVT VT = Op.getValueType();

6759

(void)VT;

6760

assert((Op.getOperand(0).getValueType().is256BitVector() ||(static_cast <bool> ((Op.getOperand(0).getValueType().is256BitVector
() || Op.getOperand(0).getValueType().is512BitVector()) &&
(VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!"
) ? void (0) : __assert_fail ("(Op.getOperand(0).getValueType().is256BitVector() || Op.getOperand(0).getValueType().is512BitVector()) && (VT.is256BitVector() || VT.is512BitVector()) && \"Unsupported VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6762, __extension__
__PRETTY_FUNCTION__))

6761

Op.getOperand(0).getValueType().is512BitVector()) &&(static_cast <bool> ((Op.getOperand(0).getValueType().is256BitVector
() || Op.getOperand(0).getValueType().is512BitVector()) &&
(VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!"
) ? void (0) : __assert_fail ("(Op.getOperand(0).getValueType().is256BitVector() || Op.getOperand(0).getValueType().is512BitVector()) && (VT.is256BitVector() || VT.is512BitVector()) && \"Unsupported VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6762, __extension__
__PRETTY_FUNCTION__))

6762

(VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!")(static_cast <bool> ((Op.getOperand(0).getValueType().is256BitVector
() || Op.getOperand(0).getValueType().is512BitVector()) &&
(VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!"
) ? void (0) : __assert_fail ("(Op.getOperand(0).getValueType().is256BitVector() || Op.getOperand(0).getValueType().is512BitVector()) && (VT.is256BitVector() || VT.is512BitVector()) && \"Unsupported VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6762, __extension__
__PRETTY_FUNCTION__));

6763

assert(Op.getOperand(0).getValueType().getVectorNumElements() ==(static_cast <bool> (Op.getOperand(0).getValueType().getVectorNumElements
() == VT.getVectorNumElements() && "Unexpected VTs!")
? void (0) : __assert_fail ("Op.getOperand(0).getValueType().getVectorNumElements() == VT.getVectorNumElements() && \"Unexpected VTs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6765, __extension__
__PRETTY_FUNCTION__))

6764

VT.getVectorNumElements() &&(static_cast <bool> (Op.getOperand(0).getValueType().getVectorNumElements
() == VT.getVectorNumElements() && "Unexpected VTs!")
? void (0) : __assert_fail ("Op.getOperand(0).getValueType().getVectorNumElements() == VT.getVectorNumElements() && \"Unexpected VTs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6765, __extension__
__PRETTY_FUNCTION__))

6765

"Unexpected VTs!")(static_cast <bool> (Op.getOperand(0).getValueType().getVectorNumElements
() == VT.getVectorNumElements() && "Unexpected VTs!")
? void (0) : __assert_fail ("Op.getOperand(0).getValueType().getVectorNumElements() == VT.getVectorNumElements() && \"Unexpected VTs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6765, __extension__
__PRETTY_FUNCTION__));

6766

return splitVectorOp(Op, DAG);

6767

}

6768

6769

/// Break a binary integer operation into 2 half sized ops and then

6770

/// concatenate the result back.

6771

static SDValue splitVectorIntBinary(SDValue Op, SelectionDAG &DAG) {

6772

// Assert that all the types match.

6773

EVT VT = Op.getValueType();

6774

(void)VT;

6775

assert(Op.getOperand(0).getValueType() == VT &&(static_cast <bool> (Op.getOperand(0).getValueType() ==
VT && Op.getOperand(1).getValueType() == VT &&
"Unexpected VTs!") ? void (0) : __assert_fail ("Op.getOperand(0).getValueType() == VT && Op.getOperand(1).getValueType() == VT && \"Unexpected VTs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6776, __extension__
__PRETTY_FUNCTION__))

6776

Op.getOperand(1).getValueType() == VT && "Unexpected VTs!")(static_cast <bool> (Op.getOperand(0).getValueType() ==
VT && Op.getOperand(1).getValueType() == VT &&
"Unexpected VTs!") ? void (0) : __assert_fail ("Op.getOperand(0).getValueType() == VT && Op.getOperand(1).getValueType() == VT && \"Unexpected VTs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6776, __extension__
__PRETTY_FUNCTION__));

6777

assert((VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!")(static_cast <bool> ((VT.is256BitVector() || VT.is512BitVector
()) && "Unsupported VT!") ? void (0) : __assert_fail (
"(VT.is256BitVector() || VT.is512BitVector()) && \"Unsupported VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6777, __extension__
__PRETTY_FUNCTION__));

6778

return splitVectorOp(Op, DAG);

6779

}

6780

6781

// Helper for splitting operands of an operation to legal target size and

6782

// apply a function on each part.

6783

// Useful for operations that are available on SSE2 in 128-bit, on AVX2 in

6784

// 256-bit and on AVX512BW in 512-bit. The argument VT is the type used for

6785

// deciding if/how to split Ops. Ops elements do *not* have to be of type VT.

6786

// The argument Builder is a function that will be applied on each split part:

6787

// SDValue Builder(SelectionDAG&G, SDLoc, ArrayRef<SDValue>)

6788

template <typename F>

6789

SDValue SplitOpsAndApply(SelectionDAG &DAG, const X86Subtarget &Subtarget,

6790

const SDLoc &DL, EVT VT, ArrayRef<SDValue> Ops,

6791

F Builder, bool CheckBWI = true) {

6792

assert(Subtarget.hasSSE2() && "Target assumed to support at least SSE2")(static_cast <bool> (Subtarget.hasSSE2() && "Target assumed to support at least SSE2"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"Target assumed to support at least SSE2\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6792, __extension__
__PRETTY_FUNCTION__));

6793

unsigned NumSubs = 1;

6794

if ((CheckBWI && Subtarget.useBWIRegs()) ||

6795

(!CheckBWI && Subtarget.useAVX512Regs())) {

6796

if (VT.getSizeInBits() > 512) {

6797

NumSubs = VT.getSizeInBits() / 512;

6798

assert((VT.getSizeInBits() % 512) == 0 && "Illegal vector size")(static_cast <bool> ((VT.getSizeInBits() % 512) == 0 &&
"Illegal vector size") ? void (0) : __assert_fail ("(VT.getSizeInBits() % 512) == 0 && \"Illegal vector size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6798, __extension__
__PRETTY_FUNCTION__));

6799

}

6800

} else if (Subtarget.hasAVX2()) {

6801

if (VT.getSizeInBits() > 256) {

6802

NumSubs = VT.getSizeInBits() / 256;

6803

assert((VT.getSizeInBits() % 256) == 0 && "Illegal vector size")(static_cast <bool> ((VT.getSizeInBits() % 256) == 0 &&
"Illegal vector size") ? void (0) : __assert_fail ("(VT.getSizeInBits() % 256) == 0 && \"Illegal vector size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6803, __extension__
__PRETTY_FUNCTION__));

6804

}

6805

} else {

6806

if (VT.getSizeInBits() > 128) {

6807

NumSubs = VT.getSizeInBits() / 128;

6808

assert((VT.getSizeInBits() % 128) == 0 && "Illegal vector size")(static_cast <bool> ((VT.getSizeInBits() % 128) == 0 &&
"Illegal vector size") ? void (0) : __assert_fail ("(VT.getSizeInBits() % 128) == 0 && \"Illegal vector size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6808, __extension__
__PRETTY_FUNCTION__));

6809

}

6810

}

6811

6812

if (NumSubs == 1)

6813

return Builder(DAG, DL, Ops);

6814

6815

SmallVector<SDValue, 4> Subs;

6816

for (unsigned i = 0; i != NumSubs; ++i) {

6817

SmallVector<SDValue, 2> SubOps;

6818

for (SDValue Op : Ops) {

6819

EVT OpVT = Op.getValueType();

6820

unsigned NumSubElts = OpVT.getVectorNumElements() / NumSubs;

6821

unsigned SizeSub = OpVT.getSizeInBits() / NumSubs;

6822

SubOps.push_back(extractSubVector(Op, i * NumSubElts, DAG, DL, SizeSub));

6823

}

6824

Subs.push_back(Builder(DAG, DL, SubOps));

6825

}

6826

return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);

6827

}

6828

6829

// Helper function that extends a non-512-bit vector op to 512-bits on non-VLX

6830

// targets.

6831

static SDValue getAVX512Node(unsigned Opcode, const SDLoc &DL, MVT VT,

6832

ArrayRef<SDValue> Ops, SelectionDAG &DAG,

6833

const X86Subtarget &Subtarget) {

6834

assert(Subtarget.hasAVX512() && "AVX512 target expected")(static_cast <bool> (Subtarget.hasAVX512() && "AVX512 target expected"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && \"AVX512 target expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6834, __extension__
__PRETTY_FUNCTION__));

6835

MVT SVT = VT.getScalarType();

6836

6837

// If we have a 32/64 splatted constant, splat it to DstTy to

6838

// encourage a foldable broadcast'd operand.

6839

auto MakeBroadcastOp = [&](SDValue Op, MVT OpVT, MVT DstVT) {

6840

unsigned OpEltSizeInBits = OpVT.getScalarSizeInBits();

6841

// AVX512 broadcasts 32/64-bit operands.

6842

// TODO: Support float once getAVX512Node is used by fp-ops.

6843

if (!OpVT.isInteger() || OpEltSizeInBits < 32 ||

6844

!DAG.getTargetLoweringInfo().isTypeLegal(SVT))

6845

return SDValue();

6846

// If we're not widening, don't bother if we're not bitcasting.

6847

if (OpVT == DstVT && Op.getOpcode() != ISD::BITCAST)

6848

return SDValue();

6849

if (auto *BV = dyn_cast<BuildVectorSDNode>(peekThroughBitcasts(Op))) {

6850

APInt SplatValue, SplatUndef;

6851

unsigned SplatBitSize;

6852

bool HasAnyUndefs;

6853

if (BV->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,

6854

HasAnyUndefs, OpEltSizeInBits) &&

6855

!HasAnyUndefs && SplatValue.getBitWidth() == OpEltSizeInBits)

6856

return DAG.getConstant(SplatValue, DL, DstVT);

6857

}

6858

return SDValue();

6859

};

6860

6861

bool Widen = !(Subtarget.hasVLX() || VT.is512BitVector());

6862

6863

MVT DstVT = VT;

6864

if (Widen)

6865

DstVT = MVT::getVectorVT(SVT, 512 / SVT.getSizeInBits());

6866

6867

// Canonicalize src operands.

6868

SmallVector<SDValue> SrcOps(Ops.begin(), Ops.end());

6869

for (SDValue &Op : SrcOps) {

6870

MVT OpVT = Op.getSimpleValueType();

6871

// Just pass through scalar operands.

6872

if (!OpVT.isVector())

6873

continue;

6874

assert(OpVT == VT && "Vector type mismatch")(static_cast <bool> (OpVT == VT && "Vector type mismatch"
) ? void (0) : __assert_fail ("OpVT == VT && \"Vector type mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6874, __extension__
__PRETTY_FUNCTION__));

6875

6876

if (SDValue BroadcastOp = MakeBroadcastOp(Op, OpVT, DstVT)) {

6877

Op = BroadcastOp;

6878

continue;

6879

}

6880

6881

// Just widen the subvector by inserting into an undef wide vector.

6882

if (Widen)

6883

Op = widenSubVector(Op, false, Subtarget, DAG, DL, 512);

6884

}

6885

6886

SDValue Res = DAG.getNode(Opcode, DL, DstVT, SrcOps);

6887

6888

// Perform the 512-bit op then extract the bottom subvector.

6889

if (Widen)

6890

Res = extractSubVector(Res, 0, DAG, DL, VT.getSizeInBits());

6891

return Res;

6892

}

6893

6894

/// Insert i1-subvector to i1-vector.

6895

static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,

6896

const X86Subtarget &Subtarget) {

6897

6898

SDLoc dl(Op);

6899

SDValue Vec = Op.getOperand(0);

6900

SDValue SubVec = Op.getOperand(1);

6901

SDValue Idx = Op.getOperand(2);

6902

unsigned IdxVal = Op.getConstantOperandVal(2);

6903

6904

// Inserting undef is a nop. We can just return the original vector.

6905

if (SubVec.isUndef())

6906

return Vec;

6907

6908

if (IdxVal == 0 && Vec.isUndef()) // the operation is legal

6909

return Op;

6910

6911

MVT OpVT = Op.getSimpleValueType();

6912

unsigned NumElems = OpVT.getVectorNumElements();

6913

SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);

6914

6915

// Extend to natively supported kshift.

6916

MVT WideOpVT = OpVT;

6917

if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8)

6918

WideOpVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;

6919

6920

// Inserting into the lsbs of a zero vector is legal. ISel will insert shifts

6921

// if necessary.

6922

if (IdxVal == 0 && ISD::isBuildVectorAllZeros(Vec.getNode())) {

6923

// May need to promote to a legal type.

6924

Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,

6925

DAG.getConstant(0, dl, WideOpVT),

6926

SubVec, Idx);

6927

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);

6928

}

6929

6930

MVT SubVecVT = SubVec.getSimpleValueType();

6931

unsigned SubVecNumElems = SubVecVT.getVectorNumElements();

6932

assert(IdxVal + SubVecNumElems <= NumElems &&(static_cast <bool> (IdxVal + SubVecNumElems <= NumElems
&& IdxVal % SubVecVT.getSizeInBits() == 0 &&
"Unexpected index value in INSERT_SUBVECTOR") ? void (0) : __assert_fail
("IdxVal + SubVecNumElems <= NumElems && IdxVal % SubVecVT.getSizeInBits() == 0 && \"Unexpected index value in INSERT_SUBVECTOR\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6934, __extension__
__PRETTY_FUNCTION__))

6933

IdxVal % SubVecVT.getSizeInBits() == 0 &&(static_cast <bool> (IdxVal + SubVecNumElems <= NumElems
&& IdxVal % SubVecVT.getSizeInBits() == 0 &&
"Unexpected index value in INSERT_SUBVECTOR") ? void (0) : __assert_fail
("IdxVal + SubVecNumElems <= NumElems && IdxVal % SubVecVT.getSizeInBits() == 0 && \"Unexpected index value in INSERT_SUBVECTOR\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6934, __extension__
__PRETTY_FUNCTION__))

6934

"Unexpected index value in INSERT_SUBVECTOR")(static_cast <bool> (IdxVal + SubVecNumElems <= NumElems
&& IdxVal % SubVecVT.getSizeInBits() == 0 &&
"Unexpected index value in INSERT_SUBVECTOR") ? void (0) : __assert_fail
("IdxVal + SubVecNumElems <= NumElems && IdxVal % SubVecVT.getSizeInBits() == 0 && \"Unexpected index value in INSERT_SUBVECTOR\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6934, __extension__
__PRETTY_FUNCTION__));

6935

6936

SDValue Undef = DAG.getUNDEF(WideOpVT);

6937

6938

if (IdxVal == 0) {

6939

// Zero lower bits of the Vec

6940

SDValue ShiftBits = DAG.getTargetConstant(SubVecNumElems, dl, MVT::i8);

6941

Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec,

6942

ZeroIdx);

6943

Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);

6944

Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);

6945

// Merge them together, SubVec should be zero extended.

6946

SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,

6947

DAG.getConstant(0, dl, WideOpVT),

6948

SubVec, ZeroIdx);

6949

Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);

6950

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);

6951

}

6952

6953

SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,

6954

Undef, SubVec, ZeroIdx);

6955

6956

if (Vec.isUndef()) {

6957

assert(IdxVal != 0 && "Unexpected index")(static_cast <bool> (IdxVal != 0 && "Unexpected index"
) ? void (0) : __assert_fail ("IdxVal != 0 && \"Unexpected index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6957, __extension__
__PRETTY_FUNCTION__));

6958

SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,

6959

DAG.getTargetConstant(IdxVal, dl, MVT::i8));

6960

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);

6961

}

6962

6963

if (ISD::isBuildVectorAllZeros(Vec.getNode())) {

6964

assert(IdxVal != 0 && "Unexpected index")(static_cast <bool> (IdxVal != 0 && "Unexpected index"
) ? void (0) : __assert_fail ("IdxVal != 0 && \"Unexpected index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6964, __extension__
__PRETTY_FUNCTION__));

6965

// If upper elements of Vec are known undef, then just shift into place.

6966

if (llvm::all_of(Vec->ops().slice(IdxVal + SubVecNumElems),

6967

[](SDValue V) { return V.isUndef(); })) {

6968

SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,

6969

DAG.getTargetConstant(IdxVal, dl, MVT::i8));

6970

} else {

6971

NumElems = WideOpVT.getVectorNumElements();

6972

unsigned ShiftLeft = NumElems - SubVecNumElems;

6973

unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;

6974

SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,

6975

DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));

6976

if (ShiftRight != 0)

6977

SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,

6978

DAG.getTargetConstant(ShiftRight, dl, MVT::i8));

6979

}

6980

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);

6981

}

6982

6983

// Simple case when we put subvector in the upper part

6984

if (IdxVal + SubVecNumElems == NumElems) {

6985

SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,

6986

DAG.getTargetConstant(IdxVal, dl, MVT::i8));

6987

if (SubVecNumElems * 2 == NumElems) {

6988

// Special case, use legal zero extending insert_subvector. This allows

6989

// isel to optimize when bits are known zero.

6990

Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVecVT, Vec, ZeroIdx);

6991

Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,

6992

DAG.getConstant(0, dl, WideOpVT),

6993

Vec, ZeroIdx);

6994

} else {

6995

// Otherwise use explicit shifts to zero the bits.

6996

Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,

6997

Undef, Vec, ZeroIdx);

6998

NumElems = WideOpVT.getVectorNumElements();

6999

SDValue ShiftBits = DAG.getTargetConstant(NumElems - IdxVal, dl, MVT::i8);

7000

Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);

7001

Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);

7002

}

7003

Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);

7004

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);

7005

}

7006

7007

// Inserting into the middle is more complicated.

7008

7009

NumElems = WideOpVT.getVectorNumElements();

7010

7011

// Widen the vector if needed.

7012

Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);

7013

7014

unsigned ShiftLeft = NumElems - SubVecNumElems;

7015

unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;

7016

7017

// Do an optimization for the the most frequently used types.

7018

if (WideOpVT != MVT::v64i1 || Subtarget.is64Bit()) {

7019

APInt Mask0 = APInt::getBitsSet(NumElems, IdxVal, IdxVal + SubVecNumElems);

7020

Mask0.flipAllBits();

7021

SDValue CMask0 = DAG.getConstant(Mask0, dl, MVT::getIntegerVT(NumElems));

7022

SDValue VMask0 = DAG.getNode(ISD::BITCAST, dl, WideOpVT, CMask0);

7023

Vec = DAG.getNode(ISD::AND, dl, WideOpVT, Vec, VMask0);

7024

SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,

7025

DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));

7026

SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,

7027

DAG.getTargetConstant(ShiftRight, dl, MVT::i8));

7028

Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);

7029

7030

// Reduce to original width if needed.

7031

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);

7032

}

7033

7034

// Clear the upper bits of the subvector and move it to its insert position.

7035

SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,

7036

DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));

7037

SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,

7038

DAG.getTargetConstant(ShiftRight, dl, MVT::i8));

7039

7040

// Isolate the bits below the insertion point.

7041

unsigned LowShift = NumElems - IdxVal;

7042

SDValue Low = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec,

7043

DAG.getTargetConstant(LowShift, dl, MVT::i8));

7044

Low = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Low,

7045

DAG.getTargetConstant(LowShift, dl, MVT::i8));

7046

7047

// Isolate the bits after the last inserted bit.

7048

unsigned HighShift = IdxVal + SubVecNumElems;

7049

SDValue High = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec,

7050

DAG.getTargetConstant(HighShift, dl, MVT::i8));

7051

High = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, High,

7052

DAG.getTargetConstant(HighShift, dl, MVT::i8));

7053

7054

// Now OR all 3 pieces together.

7055

Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Low, High);

7056

SubVec = DAG.getNode(ISD::OR, dl, WideOpVT, SubVec, Vec);

7057

7058

// Reduce to original width if needed.

7059

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);

7060

}

7061

7062

static SDValue concatSubVectors(SDValue V1, SDValue V2, SelectionDAG &DAG,

7063

const SDLoc &dl) {

7064

assert(V1.getValueType() == V2.getValueType() && "subvector type mismatch")(static_cast <bool> (V1.getValueType() == V2.getValueType
() && "subvector type mismatch") ? void (0) : __assert_fail
("V1.getValueType() == V2.getValueType() && \"subvector type mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7064, __extension__
__PRETTY_FUNCTION__));

7065

EVT SubVT = V1.getValueType();

7066

EVT SubSVT = SubVT.getScalarType();

7067

unsigned SubNumElts = SubVT.getVectorNumElements();

7068

unsigned SubVectorWidth = SubVT.getSizeInBits();

7069

EVT VT = EVT::getVectorVT(*DAG.getContext(), SubSVT, 2 * SubNumElts);

7070

SDValue V = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, dl, SubVectorWidth);

7071

return insertSubVector(V, V2, SubNumElts, DAG, dl, SubVectorWidth);

7072

}

7073

7074

/// Returns a vector of specified type with all bits set.

7075

/// Always build ones vectors as <4 x i32>, <8 x i32> or <16 x i32>.

7076

/// Then bitcast to their original type, ensuring they get CSE'd.

7077

static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {

7078

assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector()) && "Expected a 128/256/512-bit vector type"
) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Expected a 128/256/512-bit vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7079, __extension__
__PRETTY_FUNCTION__))

7079

"Expected a 128/256/512-bit vector type")(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector()) && "Expected a 128/256/512-bit vector type"
) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Expected a 128/256/512-bit vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7079, __extension__
__PRETTY_FUNCTION__));

7080

7081

APInt Ones = APInt::getAllOnes(32);

7082

unsigned NumElts = VT.getSizeInBits() / 32;

7083

SDValue Vec = DAG.getConstant(Ones, dl, MVT::getVectorVT(MVT::i32, NumElts));

7084

return DAG.getBitcast(VT, Vec);

7085

}

7086

7087

static SDValue getEXTEND_VECTOR_INREG(unsigned Opcode, const SDLoc &DL, EVT VT,

7088

SDValue In, SelectionDAG &DAG) {

7089

EVT InVT = In.getValueType();

7090

assert(VT.isVector() && InVT.isVector() && "Expected vector VTs.")(static_cast <bool> (VT.isVector() && InVT.isVector
() && "Expected vector VTs.") ? void (0) : __assert_fail
("VT.isVector() && InVT.isVector() && \"Expected vector VTs.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7090, __extension__
__PRETTY_FUNCTION__));

7091

assert((ISD::ANY_EXTEND == Opcode || ISD::SIGN_EXTEND == Opcode ||(static_cast <bool> ((ISD::ANY_EXTEND == Opcode || ISD::
SIGN_EXTEND == Opcode || ISD::ZERO_EXTEND == Opcode) &&
"Unknown extension opcode") ? void (0) : __assert_fail ("(ISD::ANY_EXTEND == Opcode || ISD::SIGN_EXTEND == Opcode || ISD::ZERO_EXTEND == Opcode) && \"Unknown extension opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7093, __extension__
__PRETTY_FUNCTION__))

7092

ISD::ZERO_EXTEND == Opcode) &&(static_cast <bool> ((ISD::ANY_EXTEND == Opcode || ISD::
SIGN_EXTEND == Opcode || ISD::ZERO_EXTEND == Opcode) &&
"Unknown extension opcode") ? void (0) : __assert_fail ("(ISD::ANY_EXTEND == Opcode || ISD::SIGN_EXTEND == Opcode || ISD::ZERO_EXTEND == Opcode) && \"Unknown extension opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7093, __extension__
__PRETTY_FUNCTION__))

7093

"Unknown extension opcode")(static_cast <bool> ((ISD::ANY_EXTEND == Opcode || ISD::
SIGN_EXTEND == Opcode || ISD::ZERO_EXTEND == Opcode) &&
"Unknown extension opcode") ? void (0) : __assert_fail ("(ISD::ANY_EXTEND == Opcode || ISD::SIGN_EXTEND == Opcode || ISD::ZERO_EXTEND == Opcode) && \"Unknown extension opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7093, __extension__
__PRETTY_FUNCTION__));

7094

7095

// For 256-bit vectors, we only need the lower (128-bit) input half.

7096

// For 512-bit vectors, we only need the lower input half or quarter.

7097

if (InVT.getSizeInBits() > 128) {

7098

assert(VT.getSizeInBits() == InVT.getSizeInBits() &&(static_cast <bool> (VT.getSizeInBits() == InVT.getSizeInBits
() && "Expected VTs to be the same size!") ? void (0)
: __assert_fail ("VT.getSizeInBits() == InVT.getSizeInBits() && \"Expected VTs to be the same size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7099, __extension__
__PRETTY_FUNCTION__))

7099

"Expected VTs to be the same size!")(static_cast <bool> (VT.getSizeInBits() == InVT.getSizeInBits
() && "Expected VTs to be the same size!") ? void (0)
: __assert_fail ("VT.getSizeInBits() == InVT.getSizeInBits() && \"Expected VTs to be the same size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7099, __extension__
__PRETTY_FUNCTION__));

7100

unsigned Scale = VT.getScalarSizeInBits() / InVT.getScalarSizeInBits();

7101

In = extractSubVector(In, 0, DAG, DL,

7102

std::max(128U, (unsigned)VT.getSizeInBits() / Scale));

7103

InVT = In.getValueType();

7104

}

7105

7106

if (VT.getVectorNumElements() != InVT.getVectorNumElements())

7107

Opcode = DAG.getOpcode_EXTEND_VECTOR_INREG(Opcode);

7108

7109

return DAG.getNode(Opcode, DL, VT, In);

7110

}

7111

7112

// Match (xor X, -1) -> X.

7113

// Match extract_subvector(xor X, -1) -> extract_subvector(X).

7114

// Match concat_vectors(xor X, -1, xor Y, -1) -> concat_vectors(X, Y).

7115

static SDValue IsNOT(SDValue V, SelectionDAG &DAG) {

7116

V = peekThroughBitcasts(V);

7117

if (V.getOpcode() == ISD::XOR &&

7118

(ISD::isBuildVectorAllOnes(V.getOperand(1).getNode()) ||

7119

isAllOnesConstant(V.getOperand(1))))

7120

return V.getOperand(0);

7121

if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR &&

7122

(isNullConstant(V.getOperand(1)) || V.getOperand(0).hasOneUse())) {

7123

if (SDValue Not = IsNOT(V.getOperand(0), DAG)) {

7124

Not = DAG.getBitcast(V.getOperand(0).getValueType(), Not);

7125

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(Not), V.getValueType(),

7126

Not, V.getOperand(1));

7127

}

7128

}

7129

SmallVector<SDValue, 2> CatOps;

7130

if (collectConcatOps(V.getNode(), CatOps, DAG)) {

7131

for (SDValue &CatOp : CatOps) {

7132

SDValue NotCat = IsNOT(CatOp, DAG);

7133

if (!NotCat) return SDValue();

7134

CatOp = DAG.getBitcast(CatOp.getValueType(), NotCat);

7135

}

7136

return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(V), V.getValueType(), CatOps);

7137

}

7138

return SDValue();

7139

}

7140

7141

void llvm::createUnpackShuffleMask(EVT VT, SmallVectorImpl<int> &Mask,

7142

bool Lo, bool Unary) {

7143

assert(VT.getScalarType().isSimple() && (VT.getSizeInBits() % 128) == 0 &&(static_cast <bool> (VT.getScalarType().isSimple() &&
(VT.getSizeInBits() % 128) == 0 && "Illegal vector type to unpack"
) ? void (0) : __assert_fail ("VT.getScalarType().isSimple() && (VT.getSizeInBits() % 128) == 0 && \"Illegal vector type to unpack\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7144, __extension__
__PRETTY_FUNCTION__))

7144

"Illegal vector type to unpack")(static_cast <bool> (VT.getScalarType().isSimple() &&
(VT.getSizeInBits() % 128) == 0 && "Illegal vector type to unpack"
) ? void (0) : __assert_fail ("VT.getScalarType().isSimple() && (VT.getSizeInBits() % 128) == 0 && \"Illegal vector type to unpack\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7144, __extension__
__PRETTY_FUNCTION__));

7145

assert(Mask.empty() && "Expected an empty shuffle mask vector")(static_cast <bool> (Mask.empty() && "Expected an empty shuffle mask vector"
) ? void (0) : __assert_fail ("Mask.empty() && \"Expected an empty shuffle mask vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7145, __extension__
__PRETTY_FUNCTION__));

7146

int NumElts = VT.getVectorNumElements();

7147

int NumEltsInLane = 128 / VT.getScalarSizeInBits();

7148

for (int i = 0; i < NumElts; ++i) {

7149

unsigned LaneStart = (i / NumEltsInLane) * NumEltsInLane;

7150

int Pos = (i % NumEltsInLane) / 2 + LaneStart;

7151

Pos += (Unary ? 0 : NumElts * (i % 2));

7152

Pos += (Lo ? 0 : NumEltsInLane / 2);

7153

Mask.push_back(Pos);

7154

}

7155

}

7156

7157

/// Similar to unpacklo/unpackhi, but without the 128-bit lane limitation

7158

/// imposed by AVX and specific to the unary pattern. Example:

7159

/// v8iX Lo --> <0, 0, 1, 1, 2, 2, 3, 3>

7160

/// v8iX Hi --> <4, 4, 5, 5, 6, 6, 7, 7>

7161

void llvm::createSplat2ShuffleMask(MVT VT, SmallVectorImpl<int> &Mask,

7162

bool Lo) {

7163

assert(Mask.empty() && "Expected an empty shuffle mask vector")(static_cast <bool> (Mask.empty() && "Expected an empty shuffle mask vector"
) ? void (0) : __assert_fail ("Mask.empty() && \"Expected an empty shuffle mask vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7163, __extension__
__PRETTY_FUNCTION__));

7164

int NumElts = VT.getVectorNumElements();

7165

for (int i = 0; i < NumElts; ++i) {

7166

int Pos = i / 2;

7167

Pos += (Lo ? 0 : NumElts / 2);

7168

Mask.push_back(Pos);

7169

}

7170

}

7171

7172

// Attempt to constant fold, else just create a VECTOR_SHUFFLE.

7173

static SDValue getVectorShuffle(SelectionDAG &DAG, EVT VT, const SDLoc &dl,

7174

SDValue V1, SDValue V2, ArrayRef<int> Mask) {

7175

if ((ISD::isBuildVectorOfConstantSDNodes(V1.getNode()) || V1.isUndef()) &&

7176

(ISD::isBuildVectorOfConstantSDNodes(V2.getNode()) || V2.isUndef())) {

7177

SmallVector<SDValue> Ops(Mask.size(), DAG.getUNDEF(VT.getScalarType()));

7178

for (int I = 0, NumElts = Mask.size(); I != NumElts; ++I) {

7179

int M = Mask[I];

7180

if (M < 0)

7181

continue;

7182

SDValue V = (M < NumElts) ? V1 : V2;

7183

if (V.isUndef())

7184

continue;

7185

Ops[I] = V.getOperand(M % NumElts);

7186

}

7187

return DAG.getBuildVector(VT, dl, Ops);

7188

}

7189

7190

return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);

7191

}

7192

7193

/// Returns a vector_shuffle node for an unpackl operation.

7194

static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, EVT VT,

7195

SDValue V1, SDValue V2) {

7196

SmallVector<int, 8> Mask;

7197

createUnpackShuffleMask(VT, Mask, /* Lo = */ true, /* Unary = */ false);

7198

return getVectorShuffle(DAG, VT, dl, V1, V2, Mask);

7199

}

7200

7201

/// Returns a vector_shuffle node for an unpackh operation.

7202

static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, EVT VT,

7203

SDValue V1, SDValue V2) {

7204

SmallVector<int, 8> Mask;

7205

createUnpackShuffleMask(VT, Mask, /* Lo = */ false, /* Unary = */ false);

7206

return getVectorShuffle(DAG, VT, dl, V1, V2, Mask);

7207

}

7208

7209

/// Returns a node that packs the LHS + RHS nodes together at half width.

7210

/// May return X86ISD::PACKSS/PACKUS, packing the top/bottom half.

7211

/// TODO: Add subvector splitting if/when we have a need for it.

7212

static SDValue getPack(SelectionDAG &DAG, const X86Subtarget &Subtarget,

7213

const SDLoc &dl, MVT VT, SDValue LHS, SDValue RHS,

7214

bool PackHiHalf = false) {

7215

MVT OpVT = LHS.getSimpleValueType();

7216

unsigned EltSizeInBits = VT.getScalarSizeInBits();

7217

bool UsePackUS = Subtarget.hasSSE41() || EltSizeInBits == 8;

7218

assert(OpVT == RHS.getSimpleValueType() &&(static_cast <bool> (OpVT == RHS.getSimpleValueType() &&
VT.getSizeInBits() == OpVT.getSizeInBits() && (EltSizeInBits
* 2) == OpVT.getScalarSizeInBits() && "Unexpected PACK operand types"
) ? void (0) : __assert_fail ("OpVT == RHS.getSimpleValueType() && VT.getSizeInBits() == OpVT.getSizeInBits() && (EltSizeInBits * 2) == OpVT.getScalarSizeInBits() && \"Unexpected PACK operand types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7221, __extension__
__PRETTY_FUNCTION__))

7219

VT.getSizeInBits() == OpVT.getSizeInBits() &&(static_cast <bool> (OpVT == RHS.getSimpleValueType() &&
VT.getSizeInBits() == OpVT.getSizeInBits() && (EltSizeInBits
* 2) == OpVT.getScalarSizeInBits() && "Unexpected PACK operand types"
) ? void (0) : __assert_fail ("OpVT == RHS.getSimpleValueType() && VT.getSizeInBits() == OpVT.getSizeInBits() && (EltSizeInBits * 2) == OpVT.getScalarSizeInBits() && \"Unexpected PACK operand types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7221, __extension__
__PRETTY_FUNCTION__))

7220

(EltSizeInBits * 2) == OpVT.getScalarSizeInBits() &&(static_cast <bool> (OpVT == RHS.getSimpleValueType() &&
VT.getSizeInBits() == OpVT.getSizeInBits() && (EltSizeInBits
* 2) == OpVT.getScalarSizeInBits() && "Unexpected PACK operand types"
) ? void (0) : __assert_fail ("OpVT == RHS.getSimpleValueType() && VT.getSizeInBits() == OpVT.getSizeInBits() && (EltSizeInBits * 2) == OpVT.getScalarSizeInBits() && \"Unexpected PACK operand types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7221, __extension__
__PRETTY_FUNCTION__))

7221

"Unexpected PACK operand types")(static_cast <bool> (OpVT == RHS.getSimpleValueType() &&
VT.getSizeInBits() == OpVT.getSizeInBits() && (EltSizeInBits
* 2) == OpVT.getScalarSizeInBits() && "Unexpected PACK operand types"
) ? void (0) : __assert_fail ("OpVT == RHS.getSimpleValueType() && VT.getSizeInBits() == OpVT.getSizeInBits() && (EltSizeInBits * 2) == OpVT.getScalarSizeInBits() && \"Unexpected PACK operand types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7221, __extension__
__PRETTY_FUNCTION__));

7222

assert((EltSizeInBits == 8 || EltSizeInBits == 16 || EltSizeInBits == 32) &&(static_cast <bool> ((EltSizeInBits == 8 || EltSizeInBits
== 16 || EltSizeInBits == 32) && "Unexpected PACK result type"
) ? void (0) : __assert_fail ("(EltSizeInBits == 8 || EltSizeInBits == 16 || EltSizeInBits == 32) && \"Unexpected PACK result type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7223, __extension__
__PRETTY_FUNCTION__))

7223

"Unexpected PACK result type")(static_cast <bool> ((EltSizeInBits == 8 || EltSizeInBits
== 16 || EltSizeInBits == 32) && "Unexpected PACK result type"
) ? void (0) : __assert_fail ("(EltSizeInBits == 8 || EltSizeInBits == 16 || EltSizeInBits == 32) && \"Unexpected PACK result type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7223, __extension__
__PRETTY_FUNCTION__));

7224

7225

// Rely on vector shuffles for vXi64 -> vXi32 packing.

7226

if (EltSizeInBits == 32) {

7227

SmallVector<int> PackMask;

7228

int Offset = PackHiHalf ? 1 : 0;

7229

int NumElts = VT.getVectorNumElements();

7230

for (int I = 0; I != NumElts; I += 4) {

7231

PackMask.push_back(I + Offset);

7232

PackMask.push_back(I + Offset + 2);

7233

PackMask.push_back(I + Offset + NumElts);

7234

PackMask.push_back(I + Offset + NumElts + 2);

7235

}

7236

return DAG.getVectorShuffle(VT, dl, DAG.getBitcast(VT, LHS),

7237

DAG.getBitcast(VT, RHS), PackMask);

7238

}

7239

7240

// See if we already have sufficient leading bits for PACKSS/PACKUS.

7241

if (!PackHiHalf) {

7242

if (UsePackUS &&

7243

DAG.computeKnownBits(LHS).countMaxActiveBits() <= EltSizeInBits &&

7244

DAG.computeKnownBits(RHS).countMaxActiveBits() <= EltSizeInBits)

7245

return DAG.getNode(X86ISD::PACKUS, dl, VT, LHS, RHS);

7246

7247

if (DAG.ComputeMaxSignificantBits(LHS) <= EltSizeInBits &&

7248

DAG.ComputeMaxSignificantBits(RHS) <= EltSizeInBits)

7249

return DAG.getNode(X86ISD::PACKSS, dl, VT, LHS, RHS);

7250

}

7251

7252

// Fallback to sign/zero extending the requested half and pack.

7253

SDValue Amt = DAG.getTargetConstant(EltSizeInBits, dl, MVT::i8);

7254

if (UsePackUS) {

7255

if (PackHiHalf) {

7256

LHS = DAG.getNode(X86ISD::VSRLI, dl, OpVT, LHS, Amt);

7257

RHS = DAG.getNode(X86ISD::VSRLI, dl, OpVT, RHS, Amt);

7258

} else {

7259

SDValue Mask = DAG.getConstant((1ULL << EltSizeInBits) - 1, dl, OpVT);

7260

LHS = DAG.getNode(ISD::AND, dl, OpVT, LHS, Mask);

7261

RHS = DAG.getNode(ISD::AND, dl, OpVT, RHS, Mask);

7262

};

7263

return DAG.getNode(X86ISD::PACKUS, dl, VT, LHS, RHS);

7264

};

7265

7266

if (!PackHiHalf) {

7267

LHS = DAG.getNode(X86ISD::VSHLI, dl, OpVT, LHS, Amt);

7268

RHS = DAG.getNode(X86ISD::VSHLI, dl, OpVT, RHS, Amt);

7269

}

7270

LHS = DAG.getNode(X86ISD::VSRAI, dl, OpVT, LHS, Amt);

7271

RHS = DAG.getNode(X86ISD::VSRAI, dl, OpVT, RHS, Amt);

7272

return DAG.getNode(X86ISD::PACKSS, dl, VT, LHS, RHS);

7273

}

7274

7275

/// Return a vector_shuffle of the specified vector of zero or undef vector.

7276

/// This produces a shuffle where the low element of V2 is swizzled into the

7277

/// zero/undef vector, landing at element Idx.

7278

/// This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3).

7279

static SDValue getShuffleVectorZeroOrUndef(SDValue V2, int Idx,

7280

bool IsZero,

7281

const X86Subtarget &Subtarget,

7282

SelectionDAG &DAG) {

7283

MVT VT = V2.getSimpleValueType();

7284

SDValue V1 = IsZero

7285

? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);

7286

int NumElems = VT.getVectorNumElements();

7287

SmallVector<int, 16> MaskVec(NumElems);

7288

for (int i = 0; i != NumElems; ++i)

7289

// If this is the insertion idx, put the low elt of V2 here.

7290

MaskVec[i] = (i == Idx) ? NumElems : i;

7291

return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, MaskVec);

7292

}

7293

7294

static const Constant *getTargetConstantFromBasePtr(SDValue Ptr) {

7295

if (Ptr.getOpcode() == X86ISD::Wrapper ||

7296

Ptr.getOpcode() == X86ISD::WrapperRIP)

7297

Ptr = Ptr.getOperand(0);

7298

7299

auto *CNode = dyn_cast<ConstantPoolSDNode>(Ptr);

7300

if (!CNode || CNode->isMachineConstantPoolEntry() || CNode->getOffset() != 0)

7301

return nullptr;

7302

7303

return CNode->getConstVal();

7304

}

7305

7306

static const Constant *getTargetConstantFromNode(LoadSDNode *Load) {

7307

if (!Load || !ISD::isNormalLoad(Load))

7308

return nullptr;

7309

return getTargetConstantFromBasePtr(Load->getBasePtr());

7310

}

7311

7312

static const Constant *getTargetConstantFromNode(SDValue Op) {

7313

Op = peekThroughBitcasts(Op);

7314

return getTargetConstantFromNode(dyn_cast<LoadSDNode>(Op));

7315

}

7316

7317

const Constant *

7318

X86TargetLowering::getTargetConstantFromLoad(LoadSDNode *LD) const {

7319

assert(LD && "Unexpected null LoadSDNode")(static_cast <bool> (LD && "Unexpected null LoadSDNode"
) ? void (0) : __assert_fail ("LD && \"Unexpected null LoadSDNode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7319, __extension__
__PRETTY_FUNCTION__));

7320

return getTargetConstantFromNode(LD);

7321

}

7322

7323

// Extract raw constant bits from constant pools.

7324

static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,

7325

APInt &UndefElts,

7326

SmallVectorImpl<APInt> &EltBits,

7327

bool AllowWholeUndefs = true,

7328

bool AllowPartialUndefs = true) {

7329

assert(EltBits.empty() && "Expected an empty EltBits vector")(static_cast <bool> (EltBits.empty() && "Expected an empty EltBits vector"
) ? void (0) : __assert_fail ("EltBits.empty() && \"Expected an empty EltBits vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7329, __extension__
__PRETTY_FUNCTION__));

7330

7331

Op = peekThroughBitcasts(Op);

7332

7333

EVT VT = Op.getValueType();

7334

unsigned SizeInBits = VT.getSizeInBits();

7335

assert((SizeInBits % EltSizeInBits) == 0 && "Can't split constant!")(static_cast <bool> ((SizeInBits % EltSizeInBits) == 0 &&
"Can't split constant!") ? void (0) : __assert_fail ("(SizeInBits % EltSizeInBits) == 0 && \"Can't split constant!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7335, __extension__
__PRETTY_FUNCTION__));

7336

unsigned NumElts = SizeInBits / EltSizeInBits;

7337

7338

// Bitcast a source array of element bits to the target size.

7339

auto CastBitData = [&](APInt &UndefSrcElts, ArrayRef<APInt> SrcEltBits) {

7340

unsigned NumSrcElts = UndefSrcElts.getBitWidth();

7341

unsigned SrcEltSizeInBits = SrcEltBits[0].getBitWidth();

7342

assert((NumSrcElts * SrcEltSizeInBits) == SizeInBits &&(static_cast <bool> ((NumSrcElts * SrcEltSizeInBits) ==
SizeInBits && "Constant bit sizes don't match") ? void
(0) : __assert_fail ("(NumSrcElts * SrcEltSizeInBits) == SizeInBits && \"Constant bit sizes don't match\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7343, __extension__
__PRETTY_FUNCTION__))

7343

"Constant bit sizes don't match")(static_cast <bool> ((NumSrcElts * SrcEltSizeInBits) ==
SizeInBits && "Constant bit sizes don't match") ? void
(0) : __assert_fail ("(NumSrcElts * SrcEltSizeInBits) == SizeInBits && \"Constant bit sizes don't match\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7343, __extension__
__PRETTY_FUNCTION__));

7344

7345

// Don't split if we don't allow undef bits.

7346

bool AllowUndefs = AllowWholeUndefs || AllowPartialUndefs;

7347

if (UndefSrcElts.getBoolValue() && !AllowUndefs)

7348

return false;

7349

7350

// If we're already the right size, don't bother bitcasting.

7351

if (NumSrcElts == NumElts) {

7352

UndefElts = UndefSrcElts;

7353

EltBits.assign(SrcEltBits.begin(), SrcEltBits.end());

7354

return true;

7355

}

7356

7357

// Extract all the undef/constant element data and pack into single bitsets.

7358

APInt UndefBits(SizeInBits, 0);

7359

APInt MaskBits(SizeInBits, 0);

7360

7361

for (unsigned i = 0; i != NumSrcElts; ++i) {

7362

unsigned BitOffset = i * SrcEltSizeInBits;

7363

if (UndefSrcElts[i])

7364

UndefBits.setBits(BitOffset, BitOffset + SrcEltSizeInBits);

7365

MaskBits.insertBits(SrcEltBits[i], BitOffset);

7366

}

7367

7368

// Split the undef/constant single bitset data into the target elements.

7369

UndefElts = APInt(NumElts, 0);

7370

EltBits.resize(NumElts, APInt(EltSizeInBits, 0));

7371

7372

for (unsigned i = 0; i != NumElts; ++i) {

7373

unsigned BitOffset = i * EltSizeInBits;

7374

APInt UndefEltBits = UndefBits.extractBits(EltSizeInBits, BitOffset);

7375

7376

// Only treat an element as UNDEF if all bits are UNDEF.

7377

if (UndefEltBits.isAllOnes()) {

7378

if (!AllowWholeUndefs)

7379

return false;

7380

UndefElts.setBit(i);

7381

continue;

7382

}

7383

7384

// If only some bits are UNDEF then treat them as zero (or bail if not

7385

// supported).

7386

if (UndefEltBits.getBoolValue() && !AllowPartialUndefs)

7387

return false;

7388

7389

EltBits[i] = MaskBits.extractBits(EltSizeInBits, BitOffset);

7390

}

7391

return true;

7392

};

7393

7394

// Collect constant bits and insert into mask/undef bit masks.

7395

auto CollectConstantBits = [](const Constant *Cst, APInt &Mask, APInt &Undefs,

7396

unsigned UndefBitIndex) {

7397

if (!Cst)

7398

return false;

7399

if (isa<UndefValue>(Cst)) {

7400

Undefs.setBit(UndefBitIndex);

7401

return true;

7402

}

7403

if (auto *CInt = dyn_cast<ConstantInt>(Cst)) {

7404

Mask = CInt->getValue();

7405

return true;

7406

}

7407

if (auto *CFP = dyn_cast<ConstantFP>(Cst)) {

7408

Mask = CFP->getValueAPF().bitcastToAPInt();

7409

return true;

7410

}

7411

return false;

7412

};

7413

7414

// Handle UNDEFs.

7415

if (Op.isUndef()) {

7416

APInt UndefSrcElts = APInt::getAllOnes(NumElts);

7417

SmallVector<APInt, 64> SrcEltBits(NumElts, APInt(EltSizeInBits, 0));

7418

return CastBitData(UndefSrcElts, SrcEltBits);

7419

}

7420

7421

// Extract scalar constant bits.

7422

if (auto *Cst = dyn_cast<ConstantSDNode>(Op)) {

7423

APInt UndefSrcElts = APInt::getZero(1);

7424

SmallVector<APInt, 64> SrcEltBits(1, Cst->getAPIntValue());

7425

return CastBitData(UndefSrcElts, SrcEltBits);

7426

}

7427

if (auto *Cst = dyn_cast<ConstantFPSDNode>(Op)) {

7428

APInt UndefSrcElts = APInt::getZero(1);

7429

APInt RawBits = Cst->getValueAPF().bitcastToAPInt();

7430

SmallVector<APInt, 64> SrcEltBits(1, RawBits);

7431

return CastBitData(UndefSrcElts, SrcEltBits);

7432

}

7433

7434

// Extract constant bits from build vector.

7435

if (auto *BV = dyn_cast<BuildVectorSDNode>(Op)) {

7436

BitVector Undefs;

7437

SmallVector<APInt> SrcEltBits;

7438

unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();

7439

if (BV->getConstantRawBits(true, SrcEltSizeInBits, SrcEltBits, Undefs)) {

7440

APInt UndefSrcElts = APInt::getZero(SrcEltBits.size());

7441

for (unsigned I = 0, E = SrcEltBits.size(); I != E; ++I)

7442

if (Undefs[I])

7443

UndefSrcElts.setBit(I);

7444

return CastBitData(UndefSrcElts, SrcEltBits);

7445

}

7446

}

7447

7448

// Extract constant bits from constant pool vector.

7449

if (auto *Cst = getTargetConstantFromNode(Op)) {

7450

Type *CstTy = Cst->getType();

7451

unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();

7452

if (!CstTy->isVectorTy() || (CstSizeInBits % SizeInBits) != 0)

7453

return false;

7454

7455

unsigned SrcEltSizeInBits = CstTy->getScalarSizeInBits();

7456

unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;

7457

7458

APInt UndefSrcElts(NumSrcElts, 0);

7459

SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));

7460

for (unsigned i = 0; i != NumSrcElts; ++i)

7461

if (!CollectConstantBits(Cst->getAggregateElement(i), SrcEltBits[i],

7462

UndefSrcElts, i))

7463

return false;

7464

7465

return CastBitData(UndefSrcElts, SrcEltBits);

7466

}

7467

7468

// Extract constant bits from a broadcasted constant pool scalar.

7469

if (Op.getOpcode() == X86ISD::VBROADCAST_LOAD &&

7470

EltSizeInBits <= VT.getScalarSizeInBits()) {

7471

auto *MemIntr = cast<MemIntrinsicSDNode>(Op);

7472

if (MemIntr->getMemoryVT().getScalarSizeInBits() != VT.getScalarSizeInBits())

7473

return false;

7474

7475

SDValue Ptr = MemIntr->getBasePtr();

7476

if (const Constant *C = getTargetConstantFromBasePtr(Ptr)) {

7477

unsigned SrcEltSizeInBits = C->getType()->getScalarSizeInBits();

7478

unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;

7479

7480

APInt UndefSrcElts(NumSrcElts, 0);

7481

SmallVector<APInt, 64> SrcEltBits(1, APInt(SrcEltSizeInBits, 0));

7482

if (CollectConstantBits(C, SrcEltBits[0], UndefSrcElts, 0)) {

7483

if (UndefSrcElts[0])

7484

UndefSrcElts.setBits(0, NumSrcElts);

7485

SrcEltBits.append(NumSrcElts - 1, SrcEltBits[0]);

7486

return CastBitData(UndefSrcElts, SrcEltBits);

7487

}

7488

}

7489

}

7490

7491

// Extract constant bits from a subvector broadcast.

7492

if (Op.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) {

7493

auto *MemIntr = cast<MemIntrinsicSDNode>(Op);

7494

SDValue Ptr = MemIntr->getBasePtr();

7495

// The source constant may be larger than the subvector broadcast,

7496

// ensure we extract the correct subvector constants.

7497

if (const Constant *Cst = getTargetConstantFromBasePtr(Ptr)) {

7498

Type *CstTy = Cst->getType();

7499

unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();

7500

unsigned SubVecSizeInBits = MemIntr->getMemoryVT().getStoreSizeInBits();

7501

if (!CstTy->isVectorTy() || (CstSizeInBits % SubVecSizeInBits) != 0 ||

7502

(SizeInBits % SubVecSizeInBits) != 0)

7503

return false;

7504

unsigned CstEltSizeInBits = CstTy->getScalarSizeInBits();

7505

unsigned NumSubElts = SubVecSizeInBits / CstEltSizeInBits;

7506

unsigned NumSubVecs = SizeInBits / SubVecSizeInBits;

7507

APInt UndefSubElts(NumSubElts, 0);

7508

SmallVector<APInt, 64> SubEltBits(NumSubElts * NumSubVecs,

7509

APInt(CstEltSizeInBits, 0));

7510

for (unsigned i = 0; i != NumSubElts; ++i) {

7511

if (!CollectConstantBits(Cst->getAggregateElement(i), SubEltBits[i],

7512

UndefSubElts, i))

7513

return false;

7514

for (unsigned j = 1; j != NumSubVecs; ++j)

7515

SubEltBits[i + (j * NumSubElts)] = SubEltBits[i];

7516

}

7517

UndefSubElts = APInt::getSplat(NumSubVecs * UndefSubElts.getBitWidth(),

7518

UndefSubElts);

7519

return CastBitData(UndefSubElts, SubEltBits);

7520

}

7521

}

7522

7523

// Extract a rematerialized scalar constant insertion.

7524

if (Op.getOpcode() == X86ISD::VZEXT_MOVL &&

7525

Op.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&

7526

isa<ConstantSDNode>(Op.getOperand(0).getOperand(0))) {

7527

unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();

7528

unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;

7529

7530

APInt UndefSrcElts(NumSrcElts, 0);

7531

SmallVector<APInt, 64> SrcEltBits;

7532

auto *CN = cast<ConstantSDNode>(Op.getOperand(0).getOperand(0));

7533

SrcEltBits.push_back(CN->getAPIntValue().zextOrTrunc(SrcEltSizeInBits));

7534

SrcEltBits.append(NumSrcElts - 1, APInt(SrcEltSizeInBits, 0));

7535

return CastBitData(UndefSrcElts, SrcEltBits);

7536

}

7537

7538

// Insert constant bits from a base and sub vector sources.

7539

if (Op.getOpcode() == ISD::INSERT_SUBVECTOR) {

7540

// If bitcasts to larger elements we might lose track of undefs - don't

7541

// allow any to be safe.

7542

unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();

7543

bool AllowUndefs = EltSizeInBits >= SrcEltSizeInBits;

7544

7545

APInt UndefSrcElts, UndefSubElts;

7546

SmallVector<APInt, 32> EltSrcBits, EltSubBits;

7547

if (getTargetConstantBitsFromNode(Op.getOperand(1), SrcEltSizeInBits,

7548

UndefSubElts, EltSubBits,

7549

AllowWholeUndefs && AllowUndefs,

7550

AllowPartialUndefs && AllowUndefs) &&

7551

getTargetConstantBitsFromNode(Op.getOperand(0), SrcEltSizeInBits,

7552

UndefSrcElts, EltSrcBits,

7553

AllowWholeUndefs && AllowUndefs,

7554

AllowPartialUndefs && AllowUndefs)) {

7555

unsigned BaseIdx = Op.getConstantOperandVal(2);

7556

UndefSrcElts.insertBits(UndefSubElts, BaseIdx);

7557

for (unsigned i = 0, e = EltSubBits.size(); i != e; ++i)

7558

EltSrcBits[BaseIdx + i] = EltSubBits[i];

7559

return CastBitData(UndefSrcElts, EltSrcBits);

7560

}

7561

}

7562

7563

// Extract constant bits from a subvector's source.

7564

if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR) {

7565

// TODO - support extract_subvector through bitcasts.

7566

if (EltSizeInBits != VT.getScalarSizeInBits())

7567

return false;

7568

7569

if (getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,

7570

UndefElts, EltBits, AllowWholeUndefs,

7571

AllowPartialUndefs)) {

7572

EVT SrcVT = Op.getOperand(0).getValueType();

7573

unsigned NumSrcElts = SrcVT.getVectorNumElements();

7574

unsigned NumSubElts = VT.getVectorNumElements();

7575

unsigned BaseIdx = Op.getConstantOperandVal(1);

7576

UndefElts = UndefElts.extractBits(NumSubElts, BaseIdx);

7577

if ((BaseIdx + NumSubElts) != NumSrcElts)

7578

EltBits.erase(EltBits.begin() + BaseIdx + NumSubElts, EltBits.end());

7579

if (BaseIdx != 0)

7580

EltBits.erase(EltBits.begin(), EltBits.begin() + BaseIdx);

7581

return true;

7582

}

7583

}

7584

7585

// Extract constant bits from shuffle node sources.

7586

if (auto *SVN = dyn_cast<ShuffleVectorSDNode>(Op)) {

7587

// TODO - support shuffle through bitcasts.

7588

if (EltSizeInBits != VT.getScalarSizeInBits())

7589

return false;

7590

7591

ArrayRef<int> Mask = SVN->getMask();

7592

if ((!AllowWholeUndefs || !AllowPartialUndefs) &&

7593

llvm::any_of(Mask, [](int M) { return M < 0; }))

7594

return false;

7595

7596

APInt UndefElts0, UndefElts1;

7597

SmallVector<APInt, 32> EltBits0, EltBits1;

7598

if (isAnyInRange(Mask, 0, NumElts) &&

7599

!getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,

7600

UndefElts0, EltBits0, AllowWholeUndefs,

7601

AllowPartialUndefs))

7602

return false;

7603

if (isAnyInRange(Mask, NumElts, 2 * NumElts) &&

7604

!getTargetConstantBitsFromNode(Op.getOperand(1), EltSizeInBits,

7605

UndefElts1, EltBits1, AllowWholeUndefs,

7606

AllowPartialUndefs))

7607

return false;

7608

7609

UndefElts = APInt::getZero(NumElts);

7610

for (int i = 0; i != (int)NumElts; ++i) {

7611

int M = Mask[i];

7612

if (M < 0) {

7613

UndefElts.setBit(i);

7614

EltBits.push_back(APInt::getZero(EltSizeInBits));

7615

} else if (M < (int)NumElts) {

7616

if (UndefElts0[M])

7617

UndefElts.setBit(i);

7618

EltBits.push_back(EltBits0[M]);

7619

} else {

7620

if (UndefElts1[M - NumElts])

7621

UndefElts.setBit(i);

7622

EltBits.push_back(EltBits1[M - NumElts]);

7623

}

7624

}

7625

return true;

7626

}

7627

7628

return false;

7629

}

7630

7631

namespace llvm {

7632

namespace X86 {

7633

bool isConstantSplat(SDValue Op, APInt &SplatVal, bool AllowPartialUndefs) {

7634

APInt UndefElts;

7635

SmallVector<APInt, 16> EltBits;

7636

if (getTargetConstantBitsFromNode(Op, Op.getScalarValueSizeInBits(),

7637

UndefElts, EltBits, true,

7638

AllowPartialUndefs)) {

7639

int SplatIndex = -1;

7640

for (int i = 0, e = EltBits.size(); i != e; ++i) {

7641

if (UndefElts[i])

7642

continue;

7643

if (0 <= SplatIndex && EltBits[i] != EltBits[SplatIndex]) {

7644

SplatIndex = -1;

7645

break;

7646

}

7647

SplatIndex = i;

7648

}

7649

if (0 <= SplatIndex) {

7650

SplatVal = EltBits[SplatIndex];

7651

return true;

7652

}

7653

}

7654

7655

return false;

7656

}

7657

} // namespace X86

7658

} // namespace llvm

7659

7660

static bool getTargetShuffleMaskIndices(SDValue MaskNode,

7661

unsigned MaskEltSizeInBits,

7662

SmallVectorImpl<uint64_t> &RawMask,

7663

APInt &UndefElts) {

7664

// Extract the raw target constant bits.

7665

SmallVector<APInt, 64> EltBits;

7666

if (!getTargetConstantBitsFromNode(MaskNode, MaskEltSizeInBits, UndefElts,

7667

EltBits, /* AllowWholeUndefs */ true,

7668

/* AllowPartialUndefs */ false))

7669

return false;

7670

7671

// Insert the extracted elements into the mask.

7672

for (const APInt &Elt : EltBits)

7673

RawMask.push_back(Elt.getZExtValue());

7674

7675

return true;

7676

}

7677

7678

/// Create a shuffle mask that matches the PACKSS/PACKUS truncation.

7679

/// A multi-stage pack shuffle mask is created by specifying NumStages > 1.

7680

/// Note: This ignores saturation, so inputs must be checked first.

7681

static void createPackShuffleMask(MVT VT, SmallVectorImpl<int> &Mask,

7682

bool Unary, unsigned NumStages = 1) {

7683

assert(Mask.empty() && "Expected an empty shuffle mask vector")(static_cast <bool> (Mask.empty() && "Expected an empty shuffle mask vector"
) ? void (0) : __assert_fail ("Mask.empty() && \"Expected an empty shuffle mask vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7683, __extension__
__PRETTY_FUNCTION__));

7684

unsigned NumElts = VT.getVectorNumElements();

7685

unsigned NumLanes = VT.getSizeInBits() / 128;

7686

unsigned NumEltsPerLane = 128 / VT.getScalarSizeInBits();

7687

unsigned Offset = Unary ? 0 : NumElts;

7688

unsigned Repetitions = 1u << (NumStages - 1);

7689

unsigned Increment = 1u << NumStages;

7690

assert((NumEltsPerLane >> NumStages) > 0 && "Illegal packing compaction")(static_cast <bool> ((NumEltsPerLane >> NumStages
) > 0 && "Illegal packing compaction") ? void (0) :
__assert_fail ("(NumEltsPerLane >> NumStages) > 0 && \"Illegal packing compaction\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7690, __extension__
__PRETTY_FUNCTION__));

7691

7692

for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {

7693

for (unsigned Stage = 0; Stage != Repetitions; ++Stage) {

7694

for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += Increment)

7695

Mask.push_back(Elt + (Lane * NumEltsPerLane));

7696

for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += Increment)

7697

Mask.push_back(Elt + (Lane * NumEltsPerLane) + Offset);

7698

}

7699

}

7700

}

7701

7702

// Split the demanded elts of a PACKSS/PACKUS node between its operands.

7703

static void getPackDemandedElts(EVT VT, const APInt &DemandedElts,

7704

APInt &DemandedLHS, APInt &DemandedRHS) {

7705

int NumLanes = VT.getSizeInBits() / 128;

7706

int NumElts = DemandedElts.getBitWidth();

7707

int NumInnerElts = NumElts / 2;

7708

int NumEltsPerLane = NumElts / NumLanes;

7709

int NumInnerEltsPerLane = NumInnerElts / NumLanes;

7710

7711

DemandedLHS = APInt::getZero(NumInnerElts);

7712

DemandedRHS = APInt::getZero(NumInnerElts);

7713

7714

// Map DemandedElts to the packed operands.

7715

for (int Lane = 0; Lane != NumLanes; ++Lane) {

7716

for (int Elt = 0; Elt != NumInnerEltsPerLane; ++Elt) {

7717

int OuterIdx = (Lane * NumEltsPerLane) + Elt;

7718

int InnerIdx = (Lane * NumInnerEltsPerLane) + Elt;

7719

if (DemandedElts[OuterIdx])

7720

DemandedLHS.setBit(InnerIdx);

7721

if (DemandedElts[OuterIdx + NumInnerEltsPerLane])

7722

DemandedRHS.setBit(InnerIdx);

7723

}

7724

}

7725

}

7726

7727

// Split the demanded elts of a HADD/HSUB node between its operands.

7728

static void getHorizDemandedElts(EVT VT, const APInt &DemandedElts,

7729

APInt &DemandedLHS, APInt &DemandedRHS) {

7730

int NumLanes = VT.getSizeInBits() / 128;

7731

int NumElts = DemandedElts.getBitWidth();

7732

int NumEltsPerLane = NumElts / NumLanes;

7733

int HalfEltsPerLane = NumEltsPerLane / 2;

7734

7735

DemandedLHS = APInt::getZero(NumElts);

7736

DemandedRHS = APInt::getZero(NumElts);

7737

7738

// Map DemandedElts to the horizontal operands.

7739

for (int Idx = 0; Idx != NumElts; ++Idx) {

7740

if (!DemandedElts[Idx])

7741

continue;

7742

int LaneIdx = (Idx / NumEltsPerLane) * NumEltsPerLane;

7743

int LocalIdx = Idx % NumEltsPerLane;

7744

if (LocalIdx < HalfEltsPerLane) {

7745

DemandedLHS.setBit(LaneIdx + 2 * LocalIdx + 0);

7746

DemandedLHS.setBit(LaneIdx + 2 * LocalIdx + 1);

7747

} else {

7748

LocalIdx -= HalfEltsPerLane;

7749

DemandedRHS.setBit(LaneIdx + 2 * LocalIdx + 0);

7750

DemandedRHS.setBit(LaneIdx + 2 * LocalIdx + 1);

7751

}

7752

}

7753

}

7754

7755

/// Calculates the shuffle mask corresponding to the target-specific opcode.

7756

/// If the mask could be calculated, returns it in \p Mask, returns the shuffle

7757

/// operands in \p Ops, and returns true.

7758

/// Sets \p IsUnary to true if only one source is used. Note that this will set

7759

/// IsUnary for shuffles which use a single input multiple times, and in those

7760

/// cases it will adjust the mask to only have indices within that single input.

7761

/// It is an error to call this with non-empty Mask/Ops vectors.

7762

static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,

7763

SmallVectorImpl<SDValue> &Ops,

7764

SmallVectorImpl<int> &Mask, bool &IsUnary) {

7765

unsigned NumElems = VT.getVectorNumElements();

7766

unsigned MaskEltSize = VT.getScalarSizeInBits();

7767

SmallVector<uint64_t, 32> RawMask;

7768

APInt RawUndefs;

7769

uint64_t ImmN;

7770

7771

assert(Mask.empty() && "getTargetShuffleMask expects an empty Mask vector")(static_cast <bool> (Mask.empty() && "getTargetShuffleMask expects an empty Mask vector"
) ? void (0) : __assert_fail ("Mask.empty() && \"getTargetShuffleMask expects an empty Mask vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7771, __extension__
__PRETTY_FUNCTION__));

7772

assert(Ops.empty() && "getTargetShuffleMask expects an empty Ops vector")(static_cast <bool> (Ops.empty() && "getTargetShuffleMask expects an empty Ops vector"
) ? void (0) : __assert_fail ("Ops.empty() && \"getTargetShuffleMask expects an empty Ops vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7772, __extension__
__PRETTY_FUNCTION__));

7773

7774

IsUnary = false;

7775

bool IsFakeUnary = false;

7776

switch (N->getOpcode()) {

7777

case X86ISD::BLENDI:

7778

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7778, __extension__
__PRETTY_FUNCTION__));

7779

assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7779, __extension__
__PRETTY_FUNCTION__));

7780

ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);

7781

DecodeBLENDMask(NumElems, ImmN, Mask);

7782

IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);

7783

break;

7784

case X86ISD::SHUFP:

7785

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7785, __extension__
__PRETTY_FUNCTION__));

7786

assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7786, __extension__
__PRETTY_FUNCTION__));

7787

ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);

7788

DecodeSHUFPMask(NumElems, MaskEltSize, ImmN, Mask);

7789

IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);

7790

break;

7791

case X86ISD::INSERTPS:

7792

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7792, __extension__
__PRETTY_FUNCTION__));

7793

assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7793, __extension__
__PRETTY_FUNCTION__));

7794

ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);

7795

DecodeINSERTPSMask(ImmN, Mask);

7796

IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);

7797

break;

7798

case X86ISD::EXTRQI:

7799

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7799, __extension__
__PRETTY_FUNCTION__));

7800

if (isa<ConstantSDNode>(N->getOperand(1)) &&

7801

isa<ConstantSDNode>(N->getOperand(2))) {

7802

int BitLen = N->getConstantOperandVal(1);

7803

int BitIdx = N->getConstantOperandVal(2);

7804

DecodeEXTRQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask);

7805

IsUnary = true;

7806

}

7807

break;

7808

case X86ISD::INSERTQI:

7809

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7809, __extension__
__PRETTY_FUNCTION__));

7810

assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7810, __extension__
__PRETTY_FUNCTION__));

7811

if (isa<ConstantSDNode>(N->getOperand(2)) &&

7812

isa<ConstantSDNode>(N->getOperand(3))) {

7813

int BitLen = N->getConstantOperandVal(2);

7814

int BitIdx = N->getConstantOperandVal(3);

7815

DecodeINSERTQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask);

7816

IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);

7817

}

7818

break;

7819

case X86ISD::UNPCKH:

7820

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7820, __extension__
__PRETTY_FUNCTION__));

7821

assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7821, __extension__
__PRETTY_FUNCTION__));

7822

DecodeUNPCKHMask(NumElems, MaskEltSize, Mask);

7823

IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);

7824

break;

7825

case X86ISD::UNPCKL:

7826

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7826, __extension__
__PRETTY_FUNCTION__));

7827

assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7827, __extension__
__PRETTY_FUNCTION__));

7828

DecodeUNPCKLMask(NumElems, MaskEltSize, Mask);

7829

IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);

7830

break;

7831

case X86ISD::MOVHLPS:

7832

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7832, __extension__
__PRETTY_FUNCTION__));

7833

assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7833, __extension__
__PRETTY_FUNCTION__));

7834

DecodeMOVHLPSMask(NumElems, Mask);

7835

IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);

7836

break;

7837

case X86ISD::MOVLHPS:

7838

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7838, __extension__
__PRETTY_FUNCTION__));

7839

assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7839, __extension__
__PRETTY_FUNCTION__));

7840

DecodeMOVLHPSMask(NumElems, Mask);

7841

IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);

7842

break;

7843

case X86ISD::VALIGN:

7844

assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&(static_cast <bool> ((VT.getScalarType() == MVT::i32 ||
VT.getScalarType() == MVT::i64) && "Only 32-bit and 64-bit elements are supported!"
) ? void (0) : __assert_fail ("(VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) && \"Only 32-bit and 64-bit elements are supported!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7845, __extension__
__PRETTY_FUNCTION__))

7845

"Only 32-bit and 64-bit elements are supported!")(static_cast <bool> ((VT.getScalarType() == MVT::i32 ||
VT.getScalarType() == MVT::i64) && "Only 32-bit and 64-bit elements are supported!"
) ? void (0) : __assert_fail ("(VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) && \"Only 32-bit and 64-bit elements are supported!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7845, __extension__
__PRETTY_FUNCTION__));

7846

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7846, __extension__
__PRETTY_FUNCTION__));

7847

assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7847, __extension__
__PRETTY_FUNCTION__));

7848

ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);

7849

DecodeVALIGNMask(NumElems, ImmN, Mask);

7850

IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);

7851

Ops.push_back(N->getOperand(1));

7852

Ops.push_back(N->getOperand(0));

7853

break;

7854

case X86ISD::PALIGNR:

7855

assert(VT.getScalarType() == MVT::i8 && "Byte vector expected")(static_cast <bool> (VT.getScalarType() == MVT::i8 &&
"Byte vector expected") ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i8 && \"Byte vector expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7855, __extension__
__PRETTY_FUNCTION__));

7856

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7856, __extension__
__PRETTY_FUNCTION__));

7857

assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7857, __extension__
__PRETTY_FUNCTION__));

7858

ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);

7859

DecodePALIGNRMask(NumElems, ImmN, Mask);

7860

IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);

7861

Ops.push_back(N->getOperand(1));

7862

Ops.push_back(N->getOperand(0));

7863

break;

7864

case X86ISD::VSHLDQ:

7865

assert(VT.getScalarType() == MVT::i8 && "Byte vector expected")(static_cast <bool> (VT.getScalarType() == MVT::i8 &&
"Byte vector expected") ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i8 && \"Byte vector expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7865, __extension__
__PRETTY_FUNCTION__));

7866

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7866, __extension__
__PRETTY_FUNCTION__));

7867

ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);

7868

DecodePSLLDQMask(NumElems, ImmN, Mask);

7869

IsUnary = true;

7870

break;

7871

case X86ISD::VSRLDQ:

7872

assert(VT.getScalarType() == MVT::i8 && "Byte vector expected")(static_cast <bool> (VT.getScalarType() == MVT::i8 &&
"Byte vector expected") ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i8 && \"Byte vector expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7872, __extension__
__PRETTY_FUNCTION__));

7873

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7873, __extension__
__PRETTY_FUNCTION__));

7874

ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);

7875

DecodePSRLDQMask(NumElems, ImmN, Mask);

7876

IsUnary = true;

7877

break;

7878

case X86ISD::PSHUFD:

7879

case X86ISD::VPERMILPI:

7880

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7880, __extension__
__PRETTY_FUNCTION__));

7881

ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);

7882

DecodePSHUFMask(NumElems, MaskEltSize, ImmN, Mask);

7883

IsUnary = true;

7884

break;

7885

case X86ISD::PSHUFHW:

7886

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7886, __extension__
__PRETTY_FUNCTION__));

7887

ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);

7888

DecodePSHUFHWMask(NumElems, ImmN, Mask);

7889

IsUnary = true;

7890

break;

7891

case X86ISD::PSHUFLW:

7892

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7892, __extension__
__PRETTY_FUNCTION__));

7893

ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);

7894

DecodePSHUFLWMask(NumElems, ImmN, Mask);

7895

IsUnary = true;

7896

break;

7897

case X86ISD::VZEXT_MOVL:

7898

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7898, __extension__
__PRETTY_FUNCTION__));

7899

DecodeZeroMoveLowMask(NumElems, Mask);

7900

IsUnary = true;

7901

break;

7902

case X86ISD::VBROADCAST:

7903

// We only decode broadcasts of same-sized vectors, peeking through to

7904

// extracted subvectors is likely to cause hasOneUse issues with

7905

// SimplifyDemandedBits etc.

7906

if (N->getOperand(0).getValueType() == VT) {

7907

DecodeVectorBroadcast(NumElems, Mask);

7908

IsUnary = true;

7909

break;

7910

}

7911

return false;

7912

case X86ISD::VPERMILPV: {

7913

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7913, __extension__
__PRETTY_FUNCTION__));

7914

IsUnary = true;

7915

SDValue MaskNode = N->getOperand(1);

7916

if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,

7917

RawUndefs)) {

7918

DecodeVPERMILPMask(NumElems, MaskEltSize, RawMask, RawUndefs, Mask);

7919

break;

7920

}

7921

return false;

7922

}

7923

case X86ISD::PSHUFB: {

7924

assert(VT.getScalarType() == MVT::i8 && "Byte vector expected")(static_cast <bool> (VT.getScalarType() == MVT::i8 &&
"Byte vector expected") ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i8 && \"Byte vector expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7924, __extension__
__PRETTY_FUNCTION__));

7925

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7925, __extension__
__PRETTY_FUNCTION__));

7926

assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7926, __extension__
__PRETTY_FUNCTION__));

7927

IsUnary = true;

7928

SDValue MaskNode = N->getOperand(1);

7929

if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) {

7930

DecodePSHUFBMask(RawMask, RawUndefs, Mask);

7931

break;

7932

}

7933

return false;

7934

}

7935

case X86ISD::VPERMI:

7936

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7936, __extension__
__PRETTY_FUNCTION__));

7937

ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);

7938

DecodeVPERMMask(NumElems, ImmN, Mask);

7939

IsUnary = true;

7940

break;

7941

case X86ISD::MOVSS:

7942

case X86ISD::MOVSD:

7943

case X86ISD::MOVSH:

7944

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7944, __extension__
__PRETTY_FUNCTION__));

7945

assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7945, __extension__
__PRETTY_FUNCTION__));

7946

DecodeScalarMoveMask(NumElems, /* IsLoad */ false, Mask);

7947

break;

7948

case X86ISD::VPERM2X128:

7949

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7949, __extension__
__PRETTY_FUNCTION__));

7950

assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7950, __extension__
__PRETTY_FUNCTION__));

7951

ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);

7952

DecodeVPERM2X128Mask(NumElems, ImmN, Mask);

7953

IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);

7954

break;

7955

case X86ISD::SHUF128:

7956

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7956, __extension__
__PRETTY_FUNCTION__));

7957

assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7957, __extension__
__PRETTY_FUNCTION__));

7958

ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);

7959

decodeVSHUF64x2FamilyMask(NumElems, MaskEltSize, ImmN, Mask);

7960

IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);

7961

break;

7962

case X86ISD::MOVSLDUP:

7963

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7963, __extension__
__PRETTY_FUNCTION__));

7964

DecodeMOVSLDUPMask(NumElems, Mask);

7965

IsUnary = true;

7966

break;

7967

case X86ISD::MOVSHDUP:

7968

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7968, __extension__
__PRETTY_FUNCTION__));

7969

DecodeMOVSHDUPMask(NumElems, Mask);

7970

IsUnary = true;

7971

break;

7972

case X86ISD::MOVDDUP:

7973

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7973, __extension__
__PRETTY_FUNCTION__));

7974

DecodeMOVDDUPMask(NumElems, Mask);

7975

IsUnary = true;

7976

break;

7977

case X86ISD::VPERMIL2: {

7978

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7978, __extension__
__PRETTY_FUNCTION__));

7979

assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7979, __extension__
__PRETTY_FUNCTION__));

7980

IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);

7981

SDValue MaskNode = N->getOperand(2);

7982

SDValue CtrlNode = N->getOperand(3);

7983

if (ConstantSDNode *CtrlOp = dyn_cast<ConstantSDNode>(CtrlNode)) {

7984

unsigned CtrlImm = CtrlOp->getZExtValue();

7985

if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,

7986

RawUndefs)) {

7987

DecodeVPERMIL2PMask(NumElems, MaskEltSize, CtrlImm, RawMask, RawUndefs,

7988

Mask);

7989

break;

7990

}

7991

}

7992

return false;

7993

}

7994

case X86ISD::VPPERM: {

7995

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7995, __extension__
__PRETTY_FUNCTION__));

7996

assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7996, __extension__
__PRETTY_FUNCTION__));

7997

IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);

7998

SDValue MaskNode = N->getOperand(2);

7999

if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) {

8000

DecodeVPPERMMask(RawMask, RawUndefs, Mask);

8001

break;

8002

}

8003

return false;

8004

}

8005

case X86ISD::VPERMV: {

8006

assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8006, __extension__
__PRETTY_FUNCTION__));

8007

IsUnary = true;

8008

// Unlike most shuffle nodes, VPERMV's mask operand is operand 0.

8009

Ops.push_back(N->getOperand(1));

8010

SDValue MaskNode = N->getOperand(0);

8011

if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,

8012

RawUndefs)) {

8013

DecodeVPERMVMask(RawMask, RawUndefs, Mask);

8014

break;

8015

}

8016

return false;

8017

}

8018

case X86ISD::VPERMV3: {

8019

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8019, __extension__
__PRETTY_FUNCTION__));

8020

assert(N->getOperand(2).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(2).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(2).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8020, __extension__
__PRETTY_FUNCTION__));

8021

IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(2);

8022

// Unlike most shuffle nodes, VPERMV3's mask operand is the middle one.

8023

Ops.push_back(N->getOperand(0));

8024

Ops.push_back(N->getOperand(2));

8025

SDValue MaskNode = N->getOperand(1);

8026

if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,

8027

RawUndefs)) {

8028

DecodeVPERMV3Mask(RawMask, RawUndefs, Mask);

8029

break;

8030

}

8031

return false;

8032

}

8033

default: llvm_unreachable("unknown target shuffle node")::llvm::llvm_unreachable_internal("unknown target shuffle node"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8033);

8034

}

8035

8036

// Empty mask indicates the decode failed.

8037

if (Mask.empty())

8038

return false;

8039

8040

// Check if we're getting a shuffle mask with zero'd elements.

8041

if (!AllowSentinelZero && isAnyZero(Mask))

8042

return false;

8043

8044

// If we have a fake unary shuffle, the shuffle mask is spread across two

8045

// inputs that are actually the same node. Re-map the mask to always point

8046

// into the first input.

8047

if (IsFakeUnary)

8048

for (int &M : Mask)

8049

if (M >= (int)Mask.size())

8050

M -= Mask.size();

8051

8052

// If we didn't already add operands in the opcode-specific code, default to

8053

// adding 1 or 2 operands starting at 0.

8054

if (Ops.empty()) {

8055

Ops.push_back(N->getOperand(0));

8056

if (!IsUnary || IsFakeUnary)

8057

Ops.push_back(N->getOperand(1));

8058

}

8059

8060

return true;

8061

}

8062

8063

// Wrapper for getTargetShuffleMask with InUnary;

8064

static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,

8065

SmallVectorImpl<SDValue> &Ops,

8066

SmallVectorImpl<int> &Mask) {

8067

bool IsUnary;

8068

return getTargetShuffleMask(N, VT, AllowSentinelZero, Ops, Mask, IsUnary);

8069

}

8070

8071

/// Compute whether each element of a shuffle is zeroable.

8072

///

8073

/// A "zeroable" vector shuffle element is one which can be lowered to zero.

8074

/// Either it is an undef element in the shuffle mask, the element of the input

8075

/// referenced is undef, or the element of the input referenced is known to be

8076

/// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle

8077

/// as many lanes with this technique as possible to simplify the remaining

8078

/// shuffle.

8079

static void computeZeroableShuffleElements(ArrayRef<int> Mask,

8080

SDValue V1, SDValue V2,

8081

APInt &KnownUndef, APInt &KnownZero) {

8082

int Size = Mask.size();

8083

KnownUndef = KnownZero = APInt::getZero(Size);

8084

8085

V1 = peekThroughBitcasts(V1);

8086

V2 = peekThroughBitcasts(V2);

8087

8088

bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());

8089

bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());

8090

8091

int VectorSizeInBits = V1.getValueSizeInBits();

8092

int ScalarSizeInBits = VectorSizeInBits / Size;

8093

assert(!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size")(static_cast <bool> (!(VectorSizeInBits % ScalarSizeInBits
) && "Illegal shuffle mask size") ? void (0) : __assert_fail
("!(VectorSizeInBits % ScalarSizeInBits) && \"Illegal shuffle mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8093, __extension__
__PRETTY_FUNCTION__));

8094

8095

for (int i = 0; i < Size; ++i) {

8096

int M = Mask[i];

8097

// Handle the easy cases.

8098

if (M < 0) {

8099

KnownUndef.setBit(i);

8100

continue;

8101

}

8102

if ((M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {

8103

KnownZero.setBit(i);

8104

continue;

8105

}

8106

8107

// Determine shuffle input and normalize the mask.

8108

SDValue V = M < Size ? V1 : V2;

8109

M %= Size;

8110

8111

// Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.

8112

if (V.getOpcode() != ISD::BUILD_VECTOR)

8113

continue;

8114

8115

// If the BUILD_VECTOR has fewer elements then the bitcasted portion of

8116

// the (larger) source element must be UNDEF/ZERO.

8117

if ((Size % V.getNumOperands()) == 0) {

8118

int Scale = Size / V->getNumOperands();

8119

SDValue Op = V.getOperand(M / Scale);

8120

if (Op.isUndef())

8121

KnownUndef.setBit(i);

8122

if (X86::isZeroNode(Op))

8123

KnownZero.setBit(i);

8124

else if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {

8125

APInt Val = Cst->getAPIntValue();

8126

Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);

8127

if (Val == 0)

8128

KnownZero.setBit(i);

8129

} else if (ConstantFPSDNode *Cst = dyn_cast<ConstantFPSDNode>(Op)) {

8130

APInt Val = Cst->getValueAPF().bitcastToAPInt();

8131

Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);

8132

if (Val == 0)

8133

KnownZero.setBit(i);

8134

}

8135

continue;

8136

}

8137

8138

// If the BUILD_VECTOR has more elements then all the (smaller) source

8139

// elements must be UNDEF or ZERO.

8140

if ((V.getNumOperands() % Size) == 0) {

8141

int Scale = V->getNumOperands() / Size;

8142

bool AllUndef = true;

8143

bool AllZero = true;

8144

for (int j = 0; j < Scale; ++j) {

8145

SDValue Op = V.getOperand((M * Scale) + j);

8146

AllUndef &= Op.isUndef();

8147

AllZero &= X86::isZeroNode(Op);

8148

}

8149

if (AllUndef)

8150

KnownUndef.setBit(i);

8151

if (AllZero)

8152

KnownZero.setBit(i);

8153

continue;

8154

}

8155

}

8156

}

8157

8158

/// Decode a target shuffle mask and inputs and see if any values are

8159

/// known to be undef or zero from their inputs.

8160

/// Returns true if the target shuffle mask was decoded.

8161

/// FIXME: Merge this with computeZeroableShuffleElements?

8162

static bool getTargetShuffleAndZeroables(SDValue N, SmallVectorImpl<int> &Mask,

8163

SmallVectorImpl<SDValue> &Ops,

8164

APInt &KnownUndef, APInt &KnownZero) {

8165

bool IsUnary;

8166

if (!isTargetShuffle(N.getOpcode()))

8167

return false;

8168

8169

MVT VT = N.getSimpleValueType();

8170

if (!getTargetShuffleMask(N.getNode(), VT, true, Ops, Mask, IsUnary))

8171

return false;

8172

8173

int Size = Mask.size();

8174

SDValue V1 = Ops[0];

8175

SDValue V2 = IsUnary ? V1 : Ops[1];

8176

KnownUndef = KnownZero = APInt::getZero(Size);

8177

8178

V1 = peekThroughBitcasts(V1);

8179

V2 = peekThroughBitcasts(V2);

8180

8181

assert((VT.getSizeInBits() % Size) == 0 &&(static_cast <bool> ((VT.getSizeInBits() % Size) == 0 &&
"Illegal split of shuffle value type") ? void (0) : __assert_fail
("(VT.getSizeInBits() % Size) == 0 && \"Illegal split of shuffle value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8182, __extension__
__PRETTY_FUNCTION__))

8182

"Illegal split of shuffle value type")(static_cast <bool> ((VT.getSizeInBits() % Size) == 0 &&
"Illegal split of shuffle value type") ? void (0) : __assert_fail
("(VT.getSizeInBits() % Size) == 0 && \"Illegal split of shuffle value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8182, __extension__
__PRETTY_FUNCTION__));

8183

unsigned EltSizeInBits = VT.getSizeInBits() / Size;

8184

8185

// Extract known constant input data.

8186

APInt UndefSrcElts[2];

8187

SmallVector<APInt, 32> SrcEltBits[2];

8188

bool IsSrcConstant[2] = {

8189

getTargetConstantBitsFromNode(V1, EltSizeInBits, UndefSrcElts[0],

8190

SrcEltBits[0], true, false),

8191

getTargetConstantBitsFromNode(V2, EltSizeInBits, UndefSrcElts[1],

8192

SrcEltBits[1], true, false)};

8193

8194

for (int i = 0; i < Size; ++i) {

8195

int M = Mask[i];

8196

8197

// Already decoded as SM_SentinelZero / SM_SentinelUndef.

8198

if (M < 0) {

8199

assert(isUndefOrZero(M) && "Unknown shuffle sentinel value!")(static_cast <bool> (isUndefOrZero(M) && "Unknown shuffle sentinel value!"
) ? void (0) : __assert_fail ("isUndefOrZero(M) && \"Unknown shuffle sentinel value!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8199, __extension__
__PRETTY_FUNCTION__));

8200

if (SM_SentinelUndef == M)

8201

KnownUndef.setBit(i);

8202

if (SM_SentinelZero == M)

8203

KnownZero.setBit(i);

8204

continue;

8205

}

8206

8207

// Determine shuffle input and normalize the mask.

8208

unsigned SrcIdx = M / Size;

8209

SDValue V = M < Size ? V1 : V2;

8210

M %= Size;

8211

8212

// We are referencing an UNDEF input.

8213

if (V.isUndef()) {

8214

KnownUndef.setBit(i);

8215

continue;

8216

}

8217

8218

// SCALAR_TO_VECTOR - only the first element is defined, and the rest UNDEF.

8219

// TODO: We currently only set UNDEF for integer types - floats use the same

8220

// registers as vectors and many of the scalar folded loads rely on the

8221

// SCALAR_TO_VECTOR pattern.

8222

if (V.getOpcode() == ISD::SCALAR_TO_VECTOR &&

8223

(Size % V.getValueType().getVectorNumElements()) == 0) {

8224

int Scale = Size / V.getValueType().getVectorNumElements();

8225

int Idx = M / Scale;

8226

if (Idx != 0 && !VT.isFloatingPoint())

8227

KnownUndef.setBit(i);

8228

else if (Idx == 0 && X86::isZeroNode(V.getOperand(0)))

8229

KnownZero.setBit(i);

8230

continue;

8231

}

8232

8233

// INSERT_SUBVECTOR - to widen vectors we often insert them into UNDEF

8234

// base vectors.

8235

if (V.getOpcode() == ISD::INSERT_SUBVECTOR) {

8236

SDValue Vec = V.getOperand(0);

8237

int NumVecElts = Vec.getValueType().getVectorNumElements();

8238

if (Vec.isUndef() && Size == NumVecElts) {

8239

int Idx = V.getConstantOperandVal(2);

8240

int NumSubElts = V.getOperand(1).getValueType().getVectorNumElements();

8241

if (M < Idx || (Idx + NumSubElts) <= M)

8242

KnownUndef.setBit(i);

8243

}

8244

continue;

8245

}

8246

8247

// Attempt to extract from the source's constant bits.

8248

if (IsSrcConstant[SrcIdx]) {

8249

if (UndefSrcElts[SrcIdx][M])

8250

KnownUndef.setBit(i);

8251

else if (SrcEltBits[SrcIdx][M] == 0)

8252

KnownZero.setBit(i);

8253

}

8254

}

8255

8256

assert(VT.getVectorNumElements() == (unsigned)Size &&(static_cast <bool> (VT.getVectorNumElements() == (unsigned
)Size && "Different mask size from vector size!") ? void
(0) : __assert_fail ("VT.getVectorNumElements() == (unsigned)Size && \"Different mask size from vector size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8257, __extension__
__PRETTY_FUNCTION__))

8257

"Different mask size from vector size!")(static_cast <bool> (VT.getVectorNumElements() == (unsigned
)Size && "Different mask size from vector size!") ? void
(0) : __assert_fail ("VT.getVectorNumElements() == (unsigned)Size && \"Different mask size from vector size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8257, __extension__
__PRETTY_FUNCTION__));

8258

return true;

8259

}

8260

8261

// Replace target shuffle mask elements with known undef/zero sentinels.

8262

static void resolveTargetShuffleFromZeroables(SmallVectorImpl<int> &Mask,

8263

const APInt &KnownUndef,

8264

const APInt &KnownZero,

8265

bool ResolveKnownZeros= true) {

8266

unsigned NumElts = Mask.size();

8267

assert(KnownUndef.getBitWidth() == NumElts &&(static_cast <bool> (KnownUndef.getBitWidth() == NumElts
&& KnownZero.getBitWidth() == NumElts && "Shuffle mask size mismatch"
) ? void (0) : __assert_fail ("KnownUndef.getBitWidth() == NumElts && KnownZero.getBitWidth() == NumElts && \"Shuffle mask size mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8268, __extension__
__PRETTY_FUNCTION__))

8268

KnownZero.getBitWidth() == NumElts && "Shuffle mask size mismatch")(static_cast <bool> (KnownUndef.getBitWidth() == NumElts
&& KnownZero.getBitWidth() == NumElts && "Shuffle mask size mismatch"
) ? void (0) : __assert_fail ("KnownUndef.getBitWidth() == NumElts && KnownZero.getBitWidth() == NumElts && \"Shuffle mask size mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8268, __extension__
__PRETTY_FUNCTION__));

8269

8270

for (unsigned i = 0; i != NumElts; ++i) {

8271

if (KnownUndef[i])

8272

Mask[i] = SM_SentinelUndef;

8273

else if (ResolveKnownZeros && KnownZero[i])

8274

Mask[i] = SM_SentinelZero;

8275

}

8276

}

8277

8278

// Extract target shuffle mask sentinel elements to known undef/zero bitmasks.

8279

static void resolveZeroablesFromTargetShuffle(const SmallVectorImpl<int> &Mask,

8280

APInt &KnownUndef,

8281

APInt &KnownZero) {

8282

unsigned NumElts = Mask.size();

8283

KnownUndef = KnownZero = APInt::getZero(NumElts);

8284

8285

for (unsigned i = 0; i != NumElts; ++i) {

8286

int M = Mask[i];

8287

if (SM_SentinelUndef == M)

8288

KnownUndef.setBit(i);

8289

if (SM_SentinelZero == M)

8290

KnownZero.setBit(i);

8291

}

8292

}

8293

8294

// Attempt to create a shuffle mask from a VSELECT/BLENDV condition mask.

8295

static bool createShuffleMaskFromVSELECT(SmallVectorImpl<int> &Mask,

8296

SDValue Cond, bool IsBLENDV = false) {

8297

EVT CondVT = Cond.getValueType();

8298

unsigned EltSizeInBits = CondVT.getScalarSizeInBits();

8299

unsigned NumElts = CondVT.getVectorNumElements();

8300

8301

APInt UndefElts;

8302

SmallVector<APInt, 32> EltBits;

8303

if (!getTargetConstantBitsFromNode(Cond, EltSizeInBits, UndefElts, EltBits,

8304

true, false))

8305

return false;

8306

8307

Mask.resize(NumElts, SM_SentinelUndef);

8308

8309

for (int i = 0; i != (int)NumElts; ++i) {

8310

Mask[i] = i;

8311

// Arbitrarily choose from the 2nd operand if the select condition element

8312

// is undef.

8313

// TODO: Can we do better by matching patterns such as even/odd?

8314

if (UndefElts[i] || (!IsBLENDV && EltBits[i].isZero()) ||

8315

(IsBLENDV && EltBits[i].isNonNegative()))

8316

Mask[i] += NumElts;

8317

}

8318

8319

return true;

8320

}

8321

8322

// Forward declaration (for getFauxShuffleMask recursive check).

8323

static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,

8324

SmallVectorImpl<SDValue> &Inputs,

8325

SmallVectorImpl<int> &Mask,

8326

const SelectionDAG &DAG, unsigned Depth,

8327

bool ResolveKnownElts);

8328

8329

// Attempt to decode ops that could be represented as a shuffle mask.

8330

// The decoded shuffle mask may contain a different number of elements to the

8331

// destination value type.

8332

// TODO: Merge into getTargetShuffleInputs()

8333

static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,

8334

SmallVectorImpl<int> &Mask,

8335

SmallVectorImpl<SDValue> &Ops,

8336

const SelectionDAG &DAG, unsigned Depth,

8337

bool ResolveKnownElts) {

8338

Mask.clear();

8339

Ops.clear();

8340

8341

MVT VT = N.getSimpleValueType();

8342

unsigned NumElts = VT.getVectorNumElements();

8343

unsigned NumSizeInBits = VT.getSizeInBits();

8344

unsigned NumBitsPerElt = VT.getScalarSizeInBits();

8345

if ((NumBitsPerElt % 8) != 0 || (NumSizeInBits % 8) != 0)

8346

return false;

8347

assert(NumElts == DemandedElts.getBitWidth() && "Unexpected vector size")(static_cast <bool> (NumElts == DemandedElts.getBitWidth
() && "Unexpected vector size") ? void (0) : __assert_fail
("NumElts == DemandedElts.getBitWidth() && \"Unexpected vector size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8347, __extension__
__PRETTY_FUNCTION__));

8348

unsigned NumSizeInBytes = NumSizeInBits / 8;

8349

unsigned NumBytesPerElt = NumBitsPerElt / 8;

8350

8351

unsigned Opcode = N.getOpcode();

8352

switch (Opcode) {

8353

case ISD::VECTOR_SHUFFLE: {

8354

// Don't treat ISD::VECTOR_SHUFFLE as a target shuffle so decode it here.

8355

ArrayRef<int> ShuffleMask = cast<ShuffleVectorSDNode>(N)->getMask();

8356

if (isUndefOrInRange(ShuffleMask, 0, 2 * NumElts)) {

8357

Mask.append(ShuffleMask.begin(), ShuffleMask.end());

8358

Ops.push_back(N.getOperand(0));

8359

Ops.push_back(N.getOperand(1));

8360

return true;

8361

}

8362

return false;

8363

}

8364

case ISD::AND:

8365

case X86ISD::ANDNP: {

8366

// Attempt to decode as a per-byte mask.

8367

APInt UndefElts;

8368

SmallVector<APInt, 32> EltBits;

8369

SDValue N0 = N.getOperand(0);

8370

SDValue N1 = N.getOperand(1);

8371

bool IsAndN = (X86ISD::ANDNP == Opcode);

8372

uint64_t ZeroMask = IsAndN ? 255 : 0;

8373

if (!getTargetConstantBitsFromNode(IsAndN ? N0 : N1, 8, UndefElts, EltBits))

8374

return false;

8375

// We can't assume an undef src element gives an undef dst - the other src

8376

// might be zero.

8377

if (!UndefElts.isZero())

8378

return false;

8379

for (int i = 0, e = (int)EltBits.size(); i != e; ++i) {

8380

const APInt &ByteBits = EltBits[i];

8381

if (ByteBits != 0 && ByteBits != 255)

8382

return false;

8383

Mask.push_back(ByteBits == ZeroMask ? SM_SentinelZero : i);

8384

}

8385

Ops.push_back(IsAndN ? N1 : N0);

8386

return true;

8387

}

8388

case ISD::OR: {

8389

// Handle OR(SHUFFLE,SHUFFLE) case where one source is zero and the other

8390

// is a valid shuffle index.

8391

SDValue N0 = peekThroughBitcasts(N.getOperand(0));

8392

SDValue N1 = peekThroughBitcasts(N.getOperand(1));

8393

if (!N0.getValueType().isVector() || !N1.getValueType().isVector())

8394

return false;

8395

8396

SmallVector<int, 64> SrcMask0, SrcMask1;

8397

SmallVector<SDValue, 2> SrcInputs0, SrcInputs1;

8398

APInt Demand0 = APInt::getAllOnes(N0.getValueType().getVectorNumElements());

8399

APInt Demand1 = APInt::getAllOnes(N1.getValueType().getVectorNumElements());

8400

if (!getTargetShuffleInputs(N0, Demand0, SrcInputs0, SrcMask0, DAG,

8401

Depth + 1, true) ||

8402

!getTargetShuffleInputs(N1, Demand1, SrcInputs1, SrcMask1, DAG,

8403

Depth + 1, true))

8404

return false;

8405

8406

size_t MaskSize = std::max(SrcMask0.size(), SrcMask1.size());

8407

SmallVector<int, 64> Mask0, Mask1;

8408

narrowShuffleMaskElts(MaskSize / SrcMask0.size(), SrcMask0, Mask0);

8409

narrowShuffleMaskElts(MaskSize / SrcMask1.size(), SrcMask1, Mask1);

8410

for (int i = 0; i != (int)MaskSize; ++i) {

8411

// NOTE: Don't handle SM_SentinelUndef, as we can end up in infinite

8412

// loops converting between OR and BLEND shuffles due to

8413

// canWidenShuffleElements merging away undef elements, meaning we

8414

// fail to recognise the OR as the undef element isn't known zero.

8415

if (Mask0[i] == SM_SentinelZero && Mask1[i] == SM_SentinelZero)

8416

Mask.push_back(SM_SentinelZero);

8417

else if (Mask1[i] == SM_SentinelZero)

8418

Mask.push_back(i);

8419

else if (Mask0[i] == SM_SentinelZero)

8420

Mask.push_back(i + MaskSize);

8421

else

8422

return false;

8423

}

8424

Ops.push_back(N0);

8425

Ops.push_back(N1);

8426

return true;

8427

}

8428

case ISD::INSERT_SUBVECTOR: {

8429

SDValue Src = N.getOperand(0);

8430

SDValue Sub = N.getOperand(1);

8431

EVT SubVT = Sub.getValueType();

8432

unsigned NumSubElts = SubVT.getVectorNumElements();

8433

if (!N->isOnlyUserOf(Sub.getNode()))

8434

return false;

8435

uint64_t InsertIdx = N.getConstantOperandVal(2);

8436

// Handle INSERT_SUBVECTOR(SRC0, EXTRACT_SUBVECTOR(SRC1)).

8437

if (Sub.getOpcode() == ISD::EXTRACT_SUBVECTOR &&

8438

Sub.getOperand(0).getValueType() == VT) {

8439

uint64_t ExtractIdx = Sub.getConstantOperandVal(1);

8440

for (int i = 0; i != (int)NumElts; ++i)

8441

Mask.push_back(i);

8442

for (int i = 0; i != (int)NumSubElts; ++i)

8443

Mask[InsertIdx + i] = NumElts + ExtractIdx + i;

8444

Ops.push_back(Src);

8445

Ops.push_back(Sub.getOperand(0));

8446

return true;

8447

}

8448

// Handle INSERT_SUBVECTOR(SRC0, SHUFFLE(SRC1)).

8449

SmallVector<int, 64> SubMask;

8450

SmallVector<SDValue, 2> SubInputs;

8451

SDValue SubSrc = peekThroughOneUseBitcasts(Sub);

8452

EVT SubSrcVT = SubSrc.getValueType();

8453

if (!SubSrcVT.isVector())

8454

return false;

8455

8456

APInt SubDemand = APInt::getAllOnes(SubSrcVT.getVectorNumElements());

8457

if (!getTargetShuffleInputs(SubSrc, SubDemand, SubInputs, SubMask, DAG,

8458

Depth + 1, ResolveKnownElts))

8459

return false;

8460

8461

// Subvector shuffle inputs must not be larger than the subvector.

8462

if (llvm::any_of(SubInputs, [SubVT](SDValue SubInput) {

8463

return SubVT.getFixedSizeInBits() <

8464

SubInput.getValueSizeInBits().getFixedValue();

8465

}))

8466

return false;

8467

8468

if (SubMask.size() != NumSubElts) {

8469

assert(((SubMask.size() % NumSubElts) == 0 ||(static_cast <bool> (((SubMask.size() % NumSubElts) == 0
|| (NumSubElts % SubMask.size()) == 0) && "Illegal submask scale"
) ? void (0) : __assert_fail ("((SubMask.size() % NumSubElts) == 0 || (NumSubElts % SubMask.size()) == 0) && \"Illegal submask scale\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8470, __extension__
__PRETTY_FUNCTION__))

8470

(NumSubElts % SubMask.size()) == 0) && "Illegal submask scale")(static_cast <bool> (((SubMask.size() % NumSubElts) == 0
|| (NumSubElts % SubMask.size()) == 0) && "Illegal submask scale"
) ? void (0) : __assert_fail ("((SubMask.size() % NumSubElts) == 0 || (NumSubElts % SubMask.size()) == 0) && \"Illegal submask scale\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8470, __extension__
__PRETTY_FUNCTION__));

8471

if ((NumSubElts % SubMask.size()) == 0) {

8472

int Scale = NumSubElts / SubMask.size();

8473

SmallVector<int,64> ScaledSubMask;

8474

narrowShuffleMaskElts(Scale, SubMask, ScaledSubMask);

8475

SubMask = ScaledSubMask;

8476

} else {

8477

int Scale = SubMask.size() / NumSubElts;

8478

NumSubElts = SubMask.size();

8479

NumElts *= Scale;

8480

InsertIdx *= Scale;

8481

}

8482

}

8483

Ops.push_back(Src);

8484

Ops.append(SubInputs.begin(), SubInputs.end());

8485

if (ISD::isBuildVectorAllZeros(Src.getNode()))

8486

Mask.append(NumElts, SM_SentinelZero);

8487

else

8488

for (int i = 0; i != (int)NumElts; ++i)

8489

Mask.push_back(i);

8490

for (int i = 0; i != (int)NumSubElts; ++i) {

8491

int M = SubMask[i];

8492

if (0 <= M) {

8493

int InputIdx = M / NumSubElts;

8494

M = (NumElts * (1 + InputIdx)) + (M % NumSubElts);

8495

}

8496

Mask[i + InsertIdx] = M;

8497

}

8498

return true;

8499

}

8500

case X86ISD::PINSRB:

8501

case X86ISD::PINSRW:

8502

case ISD::SCALAR_TO_VECTOR:

8503

case ISD::INSERT_VECTOR_ELT: {

8504

// Match against a insert_vector_elt/scalar_to_vector of an extract from a

8505

// vector, for matching src/dst vector types.

8506

SDValue Scl = N.getOperand(Opcode == ISD::SCALAR_TO_VECTOR ? 0 : 1);

8507

8508

unsigned DstIdx = 0;

8509

if (Opcode != ISD::SCALAR_TO_VECTOR) {

8510

// Check we have an in-range constant insertion index.

8511

if (!isa<ConstantSDNode>(N.getOperand(2)) ||

8512

N.getConstantOperandAPInt(2).uge(NumElts))

8513

return false;

8514

DstIdx = N.getConstantOperandVal(2);

8515

8516

// Attempt to recognise an INSERT*(VEC, 0, DstIdx) shuffle pattern.

8517

if (X86::isZeroNode(Scl)) {

8518

Ops.push_back(N.getOperand(0));

8519

for (unsigned i = 0; i != NumElts; ++i)

8520

Mask.push_back(i == DstIdx ? SM_SentinelZero : (int)i);

8521

return true;

8522

}

8523

}

8524

8525

// Peek through trunc/aext/zext.

8526

// TODO: aext shouldn't require SM_SentinelZero padding.

8527

// TODO: handle shift of scalars.

8528

unsigned MinBitsPerElt = Scl.getScalarValueSizeInBits();

8529

while (Scl.getOpcode() == ISD::TRUNCATE ||

8530

Scl.getOpcode() == ISD::ANY_EXTEND ||

8531

Scl.getOpcode() == ISD::ZERO_EXTEND) {

8532

Scl = Scl.getOperand(0);

8533

MinBitsPerElt =

8534

std::min<unsigned>(MinBitsPerElt, Scl.getScalarValueSizeInBits());

8535

}

8536

if ((MinBitsPerElt % 8) != 0)

8537

return false;

8538

8539

// Attempt to find the source vector the scalar was extracted from.

8540

SDValue SrcExtract;

8541

if ((Scl.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||

8542

Scl.getOpcode() == X86ISD::PEXTRW ||

8543

Scl.getOpcode() == X86ISD::PEXTRB) &&

8544

Scl.getOperand(0).getValueSizeInBits() == NumSizeInBits) {

8545

SrcExtract = Scl;

8546

}

8547

if (!SrcExtract || !isa<ConstantSDNode>(SrcExtract.getOperand(1)))

8548

return false;

8549

8550

SDValue SrcVec = SrcExtract.getOperand(0);

8551

EVT SrcVT = SrcVec.getValueType();

8552

if (!SrcVT.getScalarType().isByteSized())

8553

return false;

8554

unsigned SrcIdx = SrcExtract.getConstantOperandVal(1);

8555

unsigned SrcByte = SrcIdx * (SrcVT.getScalarSizeInBits() / 8);

8556

unsigned DstByte = DstIdx * NumBytesPerElt;

8557

MinBitsPerElt =

8558

std::min<unsigned>(MinBitsPerElt, SrcVT.getScalarSizeInBits());

8559

8560

// Create 'identity' byte level shuffle mask and then add inserted bytes.

8561

if (Opcode == ISD::SCALAR_TO_VECTOR) {

8562

Ops.push_back(SrcVec);

8563

Mask.append(NumSizeInBytes, SM_SentinelUndef);

8564

} else {

8565

Ops.push_back(SrcVec);

8566

Ops.push_back(N.getOperand(0));

8567

for (int i = 0; i != (int)NumSizeInBytes; ++i)

8568

Mask.push_back(NumSizeInBytes + i);

8569

}

8570

8571

unsigned MinBytesPerElts = MinBitsPerElt / 8;

8572

MinBytesPerElts = std::min(MinBytesPerElts, NumBytesPerElt);

8573

for (unsigned i = 0; i != MinBytesPerElts; ++i)

8574

Mask[DstByte + i] = SrcByte + i;

8575

for (unsigned i = MinBytesPerElts; i < NumBytesPerElt; ++i)

8576

Mask[DstByte + i] = SM_SentinelZero;

8577

return true;

8578

}

8579

case X86ISD::PACKSS:

8580

case X86ISD::PACKUS: {

8581

SDValue N0 = N.getOperand(0);

8582

SDValue N1 = N.getOperand(1);

8583

assert(N0.getValueType().getVectorNumElements() == (NumElts / 2) &&(static_cast <bool> (N0.getValueType().getVectorNumElements
() == (NumElts / 2) && N1.getValueType().getVectorNumElements
() == (NumElts / 2) && "Unexpected input value type")
? void (0) : __assert_fail ("N0.getValueType().getVectorNumElements() == (NumElts / 2) && N1.getValueType().getVectorNumElements() == (NumElts / 2) && \"Unexpected input value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8585, __extension__
__PRETTY_FUNCTION__))

8584

N1.getValueType().getVectorNumElements() == (NumElts / 2) &&(static_cast <bool> (N0.getValueType().getVectorNumElements
() == (NumElts / 2) && N1.getValueType().getVectorNumElements
() == (NumElts / 2) && "Unexpected input value type")
? void (0) : __assert_fail ("N0.getValueType().getVectorNumElements() == (NumElts / 2) && N1.getValueType().getVectorNumElements() == (NumElts / 2) && \"Unexpected input value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8585, __extension__
__PRETTY_FUNCTION__))

8585

"Unexpected input value type")(static_cast <bool> (N0.getValueType().getVectorNumElements
() == (NumElts / 2) && N1.getValueType().getVectorNumElements
() == (NumElts / 2) && "Unexpected input value type")
? void (0) : __assert_fail ("N0.getValueType().getVectorNumElements() == (NumElts / 2) && N1.getValueType().getVectorNumElements() == (NumElts / 2) && \"Unexpected input value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8585, __extension__
__PRETTY_FUNCTION__));

8586

8587

APInt EltsLHS, EltsRHS;

8588

getPackDemandedElts(VT, DemandedElts, EltsLHS, EltsRHS);

8589

8590

// If we know input saturation won't happen (or we don't care for particular

8591

// lanes), we can treat this as a truncation shuffle.

8592

bool Offset0 = false, Offset1 = false;

8593

if (Opcode == X86ISD::PACKSS) {

8594

if ((!(N0.isUndef() || EltsLHS.isZero()) &&

8595

DAG.ComputeNumSignBits(N0, EltsLHS, Depth + 1) <= NumBitsPerElt) ||

8596

(!(N1.isUndef() || EltsRHS.isZero()) &&

8597

DAG.ComputeNumSignBits(N1, EltsRHS, Depth + 1) <= NumBitsPerElt))

8598

return false;

8599

// We can't easily fold ASHR into a shuffle, but if it was feeding a

8600

// PACKSS then it was likely being used for sign-extension for a

8601

// truncation, so just peek through and adjust the mask accordingly.

8602

if (N0.getOpcode() == X86ISD::VSRAI && N->isOnlyUserOf(N0.getNode()) &&

8603

N0.getConstantOperandAPInt(1) == NumBitsPerElt) {

8604

Offset0 = true;

8605

N0 = N0.getOperand(0);

8606

}

8607

if (N1.getOpcode() == X86ISD::VSRAI && N->isOnlyUserOf(N1.getNode()) &&

8608

N1.getConstantOperandAPInt(1) == NumBitsPerElt) {

8609

Offset1 = true;

8610

N1 = N1.getOperand(0);

8611

}

8612

} else {

8613

APInt ZeroMask = APInt::getHighBitsSet(2 * NumBitsPerElt, NumBitsPerElt);

8614

if ((!(N0.isUndef() || EltsLHS.isZero()) &&

8615

!DAG.MaskedValueIsZero(N0, ZeroMask, EltsLHS, Depth + 1)) ||

8616

(!(N1.isUndef() || EltsRHS.isZero()) &&

8617

!DAG.MaskedValueIsZero(N1, ZeroMask, EltsRHS, Depth + 1)))

8618

return false;

8619

}

8620

8621

bool IsUnary = (N0 == N1);

8622

8623

Ops.push_back(N0);

8624

if (!IsUnary)

8625

Ops.push_back(N1);

8626

8627

createPackShuffleMask(VT, Mask, IsUnary);

8628

8629

if (Offset0 || Offset1) {

8630

for (int &M : Mask)

8631

if ((Offset0 && isInRange(M, 0, NumElts)) ||

8632

(Offset1 && isInRange(M, NumElts, 2 * NumElts)))

8633

++M;

8634

}

8635

return true;

8636

}

8637

case ISD::VSELECT:

8638

case X86ISD::BLENDV: {

8639

SDValue Cond = N.getOperand(0);

8640

if (createShuffleMaskFromVSELECT(Mask, Cond, Opcode == X86ISD::BLENDV)) {

8641

Ops.push_back(N.getOperand(1));

8642

Ops.push_back(N.getOperand(2));

8643

return true;

8644

}

8645

return false;

8646

}

8647

case X86ISD::VTRUNC: {

8648

SDValue Src = N.getOperand(0);

8649

EVT SrcVT = Src.getValueType();

8650

// Truncated source must be a simple vector.

8651

if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 ||

8652

(SrcVT.getScalarSizeInBits() % 8) != 0)

8653

return false;

8654

unsigned NumSrcElts = SrcVT.getVectorNumElements();

8655

unsigned NumBitsPerSrcElt = SrcVT.getScalarSizeInBits();

8656

unsigned Scale = NumBitsPerSrcElt / NumBitsPerElt;

8657

assert((NumBitsPerSrcElt % NumBitsPerElt) == 0 && "Illegal truncation")(static_cast <bool> ((NumBitsPerSrcElt % NumBitsPerElt)
== 0 && "Illegal truncation") ? void (0) : __assert_fail
("(NumBitsPerSrcElt % NumBitsPerElt) == 0 && \"Illegal truncation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8657, __extension__
__PRETTY_FUNCTION__));

8658

for (unsigned i = 0; i != NumSrcElts; ++i)

8659

Mask.push_back(i * Scale);

8660

Mask.append(NumElts - NumSrcElts, SM_SentinelZero);

8661

Ops.push_back(Src);

8662

return true;

8663

}

8664

case X86ISD::VSHLI:

8665

case X86ISD::VSRLI: {

8666

uint64_t ShiftVal = N.getConstantOperandVal(1);

8667

// Out of range bit shifts are guaranteed to be zero.

8668

if (NumBitsPerElt <= ShiftVal) {

8669

Mask.append(NumElts, SM_SentinelZero);

8670

return true;

8671

}

8672

8673

// We can only decode 'whole byte' bit shifts as shuffles.

8674

if ((ShiftVal % 8) != 0)

8675

break;

8676

8677

uint64_t ByteShift = ShiftVal / 8;

8678

Ops.push_back(N.getOperand(0));

8679

8680

// Clear mask to all zeros and insert the shifted byte indices.

8681

Mask.append(NumSizeInBytes, SM_SentinelZero);

8682

8683

if (X86ISD::VSHLI == Opcode) {

8684

for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)

8685

for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)

8686

Mask[i + j] = i + j - ByteShift;

8687

} else {

8688

for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)

8689

for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)

8690

Mask[i + j - ByteShift] = i + j;

8691

}

8692

return true;

8693

}

8694

case X86ISD::VROTLI:

8695

case X86ISD::VROTRI: {

8696

// We can only decode 'whole byte' bit rotates as shuffles.

8697

uint64_t RotateVal = N.getConstantOperandAPInt(1).urem(NumBitsPerElt);

8698

if ((RotateVal % 8) != 0)

8699

return false;

8700

Ops.push_back(N.getOperand(0));

8701

int Offset = RotateVal / 8;

8702

Offset = (X86ISD::VROTLI == Opcode ? NumBytesPerElt - Offset : Offset);

8703

for (int i = 0; i != (int)NumElts; ++i) {

8704

int BaseIdx = i * NumBytesPerElt;

8705

for (int j = 0; j != (int)NumBytesPerElt; ++j) {

8706

Mask.push_back(BaseIdx + ((Offset + j) % NumBytesPerElt));

8707

}

8708

}

8709

return true;

8710

}

8711

case X86ISD::VBROADCAST: {

8712

SDValue Src = N.getOperand(0);

8713

if (!Src.getSimpleValueType().isVector()) {

8714

if (Src.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

8715

!isNullConstant(Src.getOperand(1)) ||

8716

Src.getOperand(0).getValueType().getScalarType() !=

8717

VT.getScalarType())

8718

return false;

8719

Src = Src.getOperand(0);

8720

}

8721

Ops.push_back(Src);

8722

Mask.append(NumElts, 0);

8723

return true;

8724

}

8725

case ISD::ZERO_EXTEND:

8726

case ISD::ANY_EXTEND:

8727

case ISD::ZERO_EXTEND_VECTOR_INREG:

8728

case ISD::ANY_EXTEND_VECTOR_INREG: {

8729

SDValue Src = N.getOperand(0);

8730

EVT SrcVT = Src.getValueType();

8731

8732

// Extended source must be a simple vector.

8733

if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 ||

8734

(SrcVT.getScalarSizeInBits() % 8) != 0)

8735

return false;

8736

8737

bool IsAnyExtend =

8738

(ISD::ANY_EXTEND == Opcode || ISD::ANY_EXTEND_VECTOR_INREG == Opcode);

8739

DecodeZeroExtendMask(SrcVT.getScalarSizeInBits(), NumBitsPerElt, NumElts,

8740

IsAnyExtend, Mask);

8741

Ops.push_back(Src);

8742

return true;

8743

}

8744

}

8745

8746

return false;

8747

}

8748

8749

/// Removes unused/repeated shuffle source inputs and adjusts the shuffle mask.

8750

static void resolveTargetShuffleInputsAndMask(SmallVectorImpl<SDValue> &Inputs,

8751

SmallVectorImpl<int> &Mask) {

8752

int MaskWidth = Mask.size();

8753

SmallVector<SDValue, 16> UsedInputs;

8754

for (int i = 0, e = Inputs.size(); i < e; ++i) {

8755

int lo = UsedInputs.size() * MaskWidth;

8756

int hi = lo + MaskWidth;

8757

8758

// Strip UNDEF input usage.

8759

if (Inputs[i].isUndef())

8760

for (int &M : Mask)

8761

if ((lo <= M) && (M < hi))

8762

M = SM_SentinelUndef;

8763

8764

// Check for unused inputs.

8765

if (none_of(Mask, [lo, hi](int i) { return (lo <= i) && (i < hi); })) {

8766

for (int &M : Mask)

8767

if (lo <= M)

8768

M -= MaskWidth;

8769

continue;

8770

}

8771

8772

// Check for repeated inputs.

8773

bool IsRepeat = false;

8774

for (int j = 0, ue = UsedInputs.size(); j != ue; ++j) {

8775

if (UsedInputs[j] != Inputs[i])

8776

continue;

8777

for (int &M : Mask)

8778

if (lo <= M)

8779

M = (M < hi) ? ((M - lo) + (j * MaskWidth)) : (M - MaskWidth);

8780

IsRepeat = true;

8781

break;

8782

}

8783

if (IsRepeat)

8784

continue;

8785

8786

UsedInputs.push_back(Inputs[i]);

8787

}

8788

Inputs = UsedInputs;

8789

}

8790

8791

/// Calls getTargetShuffleAndZeroables to resolve a target shuffle mask's inputs

8792

/// and then sets the SM_SentinelUndef and SM_SentinelZero values.

8793

/// Returns true if the target shuffle mask was decoded.

8794

static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,

8795

SmallVectorImpl<SDValue> &Inputs,

8796

SmallVectorImpl<int> &Mask,

8797

APInt &KnownUndef, APInt &KnownZero,

8798

const SelectionDAG &DAG, unsigned Depth,

8799

bool ResolveKnownElts) {

8800

if (Depth >= SelectionDAG::MaxRecursionDepth)

8801

return false; // Limit search depth.

8802

8803

EVT VT = Op.getValueType();

8804

if (!VT.isSimple() || !VT.isVector())

8805

return false;

8806

8807

if (getTargetShuffleAndZeroables(Op, Mask, Inputs, KnownUndef, KnownZero)) {

8808

if (ResolveKnownElts)

8809

resolveTargetShuffleFromZeroables(Mask, KnownUndef, KnownZero);

8810

return true;

8811

}

8812

if (getFauxShuffleMask(Op, DemandedElts, Mask, Inputs, DAG, Depth,

8813

ResolveKnownElts)) {

8814

resolveZeroablesFromTargetShuffle(Mask, KnownUndef, KnownZero);

8815

return true;

8816

}

8817

return false;

8818

}

8819

8820

static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,

8821

SmallVectorImpl<SDValue> &Inputs,

8822

SmallVectorImpl<int> &Mask,

8823

const SelectionDAG &DAG, unsigned Depth,

8824

bool ResolveKnownElts) {

8825

APInt KnownUndef, KnownZero;

8826

return getTargetShuffleInputs(Op, DemandedElts, Inputs, Mask, KnownUndef,

8827

KnownZero, DAG, Depth, ResolveKnownElts);

8828

}

8829

8830

static bool getTargetShuffleInputs(SDValue Op, SmallVectorImpl<SDValue> &Inputs,

8831

SmallVectorImpl<int> &Mask,

8832

const SelectionDAG &DAG, unsigned Depth = 0,

8833

bool ResolveKnownElts = true) {

8834

EVT VT = Op.getValueType();

8835

if (!VT.isSimple() || !VT.isVector())

8836

return false;

8837

8838

unsigned NumElts = Op.getValueType().getVectorNumElements();

8839

APInt DemandedElts = APInt::getAllOnes(NumElts);

8840

return getTargetShuffleInputs(Op, DemandedElts, Inputs, Mask, DAG, Depth,

8841

ResolveKnownElts);

8842

}

8843

8844

// Attempt to create a scalar/subvector broadcast from the base MemSDNode.

8845

static SDValue getBROADCAST_LOAD(unsigned Opcode, const SDLoc &DL, EVT VT,

8846

EVT MemVT, MemSDNode *Mem, unsigned Offset,

8847

SelectionDAG &DAG) {

8848

assert((Opcode == X86ISD::VBROADCAST_LOAD ||(static_cast <bool> ((Opcode == X86ISD::VBROADCAST_LOAD
|| Opcode == X86ISD::SUBV_BROADCAST_LOAD) && "Unknown broadcast load type"
) ? void (0) : __assert_fail ("(Opcode == X86ISD::VBROADCAST_LOAD || Opcode == X86ISD::SUBV_BROADCAST_LOAD) && \"Unknown broadcast load type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8850, __extension__
__PRETTY_FUNCTION__))

8849

Opcode == X86ISD::SUBV_BROADCAST_LOAD) &&(static_cast <bool> ((Opcode == X86ISD::VBROADCAST_LOAD
|| Opcode == X86ISD::SUBV_BROADCAST_LOAD) && "Unknown broadcast load type"
) ? void (0) : __assert_fail ("(Opcode == X86ISD::VBROADCAST_LOAD || Opcode == X86ISD::SUBV_BROADCAST_LOAD) && \"Unknown broadcast load type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8850, __extension__
__PRETTY_FUNCTION__))

8850

"Unknown broadcast load type")(static_cast <bool> ((Opcode == X86ISD::VBROADCAST_LOAD
|| Opcode == X86ISD::SUBV_BROADCAST_LOAD) && "Unknown broadcast load type"
) ? void (0) : __assert_fail ("(Opcode == X86ISD::VBROADCAST_LOAD || Opcode == X86ISD::SUBV_BROADCAST_LOAD) && \"Unknown broadcast load type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8850, __extension__
__PRETTY_FUNCTION__));

8851

8852

// Ensure this is a simple (non-atomic, non-voltile), temporal read memop.

8853

if (!Mem || !Mem->readMem() || !Mem->isSimple() || Mem->isNonTemporal())

8854

return SDValue();

8855

8856

SDValue Ptr =

8857

DAG.getMemBasePlusOffset(Mem->getBasePtr(), TypeSize::Fixed(Offset), DL);

8858

SDVTList Tys = DAG.getVTList(VT, MVT::Other);

8859

SDValue Ops[] = {Mem->getChain(), Ptr};

8860

SDValue BcstLd = DAG.getMemIntrinsicNode(

8861

Opcode, DL, Tys, Ops, MemVT,

8862

DAG.getMachineFunction().getMachineMemOperand(

8863

Mem->getMemOperand(), Offset, MemVT.getStoreSize()));

8864

DAG.makeEquivalentMemoryOrdering(SDValue(Mem, 1), BcstLd.getValue(1));

8865

return BcstLd;

8866

}

8867

8868

/// Returns the scalar element that will make up the i'th

8869

/// element of the result of the vector shuffle.

8870

static SDValue getShuffleScalarElt(SDValue Op, unsigned Index,

8871

SelectionDAG &DAG, unsigned Depth) {

8872

if (Depth >= SelectionDAG::MaxRecursionDepth)

8873

return SDValue(); // Limit search depth.

8874

8875

EVT VT = Op.getValueType();

8876

unsigned Opcode = Op.getOpcode();

8877

unsigned NumElems = VT.getVectorNumElements();

8878

8879

// Recurse into ISD::VECTOR_SHUFFLE node to find scalars.

8880

if (auto *SV = dyn_cast<ShuffleVectorSDNode>(Op)) {

8881

int Elt = SV->getMaskElt(Index);

8882

8883

if (Elt < 0)

8884

return DAG.getUNDEF(VT.getVectorElementType());

8885

8886

SDValue Src = (Elt < (int)NumElems) ? SV->getOperand(0) : SV->getOperand(1);

8887

return getShuffleScalarElt(Src, Elt % NumElems, DAG, Depth + 1);

8888

}

8889

8890

// Recurse into target specific vector shuffles to find scalars.

8891

if (isTargetShuffle(Opcode)) {

8892

MVT ShufVT = VT.getSimpleVT();

8893

MVT ShufSVT = ShufVT.getVectorElementType();

8894

int NumElems = (int)ShufVT.getVectorNumElements();

8895

SmallVector<int, 16> ShuffleMask;

8896

SmallVector<SDValue, 16> ShuffleOps;

8897

if (!getTargetShuffleMask(Op.getNode(), ShufVT, true, ShuffleOps,

8898

ShuffleMask))

8899

return SDValue();

8900

8901

int Elt = ShuffleMask[Index];

8902

if (Elt == SM_SentinelZero)

8903

return ShufSVT.isInteger() ? DAG.getConstant(0, SDLoc(Op), ShufSVT)

8904

: DAG.getConstantFP(+0.0, SDLoc(Op), ShufSVT);

8905

if (Elt == SM_SentinelUndef)

8906

return DAG.getUNDEF(ShufSVT);

8907

8908

assert(0 <= Elt && Elt < (2 * NumElems) && "Shuffle index out of range")(static_cast <bool> (0 <= Elt && Elt < (2
* NumElems) && "Shuffle index out of range") ? void (
0) : __assert_fail ("0 <= Elt && Elt < (2 * NumElems) && \"Shuffle index out of range\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8908, __extension__
__PRETTY_FUNCTION__));

8909

SDValue Src = (Elt < NumElems) ? ShuffleOps[0] : ShuffleOps[1];

8910

return getShuffleScalarElt(Src, Elt % NumElems, DAG, Depth + 1);

8911

}

8912

8913

// Recurse into insert_subvector base/sub vector to find scalars.

8914

if (Opcode == ISD::INSERT_SUBVECTOR) {

8915

SDValue Vec = Op.getOperand(0);

8916

SDValue Sub = Op.getOperand(1);

8917

uint64_t SubIdx = Op.getConstantOperandVal(2);

8918

unsigned NumSubElts = Sub.getValueType().getVectorNumElements();

8919

8920

if (SubIdx <= Index && Index < (SubIdx + NumSubElts))

8921

return getShuffleScalarElt(Sub, Index - SubIdx, DAG, Depth + 1);

8922

return getShuffleScalarElt(Vec, Index, DAG, Depth + 1);

8923

}

8924

8925

// Recurse into concat_vectors sub vector to find scalars.

8926

if (Opcode == ISD::CONCAT_VECTORS) {

8927

EVT SubVT = Op.getOperand(0).getValueType();

8928

unsigned NumSubElts = SubVT.getVectorNumElements();

8929

uint64_t SubIdx = Index / NumSubElts;

8930

uint64_t SubElt = Index % NumSubElts;

8931

return getShuffleScalarElt(Op.getOperand(SubIdx), SubElt, DAG, Depth + 1);

8932

}

8933

8934

// Recurse into extract_subvector src vector to find scalars.

8935

if (Opcode == ISD::EXTRACT_SUBVECTOR) {

8936

SDValue Src = Op.getOperand(0);

8937

uint64_t SrcIdx = Op.getConstantOperandVal(1);

8938

return getShuffleScalarElt(Src, Index + SrcIdx, DAG, Depth + 1);

8939

}

8940

8941

// We only peek through bitcasts of the same vector width.

8942

if (Opcode == ISD::BITCAST) {

8943

SDValue Src = Op.getOperand(0);

8944

EVT SrcVT = Src.getValueType();

8945

if (SrcVT.isVector() && SrcVT.getVectorNumElements() == NumElems)

8946

return getShuffleScalarElt(Src, Index, DAG, Depth + 1);

8947

return SDValue();

8948

}

8949

8950

// Actual nodes that may contain scalar elements

8951

8952

// For insert_vector_elt - either return the index matching scalar or recurse

8953

// into the base vector.

8954

if (Opcode == ISD::INSERT_VECTOR_ELT &&

8955

isa<ConstantSDNode>(Op.getOperand(2))) {

8956

if (Op.getConstantOperandAPInt(2) == Index)

8957

return Op.getOperand(1);

8958

return getShuffleScalarElt(Op.getOperand(0), Index, DAG, Depth + 1);

8959

}

8960

8961

if (Opcode == ISD::SCALAR_TO_VECTOR)

8962

return (Index == 0) ? Op.getOperand(0)

8963

: DAG.getUNDEF(VT.getVectorElementType());

8964

8965

if (Opcode == ISD::BUILD_VECTOR)

8966

return Op.getOperand(Index);

8967

8968

return SDValue();

8969

}

8970

8971

// Use PINSRB/PINSRW/PINSRD to create a build vector.

8972

static SDValue LowerBuildVectorAsInsert(SDValue Op, const APInt &NonZeroMask,

8973

unsigned NumNonZero, unsigned NumZero,

8974

SelectionDAG &DAG,

8975

const X86Subtarget &Subtarget) {

8976

MVT VT = Op.getSimpleValueType();

8977

unsigned NumElts = VT.getVectorNumElements();

8978

assert(((VT == MVT::v8i16 && Subtarget.hasSSE2()) ||(static_cast <bool> (((VT == MVT::v8i16 && Subtarget
.hasSSE2()) || ((VT == MVT::v16i8 || VT == MVT::v4i32) &&
Subtarget.hasSSE41())) && "Illegal vector insertion"
) ? void (0) : __assert_fail ("((VT == MVT::v8i16 && Subtarget.hasSSE2()) || ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) && \"Illegal vector insertion\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8980, __extension__
__PRETTY_FUNCTION__))

8979

((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) &&(static_cast <bool> (((VT == MVT::v8i16 && Subtarget
.hasSSE2()) || ((VT == MVT::v16i8 || VT == MVT::v4i32) &&
Subtarget.hasSSE41())) && "Illegal vector insertion"
) ? void (0) : __assert_fail ("((VT == MVT::v8i16 && Subtarget.hasSSE2()) || ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) && \"Illegal vector insertion\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8980, __extension__
__PRETTY_FUNCTION__))

8980

"Illegal vector insertion")(static_cast <bool> (((VT == MVT::v8i16 && Subtarget
.hasSSE2()) || ((VT == MVT::v16i8 || VT == MVT::v4i32) &&
Subtarget.hasSSE41())) && "Illegal vector insertion"
) ? void (0) : __assert_fail ("((VT == MVT::v8i16 && Subtarget.hasSSE2()) || ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) && \"Illegal vector insertion\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8980, __extension__
__PRETTY_FUNCTION__));

8981

8982

SDLoc dl(Op);

8983

SDValue V;

8984

bool First = true;

8985

8986

for (unsigned i = 0; i < NumElts; ++i) {

8987

bool IsNonZero = NonZeroMask[i];

8988

if (!IsNonZero)

8989

continue;

8990

8991

// If the build vector contains zeros or our first insertion is not the

8992

// first index then insert into zero vector to break any register

8993

// dependency else use SCALAR_TO_VECTOR.

8994

if (First) {

8995

First = false;

8996

if (NumZero || 0 != i)

8997

V = getZeroVector(VT, Subtarget, DAG, dl);

8998

else {

8999

assert(0 == i && "Expected insertion into zero-index")(static_cast <bool> (0 == i && "Expected insertion into zero-index"
) ? void (0) : __assert_fail ("0 == i && \"Expected insertion into zero-index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8999, __extension__
__PRETTY_FUNCTION__));

9000

V = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);

9001

V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);

9002

V = DAG.getBitcast(VT, V);

9003

continue;

9004

}

9005

}

9006

V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, V, Op.getOperand(i),

9007

DAG.getIntPtrConstant(i, dl));

9008

}

9009

9010

return V;

9011

}

9012

9013

/// Custom lower build_vector of v16i8.

9014

static SDValue LowerBuildVectorv16i8(SDValue Op, const APInt &NonZeroMask,

9015

unsigned NumNonZero, unsigned NumZero,

9016

SelectionDAG &DAG,

9017

const X86Subtarget &Subtarget) {

9018

if (NumNonZero > 8 && !Subtarget.hasSSE41())

9019

return SDValue();

9020

9021

// SSE4.1 - use PINSRB to insert each byte directly.

9022

if (Subtarget.hasSSE41())

9023

return LowerBuildVectorAsInsert(Op, NonZeroMask, NumNonZero, NumZero, DAG,

9024

Subtarget);

9025

9026

SDLoc dl(Op);

9027

SDValue V;

9028

9029

// Pre-SSE4.1 - merge byte pairs and insert with PINSRW.

9030

for (unsigned i = 0; i < 16; i += 2) {

9031

bool ThisIsNonZero = NonZeroMask[i];

9032

bool NextIsNonZero = NonZeroMask[i + 1];

9033

if (!ThisIsNonZero && !NextIsNonZero)

9034

continue;

9035

9036

// FIXME: Investigate combining the first 4 bytes as a i32 instead.

9037

SDValue Elt;

9038

if (ThisIsNonZero) {

9039

if (NumZero || NextIsNonZero)

9040

Elt = DAG.getZExtOrTrunc(Op.getOperand(i), dl, MVT::i32);

9041

else

9042

Elt = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);

9043

}

9044

9045

if (NextIsNonZero) {

9046

SDValue NextElt = Op.getOperand(i + 1);

9047

if (i == 0 && NumZero)

9048

NextElt = DAG.getZExtOrTrunc(NextElt, dl, MVT::i32);

9049

else

9050

NextElt = DAG.getAnyExtOrTrunc(NextElt, dl, MVT::i32);

9051

NextElt = DAG.getNode(ISD::SHL, dl, MVT::i32, NextElt,

9052

DAG.getConstant(8, dl, MVT::i8));

9053

if (ThisIsNonZero)

9054

Elt = DAG.getNode(ISD::OR, dl, MVT::i32, NextElt, Elt);

9055

else

9056

Elt = NextElt;

9057

}

9058

9059

// If our first insertion is not the first index or zeros are needed, then

9060

// insert into zero vector. Otherwise, use SCALAR_TO_VECTOR (leaves high

9061

// elements undefined).

9062

if (!V) {

9063

if (i != 0 || NumZero)

9064

V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);

9065

else {

9066

V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Elt);

9067

V = DAG.getBitcast(MVT::v8i16, V);

9068

continue;

9069

}

9070

}

9071

Elt = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Elt);

9072

V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, Elt,

9073

DAG.getIntPtrConstant(i / 2, dl));

9074

}

9075

9076

return DAG.getBitcast(MVT::v16i8, V);

9077

}

9078

9079

/// Custom lower build_vector of v8i16.

9080

static SDValue LowerBuildVectorv8i16(SDValue Op, const APInt &NonZeroMask,

9081

unsigned NumNonZero, unsigned NumZero,

9082

SelectionDAG &DAG,

9083

const X86Subtarget &Subtarget) {

9084

if (NumNonZero > 4 && !Subtarget.hasSSE41())

9085

return SDValue();

9086

9087

// Use PINSRW to insert each byte directly.

9088

return LowerBuildVectorAsInsert(Op, NonZeroMask, NumNonZero, NumZero, DAG,

9089

Subtarget);

9090

}

9091

9092

/// Custom lower build_vector of v4i32 or v4f32.

9093

static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,

9094

const X86Subtarget &Subtarget) {

9095

// If this is a splat of a pair of elements, use MOVDDUP (unless the target

9096

// has XOP; in that case defer lowering to potentially use VPERMIL2PS).

9097

// Because we're creating a less complicated build vector here, we may enable

9098

// further folding of the MOVDDUP via shuffle transforms.

9099

if (Subtarget.hasSSE3() && !Subtarget.hasXOP() &&

9100

Op.getOperand(0) == Op.getOperand(2) &&

9101

Op.getOperand(1) == Op.getOperand(3) &&

9102

Op.getOperand(0) != Op.getOperand(1)) {

9103

SDLoc DL(Op);

9104

MVT VT = Op.getSimpleValueType();

9105

MVT EltVT = VT.getVectorElementType();

9106

// Create a new build vector with the first 2 elements followed by undef

9107

// padding, bitcast to v2f64, duplicate, and bitcast back.

9108

SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),

9109

DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };

9110

SDValue NewBV = DAG.getBitcast(MVT::v2f64, DAG.getBuildVector(VT, DL, Ops));

9111

SDValue Dup = DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, NewBV);

9112

return DAG.getBitcast(VT, Dup);

9113

}

9114

9115

// Find all zeroable elements.

9116

std::bitset<4> Zeroable, Undefs;

9117

for (int i = 0; i < 4; ++i) {

9118

SDValue Elt = Op.getOperand(i);

9119

Undefs[i] = Elt.isUndef();

9120

Zeroable[i] = (Elt.isUndef() || X86::isZeroNode(Elt));

9121

}

9122

assert(Zeroable.size() - Zeroable.count() > 1 &&(static_cast <bool> (Zeroable.size() - Zeroable.count()
> 1 && "We expect at least two non-zero elements!"
) ? void (0) : __assert_fail ("Zeroable.size() - Zeroable.count() > 1 && \"We expect at least two non-zero elements!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9123, __extension__
__PRETTY_FUNCTION__))

9123

"We expect at least two non-zero elements!")(static_cast <bool> (Zeroable.size() - Zeroable.count()
> 1 && "We expect at least two non-zero elements!"
) ? void (0) : __assert_fail ("Zeroable.size() - Zeroable.count() > 1 && \"We expect at least two non-zero elements!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9123, __extension__
__PRETTY_FUNCTION__));

9124

9125

// We only know how to deal with build_vector nodes where elements are either

9126

// zeroable or extract_vector_elt with constant index.

9127

SDValue FirstNonZero;

9128

unsigned FirstNonZeroIdx;

9129

for (unsigned i = 0; i < 4; ++i) {

9130

if (Zeroable[i])

9131

continue;

9132

SDValue Elt = Op.getOperand(i);

9133

if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

9134

!isa<ConstantSDNode>(Elt.getOperand(1)))

9135

return SDValue();

9136

// Make sure that this node is extracting from a 128-bit vector.

9137

MVT VT = Elt.getOperand(0).getSimpleValueType();

9138

if (!VT.is128BitVector())

9139

return SDValue();

9140

if (!FirstNonZero.getNode()) {

9141

FirstNonZero = Elt;

9142

FirstNonZeroIdx = i;

9143

}

9144

}

9145

9146

assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!")(static_cast <bool> (FirstNonZero.getNode() && "Unexpected build vector of all zeros!"
) ? void (0) : __assert_fail ("FirstNonZero.getNode() && \"Unexpected build vector of all zeros!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9146, __extension__
__PRETTY_FUNCTION__));

9147

SDValue V1 = FirstNonZero.getOperand(0);

9148

MVT VT = V1.getSimpleValueType();

9149

9150

// See if this build_vector can be lowered as a blend with zero.

9151

SDValue Elt;

9152

unsigned EltMaskIdx, EltIdx;

9153

int Mask[4];

9154

for (EltIdx = 0; EltIdx < 4; ++EltIdx) {

9155

if (Zeroable[EltIdx]) {

9156

// The zero vector will be on the right hand side.

9157

Mask[EltIdx] = EltIdx+4;

9158

continue;

9159

}

9160

9161

Elt = Op->getOperand(EltIdx);

9162

// By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.

9163

EltMaskIdx = Elt.getConstantOperandVal(1);

9164

if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx)

9165

break;

9166

Mask[EltIdx] = EltIdx;

9167

}

9168

9169

if (EltIdx == 4) {

9170

// Let the shuffle legalizer deal with blend operations.

9171

SDValue VZeroOrUndef = (Zeroable == Undefs)

9172

? DAG.getUNDEF(VT)

9173

: getZeroVector(VT, Subtarget, DAG, SDLoc(Op));

9174

if (V1.getSimpleValueType() != VT)

9175

V1 = DAG.getBitcast(VT, V1);

9176

return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZeroOrUndef, Mask);

9177

}

9178

9179

// See if we can lower this build_vector to a INSERTPS.

9180

if (!Subtarget.hasSSE41())

9181

return SDValue();

9182

9183

SDValue V2 = Elt.getOperand(0);

9184

if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)

9185

V1 = SDValue();

9186

9187

bool CanFold = true;

9188

for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {

9189

if (Zeroable[i])

9190

continue;

9191

9192

SDValue Current = Op->getOperand(i);

9193

SDValue SrcVector = Current->getOperand(0);

9194

if (!V1.getNode())

9195

V1 = SrcVector;

9196

CanFold = (SrcVector == V1) && (Current.getConstantOperandAPInt(1) == i);

9197

}

9198

9199

if (!CanFold)

9200

return SDValue();

9201

9202

assert(V1.getNode() && "Expected at least two non-zero elements!")(static_cast <bool> (V1.getNode() && "Expected at least two non-zero elements!"
) ? void (0) : __assert_fail ("V1.getNode() && \"Expected at least two non-zero elements!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9202, __extension__
__PRETTY_FUNCTION__));

9203

if (V1.getSimpleValueType() != MVT::v4f32)

9204

V1 = DAG.getBitcast(MVT::v4f32, V1);

9205

if (V2.getSimpleValueType() != MVT::v4f32)

9206

V2 = DAG.getBitcast(MVT::v4f32, V2);

9207

9208

// Ok, we can emit an INSERTPS instruction.

9209

unsigned ZMask = Zeroable.to_ulong();

9210

9211

unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask;

9212

assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!")(static_cast <bool> ((InsertPSMask & ~0xFFu) == 0 &&
"Invalid mask!") ? void (0) : __assert_fail ("(InsertPSMask & ~0xFFu) == 0 && \"Invalid mask!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9212, __extension__
__PRETTY_FUNCTION__));

9213

SDLoc DL(Op);

9214

SDValue Result = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,

9215

DAG.getIntPtrConstant(InsertPSMask, DL, true));

9216

return DAG.getBitcast(VT, Result);

9217

}

9218

9219

/// Return a vector logical shift node.

9220

static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits,

9221

SelectionDAG &DAG, const TargetLowering &TLI,

9222

const SDLoc &dl) {

9223

assert(VT.is128BitVector() && "Unknown type for VShift")(static_cast <bool> (VT.is128BitVector() && "Unknown type for VShift"
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Unknown type for VShift\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9223, __extension__
__PRETTY_FUNCTION__));

9224

MVT ShVT = MVT::v16i8;

9225

unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;

9226

SrcOp = DAG.getBitcast(ShVT, SrcOp);

9227

assert(NumBits % 8 == 0 && "Only support byte sized shifts")(static_cast <bool> (NumBits % 8 == 0 && "Only support byte sized shifts"
) ? void (0) : __assert_fail ("NumBits % 8 == 0 && \"Only support byte sized shifts\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9227, __extension__
__PRETTY_FUNCTION__));

9228

SDValue ShiftVal = DAG.getTargetConstant(NumBits / 8, dl, MVT::i8);

9229

return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));

9230

}

9231

9232

static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl,

9233

SelectionDAG &DAG) {

9234

9235

// Check if the scalar load can be widened into a vector load. And if

9236

// the address is "base + cst" see if the cst can be "absorbed" into

9237

// the shuffle mask.

9238

if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {

9239

SDValue Ptr = LD->getBasePtr();

9240

if (!ISD::isNormalLoad(LD) || !LD->isSimple())

9241

return SDValue();

9242

EVT PVT = LD->getValueType(0);

9243

if (PVT != MVT::i32 && PVT != MVT::f32)

9244

return SDValue();

9245

9246

int FI = -1;

9247

int64_t Offset = 0;

9248

if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {

9249

FI = FINode->getIndex();

9250

Offset = 0;

9251

} else if (DAG.isBaseWithConstantOffset(Ptr) &&

9252

isa<FrameIndexSDNode>(Ptr.getOperand(0))) {

9253

FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();

9254

Offset = Ptr.getConstantOperandVal(1);

9255

Ptr = Ptr.getOperand(0);

9256

} else {

9257

return SDValue();

9258

}

9259

9260

// FIXME: 256-bit vector instructions don't require a strict alignment,

9261

// improve this code to support it better.

9262

Align RequiredAlign(VT.getSizeInBits() / 8);

9263

SDValue Chain = LD->getChain();

9264

// Make sure the stack object alignment is at least 16 or 32.

9265

MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();

9266

MaybeAlign InferredAlign = DAG.InferPtrAlign(Ptr);

9267

if (!InferredAlign || *InferredAlign < RequiredAlign) {

9268

if (MFI.isFixedObjectIndex(FI)) {

9269

// Can't change the alignment. FIXME: It's possible to compute

9270

// the exact stack offset and reference FI + adjust offset instead.

9271

// If someone *really* cares about this. That's the way to implement it.

9272

return SDValue();

9273

} else {

9274

MFI.setObjectAlignment(FI, RequiredAlign);

9275

}

9276

}

9277

9278

// (Offset % 16 or 32) must be multiple of 4. Then address is then

9279

// Ptr + (Offset & ~15).

9280

if (Offset < 0)

9281

return SDValue();

9282

if ((Offset % RequiredAlign.value()) & 3)

9283

return SDValue();

9284

int64_t StartOffset = Offset & ~int64_t(RequiredAlign.value() - 1);

9285

if (StartOffset) {

9286

SDLoc DL(Ptr);

9287

Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,

9288

DAG.getConstant(StartOffset, DL, Ptr.getValueType()));

9289

}

9290

9291

int EltNo = (Offset - StartOffset) >> 2;

9292

unsigned NumElems = VT.getVectorNumElements();

9293

9294

EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);

9295

SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,

9296

LD->getPointerInfo().getWithOffset(StartOffset));

9297

9298

SmallVector<int, 8> Mask(NumElems, EltNo);

9299

9300

return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), Mask);

9301

}

9302

9303

return SDValue();

9304

}

9305

9306

// Recurse to find a LoadSDNode source and the accumulated ByteOffest.

9307

static bool findEltLoadSrc(SDValue Elt, LoadSDNode *&Ld, int64_t &ByteOffset) {

9308

if (ISD::isNON_EXTLoad(Elt.getNode())) {

9309

auto *BaseLd = cast<LoadSDNode>(Elt);

9310

if (!BaseLd->isSimple())

9311

return false;

9312

Ld = BaseLd;

9313

ByteOffset = 0;

9314

return true;

9315

}

9316

9317

switch (Elt.getOpcode()) {

9318

case ISD::BITCAST:

9319

case ISD::TRUNCATE:

9320

case ISD::SCALAR_TO_VECTOR:

9321

return findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset);

9322

case ISD::SRL:

9323

if (auto *AmtC = dyn_cast<ConstantSDNode>(Elt.getOperand(1))) {

9324

uint64_t Amt = AmtC->getZExtValue();

9325

if ((Amt % 8) == 0 && findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset)) {

9326

ByteOffset += Amt / 8;

9327

return true;

9328

}

9329

}

9330

break;

9331

case ISD::EXTRACT_VECTOR_ELT:

9332

if (auto *IdxC = dyn_cast<ConstantSDNode>(Elt.getOperand(1))) {

9333

SDValue Src = Elt.getOperand(0);

9334

unsigned SrcSizeInBits = Src.getScalarValueSizeInBits();

9335

unsigned DstSizeInBits = Elt.getScalarValueSizeInBits();

9336

if (DstSizeInBits == SrcSizeInBits && (SrcSizeInBits % 8) == 0 &&

9337

findEltLoadSrc(Src, Ld, ByteOffset)) {

9338

uint64_t Idx = IdxC->getZExtValue();

9339

ByteOffset += Idx * (SrcSizeInBits / 8);

9340

return true;

9341

}

9342

}

9343

break;

9344

}

9345

9346

return false;

9347

}

9348

9349

/// Given the initializing elements 'Elts' of a vector of type 'VT', see if the

9350

/// elements can be replaced by a single large load which has the same value as

9351

/// a build_vector or insert_subvector whose loaded operands are 'Elts'.

9352

///

9353

/// Example: <load i32 *a, load i32 *a+4, zero, undef> -> zextload a

9354

static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,

9355

const SDLoc &DL, SelectionDAG &DAG,

9356

const X86Subtarget &Subtarget,

9357

bool IsAfterLegalize) {

9358

if ((VT.getScalarSizeInBits() % 8) != 0)

9359

return SDValue();

9360

9361

unsigned NumElems = Elts.size();

9362

9363

int LastLoadedElt = -1;

9364

APInt LoadMask = APInt::getZero(NumElems);

9365

APInt ZeroMask = APInt::getZero(NumElems);

9366

APInt UndefMask = APInt::getZero(NumElems);

9367

9368

SmallVector<LoadSDNode*, 8> Loads(NumElems, nullptr);

9369

SmallVector<int64_t, 8> ByteOffsets(NumElems, 0);

9370

9371

// For each element in the initializer, see if we've found a load, zero or an

9372

// undef.

9373

for (unsigned i = 0; i < NumElems; ++i) {

9374

SDValue Elt = peekThroughBitcasts(Elts[i]);

9375

if (!Elt.getNode())

9376

return SDValue();

9377

if (Elt.isUndef()) {

9378

UndefMask.setBit(i);

9379

continue;

9380

}

9381

if (X86::isZeroNode(Elt) || ISD::isBuildVectorAllZeros(Elt.getNode())) {

9382

ZeroMask.setBit(i);

9383

continue;

9384

}

9385

9386

// Each loaded element must be the correct fractional portion of the

9387

// requested vector load.

9388

unsigned EltSizeInBits = Elt.getValueSizeInBits();

9389

if ((NumElems * EltSizeInBits) != VT.getSizeInBits())

9390

return SDValue();

9391

9392

if (!findEltLoadSrc(Elt, Loads[i], ByteOffsets[i]) || ByteOffsets[i] < 0)

9393

return SDValue();

9394

unsigned LoadSizeInBits = Loads[i]->getValueSizeInBits(0);

9395

if (((ByteOffsets[i] * 8) + EltSizeInBits) > LoadSizeInBits)

9396

return SDValue();

9397

9398

LoadMask.setBit(i);

9399

LastLoadedElt = i;

9400

}

9401

assert((ZeroMask.popcount() + UndefMask.popcount() + LoadMask.popcount()) ==(static_cast <bool> ((ZeroMask.popcount() + UndefMask.popcount
() + LoadMask.popcount()) == NumElems && "Incomplete element masks"
) ? void (0) : __assert_fail ("(ZeroMask.popcount() + UndefMask.popcount() + LoadMask.popcount()) == NumElems && \"Incomplete element masks\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9403, __extension__
__PRETTY_FUNCTION__))

9402

NumElems &&(static_cast <bool> ((ZeroMask.popcount() + UndefMask.popcount
() + LoadMask.popcount()) == NumElems && "Incomplete element masks"
) ? void (0) : __assert_fail ("(ZeroMask.popcount() + UndefMask.popcount() + LoadMask.popcount()) == NumElems && \"Incomplete element masks\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9403, __extension__
__PRETTY_FUNCTION__))

9403

"Incomplete element masks")(static_cast <bool> ((ZeroMask.popcount() + UndefMask.popcount
() + LoadMask.popcount()) == NumElems && "Incomplete element masks"
) ? void (0) : __assert_fail ("(ZeroMask.popcount() + UndefMask.popcount() + LoadMask.popcount()) == NumElems && \"Incomplete element masks\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9403, __extension__
__PRETTY_FUNCTION__));

9404

9405

// Handle Special Cases - all undef or undef/zero.

9406

if (UndefMask.popcount() == NumElems)

9407

return DAG.getUNDEF(VT);

9408

if ((ZeroMask.popcount() + UndefMask.popcount()) == NumElems)

9409

return VT.isInteger() ? DAG.getConstant(0, DL, VT)

9410

: DAG.getConstantFP(0.0, DL, VT);

9411

9412

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

9413

int FirstLoadedElt = LoadMask.countr_zero();

9414

SDValue EltBase = peekThroughBitcasts(Elts[FirstLoadedElt]);

9415

EVT EltBaseVT = EltBase.getValueType();

9416

assert(EltBaseVT.getSizeInBits() == EltBaseVT.getStoreSizeInBits() &&(static_cast <bool> (EltBaseVT.getSizeInBits() == EltBaseVT
.getStoreSizeInBits() && "Register/Memory size mismatch"
) ? void (0) : __assert_fail ("EltBaseVT.getSizeInBits() == EltBaseVT.getStoreSizeInBits() && \"Register/Memory size mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9417, __extension__
__PRETTY_FUNCTION__))

9417

"Register/Memory size mismatch")(static_cast <bool> (EltBaseVT.getSizeInBits() == EltBaseVT
.getStoreSizeInBits() && "Register/Memory size mismatch"
) ? void (0) : __assert_fail ("EltBaseVT.getSizeInBits() == EltBaseVT.getStoreSizeInBits() && \"Register/Memory size mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9417, __extension__
__PRETTY_FUNCTION__));

9418

LoadSDNode *LDBase = Loads[FirstLoadedElt];

9419

assert(LDBase && "Did not find base load for merging consecutive loads")(static_cast <bool> (LDBase && "Did not find base load for merging consecutive loads"
) ? void (0) : __assert_fail ("LDBase && \"Did not find base load for merging consecutive loads\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9419, __extension__
__PRETTY_FUNCTION__));

9420

unsigned BaseSizeInBits = EltBaseVT.getStoreSizeInBits();

9421

unsigned BaseSizeInBytes = BaseSizeInBits / 8;

9422

int NumLoadedElts = (1 + LastLoadedElt - FirstLoadedElt);

9423

int LoadSizeInBits = NumLoadedElts * BaseSizeInBits;

9424

assert((BaseSizeInBits % 8) == 0 && "Sub-byte element loads detected")(static_cast <bool> ((BaseSizeInBits % 8) == 0 &&
"Sub-byte element loads detected") ? void (0) : __assert_fail
("(BaseSizeInBits % 8) == 0 && \"Sub-byte element loads detected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9424, __extension__
__PRETTY_FUNCTION__));

9425

9426

// TODO: Support offsetting the base load.

9427

if (ByteOffsets[FirstLoadedElt] != 0)

9428

return SDValue();

9429

9430

// Check to see if the element's load is consecutive to the base load

9431

// or offset from a previous (already checked) load.

9432

auto CheckConsecutiveLoad = [&](LoadSDNode *Base, int EltIdx) {

9433

LoadSDNode *Ld = Loads[EltIdx];

9434

int64_t ByteOffset = ByteOffsets[EltIdx];

9435

if (ByteOffset && (ByteOffset % BaseSizeInBytes) == 0) {

9436

int64_t BaseIdx = EltIdx - (ByteOffset / BaseSizeInBytes);

9437

return (0 <= BaseIdx && BaseIdx < (int)NumElems && LoadMask[BaseIdx] &&

9438

Loads[BaseIdx] == Ld && ByteOffsets[BaseIdx] == 0);

9439

}

9440

return DAG.areNonVolatileConsecutiveLoads(Ld, Base, BaseSizeInBytes,

9441

EltIdx - FirstLoadedElt);

9442

};

9443

9444

// Consecutive loads can contain UNDEFS but not ZERO elements.

9445

// Consecutive loads with UNDEFs and ZEROs elements require a

9446

// an additional shuffle stage to clear the ZERO elements.

9447

bool IsConsecutiveLoad = true;

9448

bool IsConsecutiveLoadWithZeros = true;

9449

for (int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) {

9450

if (LoadMask[i]) {

9451

if (!CheckConsecutiveLoad(LDBase, i)) {

9452

IsConsecutiveLoad = false;

9453

IsConsecutiveLoadWithZeros = false;

9454

break;

9455

}

9456

} else if (ZeroMask[i]) {

9457

IsConsecutiveLoad = false;

9458

}

9459

}

9460

9461

auto CreateLoad = [&DAG, &DL, &Loads](EVT VT, LoadSDNode *LDBase) {

9462

auto MMOFlags = LDBase->getMemOperand()->getFlags();

9463

assert(LDBase->isSimple() &&(static_cast <bool> (LDBase->isSimple() && "Cannot merge volatile or atomic loads."
) ? void (0) : __assert_fail ("LDBase->isSimple() && \"Cannot merge volatile or atomic loads.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9464, __extension__
__PRETTY_FUNCTION__))

9464

"Cannot merge volatile or atomic loads.")(static_cast <bool> (LDBase->isSimple() && "Cannot merge volatile or atomic loads."
) ? void (0) : __assert_fail ("LDBase->isSimple() && \"Cannot merge volatile or atomic loads.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9464, __extension__
__PRETTY_FUNCTION__));

9465

SDValue NewLd =

9466

DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),

9467

LDBase->getPointerInfo(), LDBase->getOriginalAlign(),

9468

MMOFlags);

9469

for (auto *LD : Loads)

9470

if (LD)

9471

DAG.makeEquivalentMemoryOrdering(LD, NewLd);

9472

return NewLd;

9473

};

9474

9475

// Check if the base load is entirely dereferenceable.

9476

bool IsDereferenceable = LDBase->getPointerInfo().isDereferenceable(

9477

VT.getSizeInBits() / 8, *DAG.getContext(), DAG.getDataLayout());

9478

9479

// LOAD - all consecutive load/undefs (must start/end with a load or be

9480

// entirely dereferenceable). If we have found an entire vector of loads and

9481

// undefs, then return a large load of the entire vector width starting at the

9482

// base pointer. If the vector contains zeros, then attempt to shuffle those

9483

// elements.

9484

if (FirstLoadedElt == 0 &&

9485

(NumLoadedElts == (int)NumElems || IsDereferenceable) &&

9486

(IsConsecutiveLoad || IsConsecutiveLoadWithZeros)) {

9487

if (IsAfterLegalize && !TLI.isOperationLegal(ISD::LOAD, VT))

9488

return SDValue();

9489

9490

// Don't create 256-bit non-temporal aligned loads without AVX2 as these

9491

// will lower to regular temporal loads and use the cache.

9492

if (LDBase->isNonTemporal() && LDBase->getAlign() >= Align(32) &&

9493

VT.is256BitVector() && !Subtarget.hasInt256())

9494

return SDValue();

9495

9496

if (NumElems == 1)

9497

return DAG.getBitcast(VT, Elts[FirstLoadedElt]);

9498

9499

if (!ZeroMask)

9500

return CreateLoad(VT, LDBase);

9501

9502

// IsConsecutiveLoadWithZeros - we need to create a shuffle of the loaded

9503

// vector and a zero vector to clear out the zero elements.

9504

if (!IsAfterLegalize && VT.isVector()) {

9505

unsigned NumMaskElts = VT.getVectorNumElements();

9506

if ((NumMaskElts % NumElems) == 0) {

9507

unsigned Scale = NumMaskElts / NumElems;

9508

SmallVector<int, 4> ClearMask(NumMaskElts, -1);

9509

for (unsigned i = 0; i < NumElems; ++i) {

9510

if (UndefMask[i])

9511

continue;

9512

int Offset = ZeroMask[i] ? NumMaskElts : 0;

9513

for (unsigned j = 0; j != Scale; ++j)

9514

ClearMask[(i * Scale) + j] = (i * Scale) + j + Offset;

9515

}

9516

SDValue V = CreateLoad(VT, LDBase);

9517

SDValue Z = VT.isInteger() ? DAG.getConstant(0, DL, VT)

9518

: DAG.getConstantFP(0.0, DL, VT);

9519

return DAG.getVectorShuffle(VT, DL, V, Z, ClearMask);

9520

}

9521

}

9522

}

9523

9524

// If the upper half of a ymm/zmm load is undef then just load the lower half.

9525

if (VT.is256BitVector() || VT.is512BitVector()) {

9526

unsigned HalfNumElems = NumElems / 2;

9527

if (UndefMask.extractBits(HalfNumElems, HalfNumElems).isAllOnes()) {

9528

EVT HalfVT =

9529

EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(), HalfNumElems);

9530

SDValue HalfLD =

9531

EltsFromConsecutiveLoads(HalfVT, Elts.drop_back(HalfNumElems), DL,

9532

DAG, Subtarget, IsAfterLegalize);

9533

if (HalfLD)

9534

return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT),

9535

HalfLD, DAG.getIntPtrConstant(0, DL));

9536

}

9537

}

9538

9539

// VZEXT_LOAD - consecutive 32/64-bit load/undefs followed by zeros/undefs.

9540

if (IsConsecutiveLoad && FirstLoadedElt == 0 &&

9541

((LoadSizeInBits == 16 && Subtarget.hasFP16()) || LoadSizeInBits == 32 ||

9542

LoadSizeInBits == 64) &&

9543

((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))) {

9544

MVT VecSVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(LoadSizeInBits)

9545

: MVT::getIntegerVT(LoadSizeInBits);

9546

MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / LoadSizeInBits);

9547

// Allow v4f32 on SSE1 only targets.

9548

// FIXME: Add more isel patterns so we can just use VT directly.

9549

if (!Subtarget.hasSSE2() && VT == MVT::v4f32)

9550

VecVT = MVT::v4f32;

9551

if (TLI.isTypeLegal(VecVT)) {

9552

SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);

9553

SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };

9554

SDValue ResNode = DAG.getMemIntrinsicNode(

9555

X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT, LDBase->getPointerInfo(),

9556

LDBase->getOriginalAlign(), MachineMemOperand::MOLoad);

9557

for (auto *LD : Loads)

9558

if (LD)

9559

DAG.makeEquivalentMemoryOrdering(LD, ResNode);

9560

return DAG.getBitcast(VT, ResNode);

9561

}

9562

}

9563

9564

// BROADCAST - match the smallest possible repetition pattern, load that

9565

// scalar/subvector element and then broadcast to the entire vector.

9566

if (ZeroMask.isZero() && isPowerOf2_32(NumElems) && Subtarget.hasAVX() &&

9567

(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector())) {

9568

for (unsigned SubElems = 1; SubElems < NumElems; SubElems *= 2) {

9569

unsigned RepeatSize = SubElems * BaseSizeInBits;

9570

unsigned ScalarSize = std::min(RepeatSize, 64u);

9571

if (!Subtarget.hasAVX2() && ScalarSize < 32)

9572

continue;

9573

9574

// Don't attempt a 1:N subvector broadcast - it should be caught by

9575

// combineConcatVectorOps, else will cause infinite loops.

9576

if (RepeatSize > ScalarSize && SubElems == 1)

9577

continue;

9578

9579

bool Match = true;

9580

SmallVector<SDValue, 8> RepeatedLoads(SubElems, DAG.getUNDEF(EltBaseVT));

9581

for (unsigned i = 0; i != NumElems && Match; ++i) {

9582

if (!LoadMask[i])

9583

continue;

9584

SDValue Elt = peekThroughBitcasts(Elts[i]);

9585

if (RepeatedLoads[i % SubElems].isUndef())

9586

RepeatedLoads[i % SubElems] = Elt;

9587

else

9588

Match &= (RepeatedLoads[i % SubElems] == Elt);

9589

}

9590

9591

// We must have loads at both ends of the repetition.

9592

Match &= !RepeatedLoads.front().isUndef();

9593

Match &= !RepeatedLoads.back().isUndef();

9594

if (!Match)

9595

continue;

9596

9597

EVT RepeatVT =

9598

VT.isInteger() && (RepeatSize != 64 || TLI.isTypeLegal(MVT::i64))

9599

? EVT::getIntegerVT(*DAG.getContext(), ScalarSize)

9600

: EVT::getFloatingPointVT(ScalarSize);

9601

if (RepeatSize > ScalarSize)

9602

RepeatVT = EVT::getVectorVT(*DAG.getContext(), RepeatVT,

9603

RepeatSize / ScalarSize);

9604

EVT BroadcastVT =

9605

EVT::getVectorVT(*DAG.getContext(), RepeatVT.getScalarType(),

9606

VT.getSizeInBits() / ScalarSize);

9607

if (TLI.isTypeLegal(BroadcastVT)) {

9608

if (SDValue RepeatLoad = EltsFromConsecutiveLoads(

9609

RepeatVT, RepeatedLoads, DL, DAG, Subtarget, IsAfterLegalize)) {

9610

SDValue Broadcast = RepeatLoad;

9611

if (RepeatSize > ScalarSize) {

9612

while (Broadcast.getValueSizeInBits() < VT.getSizeInBits())

9613

Broadcast = concatSubVectors(Broadcast, Broadcast, DAG, DL);

9614

} else {

9615

if (!Subtarget.hasAVX2() &&

9616

!X86::mayFoldLoadIntoBroadcastFromMem(

9617

RepeatLoad, RepeatVT.getScalarType().getSimpleVT(),

9618

Subtarget,

9619

/*AssumeSingleUse=*/true))

9620

return SDValue();

9621

Broadcast =

9622

DAG.getNode(X86ISD::VBROADCAST, DL, BroadcastVT, RepeatLoad);

9623

}

9624

return DAG.getBitcast(VT, Broadcast);

9625

}

9626

}

9627

}

9628

}

9629

9630

return SDValue();

9631

}

9632

9633

// Combine a vector ops (shuffles etc.) that is equal to build_vector load1,

9634

// load2, load3, load4, <0, 1, 2, 3> into a vector load if the load addresses

9635

// are consecutive, non-overlapping, and in the right order.

9636

static SDValue combineToConsecutiveLoads(EVT VT, SDValue Op, const SDLoc &DL,

9637

SelectionDAG &DAG,

9638

const X86Subtarget &Subtarget,

9639

bool IsAfterLegalize) {

9640

SmallVector<SDValue, 64> Elts;

9641

for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {

9642

if (SDValue Elt = getShuffleScalarElt(Op, i, DAG, 0)) {

9643

Elts.push_back(Elt);

9644

continue;

9645

}

9646

return SDValue();

9647

}

9648

assert(Elts.size() == VT.getVectorNumElements())(static_cast <bool> (Elts.size() == VT.getVectorNumElements
()) ? void (0) : __assert_fail ("Elts.size() == VT.getVectorNumElements()"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9648, __extension__
__PRETTY_FUNCTION__));

9649

return EltsFromConsecutiveLoads(VT, Elts, DL, DAG, Subtarget,

9650

IsAfterLegalize);

9651

}

9652

9653

static Constant *getConstantVector(MVT VT, const APInt &SplatValue,

9654

unsigned SplatBitSize, LLVMContext &C) {

9655

unsigned ScalarSize = VT.getScalarSizeInBits();

9656

unsigned NumElm = SplatBitSize / ScalarSize;

9657

9658

SmallVector<Constant *, 32> ConstantVec;

9659

for (unsigned i = 0; i < NumElm; i++) {

9660

APInt Val = SplatValue.extractBits(ScalarSize, ScalarSize * i);

9661

Constant *Const;

9662

if (VT.isFloatingPoint()) {

9663

if (ScalarSize == 16) {

9664

Const = ConstantFP::get(C, APFloat(APFloat::IEEEhalf(), Val));

9665

} else if (ScalarSize == 32) {

9666

Const = ConstantFP::get(C, APFloat(APFloat::IEEEsingle(), Val));

9667

} else {

9668

assert(ScalarSize == 64 && "Unsupported floating point scalar size")(static_cast <bool> (ScalarSize == 64 && "Unsupported floating point scalar size"
) ? void (0) : __assert_fail ("ScalarSize == 64 && \"Unsupported floating point scalar size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9668, __extension__
__PRETTY_FUNCTION__));

9669

Const = ConstantFP::get(C, APFloat(APFloat::IEEEdouble(), Val));

9670

}

9671

} else

9672

Const = Constant::getIntegerValue(Type::getIntNTy(C, ScalarSize), Val);

9673

ConstantVec.push_back(Const);

9674

}

9675

return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));

9676

}

9677

9678

static bool isFoldableUseOfShuffle(SDNode *N) {

9679

for (auto *U : N->uses()) {

9680

unsigned Opc = U->getOpcode();

9681

// VPERMV/VPERMV3 shuffles can never fold their index operands.

9682

if (Opc == X86ISD::VPERMV && U->getOperand(0).getNode() == N)

9683

return false;

9684

if (Opc == X86ISD::VPERMV3 && U->getOperand(1).getNode() == N)

9685

return false;

9686

if (isTargetShuffle(Opc))

9687

return true;

9688

if (Opc == ISD::BITCAST) // Ignore bitcasts

9689

return isFoldableUseOfShuffle(U);

9690

if (N->hasOneUse()) {

9691

// TODO, there may be some general way to know if a SDNode can

9692

// be folded. We now only know whether an MI is foldable.

9693

if (Opc == X86ISD::VPDPBUSD && U->getOperand(2).getNode() != N)

9694

return false;

9695

return true;

9696

}

9697

}

9698

return false;

9699

}

9700

9701

/// Attempt to use the vbroadcast instruction to generate a splat value

9702

/// from a splat BUILD_VECTOR which uses:

9703

/// a. A single scalar load, or a constant.

9704

/// b. Repeated pattern of constants (e.g. <0,1,0,1> or <0,1,2,3,0,1,2,3>).

9705

///

9706

/// The VBROADCAST node is returned when a pattern is found,

9707

/// or SDValue() otherwise.

9708

static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,

9709

const X86Subtarget &Subtarget,

9710

SelectionDAG &DAG) {

9711

// VBROADCAST requires AVX.

9712

// TODO: Splats could be generated for non-AVX CPUs using SSE

9713

// instructions, but there's less potential gain for only 128-bit vectors.

9714

if (!Subtarget.hasAVX())

9715

return SDValue();

9716

9717

MVT VT = BVOp->getSimpleValueType(0);

9718

unsigned NumElts = VT.getVectorNumElements();

9719

SDLoc dl(BVOp);

9720

9721

assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector()) && "Unsupported vector type for broadcast."
) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Unsupported vector type for broadcast.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9722, __extension__
__PRETTY_FUNCTION__))

9722

"Unsupported vector type for broadcast.")(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector()) && "Unsupported vector type for broadcast."
) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Unsupported vector type for broadcast.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9722, __extension__
__PRETTY_FUNCTION__));

9723

9724

// See if the build vector is a repeating sequence of scalars (inc. splat).

9725

SDValue Ld;

9726

BitVector UndefElements;

9727

SmallVector<SDValue, 16> Sequence;

9728

if (BVOp->getRepeatedSequence(Sequence, &UndefElements)) {

9729

assert((NumElts % Sequence.size()) == 0 && "Sequence doesn't fit.")(static_cast <bool> ((NumElts % Sequence.size()) == 0 &&
"Sequence doesn't fit.") ? void (0) : __assert_fail ("(NumElts % Sequence.size()) == 0 && \"Sequence doesn't fit.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9729, __extension__
__PRETTY_FUNCTION__));

9730

if (Sequence.size() == 1)

9731

Ld = Sequence[0];

9732

}

9733

9734

// Attempt to use VBROADCASTM

9735

// From this pattern:

9736

// a. t0 = (zext_i64 (bitcast_i8 v2i1 X))

9737

// b. t1 = (build_vector t0 t0)

9738

//

9739

// Create (VBROADCASTM v2i1 X)

9740

if (!Sequence.empty() && Subtarget.hasCDI()) {

9741

// If not a splat, are the upper sequence values zeroable?

9742

unsigned SeqLen = Sequence.size();

9743

bool UpperZeroOrUndef =

9744

SeqLen == 1 ||

9745

llvm::all_of(ArrayRef(Sequence).drop_front(), [](SDValue V) {

9746

return !V || V.isUndef() || isNullConstant(V);

9747

});

9748

SDValue Op0 = Sequence[0];

9749

if (UpperZeroOrUndef && ((Op0.getOpcode() == ISD::BITCAST) ||

9750

(Op0.getOpcode() == ISD::ZERO_EXTEND &&

9751

Op0.getOperand(0).getOpcode() == ISD::BITCAST))) {

9752

SDValue BOperand = Op0.getOpcode() == ISD::BITCAST

9753

? Op0.getOperand(0)

9754

: Op0.getOperand(0).getOperand(0);

9755

MVT MaskVT = BOperand.getSimpleValueType();

9756

MVT EltType = MVT::getIntegerVT(VT.getScalarSizeInBits() * SeqLen);

9757

if ((EltType == MVT::i64 && MaskVT == MVT::v8i1) || // for broadcastmb2q

9758

(EltType == MVT::i32 && MaskVT == MVT::v16i1)) { // for broadcastmw2d

9759

MVT BcstVT = MVT::getVectorVT(EltType, NumElts / SeqLen);

9760

if (!VT.is512BitVector() && !Subtarget.hasVLX()) {

9761

unsigned Scale = 512 / VT.getSizeInBits();

9762

BcstVT = MVT::getVectorVT(EltType, Scale * (NumElts / SeqLen));

9763

}

9764

SDValue Bcst = DAG.getNode(X86ISD::VBROADCASTM, dl, BcstVT, BOperand);

9765

if (BcstVT.getSizeInBits() != VT.getSizeInBits())

9766

Bcst = extractSubVector(Bcst, 0, DAG, dl, VT.getSizeInBits());

9767

return DAG.getBitcast(VT, Bcst);

9768

}

9769

}

9770

}

9771

9772

unsigned NumUndefElts = UndefElements.count();

9773

if (!Ld || (NumElts - NumUndefElts) <= 1) {

9774

APInt SplatValue, Undef;

9775

unsigned SplatBitSize;

9776

bool HasUndef;

9777

// Check if this is a repeated constant pattern suitable for broadcasting.

9778

if (BVOp->isConstantSplat(SplatValue, Undef, SplatBitSize, HasUndef) &&

9779

SplatBitSize > VT.getScalarSizeInBits() &&

9780

SplatBitSize < VT.getSizeInBits()) {

9781

// Avoid replacing with broadcast when it's a use of a shuffle

9782

// instruction to preserve the present custom lowering of shuffles.

9783

if (isFoldableUseOfShuffle(BVOp))

9784

return SDValue();

9785

// replace BUILD_VECTOR with broadcast of the repeated constants.

9786

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

9787

LLVMContext *Ctx = DAG.getContext();

9788

MVT PVT = TLI.getPointerTy(DAG.getDataLayout());

9789

if (Subtarget.hasAVX()) {

9790

if (SplatBitSize == 32 || SplatBitSize == 64 ||

9791

(SplatBitSize < 32 && Subtarget.hasAVX2())) {

9792

// Splatted value can fit in one INTEGER constant in constant pool.

9793

// Load the constant and broadcast it.

9794

MVT CVT = MVT::getIntegerVT(SplatBitSize);

9795

Type *ScalarTy = Type::getIntNTy(*Ctx, SplatBitSize);

9796

Constant *C = Constant::getIntegerValue(ScalarTy, SplatValue);

9797

SDValue CP = DAG.getConstantPool(C, PVT);

9798

unsigned Repeat = VT.getSizeInBits() / SplatBitSize;

9799

9800

Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();

9801

SDVTList Tys =

9802

DAG.getVTList(MVT::getVectorVT(CVT, Repeat), MVT::Other);

9803

SDValue Ops[] = {DAG.getEntryNode(), CP};

9804

MachinePointerInfo MPI =

9805

MachinePointerInfo::getConstantPool(DAG.getMachineFunction());

9806

SDValue Brdcst = DAG.getMemIntrinsicNode(

9807

X86ISD::VBROADCAST_LOAD, dl, Tys, Ops, CVT, MPI, Alignment,

9808

MachineMemOperand::MOLoad);

9809

return DAG.getBitcast(VT, Brdcst);

9810

}

9811

if (SplatBitSize > 64) {

9812

// Load the vector of constants and broadcast it.

9813

Constant *VecC = getConstantVector(VT, SplatValue, SplatBitSize,

9814

*Ctx);

9815

SDValue VCP = DAG.getConstantPool(VecC, PVT);

9816

unsigned NumElm = SplatBitSize / VT.getScalarSizeInBits();

9817

MVT VVT = MVT::getVectorVT(VT.getScalarType(), NumElm);

9818

Align Alignment = cast<ConstantPoolSDNode>(VCP)->getAlign();

9819

SDVTList Tys = DAG.getVTList(VT, MVT::Other);

9820

SDValue Ops[] = {DAG.getEntryNode(), VCP};

9821

MachinePointerInfo MPI =

9822

MachinePointerInfo::getConstantPool(DAG.getMachineFunction());

9823

return DAG.getMemIntrinsicNode(

9824

X86ISD::SUBV_BROADCAST_LOAD, dl, Tys, Ops, VVT, MPI, Alignment,

9825

MachineMemOperand::MOLoad);

9826

}

9827

}

9828

}

9829

9830

// If we are moving a scalar into a vector (Ld must be set and all elements

9831

// but 1 are undef) and that operation is not obviously supported by

9832

// vmovd/vmovq/vmovss/vmovsd, then keep trying to form a broadcast.

9833

// That's better than general shuffling and may eliminate a load to GPR and

9834

// move from scalar to vector register.

9835

if (!Ld || NumElts - NumUndefElts != 1)

9836

return SDValue();

9837

unsigned ScalarSize = Ld.getValueSizeInBits();

9838

if (!(UndefElements[0] || (ScalarSize != 32 && ScalarSize != 64)))

9839

return SDValue();

9840

}

9841

9842

bool ConstSplatVal =

9843

(Ld.getOpcode() == ISD::Constant || Ld.getOpcode() == ISD::ConstantFP);

9844

bool IsLoad = ISD::isNormalLoad(Ld.getNode());

9845

9846

// TODO: Handle broadcasts of non-constant sequences.

9847

9848

// Make sure that all of the users of a non-constant load are from the

9849

// BUILD_VECTOR node.

9850

// FIXME: Is the use count needed for non-constant, non-load case?

9851

if (!ConstSplatVal && !IsLoad && !BVOp->isOnlyUserOf(Ld.getNode()))

9852

return SDValue();

9853

9854

unsigned ScalarSize = Ld.getValueSizeInBits();

9855

bool IsGE256 = (VT.getSizeInBits() >= 256);

9856

9857

// When optimizing for size, generate up to 5 extra bytes for a broadcast

9858

// instruction to save 8 or more bytes of constant pool data.

9859

// TODO: If multiple splats are generated to load the same constant,

9860

// it may be detrimental to overall size. There needs to be a way to detect

9861

// that condition to know if this is truly a size win.

9862

bool OptForSize = DAG.shouldOptForSize();

9863

9864

// Handle broadcasting a single constant scalar from the constant pool

9865

// into a vector.

9866

// On Sandybridge (no AVX2), it is still better to load a constant vector

9867

// from the constant pool and not to broadcast it from a scalar.

9868

// But override that restriction when optimizing for size.

9869

// TODO: Check if splatting is recommended for other AVX-capable CPUs.

9870

if (ConstSplatVal && (Subtarget.hasAVX2() || OptForSize)) {

9871

EVT CVT = Ld.getValueType();

9872

assert(!CVT.isVector() && "Must not broadcast a vector type")(static_cast <bool> (!CVT.isVector() && "Must not broadcast a vector type"
) ? void (0) : __assert_fail ("!CVT.isVector() && \"Must not broadcast a vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9872, __extension__
__PRETTY_FUNCTION__));

9873

9874

// Splat f16, f32, i32, v4f64, v4i64 in all cases with AVX2.

9875

// For size optimization, also splat v2f64 and v2i64, and for size opt

9876

// with AVX2, also splat i8 and i16.

9877

// With pattern matching, the VBROADCAST node may become a VMOVDDUP.

9878

if (ScalarSize == 32 ||

9879

(ScalarSize == 64 && (IsGE256 || Subtarget.hasVLX())) ||

9880

CVT == MVT::f16 ||

9881

(OptForSize && (ScalarSize == 64 || Subtarget.hasAVX2()))) {

9882

const Constant *C = nullptr;

9883

if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))

9884

C = CI->getConstantIntValue();

9885

else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))

9886

C = CF->getConstantFPValue();

9887

9888

assert(C && "Invalid constant type")(static_cast <bool> (C && "Invalid constant type"
) ? void (0) : __assert_fail ("C && \"Invalid constant type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9888, __extension__
__PRETTY_FUNCTION__));

9889

9890

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

9891

SDValue CP =

9892

DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout()));

9893

Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();

9894

9895

SDVTList Tys = DAG.getVTList(VT, MVT::Other);

9896

SDValue Ops[] = {DAG.getEntryNode(), CP};

9897

MachinePointerInfo MPI =

9898

MachinePointerInfo::getConstantPool(DAG.getMachineFunction());

9899

return DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops, CVT,

9900

MPI, Alignment, MachineMemOperand::MOLoad);

9901

}

9902

}

9903

9904

// Handle AVX2 in-register broadcasts.

9905

if (!IsLoad && Subtarget.hasInt256() &&

9906

(ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))

9907

return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);

9908

9909

// The scalar source must be a normal load.

9910

if (!IsLoad)

9911

return SDValue();

9912

9913

// Make sure the non-chain result is only used by this build vector.

9914

if (!Ld->hasNUsesOfValue(NumElts - NumUndefElts, 0))

9915

return SDValue();

9916

9917

if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||

9918

(Subtarget.hasVLX() && ScalarSize == 64)) {

9919

auto *LN = cast<LoadSDNode>(Ld);

9920

SDVTList Tys = DAG.getVTList(VT, MVT::Other);

9921

SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};

9922

SDValue BCast =

9923

DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops,

9924

LN->getMemoryVT(), LN->getMemOperand());

9925

DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BCast.getValue(1));

9926

return BCast;

9927

}

9928

9929

// The integer check is needed for the 64-bit into 128-bit so it doesn't match

9930

// double since there is no vbroadcastsd xmm

9931

if (Subtarget.hasInt256() && Ld.getValueType().isInteger() &&

9932

(ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)) {

9933

auto *LN = cast<LoadSDNode>(Ld);

9934

SDVTList Tys = DAG.getVTList(VT, MVT::Other);

9935

SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};

9936

SDValue BCast =

9937

DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops,

9938

LN->getMemoryVT(), LN->getMemOperand());

9939

DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BCast.getValue(1));

9940

return BCast;

9941

}

9942

9943

if (ScalarSize == 16 && Subtarget.hasFP16() && IsGE256)

9944

return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);

9945

9946

// Unsupported broadcast.

9947

return SDValue();

9948

}

9949

9950

/// For an EXTRACT_VECTOR_ELT with a constant index return the real

9951

/// underlying vector and index.

9952

///

9953

/// Modifies \p ExtractedFromVec to the real vector and returns the real

9954

/// index.

9955

static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,

9956

SDValue ExtIdx) {

9957

int Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue();

9958

if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))

9959

return Idx;

9960

9961

// For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already

9962

// lowered this:

9963

// (extract_vector_elt (v8f32 %1), Constant<6>)

9964

// to:

9965

// (extract_vector_elt (vector_shuffle<2,u,u,u>

9966

// (extract_subvector (v8f32 %0), Constant<4>),

9967

// undef)

9968

// Constant<0>)

9969

// In this case the vector is the extract_subvector expression and the index

9970

// is 2, as specified by the shuffle.

9971

ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);

9972

SDValue ShuffleVec = SVOp->getOperand(0);

9973

MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();

9974

assert(ShuffleVecVT.getVectorElementType() ==(static_cast <bool> (ShuffleVecVT.getVectorElementType(
) == ExtractedFromVec.getSimpleValueType().getVectorElementType
()) ? void (0) : __assert_fail ("ShuffleVecVT.getVectorElementType() == ExtractedFromVec.getSimpleValueType().getVectorElementType()"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9975, __extension__
__PRETTY_FUNCTION__))

9975

ExtractedFromVec.getSimpleValueType().getVectorElementType())(static_cast <bool> (ShuffleVecVT.getVectorElementType(
) == ExtractedFromVec.getSimpleValueType().getVectorElementType
()) ? void (0) : __assert_fail ("ShuffleVecVT.getVectorElementType() == ExtractedFromVec.getSimpleValueType().getVectorElementType()"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9975, __extension__
__PRETTY_FUNCTION__));

9976

9977

int ShuffleIdx = SVOp->getMaskElt(Idx);

9978

if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {

9979

ExtractedFromVec = ShuffleVec;

9980

return ShuffleIdx;

9981

}

9982

return Idx;

9983

}

9984

9985

static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {

9986

MVT VT = Op.getSimpleValueType();

9987

9988

// Skip if insert_vec_elt is not supported.

9989

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

9990

if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT))

9991

return SDValue();

9992

9993

SDLoc DL(Op);

9994

unsigned NumElems = Op.getNumOperands();

9995

9996

SDValue VecIn1;

9997

SDValue VecIn2;

9998

SmallVector<unsigned, 4> InsertIndices;

9999

SmallVector<int, 8> Mask(NumElems, -1);

10000

10001

for (unsigned i = 0; i != NumElems; ++i) {

10002

unsigned Opc = Op.getOperand(i).getOpcode();

10003

10004

if (Opc == ISD::UNDEF)

10005

continue;

10006

10007

if (Opc != ISD::EXTRACT_VECTOR_ELT) {

10008

// Quit if more than 1 elements need inserting.

10009

if (InsertIndices.size() > 1)

10010

return SDValue();

10011

10012

InsertIndices.push_back(i);

10013

continue;

10014

}

10015

10016

SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);

10017

SDValue ExtIdx = Op.getOperand(i).getOperand(1);

10018

10019

// Quit if non-constant index.

10020

if (!isa<ConstantSDNode>(ExtIdx))

10021

return SDValue();

10022

int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);

10023

10024

// Quit if extracted from vector of different type.

10025

if (ExtractedFromVec.getValueType() != VT)

10026

return SDValue();

10027

10028

if (!VecIn1.getNode())

10029

VecIn1 = ExtractedFromVec;

10030

else if (VecIn1 != ExtractedFromVec) {

10031

if (!VecIn2.getNode())

10032

VecIn2 = ExtractedFromVec;

10033

else if (VecIn2 != ExtractedFromVec)

10034

// Quit if more than 2 vectors to shuffle

10035

return SDValue();

10036

}

10037

10038

if (ExtractedFromVec == VecIn1)

10039

Mask[i] = Idx;

10040

else if (ExtractedFromVec == VecIn2)

10041

Mask[i] = Idx + NumElems;

10042

}

10043

10044

if (!VecIn1.getNode())

10045

return SDValue();

10046

10047

VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);

10048

SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, Mask);

10049

10050

for (unsigned Idx : InsertIndices)

10051

NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),

10052

DAG.getIntPtrConstant(Idx, DL));

10053

10054

return NV;

10055

}

10056

10057

// Lower BUILD_VECTOR operation for v8bf16, v16bf16 and v32bf16 types.

10058

static SDValue LowerBUILD_VECTORvXbf16(SDValue Op, SelectionDAG &DAG,

10059

const X86Subtarget &Subtarget) {

10060

MVT VT = Op.getSimpleValueType();

10061

MVT IVT = VT.changeVectorElementTypeToInteger();

10062

SmallVector<SDValue, 16> NewOps;

10063

for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I)

10064

NewOps.push_back(DAG.getBitcast(MVT::i16, Op.getOperand(I)));

10065

SDValue Res = DAG.getNode(ISD::BUILD_VECTOR, SDLoc(), IVT, NewOps);

10066

return DAG.getBitcast(VT, Res);

10067

}

10068

10069

// Lower BUILD_VECTOR operation for v8i1 and v16i1 types.

10070

static SDValue LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG,

10071

const X86Subtarget &Subtarget) {

10072

10073

MVT VT = Op.getSimpleValueType();

10074

assert((VT.getVectorElementType() == MVT::i1) &&(static_cast <bool> ((VT.getVectorElementType() == MVT::
i1) && "Unexpected type in LowerBUILD_VECTORvXi1!") ?
void (0) : __assert_fail ("(VT.getVectorElementType() == MVT::i1) && \"Unexpected type in LowerBUILD_VECTORvXi1!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10075, __extension__
__PRETTY_FUNCTION__))

10075

"Unexpected type in LowerBUILD_VECTORvXi1!")(static_cast <bool> ((VT.getVectorElementType() == MVT::
i1) && "Unexpected type in LowerBUILD_VECTORvXi1!") ?
void (0) : __assert_fail ("(VT.getVectorElementType() == MVT::i1) && \"Unexpected type in LowerBUILD_VECTORvXi1!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10075, __extension__
__PRETTY_FUNCTION__));

10076

10077

SDLoc dl(Op);

10078

if (ISD::isBuildVectorAllZeros(Op.getNode()) ||

10079

ISD::isBuildVectorAllOnes(Op.getNode()))

10080

return Op;

10081

10082

uint64_t Immediate = 0;

10083

SmallVector<unsigned, 16> NonConstIdx;

10084

bool IsSplat = true;

10085

bool HasConstElts = false;

10086

int SplatIdx = -1;

10087

for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {

10088

SDValue In = Op.getOperand(idx);

10089

if (In.isUndef())

10090

continue;

10091

if (auto *InC = dyn_cast<ConstantSDNode>(In)) {

10092

Immediate |= (InC->getZExtValue() & 0x1) << idx;

10093

HasConstElts = true;

10094

} else {

10095

NonConstIdx.push_back(idx);

10096

}

10097

if (SplatIdx < 0)

10098

SplatIdx = idx;

10099

else if (In != Op.getOperand(SplatIdx))

10100

IsSplat = false;

10101

}

10102

10103

// for splat use " (select i1 splat_elt, all-ones, all-zeroes)"

10104

if (IsSplat) {

10105

// The build_vector allows the scalar element to be larger than the vector

10106

// element type. We need to mask it to use as a condition unless we know

10107

// the upper bits are zero.

10108

// FIXME: Use computeKnownBits instead of checking specific opcode?

10109

SDValue Cond = Op.getOperand(SplatIdx);

10110

assert(Cond.getValueType() == MVT::i8 && "Unexpected VT!")(static_cast <bool> (Cond.getValueType() == MVT::i8 &&
"Unexpected VT!") ? void (0) : __assert_fail ("Cond.getValueType() == MVT::i8 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10110, __extension__
__PRETTY_FUNCTION__));

10111

if (Cond.getOpcode() != ISD::SETCC)

10112

Cond = DAG.getNode(ISD::AND, dl, MVT::i8, Cond,

10113

DAG.getConstant(1, dl, MVT::i8));

10114

10115

// Perform the select in the scalar domain so we can use cmov.

10116

if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {

10117

SDValue Select = DAG.getSelect(dl, MVT::i32, Cond,

10118

DAG.getAllOnesConstant(dl, MVT::i32),

10119

DAG.getConstant(0, dl, MVT::i32));

10120

Select = DAG.getBitcast(MVT::v32i1, Select);

10121

return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Select, Select);

10122

} else {

10123

MVT ImmVT = MVT::getIntegerVT(std::max((unsigned)VT.getSizeInBits(), 8U));

10124

SDValue Select = DAG.getSelect(dl, ImmVT, Cond,

10125

DAG.getAllOnesConstant(dl, ImmVT),

10126

DAG.getConstant(0, dl, ImmVT));

10127

MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1;

10128

Select = DAG.getBitcast(VecVT, Select);

10129

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Select,

10130

DAG.getIntPtrConstant(0, dl));

10131

}

10132

}

10133

10134

// insert elements one by one

10135

SDValue DstVec;

10136

if (HasConstElts) {

10137

if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {

10138

SDValue ImmL = DAG.getConstant(Lo_32(Immediate), dl, MVT::i32);

10139

SDValue ImmH = DAG.getConstant(Hi_32(Immediate), dl, MVT::i32);

10140

ImmL = DAG.getBitcast(MVT::v32i1, ImmL);

10141

ImmH = DAG.getBitcast(MVT::v32i1, ImmH);

10142

DstVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, ImmL, ImmH);

10143

} else {

10144

MVT ImmVT = MVT::getIntegerVT(std::max((unsigned)VT.getSizeInBits(), 8U));

10145

SDValue Imm = DAG.getConstant(Immediate, dl, ImmVT);

10146

MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1;

10147

DstVec = DAG.getBitcast(VecVT, Imm);

10148

DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, DstVec,

10149

DAG.getIntPtrConstant(0, dl));

10150

}

10151

} else

10152

DstVec = DAG.getUNDEF(VT);

10153

10154

for (unsigned i = 0, e = NonConstIdx.size(); i != e; ++i) {

10155

unsigned InsertIdx = NonConstIdx[i];

10156

DstVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,

10157

Op.getOperand(InsertIdx),

10158

DAG.getIntPtrConstant(InsertIdx, dl));

10159

}

10160

return DstVec;

10161

}

10162

10163

LLVM_ATTRIBUTE_UNUSED__attribute__((__unused__)) static bool isHorizOp(unsigned Opcode) {

10164

switch (Opcode) {

10165

case X86ISD::PACKSS:

10166

case X86ISD::PACKUS:

10167

case X86ISD::FHADD:

10168

case X86ISD::FHSUB:

10169

case X86ISD::HADD:

10170

case X86ISD::HSUB:

10171

return true;

10172

}

10173

return false;

10174

}

10175

10176

/// This is a helper function of LowerToHorizontalOp().

10177

/// This function checks that the build_vector \p N in input implements a

10178

/// 128-bit partial horizontal operation on a 256-bit vector, but that operation

10179

/// may not match the layout of an x86 256-bit horizontal instruction.

10180

/// In other words, if this returns true, then some extraction/insertion will

10181

/// be required to produce a valid horizontal instruction.

10182

///

10183

/// Parameter \p Opcode defines the kind of horizontal operation to match.

10184

/// For example, if \p Opcode is equal to ISD::ADD, then this function

10185

/// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode

10186

/// is equal to ISD::SUB, then this function checks if this is a horizontal

10187

/// arithmetic sub.

10188

///

10189

/// This function only analyzes elements of \p N whose indices are

10190

/// in range [BaseIdx, LastIdx).

10191

///

10192

/// TODO: This function was originally used to match both real and fake partial

10193

/// horizontal operations, but the index-matching logic is incorrect for that.

10194

/// See the corrected implementation in isHopBuildVector(). Can we reduce this

10195

/// code because it is only used for partial h-op matching now?

10196

static bool isHorizontalBinOpPart(const BuildVectorSDNode *N, unsigned Opcode,

10197

SelectionDAG &DAG,

10198

unsigned BaseIdx, unsigned LastIdx,

10199

SDValue &V0, SDValue &V1) {

10200

EVT VT = N->getValueType(0);

10201

assert(VT.is256BitVector() && "Only use for matching partial 256-bit h-ops")(static_cast <bool> (VT.is256BitVector() && "Only use for matching partial 256-bit h-ops"
) ? void (0) : __assert_fail ("VT.is256BitVector() && \"Only use for matching partial 256-bit h-ops\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10201, __extension__
__PRETTY_FUNCTION__));

10202

assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!")(static_cast <bool> (BaseIdx * 2 <= LastIdx &&
"Invalid Indices in input!") ? void (0) : __assert_fail ("BaseIdx * 2 <= LastIdx && \"Invalid Indices in input!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10202, __extension__
__PRETTY_FUNCTION__));

10203

assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&(static_cast <bool> (VT.isVector() && VT.getVectorNumElements
() >= LastIdx && "Invalid Vector in input!") ? void
(0) : __assert_fail ("VT.isVector() && VT.getVectorNumElements() >= LastIdx && \"Invalid Vector in input!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10204, __extension__
__PRETTY_FUNCTION__))

10204

"Invalid Vector in input!")(static_cast <bool> (VT.isVector() && VT.getVectorNumElements
() >= LastIdx && "Invalid Vector in input!") ? void
(0) : __assert_fail ("VT.isVector() && VT.getVectorNumElements() >= LastIdx && \"Invalid Vector in input!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10204, __extension__
__PRETTY_FUNCTION__));

10205

10206

bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);

10207

bool CanFold = true;

10208

unsigned ExpectedVExtractIdx = BaseIdx;

10209

unsigned NumElts = LastIdx - BaseIdx;

10210

V0 = DAG.getUNDEF(VT);

10211

V1 = DAG.getUNDEF(VT);

10212

10213

// Check if N implements a horizontal binop.

10214

for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {

10215

SDValue Op = N->getOperand(i + BaseIdx);

10216

10217

// Skip UNDEFs.

10218

if (Op->isUndef()) {

10219

// Update the expected vector extract index.

10220

if (i * 2 == NumElts)

10221

ExpectedVExtractIdx = BaseIdx;

10222

ExpectedVExtractIdx += 2;

10223

continue;

10224

}

10225

10226

CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();

10227

10228

if (!CanFold)

10229

break;

10230

10231

SDValue Op0 = Op.getOperand(0);

10232

SDValue Op1 = Op.getOperand(1);

10233

10234

// Try to match the following pattern:

10235

// (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))

10236

CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&

10237

Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&

10238

Op0.getOperand(0) == Op1.getOperand(0) &&

10239

isa<ConstantSDNode>(Op0.getOperand(1)) &&

10240

isa<ConstantSDNode>(Op1.getOperand(1)));

10241

if (!CanFold)

10242

break;

10243

10244

unsigned I0 = Op0.getConstantOperandVal(1);

10245

unsigned I1 = Op1.getConstantOperandVal(1);

10246

10247

if (i * 2 < NumElts) {

10248

if (V0.isUndef()) {

10249

V0 = Op0.getOperand(0);

10250

if (V0.getValueType() != VT)

10251

return false;

10252

}

10253

} else {

10254

if (V1.isUndef()) {

10255

V1 = Op0.getOperand(0);

10256

if (V1.getValueType() != VT)

10257

return false;

10258

}

10259

if (i * 2 == NumElts)

10260

ExpectedVExtractIdx = BaseIdx;

10261

}

10262

10263

SDValue Expected = (i * 2 < NumElts) ? V0 : V1;

10264

if (I0 == ExpectedVExtractIdx)

10265

CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;

10266

else if (IsCommutable && I1 == ExpectedVExtractIdx) {

10267

// Try to match the following dag sequence:

10268

// (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))

10269

CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;

10270

} else

10271

CanFold = false;

10272

10273

ExpectedVExtractIdx += 2;

10274

}

10275

10276

return CanFold;

10277

}

10278

10279

/// Emit a sequence of two 128-bit horizontal add/sub followed by

10280

/// a concat_vector.

10281

///

10282

/// This is a helper function of LowerToHorizontalOp().

10283

/// This function expects two 256-bit vectors called V0 and V1.

10284

/// At first, each vector is split into two separate 128-bit vectors.

10285

/// Then, the resulting 128-bit vectors are used to implement two

10286

/// horizontal binary operations.

10287

///

10288

/// The kind of horizontal binary operation is defined by \p X86Opcode.

10289

///

10290

/// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to

10291

/// the two new horizontal binop.

10292

/// When Mode is set, the first horizontal binop dag node would take as input

10293

/// the lower 128-bit of V0 and the upper 128-bit of V0. The second

10294

/// horizontal binop dag node would take as input the lower 128-bit of V1

10295

/// and the upper 128-bit of V1.

10296

/// Example:

10297

/// HADD V0_LO, V0_HI

10298

/// HADD V1_LO, V1_HI

10299

///

10300

/// Otherwise, the first horizontal binop dag node takes as input the lower

10301

/// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop

10302

/// dag node takes the upper 128-bit of V0 and the upper 128-bit of V1.

10303

/// Example:

10304

/// HADD V0_LO, V1_LO

10305

/// HADD V0_HI, V1_HI

10306

///

10307

/// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower

10308

/// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to

10309

/// the upper 128-bits of the result.

10310

static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,

10311

const SDLoc &DL, SelectionDAG &DAG,

10312

unsigned X86Opcode, bool Mode,

10313

bool isUndefLO, bool isUndefHI) {

10314

MVT VT = V0.getSimpleValueType();

10315

assert(VT.is256BitVector() && VT == V1.getSimpleValueType() &&(static_cast <bool> (VT.is256BitVector() && VT ==
V1.getSimpleValueType() && "Invalid nodes in input!"
) ? void (0) : __assert_fail ("VT.is256BitVector() && VT == V1.getSimpleValueType() && \"Invalid nodes in input!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10316, __extension__
__PRETTY_FUNCTION__))

10316

"Invalid nodes in input!")(static_cast <bool> (VT.is256BitVector() && VT ==
V1.getSimpleValueType() && "Invalid nodes in input!"
) ? void (0) : __assert_fail ("VT.is256BitVector() && VT == V1.getSimpleValueType() && \"Invalid nodes in input!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10316, __extension__
__PRETTY_FUNCTION__));

10317

10318

unsigned NumElts = VT.getVectorNumElements();

10319

SDValue V0_LO = extract128BitVector(V0, 0, DAG, DL);

10320

SDValue V0_HI = extract128BitVector(V0, NumElts/2, DAG, DL);

10321

SDValue V1_LO = extract128BitVector(V1, 0, DAG, DL);

10322

SDValue V1_HI = extract128BitVector(V1, NumElts/2, DAG, DL);

10323

MVT NewVT = V0_LO.getSimpleValueType();

10324

10325

SDValue LO = DAG.getUNDEF(NewVT);

10326

SDValue HI = DAG.getUNDEF(NewVT);

10327

10328

if (Mode) {

10329

// Don't emit a horizontal binop if the result is expected to be UNDEF.

10330

if (!isUndefLO && !V0->isUndef())

10331

LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);

10332

if (!isUndefHI && !V1->isUndef())

10333

HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);

10334

} else {

10335

// Don't emit a horizontal binop if the result is expected to be UNDEF.

10336

if (!isUndefLO && (!V0_LO->isUndef() || !V1_LO->isUndef()))

10337

LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);

10338

10339

if (!isUndefHI && (!V0_HI->isUndef() || !V1_HI->isUndef()))

10340

HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);

10341

}

10342

10343

return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);

10344

}

10345

10346

/// Returns true iff \p BV builds a vector with the result equivalent to

10347

/// the result of ADDSUB/SUBADD operation.

10348

/// If true is returned then the operands of ADDSUB = Opnd0 +- Opnd1

10349

/// (SUBADD = Opnd0 -+ Opnd1) operation are written to the parameters

10350

/// \p Opnd0 and \p Opnd1.

10351

static bool isAddSubOrSubAdd(const BuildVectorSDNode *BV,

10352

const X86Subtarget &Subtarget, SelectionDAG &DAG,

10353

SDValue &Opnd0, SDValue &Opnd1,

10354

unsigned &NumExtracts,

10355

bool &IsSubAdd) {

10356

10357

MVT VT = BV->getSimpleValueType(0);

10358

if (!Subtarget.hasSSE3() || !VT.isFloatingPoint())

10359

return false;

10360

10361

unsigned NumElts = VT.getVectorNumElements();

10362

SDValue InVec0 = DAG.getUNDEF(VT);

10363

SDValue InVec1 = DAG.getUNDEF(VT);

10364

10365

NumExtracts = 0;

10366

10367

// Odd-numbered elements in the input build vector are obtained from

10368

// adding/subtracting two integer/float elements.

10369

// Even-numbered elements in the input build vector are obtained from

10370

// subtracting/adding two integer/float elements.

10371

unsigned Opc[2] = {0, 0};

10372

for (unsigned i = 0, e = NumElts; i != e; ++i) {

10373

SDValue Op = BV->getOperand(i);

10374

10375

// Skip 'undef' values.

10376

unsigned Opcode = Op.getOpcode();

10377

if (Opcode == ISD::UNDEF)

10378

continue;

10379

10380

// Early exit if we found an unexpected opcode.

10381

if (Opcode != ISD::FADD && Opcode != ISD::FSUB)

10382

return false;

10383

10384

SDValue Op0 = Op.getOperand(0);

10385

SDValue Op1 = Op.getOperand(1);

10386

10387

// Try to match the following pattern:

10388

// (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))

10389

// Early exit if we cannot match that sequence.

10390

if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

10391

Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

10392

!isa<ConstantSDNode>(Op0.getOperand(1)) ||

10393

Op0.getOperand(1) != Op1.getOperand(1))

10394

return false;

10395

10396

unsigned I0 = Op0.getConstantOperandVal(1);

10397

if (I0 != i)

10398

return false;

10399

10400

// We found a valid add/sub node, make sure its the same opcode as previous

10401

// elements for this parity.

10402

if (Opc[i % 2] != 0 && Opc[i % 2] != Opcode)

10403

return false;

10404

Opc[i % 2] = Opcode;

10405

10406

// Update InVec0 and InVec1.

10407

if (InVec0.isUndef()) {

10408

InVec0 = Op0.getOperand(0);

10409

if (InVec0.getSimpleValueType() != VT)

10410

return false;

10411

}

10412

if (InVec1.isUndef()) {

10413

InVec1 = Op1.getOperand(0);

10414

if (InVec1.getSimpleValueType() != VT)

10415

return false;

10416

}

10417

10418

// Make sure that operands in input to each add/sub node always

10419

// come from a same pair of vectors.

10420

if (InVec0 != Op0.getOperand(0)) {

10421

if (Opcode == ISD::FSUB)

10422

return false;

10423

10424

// FADD is commutable. Try to commute the operands

10425

// and then test again.

10426

std::swap(Op0, Op1);

10427

if (InVec0 != Op0.getOperand(0))

10428

return false;

10429

}

10430

10431

if (InVec1 != Op1.getOperand(0))

10432

return false;

10433

10434

// Increment the number of extractions done.

10435

++NumExtracts;

10436

}

10437

10438

// Ensure we have found an opcode for both parities and that they are

10439

// different. Don't try to fold this build_vector into an ADDSUB/SUBADD if the

10440

// inputs are undef.

10441

if (!Opc[0] || !Opc[1] || Opc[0] == Opc[1] ||

10442

InVec0.isUndef() || InVec1.isUndef())

10443

return false;

10444

10445

IsSubAdd = Opc[0] == ISD::FADD;

10446

10447

Opnd0 = InVec0;

10448

Opnd1 = InVec1;

10449

return true;

10450

}

10451

10452

/// Returns true if is possible to fold MUL and an idiom that has already been

10453

/// recognized as ADDSUB/SUBADD(\p Opnd0, \p Opnd1) into

10454

/// FMADDSUB/FMSUBADD(x, y, \p Opnd1). If (and only if) true is returned, the

10455

/// operands of FMADDSUB/FMSUBADD are written to parameters \p Opnd0, \p Opnd1, \p Opnd2.

10456

///

10457

/// Prior to calling this function it should be known that there is some

10458

/// SDNode that potentially can be replaced with an X86ISD::ADDSUB operation

10459

/// using \p Opnd0 and \p Opnd1 as operands. Also, this method is called

10460

/// before replacement of such SDNode with ADDSUB operation. Thus the number

10461

/// of \p Opnd0 uses is expected to be equal to 2.

10462

/// For example, this function may be called for the following IR:

10463

/// %AB = fmul fast <2 x double> %A, %B

10464

/// %Sub = fsub fast <2 x double> %AB, %C

10465

/// %Add = fadd fast <2 x double> %AB, %C

10466

/// %Addsub = shufflevector <2 x double> %Sub, <2 x double> %Add,

10467

/// <2 x i32> <i32 0, i32 3>

10468

/// There is a def for %Addsub here, which potentially can be replaced by

10469

/// X86ISD::ADDSUB operation:

10470

/// %Addsub = X86ISD::ADDSUB %AB, %C

10471

/// and such ADDSUB can further be replaced with FMADDSUB:

10472

/// %Addsub = FMADDSUB %A, %B, %C.

10473

///

10474

/// The main reason why this method is called before the replacement of the

10475

/// recognized ADDSUB idiom with ADDSUB operation is that such replacement

10476

/// is illegal sometimes. E.g. 512-bit ADDSUB is not available, while 512-bit

10477

/// FMADDSUB is.

10478

static bool isFMAddSubOrFMSubAdd(const X86Subtarget &Subtarget,

10479

SelectionDAG &DAG,

10480

SDValue &Opnd0, SDValue &Opnd1, SDValue &Opnd2,

10481

unsigned ExpectedUses) {

10482

if (Opnd0.getOpcode() != ISD::FMUL ||

10483

!Opnd0->hasNUsesOfValue(ExpectedUses, 0) || !Subtarget.hasAnyFMA())

10484

return false;

10485

10486

// FIXME: These checks must match the similar ones in

10487

// DAGCombiner::visitFADDForFMACombine. It would be good to have one

10488

// function that would answer if it is Ok to fuse MUL + ADD to FMADD

10489

// or MUL + ADDSUB to FMADDSUB.

10490

const TargetOptions &Options = DAG.getTarget().Options;

10491

bool AllowFusion =

10492

(Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath);

10493

if (!AllowFusion)

10494

return false;

10495

10496

Opnd2 = Opnd1;

10497

Opnd1 = Opnd0.getOperand(1);

10498

Opnd0 = Opnd0.getOperand(0);

10499

10500

return true;

10501

}

10502

10503

/// Try to fold a build_vector that performs an 'addsub' or 'fmaddsub' or

10504

/// 'fsubadd' operation accordingly to X86ISD::ADDSUB or X86ISD::FMADDSUB or

10505

/// X86ISD::FMSUBADD node.

10506

static SDValue lowerToAddSubOrFMAddSub(const BuildVectorSDNode *BV,

10507

const X86Subtarget &Subtarget,

10508

SelectionDAG &DAG) {

10509

SDValue Opnd0, Opnd1;

10510

unsigned NumExtracts;

10511

bool IsSubAdd;

10512

if (!isAddSubOrSubAdd(BV, Subtarget, DAG, Opnd0, Opnd1, NumExtracts,

10513

IsSubAdd))

10514

return SDValue();

10515

10516

MVT VT = BV->getSimpleValueType(0);

10517

SDLoc DL(BV);

10518

10519

// Try to generate X86ISD::FMADDSUB node here.

10520

SDValue Opnd2;

10521

if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, NumExtracts)) {

10522

unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;

10523

return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);

10524

}

10525

10526

// We only support ADDSUB.

10527

if (IsSubAdd)

10528

return SDValue();

10529

10530

// There are no known X86 targets with 512-bit ADDSUB instructions!

10531

// Convert to blend(fsub,fadd).

10532

if (VT.is512BitVector()) {

10533

SmallVector<int> Mask;

10534

for (int I = 0, E = VT.getVectorNumElements(); I != E; I += 2) {

10535

Mask.push_back(I);

10536

Mask.push_back(I + E + 1);

10537

}

10538

SDValue Sub = DAG.getNode(ISD::FSUB, DL, VT, Opnd0, Opnd1);

10539

SDValue Add = DAG.getNode(ISD::FADD, DL, VT, Opnd0, Opnd1);

10540

return DAG.getVectorShuffle(VT, DL, Sub, Add, Mask);

10541

}

10542

10543

return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);

10544

}

10545

10546

static bool isHopBuildVector(const BuildVectorSDNode *BV, SelectionDAG &DAG,

10547

unsigned &HOpcode, SDValue &V0, SDValue &V1) {

10548

// Initialize outputs to known values.

10549

MVT VT = BV->getSimpleValueType(0);

10550

HOpcode = ISD::DELETED_NODE;

10551

V0 = DAG.getUNDEF(VT);

10552

V1 = DAG.getUNDEF(VT);

10553

10554

// x86 256-bit horizontal ops are defined in a non-obvious way. Each 128-bit

10555

// half of the result is calculated independently from the 128-bit halves of

10556

// the inputs, so that makes the index-checking logic below more complicated.

10557

unsigned NumElts = VT.getVectorNumElements();

10558

unsigned GenericOpcode = ISD::DELETED_NODE;

10559

unsigned Num128BitChunks = VT.is256BitVector() ? 2 : 1;

10560

unsigned NumEltsIn128Bits = NumElts / Num128BitChunks;

10561

unsigned NumEltsIn64Bits = NumEltsIn128Bits / 2;

10562

for (unsigned i = 0; i != Num128BitChunks; ++i) {

10563

for (unsigned j = 0; j != NumEltsIn128Bits; ++j) {

10564

// Ignore undef elements.

10565

SDValue Op = BV->getOperand(i * NumEltsIn128Bits + j);

10566

if (Op.isUndef())

10567

continue;

10568

10569

// If there's an opcode mismatch, we're done.

10570

if (HOpcode != ISD::DELETED_NODE && Op.getOpcode() != GenericOpcode)

10571

return false;

10572

10573

// Initialize horizontal opcode.

10574

if (HOpcode == ISD::DELETED_NODE) {

10575

GenericOpcode = Op.getOpcode();

10576

switch (GenericOpcode) {

10577

case ISD::ADD: HOpcode = X86ISD::HADD; break;

10578

case ISD::SUB: HOpcode = X86ISD::HSUB; break;

10579

case ISD::FADD: HOpcode = X86ISD::FHADD; break;

10580

case ISD::FSUB: HOpcode = X86ISD::FHSUB; break;

10581

default: return false;

10582

}

10583

}

10584

10585

SDValue Op0 = Op.getOperand(0);

10586

SDValue Op1 = Op.getOperand(1);

10587

if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

10588

Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

10589

Op0.getOperand(0) != Op1.getOperand(0) ||

10590

!isa<ConstantSDNode>(Op0.getOperand(1)) ||

10591

!isa<ConstantSDNode>(Op1.getOperand(1)) || !Op.hasOneUse())

10592

return false;

10593

10594

// The source vector is chosen based on which 64-bit half of the

10595

// destination vector is being calculated.

10596

if (j < NumEltsIn64Bits) {

10597

if (V0.isUndef())

10598

V0 = Op0.getOperand(0);

10599

} else {

10600

if (V1.isUndef())

10601

V1 = Op0.getOperand(0);

10602

}

10603

10604

SDValue SourceVec = (j < NumEltsIn64Bits) ? V0 : V1;

10605

if (SourceVec != Op0.getOperand(0))

10606

return false;

10607

10608

// op (extract_vector_elt A, I), (extract_vector_elt A, I+1)

10609

unsigned ExtIndex0 = Op0.getConstantOperandVal(1);

10610

unsigned ExtIndex1 = Op1.getConstantOperandVal(1);

10611

unsigned ExpectedIndex = i * NumEltsIn128Bits +

10612

(j % NumEltsIn64Bits) * 2;

10613

if (ExpectedIndex == ExtIndex0 && ExtIndex1 == ExtIndex0 + 1)

10614

continue;

10615

10616

// If this is not a commutative op, this does not match.

10617

if (GenericOpcode != ISD::ADD && GenericOpcode != ISD::FADD)

10618

return false;

10619

10620

// Addition is commutative, so try swapping the extract indexes.

10621

// op (extract_vector_elt A, I+1), (extract_vector_elt A, I)

10622

if (ExpectedIndex == ExtIndex1 && ExtIndex0 == ExtIndex1 + 1)

10623

continue;

10624

10625

// Extract indexes do not match horizontal requirement.

10626

return false;

10627

}

10628

}

10629

// We matched. Opcode and operands are returned by reference as arguments.

10630

return true;

10631

}

10632

10633

static SDValue getHopForBuildVector(const BuildVectorSDNode *BV,

10634

SelectionDAG &DAG, unsigned HOpcode,

10635

SDValue V0, SDValue V1) {

10636

// If either input vector is not the same size as the build vector,

10637

// extract/insert the low bits to the correct size.

10638

// This is free (examples: zmm --> xmm, xmm --> ymm).

10639

MVT VT = BV->getSimpleValueType(0);

10640

unsigned Width = VT.getSizeInBits();

10641

if (V0.getValueSizeInBits() > Width)

10642

V0 = extractSubVector(V0, 0, DAG, SDLoc(BV), Width);

10643

else if (V0.getValueSizeInBits() < Width)

10644

V0 = insertSubVector(DAG.getUNDEF(VT), V0, 0, DAG, SDLoc(BV), Width);

10645

10646

if (V1.getValueSizeInBits() > Width)

10647

V1 = extractSubVector(V1, 0, DAG, SDLoc(BV), Width);

10648

else if (V1.getValueSizeInBits() < Width)

10649

V1 = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, SDLoc(BV), Width);

10650

10651

unsigned NumElts = VT.getVectorNumElements();

10652

APInt DemandedElts = APInt::getAllOnes(NumElts);

10653

for (unsigned i = 0; i != NumElts; ++i)

10654

if (BV->getOperand(i).isUndef())

10655

DemandedElts.clearBit(i);

10656

10657

// If we don't need the upper xmm, then perform as a xmm hop.

10658

unsigned HalfNumElts = NumElts / 2;

10659

if (VT.is256BitVector() && DemandedElts.lshr(HalfNumElts) == 0) {

10660

MVT HalfVT = VT.getHalfNumVectorElementsVT();

10661

V0 = extractSubVector(V0, 0, DAG, SDLoc(BV), 128);

10662

V1 = extractSubVector(V1, 0, DAG, SDLoc(BV), 128);

10663

SDValue Half = DAG.getNode(HOpcode, SDLoc(BV), HalfVT, V0, V1);

10664

return insertSubVector(DAG.getUNDEF(VT), Half, 0, DAG, SDLoc(BV), 256);

10665

}

10666

10667

return DAG.getNode(HOpcode, SDLoc(BV), VT, V0, V1);

10668

}

10669

10670

/// Lower BUILD_VECTOR to a horizontal add/sub operation if possible.

10671

static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV,

10672

const X86Subtarget &Subtarget,

10673

SelectionDAG &DAG) {

10674

// We need at least 2 non-undef elements to make this worthwhile by default.

10675

unsigned NumNonUndefs =

10676

count_if(BV->op_values(), [](SDValue V) { return !V.isUndef(); });

10677

if (NumNonUndefs < 2)

10678

return SDValue();

10679

10680

// There are 4 sets of horizontal math operations distinguished by type:

10681

// int/FP at 128-bit/256-bit. Each type was introduced with a different

10682

// subtarget feature. Try to match those "native" patterns first.

10683

MVT VT = BV->getSimpleValueType(0);

10684

if (((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget.hasSSE3()) ||

10685

((VT == MVT::v8i16 || VT == MVT::v4i32) && Subtarget.hasSSSE3()) ||

10686

((VT == MVT::v8f32 || VT == MVT::v4f64) && Subtarget.hasAVX()) ||

10687

((VT == MVT::v16i16 || VT == MVT::v8i32) && Subtarget.hasAVX2())) {

10688

unsigned HOpcode;

10689

SDValue V0, V1;

10690

if (isHopBuildVector(BV, DAG, HOpcode, V0, V1))

10691

return getHopForBuildVector(BV, DAG, HOpcode, V0, V1);

10692

}

10693

10694

// Try harder to match 256-bit ops by using extract/concat.

10695

if (!Subtarget.hasAVX() || !VT.is256BitVector())

10696

return SDValue();

10697

10698

// Count the number of UNDEF operands in the build_vector in input.

10699

unsigned NumElts = VT.getVectorNumElements();

10700

unsigned Half = NumElts / 2;

10701

unsigned NumUndefsLO = 0;

10702

unsigned NumUndefsHI = 0;

10703

for (unsigned i = 0, e = Half; i != e; ++i)

10704

if (BV->getOperand(i)->isUndef())

10705

NumUndefsLO++;

10706

10707

for (unsigned i = Half, e = NumElts; i != e; ++i)

10708

if (BV->getOperand(i)->isUndef())

10709

NumUndefsHI++;

10710

10711

SDLoc DL(BV);

10712

SDValue InVec0, InVec1;

10713

if (VT == MVT::v8i32 || VT == MVT::v16i16) {

10714

SDValue InVec2, InVec3;

10715

unsigned X86Opcode;

10716

bool CanFold = true;

10717

10718

if (isHorizontalBinOpPart(BV, ISD::ADD, DAG, 0, Half, InVec0, InVec1) &&

10719

isHorizontalBinOpPart(BV, ISD::ADD, DAG, Half, NumElts, InVec2,

10720

InVec3) &&

10721

((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&

10722

((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))

10723

X86Opcode = X86ISD::HADD;

10724

else if (isHorizontalBinOpPart(BV, ISD::SUB, DAG, 0, Half, InVec0,

10725

InVec1) &&

10726

isHorizontalBinOpPart(BV, ISD::SUB, DAG, Half, NumElts, InVec2,

10727

InVec3) &&

10728

((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&

10729

((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))

10730

X86Opcode = X86ISD::HSUB;

10731

else

10732

CanFold = false;

10733

10734

if (CanFold) {

10735

// Do not try to expand this build_vector into a pair of horizontal

10736

// add/sub if we can emit a pair of scalar add/sub.

10737

if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)

10738

return SDValue();

10739

10740

// Convert this build_vector into a pair of horizontal binops followed by

10741

// a concat vector. We must adjust the outputs from the partial horizontal

10742

// matching calls above to account for undefined vector halves.

10743

SDValue V0 = InVec0.isUndef() ? InVec2 : InVec0;

10744

SDValue V1 = InVec1.isUndef() ? InVec3 : InVec1;

10745

assert((!V0.isUndef() || !V1.isUndef()) && "Horizontal-op of undefs?")(static_cast <bool> ((!V0.isUndef() || !V1.isUndef()) &&
"Horizontal-op of undefs?") ? void (0) : __assert_fail ("(!V0.isUndef() || !V1.isUndef()) && \"Horizontal-op of undefs?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10745, __extension__
__PRETTY_FUNCTION__));

10746

bool isUndefLO = NumUndefsLO == Half;

10747

bool isUndefHI = NumUndefsHI == Half;

10748

return ExpandHorizontalBinOp(V0, V1, DL, DAG, X86Opcode, false, isUndefLO,

10749

isUndefHI);

10750

}

10751

}

10752

10753

if (VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||

10754

VT == MVT::v16i16) {

10755

unsigned X86Opcode;

10756

if (isHorizontalBinOpPart(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))

10757

X86Opcode = X86ISD::HADD;

10758

else if (isHorizontalBinOpPart(BV, ISD::SUB, DAG, 0, NumElts, InVec0,

10759

InVec1))

10760

X86Opcode = X86ISD::HSUB;

10761

else if (isHorizontalBinOpPart(BV, ISD::FADD, DAG, 0, NumElts, InVec0,

10762

InVec1))

10763

X86Opcode = X86ISD::FHADD;

10764

else if (isHorizontalBinOpPart(BV, ISD::FSUB, DAG, 0, NumElts, InVec0,

10765

InVec1))

10766

X86Opcode = X86ISD::FHSUB;

10767

else

10768

return SDValue();

10769

10770

// Don't try to expand this build_vector into a pair of horizontal add/sub

10771

// if we can simply emit a pair of scalar add/sub.

10772

if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)

10773

return SDValue();

10774

10775

// Convert this build_vector into two horizontal add/sub followed by

10776

// a concat vector.

10777

bool isUndefLO = NumUndefsLO == Half;

10778

bool isUndefHI = NumUndefsHI == Half;

10779

return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,

10780

isUndefLO, isUndefHI);

10781

}

10782

10783

return SDValue();

10784

}

10785

10786

static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,

10787

SelectionDAG &DAG);

10788

10789

/// If a BUILD_VECTOR's source elements all apply the same bit operation and

10790

/// one of their operands is constant, lower to a pair of BUILD_VECTOR and

10791

/// just apply the bit to the vectors.

10792

/// NOTE: Its not in our interest to start make a general purpose vectorizer

10793

/// from this, but enough scalar bit operations are created from the later

10794

/// legalization + scalarization stages to need basic support.

10795

static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op,

10796

const X86Subtarget &Subtarget,

10797

SelectionDAG &DAG) {

10798

SDLoc DL(Op);

10799

MVT VT = Op->getSimpleValueType(0);

10800

unsigned NumElems = VT.getVectorNumElements();

10801

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

10802

10803

// Check that all elements have the same opcode.

10804

// TODO: Should we allow UNDEFS and if so how many?

10805

unsigned Opcode = Op->getOperand(0).getOpcode();

10806

for (unsigned i = 1; i < NumElems; ++i)

10807

if (Opcode != Op->getOperand(i).getOpcode())

10808

return SDValue();

10809

10810

// TODO: We may be able to add support for other Ops (ADD/SUB + shifts).

10811

bool IsShift = false;

10812

switch (Opcode) {

10813

default:

10814

return SDValue();

10815

case ISD::SHL:

10816

case ISD::SRL:

10817

case ISD::SRA:

10818

IsShift = true;

10819

break;

10820

case ISD::AND:

10821

case ISD::XOR:

10822

case ISD::OR:

10823

// Don't do this if the buildvector is a splat - we'd replace one

10824

// constant with an entire vector.

10825

if (Op->getSplatValue())

10826

return SDValue();

10827

if (!TLI.isOperationLegalOrPromote(Opcode, VT))

10828

return SDValue();

10829

break;

10830

}

10831

10832

SmallVector<SDValue, 4> LHSElts, RHSElts;

10833

for (SDValue Elt : Op->ops()) {

10834

SDValue LHS = Elt.getOperand(0);

10835

SDValue RHS = Elt.getOperand(1);

10836

10837

// We expect the canonicalized RHS operand to be the constant.

10838

if (!isa<ConstantSDNode>(RHS))

10839

return SDValue();

10840

10841

// Extend shift amounts.

10842

if (RHS.getValueSizeInBits() != VT.getScalarSizeInBits()) {

10843

if (!IsShift)

10844

return SDValue();

10845

RHS = DAG.getZExtOrTrunc(RHS, DL, VT.getScalarType());

10846

}

10847

10848

LHSElts.push_back(LHS);

10849

RHSElts.push_back(RHS);

10850

}

10851

10852

// Limit to shifts by uniform immediates.

10853

// TODO: Only accept vXi8/vXi64 special cases?

10854

// TODO: Permit non-uniform XOP/AVX2/MULLO cases?

10855

if (IsShift && any_of(RHSElts, [&](SDValue V) { return RHSElts[0] != V; }))

10856

return SDValue();

10857

10858

SDValue LHS = DAG.getBuildVector(VT, DL, LHSElts);

10859

SDValue RHS = DAG.getBuildVector(VT, DL, RHSElts);

10860

SDValue Res = DAG.getNode(Opcode, DL, VT, LHS, RHS);

10861

10862

if (!IsShift)

10863

return Res;

10864

10865

// Immediately lower the shift to ensure the constant build vector doesn't

10866

// get converted to a constant pool before the shift is lowered.

10867

return LowerShift(Res, Subtarget, DAG);

10868

}

10869

10870

/// Create a vector constant without a load. SSE/AVX provide the bare minimum

10871

/// functionality to do this, so it's all zeros, all ones, or some derivation

10872

/// that is cheap to calculate.

10873

static SDValue materializeVectorConstant(SDValue Op, SelectionDAG &DAG,

10874

const X86Subtarget &Subtarget) {

10875

SDLoc DL(Op);

10876

MVT VT = Op.getSimpleValueType();

10877

10878

// Vectors containing all zeros can be matched by pxor and xorps.

10879

if (ISD::isBuildVectorAllZeros(Op.getNode()))

10880

return Op;

10881

10882

// Vectors containing all ones can be matched by pcmpeqd on 128-bit width

10883

// vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use

10884

// vpcmpeqd on 256-bit vectors.

10885

if (Subtarget.hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {

10886

if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)

10887

return Op;

10888

10889

return getOnesVector(VT, DAG, DL);

10890

}

10891

10892

return SDValue();

10893

}

10894

10895

/// Look for opportunities to create a VPERMV/VPERMILPV/PSHUFB variable permute

10896

/// from a vector of source values and a vector of extraction indices.

10897

/// The vectors might be manipulated to match the type of the permute op.

10898

static SDValue createVariablePermute(MVT VT, SDValue SrcVec, SDValue IndicesVec,

10899

SDLoc &DL, SelectionDAG &DAG,

10900

const X86Subtarget &Subtarget) {

10901

MVT ShuffleVT = VT;

10902

EVT IndicesVT = EVT(VT).changeVectorElementTypeToInteger();

10903

unsigned NumElts = VT.getVectorNumElements();

10904

unsigned SizeInBits = VT.getSizeInBits();

10905

10906

// Adjust IndicesVec to match VT size.

10907

assert(IndicesVec.getValueType().getVectorNumElements() >= NumElts &&(static_cast <bool> (IndicesVec.getValueType().getVectorNumElements
() >= NumElts && "Illegal variable permute mask size"
) ? void (0) : __assert_fail ("IndicesVec.getValueType().getVectorNumElements() >= NumElts && \"Illegal variable permute mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10908, __extension__
__PRETTY_FUNCTION__))

10908

"Illegal variable permute mask size")(static_cast <bool> (IndicesVec.getValueType().getVectorNumElements
() >= NumElts && "Illegal variable permute mask size"
) ? void (0) : __assert_fail ("IndicesVec.getValueType().getVectorNumElements() >= NumElts && \"Illegal variable permute mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10908, __extension__
__PRETTY_FUNCTION__));

10909

if (IndicesVec.getValueType().getVectorNumElements() > NumElts) {

10910

// Narrow/widen the indices vector to the correct size.

10911

if (IndicesVec.getValueSizeInBits() > SizeInBits)

10912

IndicesVec = extractSubVector(IndicesVec, 0, DAG, SDLoc(IndicesVec),

10913

NumElts * VT.getScalarSizeInBits());

10914

else if (IndicesVec.getValueSizeInBits() < SizeInBits)

10915

IndicesVec = widenSubVector(IndicesVec, false, Subtarget, DAG,

10916

SDLoc(IndicesVec), SizeInBits);

10917

// Zero-extend the index elements within the vector.

10918

if (IndicesVec.getValueType().getVectorNumElements() > NumElts)

10919

IndicesVec = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(IndicesVec),

10920

IndicesVT, IndicesVec);

10921

}

10922

IndicesVec = DAG.getZExtOrTrunc(IndicesVec, SDLoc(IndicesVec), IndicesVT);

10923

10924

// Handle SrcVec that don't match VT type.

10925

if (SrcVec.getValueSizeInBits() != SizeInBits) {

10926

if ((SrcVec.getValueSizeInBits() % SizeInBits) == 0) {

10927

// Handle larger SrcVec by treating it as a larger permute.

10928

unsigned Scale = SrcVec.getValueSizeInBits() / SizeInBits;

10929

VT = MVT::getVectorVT(VT.getScalarType(), Scale * NumElts);

10930

IndicesVT = EVT(VT).changeVectorElementTypeToInteger();

10931

IndicesVec = widenSubVector(IndicesVT.getSimpleVT(), IndicesVec, false,

10932

Subtarget, DAG, SDLoc(IndicesVec));

10933

SDValue NewSrcVec =

10934

createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);

10935

if (NewSrcVec)

10936

return extractSubVector(NewSrcVec, 0, DAG, DL, SizeInBits);

10937

return SDValue();

10938

} else if (SrcVec.getValueSizeInBits() < SizeInBits) {

10939

// Widen smaller SrcVec to match VT.

10940

SrcVec = widenSubVector(VT, SrcVec, false, Subtarget, DAG, SDLoc(SrcVec));

10941

} else

10942

return SDValue();

10943

}

10944

10945

auto ScaleIndices = [&DAG](SDValue Idx, uint64_t Scale) {

10946

assert(isPowerOf2_64(Scale) && "Illegal variable permute shuffle scale")(static_cast <bool> (isPowerOf2_64(Scale) && "Illegal variable permute shuffle scale"
) ? void (0) : __assert_fail ("isPowerOf2_64(Scale) && \"Illegal variable permute shuffle scale\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10946, __extension__
__PRETTY_FUNCTION__));

10947

EVT SrcVT = Idx.getValueType();

10948

unsigned NumDstBits = SrcVT.getScalarSizeInBits() / Scale;

10949

uint64_t IndexScale = 0;

10950

uint64_t IndexOffset = 0;

10951

10952

// If we're scaling a smaller permute op, then we need to repeat the

10953

// indices, scaling and offsetting them as well.

10954

// e.g. v4i32 -> v16i8 (Scale = 4)

10955

// IndexScale = v4i32 Splat(4 << 24 | 4 << 16 | 4 << 8 | 4)

10956

// IndexOffset = v4i32 Splat(3 << 24 | 2 << 16 | 1 << 8 | 0)

10957

for (uint64_t i = 0; i != Scale; ++i) {

10958

IndexScale |= Scale << (i * NumDstBits);

10959

IndexOffset |= i << (i * NumDstBits);

10960

}

10961

10962

Idx = DAG.getNode(ISD::MUL, SDLoc(Idx), SrcVT, Idx,

10963

DAG.getConstant(IndexScale, SDLoc(Idx), SrcVT));

10964

Idx = DAG.getNode(ISD::ADD, SDLoc(Idx), SrcVT, Idx,

10965

DAG.getConstant(IndexOffset, SDLoc(Idx), SrcVT));

10966

return Idx;

10967

};

10968

10969

unsigned Opcode = 0;

10970

switch (VT.SimpleTy) {

10971

default:

10972

break;

10973

case MVT::v16i8:

10974

if (Subtarget.hasSSSE3())

10975

Opcode = X86ISD::PSHUFB;

10976

break;

10977

case MVT::v8i16:

10978

if (Subtarget.hasVLX() && Subtarget.hasBWI())

10979

Opcode = X86ISD::VPERMV;

10980

else if (Subtarget.hasSSSE3()) {

10981

Opcode = X86ISD::PSHUFB;

10982

ShuffleVT = MVT::v16i8;

10983

}

10984

break;

10985

case MVT::v4f32:

10986

case MVT::v4i32:

10987

if (Subtarget.hasAVX()) {

10988

Opcode = X86ISD::VPERMILPV;

10989

ShuffleVT = MVT::v4f32;

10990

} else if (Subtarget.hasSSSE3()) {

10991

Opcode = X86ISD::PSHUFB;

10992

ShuffleVT = MVT::v16i8;

10993

}

10994

break;

10995

case MVT::v2f64:

10996

case MVT::v2i64:

10997

if (Subtarget.hasAVX()) {

10998

// VPERMILPD selects using bit#1 of the index vector, so scale IndicesVec.

10999

IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);

11000

Opcode = X86ISD::VPERMILPV;

11001

ShuffleVT = MVT::v2f64;

11002

} else if (Subtarget.hasSSE41()) {

11003

// SSE41 can compare v2i64 - select between indices 0 and 1.

11004

return DAG.getSelectCC(

11005

DL, IndicesVec,

11006

getZeroVector(IndicesVT.getSimpleVT(), Subtarget, DAG, DL),

11007

DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {0, 0}),

11008

DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {1, 1}),

11009

ISD::CondCode::SETEQ);

11010

}

11011

break;

11012

case MVT::v32i8:

11013

if (Subtarget.hasVLX() && Subtarget.hasVBMI())

11014

Opcode = X86ISD::VPERMV;

11015

else if (Subtarget.hasXOP()) {

11016

SDValue LoSrc = extract128BitVector(SrcVec, 0, DAG, DL);

11017

SDValue HiSrc = extract128BitVector(SrcVec, 16, DAG, DL);

11018

SDValue LoIdx = extract128BitVector(IndicesVec, 0, DAG, DL);

11019

SDValue HiIdx = extract128BitVector(IndicesVec, 16, DAG, DL);

11020

return DAG.getNode(

11021

ISD::CONCAT_VECTORS, DL, VT,

11022

DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, LoIdx),

11023

DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, HiIdx));

11024

} else if (Subtarget.hasAVX()) {

11025

SDValue Lo = extract128BitVector(SrcVec, 0, DAG, DL);

11026

SDValue Hi = extract128BitVector(SrcVec, 16, DAG, DL);

11027

SDValue LoLo = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Lo);

11028

SDValue HiHi = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Hi, Hi);

11029

auto PSHUFBBuilder = [](SelectionDAG &DAG, const SDLoc &DL,

11030

ArrayRef<SDValue> Ops) {

11031

// Permute Lo and Hi and then select based on index range.

11032

// This works as SHUFB uses bits[3:0] to permute elements and we don't

11033

// care about the bit[7] as its just an index vector.

11034

SDValue Idx = Ops[2];

11035

EVT VT = Idx.getValueType();

11036

return DAG.getSelectCC(DL, Idx, DAG.getConstant(15, DL, VT),

11037

DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[1], Idx),

11038

DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[0], Idx),

11039

ISD::CondCode::SETGT);

11040

};

11041

SDValue Ops[] = {LoLo, HiHi, IndicesVec};

11042

return SplitOpsAndApply(DAG, Subtarget, DL, MVT::v32i8, Ops,

11043

PSHUFBBuilder);

11044

}

11045

break;

11046

case MVT::v16i16:

11047

if (Subtarget.hasVLX() && Subtarget.hasBWI())

11048

Opcode = X86ISD::VPERMV;

11049

else if (Subtarget.hasAVX()) {

11050

// Scale to v32i8 and perform as v32i8.

11051

IndicesVec = ScaleIndices(IndicesVec, 2);

11052

return DAG.getBitcast(

11053

VT, createVariablePermute(

11054

MVT::v32i8, DAG.getBitcast(MVT::v32i8, SrcVec),

11055

DAG.getBitcast(MVT::v32i8, IndicesVec), DL, DAG, Subtarget));

11056

}

11057

break;

11058

case MVT::v8f32:

11059

case MVT::v8i32:

11060

if (Subtarget.hasAVX2())

11061

Opcode = X86ISD::VPERMV;

11062

else if (Subtarget.hasAVX()) {

11063

SrcVec = DAG.getBitcast(MVT::v8f32, SrcVec);

11064

SDValue LoLo = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,

11065

{0, 1, 2, 3, 0, 1, 2, 3});

11066

SDValue HiHi = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,

11067

{4, 5, 6, 7, 4, 5, 6, 7});

11068

if (Subtarget.hasXOP())

11069

return DAG.getBitcast(

11070

VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v8f32, LoLo, HiHi,

11071

IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8)));

11072

// Permute Lo and Hi and then select based on index range.

11073

// This works as VPERMILPS only uses index bits[0:1] to permute elements.

11074

SDValue Res = DAG.getSelectCC(

11075

DL, IndicesVec, DAG.getConstant(3, DL, MVT::v8i32),

11076

DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, HiHi, IndicesVec),

11077

DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, LoLo, IndicesVec),

11078

ISD::CondCode::SETGT);

11079

return DAG.getBitcast(VT, Res);

11080

}

11081

break;

11082

case MVT::v4i64:

11083

case MVT::v4f64:

11084

if (Subtarget.hasAVX512()) {

11085

if (!Subtarget.hasVLX()) {

11086

MVT WidenSrcVT = MVT::getVectorVT(VT.getScalarType(), 8);

11087

SrcVec = widenSubVector(WidenSrcVT, SrcVec, false, Subtarget, DAG,

11088

SDLoc(SrcVec));

11089

IndicesVec = widenSubVector(MVT::v8i64, IndicesVec, false, Subtarget,

11090

DAG, SDLoc(IndicesVec));

11091

SDValue Res = createVariablePermute(WidenSrcVT, SrcVec, IndicesVec, DL,

11092

DAG, Subtarget);

11093

return extract256BitVector(Res, 0, DAG, DL);

11094

}

11095

Opcode = X86ISD::VPERMV;

11096

} else if (Subtarget.hasAVX()) {

11097

SrcVec = DAG.getBitcast(MVT::v4f64, SrcVec);

11098

SDValue LoLo =

11099

DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {0, 1, 0, 1});

11100

SDValue HiHi =

11101

DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {2, 3, 2, 3});

11102

// VPERMIL2PD selects with bit#1 of the index vector, so scale IndicesVec.

11103

IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);

11104

if (Subtarget.hasXOP())

11105

return DAG.getBitcast(

11106

VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v4f64, LoLo, HiHi,

11107

IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8)));

11108

// Permute Lo and Hi and then select based on index range.

11109

// This works as VPERMILPD only uses index bit[1] to permute elements.

11110

SDValue Res = DAG.getSelectCC(

11111

DL, IndicesVec, DAG.getConstant(2, DL, MVT::v4i64),

11112

DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, HiHi, IndicesVec),

11113

DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, LoLo, IndicesVec),

11114

ISD::CondCode::SETGT);

11115

return DAG.getBitcast(VT, Res);

11116

}

11117

break;

11118

case MVT::v64i8:

11119

if (Subtarget.hasVBMI())

11120

Opcode = X86ISD::VPERMV;

11121

break;

11122

case MVT::v32i16:

11123

if (Subtarget.hasBWI())

11124

Opcode = X86ISD::VPERMV;

11125

break;

11126

case MVT::v16f32:

11127

case MVT::v16i32:

11128

case MVT::v8f64:

11129

case MVT::v8i64:

11130

if (Subtarget.hasAVX512())

11131

Opcode = X86ISD::VPERMV;

11132

break;

11133

}

11134

if (!Opcode)

11135

return SDValue();

11136

11137

assert((VT.getSizeInBits() == ShuffleVT.getSizeInBits()) &&(static_cast <bool> ((VT.getSizeInBits() == ShuffleVT.getSizeInBits
()) && (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits
()) == 0 && "Illegal variable permute shuffle type") ?
void (0) : __assert_fail ("(VT.getSizeInBits() == ShuffleVT.getSizeInBits()) && (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits()) == 0 && \"Illegal variable permute shuffle type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11139, __extension__
__PRETTY_FUNCTION__))

11138

(VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits()) == 0 &&(static_cast <bool> ((VT.getSizeInBits() == ShuffleVT.getSizeInBits
()) && (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits
()) == 0 && "Illegal variable permute shuffle type") ?
void (0) : __assert_fail ("(VT.getSizeInBits() == ShuffleVT.getSizeInBits()) && (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits()) == 0 && \"Illegal variable permute shuffle type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11139, __extension__
__PRETTY_FUNCTION__))

11139

"Illegal variable permute shuffle type")(static_cast <bool> ((VT.getSizeInBits() == ShuffleVT.getSizeInBits
()) && (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits
()) == 0 && "Illegal variable permute shuffle type") ?
void (0) : __assert_fail ("(VT.getSizeInBits() == ShuffleVT.getSizeInBits()) && (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits()) == 0 && \"Illegal variable permute shuffle type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11139, __extension__
__PRETTY_FUNCTION__));

11140

11141

uint64_t Scale = VT.getScalarSizeInBits() / ShuffleVT.getScalarSizeInBits();

11142

if (Scale > 1)

11143

IndicesVec = ScaleIndices(IndicesVec, Scale);

11144

11145

EVT ShuffleIdxVT = EVT(ShuffleVT).changeVectorElementTypeToInteger();

11146

IndicesVec = DAG.getBitcast(ShuffleIdxVT, IndicesVec);

11147

11148

SrcVec = DAG.getBitcast(ShuffleVT, SrcVec);

11149

SDValue Res = Opcode == X86ISD::VPERMV

11150

? DAG.getNode(Opcode, DL, ShuffleVT, IndicesVec, SrcVec)

11151

: DAG.getNode(Opcode, DL, ShuffleVT, SrcVec, IndicesVec);

11152

return DAG.getBitcast(VT, Res);

11153

}

11154

11155

// Tries to lower a BUILD_VECTOR composed of extract-extract chains that can be

11156

// reasoned to be a permutation of a vector by indices in a non-constant vector.

11157

// (build_vector (extract_elt V, (extract_elt I, 0)),

11158

// (extract_elt V, (extract_elt I, 1)),

11159

// ...

11160

// ->

11161

// (vpermv I, V)

11162

//

11163

// TODO: Handle undefs

11164

// TODO: Utilize pshufb and zero mask blending to support more efficient

11165

// construction of vectors with constant-0 elements.

11166

static SDValue

11167

LowerBUILD_VECTORAsVariablePermute(SDValue V, SelectionDAG &DAG,

11168

const X86Subtarget &Subtarget) {

11169

SDValue SrcVec, IndicesVec;

11170

// Check for a match of the permute source vector and permute index elements.

11171

// This is done by checking that the i-th build_vector operand is of the form:

11172

// (extract_elt SrcVec, (extract_elt IndicesVec, i)).

11173

for (unsigned Idx = 0, E = V.getNumOperands(); Idx != E; ++Idx) {

11174

SDValue Op = V.getOperand(Idx);

11175

if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT)

11176

return SDValue();

11177

11178

// If this is the first extract encountered in V, set the source vector,

11179

// otherwise verify the extract is from the previously defined source

11180

// vector.

11181

if (!SrcVec)

11182

SrcVec = Op.getOperand(0);

11183

else if (SrcVec != Op.getOperand(0))

11184

return SDValue();

11185

SDValue ExtractedIndex = Op->getOperand(1);

11186

// Peek through extends.

11187

if (ExtractedIndex.getOpcode() == ISD::ZERO_EXTEND ||

11188

ExtractedIndex.getOpcode() == ISD::SIGN_EXTEND)

11189

ExtractedIndex = ExtractedIndex.getOperand(0);

11190

if (ExtractedIndex.getOpcode() != ISD::EXTRACT_VECTOR_ELT)

11191

return SDValue();

11192

11193

// If this is the first extract from the index vector candidate, set the

11194

// indices vector, otherwise verify the extract is from the previously

11195

// defined indices vector.

11196

if (!IndicesVec)

11197

IndicesVec = ExtractedIndex.getOperand(0);

11198

else if (IndicesVec != ExtractedIndex.getOperand(0))

11199

return SDValue();

11200

11201

auto *PermIdx = dyn_cast<ConstantSDNode>(ExtractedIndex.getOperand(1));

11202

if (!PermIdx || PermIdx->getAPIntValue() != Idx)

11203

return SDValue();

11204

}

11205

11206

SDLoc DL(V);

11207

MVT VT = V.getSimpleValueType();

11208

return createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);

11209

}

11210

11211

SDValue

11212

X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {

11213

SDLoc dl(Op);

11214

11215

MVT VT = Op.getSimpleValueType();

11216

MVT EltVT = VT.getVectorElementType();

11217

MVT OpEltVT = Op.getOperand(0).getSimpleValueType();

11218

unsigned NumElems = Op.getNumOperands();

11219

11220

// Generate vectors for predicate vectors.

11221

if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512())

11222

return LowerBUILD_VECTORvXi1(Op, DAG, Subtarget);

11223

11224

if (VT.getVectorElementType() == MVT::bf16 && Subtarget.hasBF16())

11225

return LowerBUILD_VECTORvXbf16(Op, DAG, Subtarget);

11226

11227

if (SDValue VectorConstant = materializeVectorConstant(Op, DAG, Subtarget))

11228

return VectorConstant;

11229

11230

unsigned EVTBits = EltVT.getSizeInBits();

11231

APInt UndefMask = APInt::getZero(NumElems);

11232

APInt FrozenUndefMask = APInt::getZero(NumElems);

11233

APInt ZeroMask = APInt::getZero(NumElems);

11234

APInt NonZeroMask = APInt::getZero(NumElems);

11235

bool IsAllConstants = true;

11236

SmallSet<SDValue, 8> Values;

11237

unsigned NumConstants = NumElems;

11238

for (unsigned i = 0; i < NumElems; ++i) {

11239

SDValue Elt = Op.getOperand(i);

11240

if (Elt.isUndef()) {

11241

UndefMask.setBit(i);

11242

continue;

11243

}

11244

if (Elt.getOpcode() == ISD::FREEZE && Elt.getOperand(0).isUndef()) {

11245

FrozenUndefMask.setBit(i);

11246

continue;

11247

}

11248

Values.insert(Elt);

11249

if (!isIntOrFPConstant(Elt)) {

11250

IsAllConstants = false;

11251

NumConstants--;

11252

}

11253

if (X86::isZeroNode(Elt)) {

11254

ZeroMask.setBit(i);

11255

} else {

11256

NonZeroMask.setBit(i);

11257

}

11258

}

11259

11260

// All undef vector. Return an UNDEF.

11261

if (UndefMask.isAllOnes())

11262

return DAG.getUNDEF(VT);

11263

11264

// If we have multiple FREEZE-UNDEF operands, we are likely going to end up

11265

// lowering into a suboptimal insertion sequence. Instead, thaw the UNDEF in

11266

// our source BUILD_VECTOR, create another FREEZE-UNDEF splat BUILD_VECTOR,

11267

// and blend the FREEZE-UNDEF operands back in.

11268

// FIXME: is this worthwhile even for a single FREEZE-UNDEF operand?

11269

if (unsigned NumFrozenUndefElts = FrozenUndefMask.popcount();

11270

NumFrozenUndefElts >= 2 && NumFrozenUndefElts < NumElems) {

11271

SmallVector<int, 16> BlendMask(NumElems, -1);

11272

SmallVector<SDValue, 16> Elts(NumElems, DAG.getUNDEF(OpEltVT));

11273

for (unsigned i = 0; i < NumElems; ++i) {

11274

if (UndefMask[i]) {

11275

BlendMask[i] = -1;

11276

continue;

11277

}

11278

BlendMask[i] = i;

11279

if (!FrozenUndefMask[i])

11280

Elts[i] = Op.getOperand(i);

11281

else

11282

BlendMask[i] += NumElems;

11283

}

11284

SDValue EltsBV = DAG.getBuildVector(VT, dl, Elts);

11285

SDValue FrozenUndefElt = DAG.getFreeze(DAG.getUNDEF(OpEltVT));

11286

SDValue FrozenUndefBV = DAG.getSplatBuildVector(VT, dl, FrozenUndefElt);

11287

return DAG.getVectorShuffle(VT, dl, EltsBV, FrozenUndefBV, BlendMask);

11288

}

11289

11290

BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode());

11291

11292

// If the upper elts of a ymm/zmm are undef/zero then we might be better off

11293

// lowering to a smaller build vector and padding with undef/zero.

11294

if ((VT.is256BitVector() || VT.is512BitVector()) &&

11295

!isFoldableUseOfShuffle(BV)) {

11296

unsigned UpperElems = NumElems / 2;

11297

APInt UndefOrZeroMask = UndefMask | ZeroMask;

11298

unsigned NumUpperUndefsOrZeros = UndefOrZeroMask.countl_one();

11299

if (NumUpperUndefsOrZeros >= UpperElems) {

11300

if (VT.is512BitVector() &&

11301

NumUpperUndefsOrZeros >= (NumElems - (NumElems / 4)))

11302

UpperElems = NumElems - (NumElems / 4);

11303

bool UndefUpper = UndefMask.countl_one() >= UpperElems;

11304

MVT LowerVT = MVT::getVectorVT(EltVT, NumElems - UpperElems);

11305

SDValue NewBV =

11306

DAG.getBuildVector(LowerVT, dl, Op->ops().drop_back(UpperElems));

11307

return widenSubVector(VT, NewBV, !UndefUpper, Subtarget, DAG, dl);

11308

}

11309

}

11310

11311

if (SDValue AddSub = lowerToAddSubOrFMAddSub(BV, Subtarget, DAG))

11312

return AddSub;

11313

if (SDValue HorizontalOp = LowerToHorizontalOp(BV, Subtarget, DAG))

11314

return HorizontalOp;

11315

if (SDValue Broadcast = lowerBuildVectorAsBroadcast(BV, Subtarget, DAG))

11316

return Broadcast;

11317

if (SDValue BitOp = lowerBuildVectorToBitOp(BV, Subtarget, DAG))

11318

return BitOp;

11319

11320

unsigned NumZero = ZeroMask.popcount();

11321

unsigned NumNonZero = NonZeroMask.popcount();

11322

11323

// If we are inserting one variable into a vector of non-zero constants, try

11324

// to avoid loading each constant element as a scalar. Load the constants as a

11325

// vector and then insert the variable scalar element. If insertion is not

11326

// supported, fall back to a shuffle to get the scalar blended with the

11327

// constants. Insertion into a zero vector is handled as a special-case

11328

// somewhere below here.

11329

if (NumConstants == NumElems - 1 && NumNonZero != 1 &&

11330

(isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT) ||

11331

isOperationLegalOrCustom(ISD::VECTOR_SHUFFLE, VT))) {

11332

// Create an all-constant vector. The variable element in the old

11333

// build vector is replaced by undef in the constant vector. Save the

11334

// variable scalar element and its index for use in the insertelement.

11335

LLVMContext &Context = *DAG.getContext();

11336

Type *EltType = Op.getValueType().getScalarType().getTypeForEVT(Context);

11337

SmallVector<Constant *, 16> ConstVecOps(NumElems, UndefValue::get(EltType));

11338

SDValue VarElt;

11339

SDValue InsIndex;

11340

for (unsigned i = 0; i != NumElems; ++i) {

11341

SDValue Elt = Op.getOperand(i);

11342

if (auto *C = dyn_cast<ConstantSDNode>(Elt))

11343

ConstVecOps[i] = ConstantInt::get(Context, C->getAPIntValue());

11344

else if (auto *C = dyn_cast<ConstantFPSDNode>(Elt))

11345

ConstVecOps[i] = ConstantFP::get(Context, C->getValueAPF());

11346

else if (!Elt.isUndef()) {

11347

assert(!VarElt.getNode() && !InsIndex.getNode() &&(static_cast <bool> (!VarElt.getNode() && !InsIndex
.getNode() && "Expected one variable element in this vector"
) ? void (0) : __assert_fail ("!VarElt.getNode() && !InsIndex.getNode() && \"Expected one variable element in this vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11348, __extension__
__PRETTY_FUNCTION__))

11348

"Expected one variable element in this vector")(static_cast <bool> (!VarElt.getNode() && !InsIndex
.getNode() && "Expected one variable element in this vector"
) ? void (0) : __assert_fail ("!VarElt.getNode() && !InsIndex.getNode() && \"Expected one variable element in this vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11348, __extension__
__PRETTY_FUNCTION__));

11349

VarElt = Elt;

11350

InsIndex = DAG.getVectorIdxConstant(i, dl);

11351

}

11352

}

11353

Constant *CV = ConstantVector::get(ConstVecOps);

11354

SDValue DAGConstVec = DAG.getConstantPool(CV, VT);

11355

11356

// The constants we just created may not be legal (eg, floating point). We

11357

// must lower the vector right here because we can not guarantee that we'll

11358

// legalize it before loading it. This is also why we could not just create

11359

// a new build vector here. If the build vector contains illegal constants,

11360

// it could get split back up into a series of insert elements.

11361

// TODO: Improve this by using shorter loads with broadcast/VZEXT_LOAD.

11362

SDValue LegalDAGConstVec = LowerConstantPool(DAGConstVec, DAG);

11363

MachineFunction &MF = DAG.getMachineFunction();

11364

MachinePointerInfo MPI = MachinePointerInfo::getConstantPool(MF);

11365

SDValue Ld = DAG.getLoad(VT, dl, DAG.getEntryNode(), LegalDAGConstVec, MPI);

11366

unsigned InsertC = cast<ConstantSDNode>(InsIndex)->getZExtValue();

11367

unsigned NumEltsInLow128Bits = 128 / VT.getScalarSizeInBits();

11368

if (InsertC < NumEltsInLow128Bits)

11369

return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ld, VarElt, InsIndex);

11370

11371

// There's no good way to insert into the high elements of a >128-bit

11372

// vector, so use shuffles to avoid an extract/insert sequence.

11373

assert(VT.getSizeInBits() > 128 && "Invalid insertion index?")(static_cast <bool> (VT.getSizeInBits() > 128 &&
"Invalid insertion index?") ? void (0) : __assert_fail ("VT.getSizeInBits() > 128 && \"Invalid insertion index?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11373, __extension__
__PRETTY_FUNCTION__));

11374

assert(Subtarget.hasAVX() && "Must have AVX with >16-byte vector")(static_cast <bool> (Subtarget.hasAVX() && "Must have AVX with >16-byte vector"
) ? void (0) : __assert_fail ("Subtarget.hasAVX() && \"Must have AVX with >16-byte vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11374, __extension__
__PRETTY_FUNCTION__));

11375

SmallVector<int, 8> ShuffleMask;

11376

unsigned NumElts = VT.getVectorNumElements();

11377

for (unsigned i = 0; i != NumElts; ++i)

11378

ShuffleMask.push_back(i == InsertC ? NumElts : i);

11379

SDValue S2V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, VarElt);

11380

return DAG.getVectorShuffle(VT, dl, Ld, S2V, ShuffleMask);

11381

}

11382

11383

// Special case for single non-zero, non-undef, element.

11384

if (NumNonZero == 1) {

11385

unsigned Idx = NonZeroMask.countr_zero();

11386

SDValue Item = Op.getOperand(Idx);

11387

11388

// If we have a constant or non-constant insertion into the low element of

11389

// a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into

11390

// the rest of the elements. This will be matched as movd/movq/movss/movsd

11391

// depending on what the source datatype is.

11392

if (Idx == 0) {

11393

if (NumZero == 0)

11394

return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);

11395

11396

if (EltVT == MVT::i32 || EltVT == MVT::f16 || EltVT == MVT::f32 ||

11397

EltVT == MVT::f64 || (EltVT == MVT::i64 && Subtarget.is64Bit()) ||

11398

(EltVT == MVT::i16 && Subtarget.hasFP16())) {

11399

assert((VT.is128BitVector() || VT.is256BitVector() ||(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector()) && "Expected an SSE value type!"
) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Expected an SSE value type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11401, __extension__
__PRETTY_FUNCTION__))

11400

VT.is512BitVector()) &&(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector()) && "Expected an SSE value type!"
) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Expected an SSE value type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11401, __extension__
__PRETTY_FUNCTION__))

11401

"Expected an SSE value type!")(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector()) && "Expected an SSE value type!"
) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Expected an SSE value type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11401, __extension__
__PRETTY_FUNCTION__));

11402

Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);

11403

// Turn it into a MOVL (i.e. movsh, movss, movsd, movw or movd) to a

11404

// zero vector.

11405

return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);

11406

}

11407

11408

// We can't directly insert an i8 or i16 into a vector, so zero extend

11409

// it to i32 first.

11410

if (EltVT == MVT::i16 || EltVT == MVT::i8) {

11411

Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);

11412

MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);

11413

Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, Item);

11414

Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);

11415

return DAG.getBitcast(VT, Item);

11416

}

11417

}

11418

11419

// Is it a vector logical left shift?

11420

if (NumElems == 2 && Idx == 1 &&

11421

X86::isZeroNode(Op.getOperand(0)) &&

11422

!X86::isZeroNode(Op.getOperand(1))) {

11423

unsigned NumBits = VT.getSizeInBits();

11424

return getVShift(true, VT,

11425

DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,

11426

VT, Op.getOperand(1)),

11427

NumBits/2, DAG, *this, dl);

11428

}

11429

11430

if (IsAllConstants) // Otherwise, it's better to do a constpool load.

11431

return SDValue();

11432

11433

// Otherwise, if this is a vector with i32 or f32 elements, and the element

11434

// is a non-constant being inserted into an element other than the low one,

11435

// we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka

11436

// movd/movss) to move this into the low element, then shuffle it into

11437

// place.

11438

if (EVTBits == 32) {

11439

Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);

11440

return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);

11441

}

11442

}

11443

11444

// Splat is obviously ok. Let legalizer expand it to a shuffle.

11445

if (Values.size() == 1) {

11446

if (EVTBits == 32) {

11447

// Instead of a shuffle like this:

11448

// shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>

11449

// Check if it's possible to issue this instead.

11450

// shuffle (vload ptr)), undef, <1, 1, 1, 1>

11451

unsigned Idx = NonZeroMask.countr_zero();

11452

SDValue Item = Op.getOperand(Idx);

11453

if (Op.getNode()->isOnlyUserOf(Item.getNode()))

11454

return LowerAsSplatVectorLoad(Item, VT, dl, DAG);

11455

}

11456

return SDValue();

11457

}

11458

11459

// A vector full of immediates; various special cases are already

11460

// handled, so this is best done with a single constant-pool load.

11461

if (IsAllConstants)

11462

return SDValue();

11463

11464

if (SDValue V = LowerBUILD_VECTORAsVariablePermute(Op, DAG, Subtarget))

11465

return V;

11466

11467

// See if we can use a vector load to get all of the elements.

11468

{

11469

SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems);

11470

if (SDValue LD =

11471

EltsFromConsecutiveLoads(VT, Ops, dl, DAG, Subtarget, false))

11472

return LD;

11473

}

11474

11475

// If this is a splat of pairs of 32-bit elements, we can use a narrower

11476

// build_vector and broadcast it.

11477

// TODO: We could probably generalize this more.

11478

if (Subtarget.hasAVX2() && EVTBits == 32 && Values.size() == 2) {

11479

SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),

11480

DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };

11481

auto CanSplat = [](SDValue Op, unsigned NumElems, ArrayRef<SDValue> Ops) {

11482

// Make sure all the even/odd operands match.

11483

for (unsigned i = 2; i != NumElems; ++i)

11484

if (Ops[i % 2] != Op.getOperand(i))

11485

return false;

11486

return true;

11487

};

11488

if (CanSplat(Op, NumElems, Ops)) {

11489

MVT WideEltVT = VT.isFloatingPoint() ? MVT::f64 : MVT::i64;

11490

MVT NarrowVT = MVT::getVectorVT(EltVT, 4);

11491

// Create a new build vector and cast to v2i64/v2f64.

11492

SDValue NewBV = DAG.getBitcast(MVT::getVectorVT(WideEltVT, 2),

11493

DAG.getBuildVector(NarrowVT, dl, Ops));

11494

// Broadcast from v2i64/v2f64 and cast to final VT.

11495

MVT BcastVT = MVT::getVectorVT(WideEltVT, NumElems / 2);

11496

return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, dl, BcastVT,

11497

NewBV));

11498

}

11499

}

11500

11501

// For AVX-length vectors, build the individual 128-bit pieces and use

11502

// shuffles to put them in place.

11503

if (VT.getSizeInBits() > 128) {

11504

MVT HVT = MVT::getVectorVT(EltVT, NumElems / 2);

11505

11506

// Build both the lower and upper subvector.

11507

SDValue Lower =

11508

DAG.getBuildVector(HVT, dl, Op->ops().slice(0, NumElems / 2));

11509

SDValue Upper = DAG.getBuildVector(

11510

HVT, dl, Op->ops().slice(NumElems / 2, NumElems /2));

11511

11512

// Recreate the wider vector with the lower and upper part.

11513

return concatSubVectors(Lower, Upper, DAG, dl);

11514

}

11515

11516

// Let legalizer expand 2-wide build_vectors.

11517

if (EVTBits == 64) {

11518

if (NumNonZero == 1) {

11519

// One half is zero or undef.

11520

unsigned Idx = NonZeroMask.countr_zero();

11521

SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,

11522

Op.getOperand(Idx));

11523

return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);

11524

}

11525

return SDValue();

11526

}

11527

11528

// If element VT is < 32 bits, convert it to inserts into a zero vector.

11529

if (EVTBits == 8 && NumElems == 16)

11530

if (SDValue V = LowerBuildVectorv16i8(Op, NonZeroMask, NumNonZero, NumZero,

11531

DAG, Subtarget))

11532

return V;

11533

11534

if (EltVT == MVT::i16 && NumElems == 8)

11535

if (SDValue V = LowerBuildVectorv8i16(Op, NonZeroMask, NumNonZero, NumZero,

11536

DAG, Subtarget))

11537

return V;

11538

11539

// If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS

11540

if (EVTBits == 32 && NumElems == 4)

11541

if (SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget))

11542

return V;

11543

11544

// If element VT is == 32 bits, turn it into a number of shuffles.

11545

if (NumElems == 4 && NumZero > 0) {

11546

SmallVector<SDValue, 8> Ops(NumElems);

11547

for (unsigned i = 0; i < 4; ++i) {

11548

bool isZero = !NonZeroMask[i];

11549

if (isZero)

11550

Ops[i] = getZeroVector(VT, Subtarget, DAG, dl);

11551

else

11552

Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));

11553

}

11554

11555

for (unsigned i = 0; i < 2; ++i) {

11556

switch (NonZeroMask.extractBitsAsZExtValue(2, i * 2)) {

11557

default: llvm_unreachable("Unexpected NonZero count")::llvm::llvm_unreachable_internal("Unexpected NonZero count",
"llvm/lib/Target/X86/X86ISelLowering.cpp", 11557);

11558

case 0:

11559

Ops[i] = Ops[i*2]; // Must be a zero vector.

11560

break;

11561

case 1:

11562

Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2+1], Ops[i*2]);

11563

break;

11564

case 2:

11565

Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);

11566

break;

11567

case 3:

11568

Ops[i] = getUnpackl(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);

11569

break;

11570

}

11571

}

11572

11573

bool Reverse1 = NonZeroMask.extractBitsAsZExtValue(2, 0) == 2;

11574

bool Reverse2 = NonZeroMask.extractBitsAsZExtValue(2, 2) == 2;

11575

int MaskVec[] = {

11576

Reverse1 ? 1 : 0,

11577

Reverse1 ? 0 : 1,

11578

static_cast<int>(Reverse2 ? NumElems+1 : NumElems),

11579

static_cast<int>(Reverse2 ? NumElems : NumElems+1)

11580

};

11581

return DAG.getVectorShuffle(VT, dl, Ops[0], Ops[1], MaskVec);

11582

}

11583

11584

assert(Values.size() > 1 && "Expected non-undef and non-splat vector")(static_cast <bool> (Values.size() > 1 && "Expected non-undef and non-splat vector"
) ? void (0) : __assert_fail ("Values.size() > 1 && \"Expected non-undef and non-splat vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11584, __extension__
__PRETTY_FUNCTION__));

11585

11586

// Check for a build vector from mostly shuffle plus few inserting.

11587

if (SDValue Sh = buildFromShuffleMostly(Op, DAG))

11588

return Sh;

11589

11590

// For SSE 4.1, use insertps to put the high elements into the low element.

11591

if (Subtarget.hasSSE41() && EltVT != MVT::f16) {

11592

SDValue Result;

11593

if (!Op.getOperand(0).isUndef())

11594

Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));

11595

else

11596

Result = DAG.getUNDEF(VT);

11597

11598

for (unsigned i = 1; i < NumElems; ++i) {

11599

if (Op.getOperand(i).isUndef()) continue;

11600

Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,

11601

Op.getOperand(i), DAG.getIntPtrConstant(i, dl));

11602

}

11603

return Result;

11604

}

11605

11606

// Otherwise, expand into a number of unpckl*, start by extending each of

11607

// our (non-undef) elements to the full vector width with the element in the

11608

// bottom slot of the vector (which generates no code for SSE).

11609

SmallVector<SDValue, 8> Ops(NumElems);

11610

for (unsigned i = 0; i < NumElems; ++i) {

11611

if (!Op.getOperand(i).isUndef())

11612

Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));

11613

else

11614

Ops[i] = DAG.getUNDEF(VT);

11615

}

11616

11617

// Next, we iteratively mix elements, e.g. for v4f32:

11618

// Step 1: unpcklps 0, 1 ==> X: <?, ?, 1, 0>

11619

// : unpcklps 2, 3 ==> Y: <?, ?, 3, 2>

11620

// Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>

11621

for (unsigned Scale = 1; Scale < NumElems; Scale *= 2) {

11622

// Generate scaled UNPCKL shuffle mask.

11623

SmallVector<int, 16> Mask;

11624

for(unsigned i = 0; i != Scale; ++i)

11625

Mask.push_back(i);

11626

for (unsigned i = 0; i != Scale; ++i)

11627

Mask.push_back(NumElems+i);

11628

Mask.append(NumElems - Mask.size(), SM_SentinelUndef);

11629

11630

for (unsigned i = 0, e = NumElems / (2 * Scale); i != e; ++i)

11631

Ops[i] = DAG.getVectorShuffle(VT, dl, Ops[2*i], Ops[(2*i)+1], Mask);

11632

}

11633

return Ops[0];

11634

}

11635

11636

// 256-bit AVX can use the vinsertf128 instruction

11637

// to create 256-bit vectors from two other 128-bit ones.

11638

// TODO: Detect subvector broadcast here instead of DAG combine?

11639

static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG,

11640

const X86Subtarget &Subtarget) {

11641

SDLoc dl(Op);

11642

MVT ResVT = Op.getSimpleValueType();

11643

11644

assert((ResVT.is256BitVector() ||(static_cast <bool> ((ResVT.is256BitVector() || ResVT.is512BitVector
()) && "Value type must be 256-/512-bit wide") ? void
(0) : __assert_fail ("(ResVT.is256BitVector() || ResVT.is512BitVector()) && \"Value type must be 256-/512-bit wide\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11645, __extension__
__PRETTY_FUNCTION__))

11645

ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide")(static_cast <bool> ((ResVT.is256BitVector() || ResVT.is512BitVector
()) && "Value type must be 256-/512-bit wide") ? void
(0) : __assert_fail ("(ResVT.is256BitVector() || ResVT.is512BitVector()) && \"Value type must be 256-/512-bit wide\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11645, __extension__
__PRETTY_FUNCTION__));

11646

11647

unsigned NumOperands = Op.getNumOperands();

11648

unsigned NumFreezeUndef = 0;

11649

unsigned NumZero = 0;

11650

unsigned NumNonZero = 0;

11651

unsigned NonZeros = 0;

11652

for (unsigned i = 0; i != NumOperands; ++i) {

11653

SDValue SubVec = Op.getOperand(i);

11654

if (SubVec.isUndef())

11655

continue;

11656

if (ISD::isFreezeUndef(SubVec.getNode()) && SubVec.hasOneUse())

11657

++NumFreezeUndef;

11658

else if (ISD::isBuildVectorAllZeros(SubVec.getNode()))

11659

++NumZero;

11660

else {

11661

assert(i < sizeof(NonZeros) * CHAR_BIT)(static_cast <bool> (i < sizeof(NonZeros) * 8) ? void
(0) : __assert_fail ("i < sizeof(NonZeros) * CHAR_BIT", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11661, __extension__ __PRETTY_FUNCTION__)); // Ensure the shift is in range.

11662

NonZeros |= 1 << i;

11663

++NumNonZero;

11664

}

11665

}

11666

11667

// If we have more than 2 non-zeros, build each half separately.

11668

if (NumNonZero > 2) {

11669

MVT HalfVT = ResVT.getHalfNumVectorElementsVT();

11670

ArrayRef<SDUse> Ops = Op->ops();

11671

SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,

11672

Ops.slice(0, NumOperands/2));

11673

SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,

11674

Ops.slice(NumOperands/2));

11675

return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);

11676

}

11677

11678

// Otherwise, build it up through insert_subvectors.

11679

SDValue Vec = NumZero ? getZeroVector(ResVT, Subtarget, DAG, dl)

11680

: (NumFreezeUndef ? DAG.getFreeze(DAG.getUNDEF(ResVT))

11681

: DAG.getUNDEF(ResVT));

11682

11683

MVT SubVT = Op.getOperand(0).getSimpleValueType();

11684

unsigned NumSubElems = SubVT.getVectorNumElements();

11685

for (unsigned i = 0; i != NumOperands; ++i) {

11686

if ((NonZeros & (1 << i)) == 0)

11687

continue;

11688

11689

Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec,

11690

Op.getOperand(i),

11691

DAG.getIntPtrConstant(i * NumSubElems, dl));

11692

}

11693

11694

return Vec;

11695

}

11696

11697

// Returns true if the given node is a type promotion (by concatenating i1

11698

// zeros) of the result of a node that already zeros all upper bits of

11699

// k-register.

11700

// TODO: Merge this with LowerAVXCONCAT_VECTORS?

11701

static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,

11702

const X86Subtarget &Subtarget,

11703

SelectionDAG & DAG) {

11704

SDLoc dl(Op);

11705

MVT ResVT = Op.getSimpleValueType();

11706

unsigned NumOperands = Op.getNumOperands();

11707

11708

assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&(static_cast <bool> (NumOperands > 1 && isPowerOf2_32
(NumOperands) && "Unexpected number of operands in CONCAT_VECTORS"
) ? void (0) : __assert_fail ("NumOperands > 1 && isPowerOf2_32(NumOperands) && \"Unexpected number of operands in CONCAT_VECTORS\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11709, __extension__
__PRETTY_FUNCTION__))

11709

"Unexpected number of operands in CONCAT_VECTORS")(static_cast <bool> (NumOperands > 1 && isPowerOf2_32
(NumOperands) && "Unexpected number of operands in CONCAT_VECTORS"
) ? void (0) : __assert_fail ("NumOperands > 1 && isPowerOf2_32(NumOperands) && \"Unexpected number of operands in CONCAT_VECTORS\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11709, __extension__
__PRETTY_FUNCTION__));

11710

11711

uint64_t Zeros = 0;

11712

uint64_t NonZeros = 0;

11713

for (unsigned i = 0; i != NumOperands; ++i) {

11714

SDValue SubVec = Op.getOperand(i);

11715

if (SubVec.isUndef())

11716

continue;

11717

assert(i < sizeof(NonZeros) * CHAR_BIT)(static_cast <bool> (i < sizeof(NonZeros) * 8) ? void
(0) : __assert_fail ("i < sizeof(NonZeros) * CHAR_BIT", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11717, __extension__ __PRETTY_FUNCTION__)); // Ensure the shift is in range.

11718

if (ISD::isBuildVectorAllZeros(SubVec.getNode()))

11719

Zeros |= (uint64_t)1 << i;

11720

else

11721

NonZeros |= (uint64_t)1 << i;

11722

}

11723

11724

unsigned NumElems = ResVT.getVectorNumElements();

11725

11726

// If we are inserting non-zero vector and there are zeros in LSBs and undef

11727

// in the MSBs we need to emit a KSHIFTL. The generic lowering to

11728

// insert_subvector will give us two kshifts.

11729

if (isPowerOf2_64(NonZeros) && Zeros != 0 && NonZeros > Zeros &&

11730

Log2_64(NonZeros) != NumOperands - 1) {

11731

MVT ShiftVT = ResVT;

11732

if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8)

11733

ShiftVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;

11734

unsigned Idx = Log2_64(NonZeros);

11735

SDValue SubVec = Op.getOperand(Idx);

11736

unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();

11737

SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ShiftVT,

11738

DAG.getUNDEF(ShiftVT), SubVec,

11739

DAG.getIntPtrConstant(0, dl));

11740

Op = DAG.getNode(X86ISD::KSHIFTL, dl, ShiftVT, SubVec,

11741

DAG.getTargetConstant(Idx * SubVecNumElts, dl, MVT::i8));

11742

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResVT, Op,

11743

DAG.getIntPtrConstant(0, dl));

11744

}

11745

11746

// If there are zero or one non-zeros we can handle this very simply.

11747

if (NonZeros == 0 || isPowerOf2_64(NonZeros)) {

11748

SDValue Vec = Zeros ? DAG.getConstant(0, dl, ResVT) : DAG.getUNDEF(ResVT);

11749

if (!NonZeros)

11750

return Vec;

11751

unsigned Idx = Log2_64(NonZeros);

11752

SDValue SubVec = Op.getOperand(Idx);

11753

unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();

11754

return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, SubVec,

11755

DAG.getIntPtrConstant(Idx * SubVecNumElts, dl));

11756

}

11757

11758

if (NumOperands > 2) {

11759

MVT HalfVT = ResVT.getHalfNumVectorElementsVT();

11760

ArrayRef<SDUse> Ops = Op->ops();

11761

SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,

11762

Ops.slice(0, NumOperands/2));

11763

SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,

11764

Ops.slice(NumOperands/2));

11765

return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);

11766

}

11767

11768

assert(llvm::popcount(NonZeros) == 2 && "Simple cases not handled?")(static_cast <bool> (llvm::popcount(NonZeros) == 2 &&
"Simple cases not handled?") ? void (0) : __assert_fail ("llvm::popcount(NonZeros) == 2 && \"Simple cases not handled?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11768, __extension__
__PRETTY_FUNCTION__));

11769

11770

if (ResVT.getVectorNumElements() >= 16)

11771

return Op; // The operation is legal with KUNPCK

11772

11773

SDValue Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT,

11774

DAG.getUNDEF(ResVT), Op.getOperand(0),

11775

DAG.getIntPtrConstant(0, dl));

11776

return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, Op.getOperand(1),

11777

DAG.getIntPtrConstant(NumElems/2, dl));

11778

}

11779

11780

static SDValue LowerCONCAT_VECTORS(SDValue Op,

11781

const X86Subtarget &Subtarget,

11782

SelectionDAG &DAG) {

11783

MVT VT = Op.getSimpleValueType();

11784

if (VT.getVectorElementType() == MVT::i1)

11785

return LowerCONCAT_VECTORSvXi1(Op, Subtarget, DAG);

11786

11787

assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||(static_cast <bool> ((VT.is256BitVector() && Op
.getNumOperands() == 2) || (VT.is512BitVector() && (Op
.getNumOperands() == 2 || Op.getNumOperands() == 4))) ? void (
0) : __assert_fail ("(VT.is256BitVector() && Op.getNumOperands() == 2) || (VT.is512BitVector() && (Op.getNumOperands() == 2 || Op.getNumOperands() == 4))"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11789, __extension__
__PRETTY_FUNCTION__))

11788

(VT.is512BitVector() && (Op.getNumOperands() == 2 ||(static_cast <bool> ((VT.is256BitVector() && Op
.getNumOperands() == 2) || (VT.is512BitVector() && (Op
.getNumOperands() == 2 || Op.getNumOperands() == 4))) ? void (
0) : __assert_fail ("(VT.is256BitVector() && Op.getNumOperands() == 2) || (VT.is512BitVector() && (Op.getNumOperands() == 2 || Op.getNumOperands() == 4))"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11789, __extension__
__PRETTY_FUNCTION__))

11789

Op.getNumOperands() == 4)))(static_cast <bool> ((VT.is256BitVector() && Op
.getNumOperands() == 2) || (VT.is512BitVector() && (Op
.getNumOperands() == 2 || Op.getNumOperands() == 4))) ? void (
0) : __assert_fail ("(VT.is256BitVector() && Op.getNumOperands() == 2) || (VT.is512BitVector() && (Op.getNumOperands() == 2 || Op.getNumOperands() == 4))"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11789, __extension__
__PRETTY_FUNCTION__));

11790

11791

// AVX can use the vinsertf128 instruction to create 256-bit vectors

11792

// from two other 128-bit ones.

11793

11794

// 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors

11795

return LowerAVXCONCAT_VECTORS(Op, DAG, Subtarget);

11796

}

11797

11798

//===----------------------------------------------------------------------===//

11799

// Vector shuffle lowering

11800

//

11801

// This is an experimental code path for lowering vector shuffles on x86. It is

11802

// designed to handle arbitrary vector shuffles and blends, gracefully

11803

// degrading performance as necessary. It works hard to recognize idiomatic

11804

// shuffles and lower them to optimal instruction patterns without leaving

11805

// a framework that allows reasonably efficient handling of all vector shuffle

11806

// patterns.

11807

//===----------------------------------------------------------------------===//

11808

11809

/// Tiny helper function to identify a no-op mask.

11810

///

11811

/// This is a somewhat boring predicate function. It checks whether the mask

11812

/// array input, which is assumed to be a single-input shuffle mask of the kind

11813

/// used by the X86 shuffle instructions (not a fully general

11814

/// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an

11815

/// in-place shuffle are 'no-op's.

11816

static bool isNoopShuffleMask(ArrayRef<int> Mask) {

11817

for (int i = 0, Size = Mask.size(); i < Size; ++i) {

11818

assert(Mask[i] >= -1 && "Out of bound mask element!")(static_cast <bool> (Mask[i] >= -1 && "Out of bound mask element!"
) ? void (0) : __assert_fail ("Mask[i] >= -1 && \"Out of bound mask element!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11818, __extension__
__PRETTY_FUNCTION__));

11819

if (Mask[i] >= 0 && Mask[i] != i)

11820

return false;

11821

}

11822

return true;

11823

}

11824

11825

/// Test whether there are elements crossing LaneSizeInBits lanes in this

11826

/// shuffle mask.

11827

///

11828

/// X86 divides up its shuffles into in-lane and cross-lane shuffle operations

11829

/// and we routinely test for these.

11830

static bool isLaneCrossingShuffleMask(unsigned LaneSizeInBits,

11831

unsigned ScalarSizeInBits,

11832

ArrayRef<int> Mask) {

11833

assert(LaneSizeInBits && ScalarSizeInBits &&(static_cast <bool> (LaneSizeInBits && ScalarSizeInBits
&& (LaneSizeInBits % ScalarSizeInBits) == 0 &&
"Illegal shuffle lane size") ? void (0) : __assert_fail ("LaneSizeInBits && ScalarSizeInBits && (LaneSizeInBits % ScalarSizeInBits) == 0 && \"Illegal shuffle lane size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11835, __extension__
__PRETTY_FUNCTION__))

11834

(LaneSizeInBits % ScalarSizeInBits) == 0 &&(static_cast <bool> (LaneSizeInBits && ScalarSizeInBits
&& (LaneSizeInBits % ScalarSizeInBits) == 0 &&
"Illegal shuffle lane size") ? void (0) : __assert_fail ("LaneSizeInBits && ScalarSizeInBits && (LaneSizeInBits % ScalarSizeInBits) == 0 && \"Illegal shuffle lane size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11835, __extension__
__PRETTY_FUNCTION__))

11835

"Illegal shuffle lane size")(static_cast <bool> (LaneSizeInBits && ScalarSizeInBits
&& (LaneSizeInBits % ScalarSizeInBits) == 0 &&
"Illegal shuffle lane size") ? void (0) : __assert_fail ("LaneSizeInBits && ScalarSizeInBits && (LaneSizeInBits % ScalarSizeInBits) == 0 && \"Illegal shuffle lane size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11835, __extension__
__PRETTY_FUNCTION__));

11836

int LaneSize = LaneSizeInBits / ScalarSizeInBits;

11837

int Size = Mask.size();

11838

for (int i = 0; i < Size; ++i)

11839

if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)

11840

return true;

11841

return false;

11842

}

11843

11844

/// Test whether there are elements crossing 128-bit lanes in this

11845

/// shuffle mask.

11846

static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) {

11847

return isLaneCrossingShuffleMask(128, VT.getScalarSizeInBits(), Mask);

11848

}

11849

11850

/// Test whether elements in each LaneSizeInBits lane in this shuffle mask come

11851

/// from multiple lanes - this is different to isLaneCrossingShuffleMask to

11852

/// better support 'repeated mask + lane permute' style shuffles.

11853

static bool isMultiLaneShuffleMask(unsigned LaneSizeInBits,

11854

unsigned ScalarSizeInBits,

11855

ArrayRef<int> Mask) {

11856

assert(LaneSizeInBits && ScalarSizeInBits &&(static_cast <bool> (LaneSizeInBits && ScalarSizeInBits
&& (LaneSizeInBits % ScalarSizeInBits) == 0 &&
"Illegal shuffle lane size") ? void (0) : __assert_fail ("LaneSizeInBits && ScalarSizeInBits && (LaneSizeInBits % ScalarSizeInBits) == 0 && \"Illegal shuffle lane size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11858, __extension__
__PRETTY_FUNCTION__))

11857

(LaneSizeInBits % ScalarSizeInBits) == 0 &&(static_cast <bool> (LaneSizeInBits && ScalarSizeInBits
&& (LaneSizeInBits % ScalarSizeInBits) == 0 &&
"Illegal shuffle lane size") ? void (0) : __assert_fail ("LaneSizeInBits && ScalarSizeInBits && (LaneSizeInBits % ScalarSizeInBits) == 0 && \"Illegal shuffle lane size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11858, __extension__
__PRETTY_FUNCTION__))

11858

"Illegal shuffle lane size")(static_cast <bool> (LaneSizeInBits && ScalarSizeInBits
&& (LaneSizeInBits % ScalarSizeInBits) == 0 &&
"Illegal shuffle lane size") ? void (0) : __assert_fail ("LaneSizeInBits && ScalarSizeInBits && (LaneSizeInBits % ScalarSizeInBits) == 0 && \"Illegal shuffle lane size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11858, __extension__
__PRETTY_FUNCTION__));

11859

int NumElts = Mask.size();

11860

int NumEltsPerLane = LaneSizeInBits / ScalarSizeInBits;

11861

int NumLanes = NumElts / NumEltsPerLane;

11862

if (NumLanes > 1) {

11863

for (int i = 0; i != NumLanes; ++i) {

11864

int SrcLane = -1;

11865

for (int j = 0; j != NumEltsPerLane; ++j) {

11866

int M = Mask[(i * NumEltsPerLane) + j];

11867

if (M < 0)

11868

continue;

11869

int Lane = (M % NumElts) / NumEltsPerLane;

11870

if (SrcLane >= 0 && SrcLane != Lane)

11871

return true;

11872

SrcLane = Lane;

11873

}

11874

}

11875

}

11876

return false;

11877

}

11878

11879

/// Test whether a shuffle mask is equivalent within each sub-lane.

11880

///

11881

/// This checks a shuffle mask to see if it is performing the same

11882

/// lane-relative shuffle in each sub-lane. This trivially implies

11883

/// that it is also not lane-crossing. It may however involve a blend from the

11884

/// same lane of a second vector.

11885

///

11886

/// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is

11887

/// non-trivial to compute in the face of undef lanes. The representation is

11888

/// suitable for use with existing 128-bit shuffles as entries from the second

11889

/// vector have been remapped to [LaneSize, 2*LaneSize).

11890

static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT,

11891

ArrayRef<int> Mask,

11892

SmallVectorImpl<int> &RepeatedMask) {

11893

auto LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();

11894

RepeatedMask.assign(LaneSize, -1);

11895

int Size = Mask.size();

11896

for (int i = 0; i < Size; ++i) {

11897

assert(Mask[i] == SM_SentinelUndef || Mask[i] >= 0)(static_cast <bool> (Mask[i] == SM_SentinelUndef || Mask
[i] >= 0) ? void (0) : __assert_fail ("Mask[i] == SM_SentinelUndef || Mask[i] >= 0"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11897, __extension__
__PRETTY_FUNCTION__));

11898

if (Mask[i] < 0)

11899

continue;

11900

if ((Mask[i] % Size) / LaneSize != i / LaneSize)

11901

// This entry crosses lanes, so there is no way to model this shuffle.

11902

return false;

11903

11904

// Ok, handle the in-lane shuffles by detecting if and when they repeat.

11905

// Adjust second vector indices to start at LaneSize instead of Size.

11906

int LocalM = Mask[i] < Size ? Mask[i] % LaneSize

11907

: Mask[i] % LaneSize + LaneSize;

11908

if (RepeatedMask[i % LaneSize] < 0)

11909

// This is the first non-undef entry in this slot of a 128-bit lane.

11910

RepeatedMask[i % LaneSize] = LocalM;

11911

else if (RepeatedMask[i % LaneSize] != LocalM)

11912

// Found a mismatch with the repeated mask.

11913

return false;

11914

}

11915

return true;

11916

}

11917

11918

/// Test whether a shuffle mask is equivalent within each 128-bit lane.

11919

static bool

11920

is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,

11921

SmallVectorImpl<int> &RepeatedMask) {

11922

return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);

11923

}

11924

11925

static bool

11926

is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask) {

11927

SmallVector<int, 32> RepeatedMask;

11928

return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);

11929

}

11930

11931

/// Test whether a shuffle mask is equivalent within each 256-bit lane.

11932

static bool

11933

is256BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,

11934

SmallVectorImpl<int> &RepeatedMask) {

11935

return isRepeatedShuffleMask(256, VT, Mask, RepeatedMask);

11936

}

11937

11938

/// Test whether a target shuffle mask is equivalent within each sub-lane.

11939

/// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.

11940

static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits,

11941

unsigned EltSizeInBits,

11942

ArrayRef<int> Mask,

11943

SmallVectorImpl<int> &RepeatedMask) {

11944

int LaneSize = LaneSizeInBits / EltSizeInBits;

11945

RepeatedMask.assign(LaneSize, SM_SentinelUndef);

11946

int Size = Mask.size();

11947

for (int i = 0; i < Size; ++i) {

11948

assert(isUndefOrZero(Mask[i]) || (Mask[i] >= 0))(static_cast <bool> (isUndefOrZero(Mask[i]) || (Mask[i]
>= 0)) ? void (0) : __assert_fail ("isUndefOrZero(Mask[i]) || (Mask[i] >= 0)"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11948, __extension__
__PRETTY_FUNCTION__));

11949

if (Mask[i] == SM_SentinelUndef)

11950

continue;

11951

if (Mask[i] == SM_SentinelZero) {

11952

if (!isUndefOrZero(RepeatedMask[i % LaneSize]))

11953

return false;

11954

RepeatedMask[i % LaneSize] = SM_SentinelZero;

11955

continue;

11956

}

11957

if ((Mask[i] % Size) / LaneSize != i / LaneSize)

11958

// This entry crosses lanes, so there is no way to model this shuffle.

11959

return false;

11960

11961

// Handle the in-lane shuffles by detecting if and when they repeat. Adjust

11962

// later vector indices to start at multiples of LaneSize instead of Size.

11963

int LaneM = Mask[i] / Size;

11964

int LocalM = (Mask[i] % LaneSize) + (LaneM * LaneSize);

11965

if (RepeatedMask[i % LaneSize] == SM_SentinelUndef)

11966

// This is the first non-undef entry in this slot of a 128-bit lane.

11967

RepeatedMask[i % LaneSize] = LocalM;

11968

else if (RepeatedMask[i % LaneSize] != LocalM)

11969

// Found a mismatch with the repeated mask.

11970

return false;

11971

}

11972

return true;

11973

}

11974

11975

/// Test whether a target shuffle mask is equivalent within each sub-lane.

11976

/// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.

11977

static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, MVT VT,

11978

ArrayRef<int> Mask,

11979

SmallVectorImpl<int> &RepeatedMask) {

11980

return isRepeatedTargetShuffleMask(LaneSizeInBits, VT.getScalarSizeInBits(),

11981

Mask, RepeatedMask);

11982

}

11983

11984

/// Checks whether the vector elements referenced by two shuffle masks are

11985

/// equivalent.

11986

static bool IsElementEquivalent(int MaskSize, SDValue Op, SDValue ExpectedOp,

11987

int Idx, int ExpectedIdx) {

11988

assert(0 <= Idx && Idx < MaskSize && 0 <= ExpectedIdx &&(static_cast <bool> (0 <= Idx && Idx < MaskSize
&& 0 <= ExpectedIdx && ExpectedIdx < MaskSize
&& "Out of range element index") ? void (0) : __assert_fail
("0 <= Idx && Idx < MaskSize && 0 <= ExpectedIdx && ExpectedIdx < MaskSize && \"Out of range element index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11989, __extension__
__PRETTY_FUNCTION__))

11989

ExpectedIdx < MaskSize && "Out of range element index")(static_cast <bool> (0 <= Idx && Idx < MaskSize
&& 0 <= ExpectedIdx && ExpectedIdx < MaskSize
&& "Out of range element index") ? void (0) : __assert_fail
("0 <= Idx && Idx < MaskSize && 0 <= ExpectedIdx && ExpectedIdx < MaskSize && \"Out of range element index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11989, __extension__
__PRETTY_FUNCTION__));

11990

if (!Op || !ExpectedOp || Op.getOpcode() != ExpectedOp.getOpcode())

11991

return false;

11992

11993

switch (Op.getOpcode()) {

11994

case ISD::BUILD_VECTOR:

11995

// If the values are build vectors, we can look through them to find

11996

// equivalent inputs that make the shuffles equivalent.

11997

// TODO: Handle MaskSize != Op.getNumOperands()?

11998

if (MaskSize == (int)Op.getNumOperands() &&

11999

MaskSize == (int)ExpectedOp.getNumOperands())

12000

return Op.getOperand(Idx) == ExpectedOp.getOperand(ExpectedIdx);

12001

break;

12002

case X86ISD::VBROADCAST:

12003

case X86ISD::VBROADCAST_LOAD:

12004

// TODO: Handle MaskSize != Op.getValueType().getVectorNumElements()?

12005

return (Op == ExpectedOp &&

12006

(int)Op.getValueType().getVectorNumElements() == MaskSize);

12007

case X86ISD::HADD:

12008

case X86ISD::HSUB:

12009

case X86ISD::FHADD:

12010

case X86ISD::FHSUB:

12011

case X86ISD::PACKSS:

12012

case X86ISD::PACKUS:

12013

// HOP(X,X) can refer to the elt from the lower/upper half of a lane.

12014

// TODO: Handle MaskSize != NumElts?

12015

// TODO: Handle HOP(X,Y) vs HOP(Y,X) equivalence cases.

12016

if (Op == ExpectedOp && Op.getOperand(0) == Op.getOperand(1)) {

12017

MVT VT = Op.getSimpleValueType();

12018

int NumElts = VT.getVectorNumElements();

12019

if (MaskSize == NumElts) {

12020

int NumLanes = VT.getSizeInBits() / 128;

12021

int NumEltsPerLane = NumElts / NumLanes;

12022

int NumHalfEltsPerLane = NumEltsPerLane / 2;

12023

bool SameLane =

12024

(Idx / NumEltsPerLane) == (ExpectedIdx / NumEltsPerLane);

12025

bool SameElt =

12026

(Idx % NumHalfEltsPerLane) == (ExpectedIdx % NumHalfEltsPerLane);

12027

return SameLane && SameElt;

12028

}

12029

}

12030

break;

12031

}

12032

12033

return false;

12034

}

12035

12036

/// Checks whether a shuffle mask is equivalent to an explicit list of

12037

/// arguments.

12038

///

12039

/// This is a fast way to test a shuffle mask against a fixed pattern:

12040

///

12041

/// if (isShuffleEquivalent(Mask, 3, 2, {1, 0})) { ... }

12042

///

12043

/// It returns true if the mask is exactly as wide as the argument list, and

12044

/// each element of the mask is either -1 (signifying undef) or the value given

12045

/// in the argument.

12046

static bool isShuffleEquivalent(ArrayRef<int> Mask, ArrayRef<int> ExpectedMask,

12047

SDValue V1 = SDValue(),

12048

SDValue V2 = SDValue()) {

12049

int Size = Mask.size();

12050

if (Size != (int)ExpectedMask.size())

12051

return false;

12052

12053

for (int i = 0; i < Size; ++i) {

12054

assert(Mask[i] >= -1 && "Out of bound mask element!")(static_cast <bool> (Mask[i] >= -1 && "Out of bound mask element!"
) ? void (0) : __assert_fail ("Mask[i] >= -1 && \"Out of bound mask element!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12054, __extension__
__PRETTY_FUNCTION__));

12055

int MaskIdx = Mask[i];

12056

int ExpectedIdx = ExpectedMask[i];

12057

if (0 <= MaskIdx && MaskIdx != ExpectedIdx) {

12058

SDValue MaskV = MaskIdx < Size ? V1 : V2;

12059

SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;

12060

MaskIdx = MaskIdx < Size ? MaskIdx : (MaskIdx - Size);

12061

ExpectedIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);

12062

if (!IsElementEquivalent(Size, MaskV, ExpectedV, MaskIdx, ExpectedIdx))

12063

return false;

12064

}

12065

}

12066

return true;

12067

}

12068

12069

/// Checks whether a target shuffle mask is equivalent to an explicit pattern.

12070

///

12071

/// The masks must be exactly the same width.

12072

///

12073

/// If an element in Mask matches SM_SentinelUndef (-1) then the corresponding

12074

/// value in ExpectedMask is always accepted. Otherwise the indices must match.

12075

///

12076

/// SM_SentinelZero is accepted as a valid negative index but must match in

12077

/// both, or via a known bits test.

12078

static bool isTargetShuffleEquivalent(MVT VT, ArrayRef<int> Mask,

12079

ArrayRef<int> ExpectedMask,

12080

const SelectionDAG &DAG,

12081

SDValue V1 = SDValue(),

12082

SDValue V2 = SDValue()) {

12083

int Size = Mask.size();

12084

if (Size != (int)ExpectedMask.size())

12085

return false;

12086

assert(llvm::all_of(ExpectedMask,(static_cast <bool> (llvm::all_of(ExpectedMask, [Size](
int M) { return isInRange(M, 0, 2 * Size); }) && "Illegal target shuffle mask"
) ? void (0) : __assert_fail ("llvm::all_of(ExpectedMask, [Size](int M) { return isInRange(M, 0, 2 * Size); }) && \"Illegal target shuffle mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12088, __extension__
__PRETTY_FUNCTION__))

12087

[Size](int M) { return isInRange(M, 0, 2 * Size); }) &&(static_cast <bool> (llvm::all_of(ExpectedMask, [Size](
int M) { return isInRange(M, 0, 2 * Size); }) && "Illegal target shuffle mask"
) ? void (0) : __assert_fail ("llvm::all_of(ExpectedMask, [Size](int M) { return isInRange(M, 0, 2 * Size); }) && \"Illegal target shuffle mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12088, __extension__
__PRETTY_FUNCTION__))

12088

"Illegal target shuffle mask")(static_cast <bool> (llvm::all_of(ExpectedMask, [Size](
int M) { return isInRange(M, 0, 2 * Size); }) && "Illegal target shuffle mask"
) ? void (0) : __assert_fail ("llvm::all_of(ExpectedMask, [Size](int M) { return isInRange(M, 0, 2 * Size); }) && \"Illegal target shuffle mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12088, __extension__
__PRETTY_FUNCTION__));

12089

12090

// Check for out-of-range target shuffle mask indices.

12091

if (!isUndefOrZeroOrInRange(Mask, 0, 2 * Size))

12092

return false;

12093

12094

// Don't use V1/V2 if they're not the same size as the shuffle mask type.

12095

if (V1 && V1.getValueSizeInBits() != VT.getSizeInBits())

12096

V1 = SDValue();

12097

if (V2 && V2.getValueSizeInBits() != VT.getSizeInBits())

12098

V2 = SDValue();

12099

12100

APInt ZeroV1 = APInt::getZero(Size);

12101

APInt ZeroV2 = APInt::getZero(Size);

12102

12103

for (int i = 0; i < Size; ++i) {

12104

int MaskIdx = Mask[i];

12105

int ExpectedIdx = ExpectedMask[i];

12106

if (MaskIdx == SM_SentinelUndef || MaskIdx == ExpectedIdx)

12107

continue;

12108

if (MaskIdx == SM_SentinelZero) {

12109

// If we need this expected index to be a zero element, then update the

12110

// relevant zero mask and perform the known bits at the end to minimize

12111

// repeated computes.

12112

SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;

12113

if (ExpectedV &&

12114

Size == (int)ExpectedV.getValueType().getVectorNumElements()) {

12115

int BitIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);

12116

APInt &ZeroMask = ExpectedIdx < Size ? ZeroV1 : ZeroV2;

12117

ZeroMask.setBit(BitIdx);

12118

continue;

12119

}

12120

}

12121

if (MaskIdx >= 0) {

12122

SDValue MaskV = MaskIdx < Size ? V1 : V2;

12123

SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;

12124

MaskIdx = MaskIdx < Size ? MaskIdx : (MaskIdx - Size);

12125

ExpectedIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);

12126

if (IsElementEquivalent(Size, MaskV, ExpectedV, MaskIdx, ExpectedIdx))

12127

continue;

12128

}

12129

return false;

12130

}

12131

return (ZeroV1.isZero() || DAG.MaskedVectorIsZero(V1, ZeroV1)) &&

12132

(ZeroV2.isZero() || DAG.MaskedVectorIsZero(V2, ZeroV2));

12133

}

12134

12135

// Check if the shuffle mask is suitable for the AVX vpunpcklwd or vpunpckhwd

12136

// instructions.

12137

static bool isUnpackWdShuffleMask(ArrayRef<int> Mask, MVT VT,

12138

const SelectionDAG &DAG) {

12139

if (VT != MVT::v8i32 && VT != MVT::v8f32)

12140

return false;

12141

12142

SmallVector<int, 8> Unpcklwd;

12143

createUnpackShuffleMask(MVT::v8i16, Unpcklwd, /* Lo = */ true,

12144

/* Unary = */ false);

12145

SmallVector<int, 8> Unpckhwd;

12146

createUnpackShuffleMask(MVT::v8i16, Unpckhwd, /* Lo = */ false,

12147

/* Unary = */ false);

12148

bool IsUnpackwdMask = (isTargetShuffleEquivalent(VT, Mask, Unpcklwd, DAG) ||

12149

isTargetShuffleEquivalent(VT, Mask, Unpckhwd, DAG));

12150

return IsUnpackwdMask;

12151

}

12152

12153

static bool is128BitUnpackShuffleMask(ArrayRef<int> Mask,

12154

const SelectionDAG &DAG) {

12155

// Create 128-bit vector type based on mask size.

12156

MVT EltVT = MVT::getIntegerVT(128 / Mask.size());

12157

MVT VT = MVT::getVectorVT(EltVT, Mask.size());

12158

12159

// We can't assume a canonical shuffle mask, so try the commuted version too.

12160

SmallVector<int, 4> CommutedMask(Mask);

12161

ShuffleVectorSDNode::commuteMask(CommutedMask);

12162

12163

// Match any of unary/binary or low/high.

12164

for (unsigned i = 0; i != 4; ++i) {

12165

SmallVector<int, 16> UnpackMask;

12166

createUnpackShuffleMask(VT, UnpackMask, (i >> 1) % 2, i % 2);

12167

if (isTargetShuffleEquivalent(VT, Mask, UnpackMask, DAG) ||

12168

isTargetShuffleEquivalent(VT, CommutedMask, UnpackMask, DAG))

12169

return true;

12170

}

12171

return false;

12172

}

12173

12174

/// Return true if a shuffle mask chooses elements identically in its top and

12175

/// bottom halves. For example, any splat mask has the same top and bottom

12176

/// halves. If an element is undefined in only one half of the mask, the halves

12177

/// are not considered identical.

12178

static bool hasIdenticalHalvesShuffleMask(ArrayRef<int> Mask) {

12179

assert(Mask.size() % 2 == 0 && "Expecting even number of elements in mask")(static_cast <bool> (Mask.size() % 2 == 0 && "Expecting even number of elements in mask"
) ? void (0) : __assert_fail ("Mask.size() % 2 == 0 && \"Expecting even number of elements in mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12179, __extension__
__PRETTY_FUNCTION__));

12180

unsigned HalfSize = Mask.size() / 2;

12181

for (unsigned i = 0; i != HalfSize; ++i) {

12182

if (Mask[i] != Mask[i + HalfSize])

12183

return false;

12184

}

12185

return true;

12186

}

12187

12188

/// Get a 4-lane 8-bit shuffle immediate for a mask.

12189

///

12190

/// This helper function produces an 8-bit shuffle immediate corresponding to

12191

/// the ubiquitous shuffle encoding scheme used in x86 instructions for

12192

/// shuffling 4 lanes. It can be used with most of the PSHUF instructions for

12193

/// example.

12194

///

12195

/// NB: We rely heavily on "undef" masks preserving the input lane.

12196

static unsigned getV4X86ShuffleImm(ArrayRef<int> Mask) {

12197

assert(Mask.size() == 4 && "Only 4-lane shuffle masks")(static_cast <bool> (Mask.size() == 4 && "Only 4-lane shuffle masks"
) ? void (0) : __assert_fail ("Mask.size() == 4 && \"Only 4-lane shuffle masks\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12197, __extension__
__PRETTY_FUNCTION__));

12198

assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!")(static_cast <bool> (Mask[0] >= -1 && Mask[0
] < 4 && "Out of bound mask element!") ? void (0) :
__assert_fail ("Mask[0] >= -1 && Mask[0] < 4 && \"Out of bound mask element!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12198, __extension__
__PRETTY_FUNCTION__));

12199

assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!")(static_cast <bool> (Mask[1] >= -1 && Mask[1
] < 4 && "Out of bound mask element!") ? void (0) :
__assert_fail ("Mask[1] >= -1 && Mask[1] < 4 && \"Out of bound mask element!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12199, __extension__
__PRETTY_FUNCTION__));

12200

assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!")(static_cast <bool> (Mask[2] >= -1 && Mask[2
] < 4 && "Out of bound mask element!") ? void (0) :
__assert_fail ("Mask[2] >= -1 && Mask[2] < 4 && \"Out of bound mask element!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12200, __extension__
__PRETTY_FUNCTION__));

12201

assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!")(static_cast <bool> (Mask[3] >= -1 && Mask[3
] < 4 && "Out of bound mask element!") ? void (0) :
__assert_fail ("Mask[3] >= -1 && Mask[3] < 4 && \"Out of bound mask element!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12201, __extension__
__PRETTY_FUNCTION__));

12202

12203

// If the mask only uses one non-undef element, then fully 'splat' it to

12204

// improve later broadcast matching.

12205

int FirstIndex = find_if(Mask, [](int M) { return M >= 0; }) - Mask.begin();

12206

assert(0 <= FirstIndex && FirstIndex < 4 && "All undef shuffle mask")(static_cast <bool> (0 <= FirstIndex && FirstIndex
< 4 && "All undef shuffle mask") ? void (0) : __assert_fail
("0 <= FirstIndex && FirstIndex < 4 && \"All undef shuffle mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12206, __extension__
__PRETTY_FUNCTION__));

12207

12208

int FirstElt = Mask[FirstIndex];

12209

if (all_of(Mask, [FirstElt](int M) { return M < 0 || M == FirstElt; }))

12210

return (FirstElt << 6) | (FirstElt << 4) | (FirstElt << 2) | FirstElt;

12211

12212

unsigned Imm = 0;

12213

Imm |= (Mask[0] < 0 ? 0 : Mask[0]) << 0;

12214

Imm |= (Mask[1] < 0 ? 1 : Mask[1]) << 2;

12215

Imm |= (Mask[2] < 0 ? 2 : Mask[2]) << 4;

12216

Imm |= (Mask[3] < 0 ? 3 : Mask[3]) << 6;

12217

return Imm;

12218

}

12219

12220

static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, const SDLoc &DL,

12221

SelectionDAG &DAG) {

12222

return DAG.getTargetConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8);

12223

}

12224

12225

// The Shuffle result is as follow:

12226

// 0*a[0]0*a[1]...0*a[n] , n >=0 where a[] elements in a ascending order.

12227

// Each Zeroable's element correspond to a particular Mask's element.

12228

// As described in computeZeroableShuffleElements function.

12229

//

12230

// The function looks for a sub-mask that the nonzero elements are in

12231

// increasing order. If such sub-mask exist. The function returns true.

12232

static bool isNonZeroElementsInOrder(const APInt &Zeroable,

12233

ArrayRef<int> Mask, const EVT &VectorType,

12234

bool &IsZeroSideLeft) {

12235

int NextElement = -1;

12236

// Check if the Mask's nonzero elements are in increasing order.

12237

for (int i = 0, e = Mask.size(); i < e; i++) {

12238

// Checks if the mask's zeros elements are built from only zeros.

12239

assert(Mask[i] >= -1 && "Out of bound mask element!")(static_cast <bool> (Mask[i] >= -1 && "Out of bound mask element!"
) ? void (0) : __assert_fail ("Mask[i] >= -1 && \"Out of bound mask element!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12239, __extension__
__PRETTY_FUNCTION__));

12240

if (Mask[i] < 0)

12241

return false;

12242

if (Zeroable[i])

12243

continue;

12244

// Find the lowest non zero element

12245

if (NextElement < 0) {

12246

NextElement = Mask[i] != 0 ? VectorType.getVectorNumElements() : 0;

12247

IsZeroSideLeft = NextElement != 0;

12248

}

12249

// Exit if the mask's non zero elements are not in increasing order.

12250

if (NextElement != Mask[i])

12251

return false;

12252

NextElement++;

12253

}

12254

return true;

12255

}

12256

12257

/// Try to lower a shuffle with a single PSHUFB of V1 or V2.

12258

static SDValue lowerShuffleWithPSHUFB(const SDLoc &DL, MVT VT,

12259

ArrayRef<int> Mask, SDValue V1,

12260

SDValue V2, const APInt &Zeroable,

12261

const X86Subtarget &Subtarget,

12262

SelectionDAG &DAG) {

12263

int Size = Mask.size();

12264

int LaneSize = 128 / VT.getScalarSizeInBits();

12265

const int NumBytes = VT.getSizeInBits() / 8;

12266

const int NumEltBytes = VT.getScalarSizeInBits() / 8;

12267

12268

assert((Subtarget.hasSSSE3() && VT.is128BitVector()) ||(static_cast <bool> ((Subtarget.hasSSSE3() && VT
.is128BitVector()) || (Subtarget.hasAVX2() && VT.is256BitVector
()) || (Subtarget.hasBWI() && VT.is512BitVector())) ?
void (0) : __assert_fail ("(Subtarget.hasSSSE3() && VT.is128BitVector()) || (Subtarget.hasAVX2() && VT.is256BitVector()) || (Subtarget.hasBWI() && VT.is512BitVector())"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12270, __extension__
__PRETTY_FUNCTION__))

12269

(Subtarget.hasAVX2() && VT.is256BitVector()) ||(static_cast <bool> ((Subtarget.hasSSSE3() && VT
.is128BitVector()) || (Subtarget.hasAVX2() && VT.is256BitVector
()) || (Subtarget.hasBWI() && VT.is512BitVector())) ?
void (0) : __assert_fail ("(Subtarget.hasSSSE3() && VT.is128BitVector()) || (Subtarget.hasAVX2() && VT.is256BitVector()) || (Subtarget.hasBWI() && VT.is512BitVector())"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12270, __extension__
__PRETTY_FUNCTION__))

12270

(Subtarget.hasBWI() && VT.is512BitVector()))(static_cast <bool> ((Subtarget.hasSSSE3() && VT
.is128BitVector()) || (Subtarget.hasAVX2() && VT.is256BitVector
()) || (Subtarget.hasBWI() && VT.is512BitVector())) ?
void (0) : __assert_fail ("(Subtarget.hasSSSE3() && VT.is128BitVector()) || (Subtarget.hasAVX2() && VT.is256BitVector()) || (Subtarget.hasBWI() && VT.is512BitVector())"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12270, __extension__
__PRETTY_FUNCTION__));

12271

12272

SmallVector<SDValue, 64> PSHUFBMask(NumBytes);

12273

// Sign bit set in i8 mask means zero element.

12274

SDValue ZeroMask = DAG.getConstant(0x80, DL, MVT::i8);

12275

12276

SDValue V;

12277

for (int i = 0; i < NumBytes; ++i) {

12278

int M = Mask[i / NumEltBytes];

12279

if (M < 0) {

12280

PSHUFBMask[i] = DAG.getUNDEF(MVT::i8);

12281

continue;

12282

}

12283

if (Zeroable[i / NumEltBytes]) {

12284

PSHUFBMask[i] = ZeroMask;

12285

continue;

12286

}

12287

12288

// We can only use a single input of V1 or V2.

12289

SDValue SrcV = (M >= Size ? V2 : V1);

12290

if (V && V != SrcV)

12291

return SDValue();

12292

V = SrcV;

12293

M %= Size;

12294

12295

// PSHUFB can't cross lanes, ensure this doesn't happen.

12296

if ((M / LaneSize) != ((i / NumEltBytes) / LaneSize))

12297

return SDValue();

12298

12299

M = M % LaneSize;

12300

M = M * NumEltBytes + (i % NumEltBytes);

12301

PSHUFBMask[i] = DAG.getConstant(M, DL, MVT::i8);

12302

}

12303

assert(V && "Failed to find a source input")(static_cast <bool> (V && "Failed to find a source input"
) ? void (0) : __assert_fail ("V && \"Failed to find a source input\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12303, __extension__
__PRETTY_FUNCTION__));

12304

12305

MVT I8VT = MVT::getVectorVT(MVT::i8, NumBytes);

12306

return DAG.getBitcast(

12307

VT, DAG.getNode(X86ISD::PSHUFB, DL, I8VT, DAG.getBitcast(I8VT, V),

12308

DAG.getBuildVector(I8VT, DL, PSHUFBMask)));

12309

}

12310

12311

static SDValue getMaskNode(SDValue Mask, MVT MaskVT,

12312

const X86Subtarget &Subtarget, SelectionDAG &DAG,

12313

const SDLoc &dl);

12314

12315

// X86 has dedicated shuffle that can be lowered to VEXPAND

12316

static SDValue lowerShuffleToEXPAND(const SDLoc &DL, MVT VT,

12317

const APInt &Zeroable,

12318

ArrayRef<int> Mask, SDValue &V1,

12319

SDValue &V2, SelectionDAG &DAG,

12320

const X86Subtarget &Subtarget) {

12321

bool IsLeftZeroSide = true;

12322

if (!isNonZeroElementsInOrder(Zeroable, Mask, V1.getValueType(),

12323

IsLeftZeroSide))

12324

return SDValue();

12325

unsigned VEXPANDMask = (~Zeroable).getZExtValue();

12326

MVT IntegerType =

12327

MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));

12328

SDValue MaskNode = DAG.getConstant(VEXPANDMask, DL, IntegerType);

12329

unsigned NumElts = VT.getVectorNumElements();

12330

assert((NumElts == 4 || NumElts == 8 || NumElts == 16) &&(static_cast <bool> ((NumElts == 4 || NumElts == 8 || NumElts
== 16) && "Unexpected number of vector elements") ? void
(0) : __assert_fail ("(NumElts == 4 || NumElts == 8 || NumElts == 16) && \"Unexpected number of vector elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12331, __extension__
__PRETTY_FUNCTION__))

12331

"Unexpected number of vector elements")(static_cast <bool> ((NumElts == 4 || NumElts == 8 || NumElts
== 16) && "Unexpected number of vector elements") ? void
(0) : __assert_fail ("(NumElts == 4 || NumElts == 8 || NumElts == 16) && \"Unexpected number of vector elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12331, __extension__
__PRETTY_FUNCTION__));

12332

SDValue VMask = getMaskNode(MaskNode, MVT::getVectorVT(MVT::i1, NumElts),

12333

Subtarget, DAG, DL);

12334

SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, DL);

12335

SDValue ExpandedVector = IsLeftZeroSide ? V2 : V1;

12336

return DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector, ZeroVector, VMask);

12337

}

12338

12339

static bool matchShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2,

12340

unsigned &UnpackOpcode, bool IsUnary,

12341

ArrayRef<int> TargetMask, const SDLoc &DL,

12342

SelectionDAG &DAG,

12343

const X86Subtarget &Subtarget) {

12344

int NumElts = VT.getVectorNumElements();

12345

12346

bool Undef1 = true, Undef2 = true, Zero1 = true, Zero2 = true;

12347

for (int i = 0; i != NumElts; i += 2) {

12348

int M1 = TargetMask[i + 0];

12349

int M2 = TargetMask[i + 1];

12350

Undef1 &= (SM_SentinelUndef == M1);

12351

Undef2 &= (SM_SentinelUndef == M2);

12352

Zero1 &= isUndefOrZero(M1);

12353

Zero2 &= isUndefOrZero(M2);

12354

}

12355

assert(!((Undef1 || Zero1) && (Undef2 || Zero2)) &&(static_cast <bool> (!((Undef1 || Zero1) && (Undef2
|| Zero2)) && "Zeroable shuffle detected") ? void (0
) : __assert_fail ("!((Undef1 || Zero1) && (Undef2 || Zero2)) && \"Zeroable shuffle detected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12356, __extension__
__PRETTY_FUNCTION__))

12356

"Zeroable shuffle detected")(static_cast <bool> (!((Undef1 || Zero1) && (Undef2
|| Zero2)) && "Zeroable shuffle detected") ? void (0
) : __assert_fail ("!((Undef1 || Zero1) && (Undef2 || Zero2)) && \"Zeroable shuffle detected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12356, __extension__
__PRETTY_FUNCTION__));

12357

12358

// Attempt to match the target mask against the unpack lo/hi mask patterns.

12359

SmallVector<int, 64> Unpckl, Unpckh;

12360

createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, IsUnary);

12361

if (isTargetShuffleEquivalent(VT, TargetMask, Unpckl, DAG, V1,

12362

(IsUnary ? V1 : V2))) {

12363

UnpackOpcode = X86ISD::UNPCKL;

12364

V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));

12365

V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);

12366

return true;

12367

}

12368

12369

createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, IsUnary);

12370

if (isTargetShuffleEquivalent(VT, TargetMask, Unpckh, DAG, V1,

12371

(IsUnary ? V1 : V2))) {

12372

UnpackOpcode = X86ISD::UNPCKH;

12373

V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));

12374

V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);

12375

return true;

12376

}

12377

12378

// If an unary shuffle, attempt to match as an unpack lo/hi with zero.

12379

if (IsUnary && (Zero1 || Zero2)) {

12380

// Don't bother if we can blend instead.

12381

if ((Subtarget.hasSSE41() || VT == MVT::v2i64 || VT == MVT::v2f64) &&

12382

isSequentialOrUndefOrZeroInRange(TargetMask, 0, NumElts, 0))

12383

return false;

12384

12385

bool MatchLo = true, MatchHi = true;

12386

for (int i = 0; (i != NumElts) && (MatchLo || MatchHi); ++i) {

12387

int M = TargetMask[i];

12388

12389

// Ignore if the input is known to be zero or the index is undef.

12390

if ((((i & 1) == 0) && Zero1) || (((i & 1) == 1) && Zero2) ||

12391

(M == SM_SentinelUndef))

12392

continue;

12393

12394

MatchLo &= (M == Unpckl[i]);

12395

MatchHi &= (M == Unpckh[i]);

12396

}

12397

12398

if (MatchLo || MatchHi) {

12399

UnpackOpcode = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;

12400

V2 = Zero2 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;

12401

V1 = Zero1 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;

12402

return true;

12403

}

12404

}

12405

12406

// If a binary shuffle, commute and try again.

12407

if (!IsUnary) {

12408

ShuffleVectorSDNode::commuteMask(Unpckl);

12409

if (isTargetShuffleEquivalent(VT, TargetMask, Unpckl, DAG)) {

12410

UnpackOpcode = X86ISD::UNPCKL;

12411

std::swap(V1, V2);

12412

return true;

12413

}

12414

12415

ShuffleVectorSDNode::commuteMask(Unpckh);

12416

if (isTargetShuffleEquivalent(VT, TargetMask, Unpckh, DAG)) {

12417

UnpackOpcode = X86ISD::UNPCKH;

12418

std::swap(V1, V2);

12419

return true;

12420

}

12421

}

12422

12423

return false;

12424

}

12425

12426

// X86 has dedicated unpack instructions that can handle specific blend

12427

// operations: UNPCKH and UNPCKL.

12428

static SDValue lowerShuffleWithUNPCK(const SDLoc &DL, MVT VT,

12429

ArrayRef<int> Mask, SDValue V1, SDValue V2,

12430

SelectionDAG &DAG) {

12431

SmallVector<int, 8> Unpckl;

12432

createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, /* Unary = */ false);

12433

if (isShuffleEquivalent(Mask, Unpckl, V1, V2))

12434

return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);

12435

12436

SmallVector<int, 8> Unpckh;

12437

createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, /* Unary = */ false);

12438

if (isShuffleEquivalent(Mask, Unpckh, V1, V2))

12439

return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);

12440

12441

// Commute and try again.

12442

ShuffleVectorSDNode::commuteMask(Unpckl);

12443

if (isShuffleEquivalent(Mask, Unpckl, V1, V2))

12444

return DAG.getNode(X86ISD::UNPCKL, DL, VT, V2, V1);

12445

12446

ShuffleVectorSDNode::commuteMask(Unpckh);

12447

if (isShuffleEquivalent(Mask, Unpckh, V1, V2))

12448

return DAG.getNode(X86ISD::UNPCKH, DL, VT, V2, V1);

12449

12450

return SDValue();

12451

}

12452

12453

/// Check if the mask can be mapped to a preliminary shuffle (vperm 64-bit)

12454

/// followed by unpack 256-bit.

12455

static SDValue lowerShuffleWithUNPCK256(const SDLoc &DL, MVT VT,

12456

ArrayRef<int> Mask, SDValue V1,

12457

SDValue V2, SelectionDAG &DAG) {

12458

SmallVector<int, 32> Unpckl, Unpckh;

12459

createSplat2ShuffleMask(VT, Unpckl, /* Lo */ true);

12460

createSplat2ShuffleMask(VT, Unpckh, /* Lo */ false);

12461

12462

unsigned UnpackOpcode;

12463

if (isShuffleEquivalent(Mask, Unpckl, V1, V2))

12464

UnpackOpcode = X86ISD::UNPCKL;

12465

else if (isShuffleEquivalent(Mask, Unpckh, V1, V2))

12466

UnpackOpcode = X86ISD::UNPCKH;

12467

else

12468

return SDValue();

12469

12470

// This is a "natural" unpack operation (rather than the 128-bit sectored

12471

// operation implemented by AVX). We need to rearrange 64-bit chunks of the

12472

// input in order to use the x86 instruction.

12473

V1 = DAG.getVectorShuffle(MVT::v4f64, DL, DAG.getBitcast(MVT::v4f64, V1),

12474

DAG.getUNDEF(MVT::v4f64), {0, 2, 1, 3});

12475

V1 = DAG.getBitcast(VT, V1);

12476

return DAG.getNode(UnpackOpcode, DL, VT, V1, V1);

12477

}

12478

12479

// Check if the mask can be mapped to a TRUNCATE or VTRUNC, truncating the

12480

// source into the lower elements and zeroing the upper elements.

12481

static bool matchShuffleAsVTRUNC(MVT &SrcVT, MVT &DstVT, MVT VT,

12482

ArrayRef<int> Mask, const APInt &Zeroable,

12483

const X86Subtarget &Subtarget) {

12484

if (!VT.is512BitVector() && !Subtarget.hasVLX())

12485

return false;

12486

12487

unsigned NumElts = Mask.size();

12488

unsigned EltSizeInBits = VT.getScalarSizeInBits();

12489

unsigned MaxScale = 64 / EltSizeInBits;

12490

12491

for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {

12492

unsigned SrcEltBits = EltSizeInBits * Scale;

12493

if (SrcEltBits < 32 && !Subtarget.hasBWI())

12494

continue;

12495

unsigned NumSrcElts = NumElts / Scale;

12496

if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale))

12497

continue;

12498

unsigned UpperElts = NumElts - NumSrcElts;

12499

if (!Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())

12500

continue;

12501

SrcVT = MVT::getIntegerVT(EltSizeInBits * Scale);

12502

SrcVT = MVT::getVectorVT(SrcVT, NumSrcElts);

12503

DstVT = MVT::getIntegerVT(EltSizeInBits);

12504

if ((NumSrcElts * EltSizeInBits) >= 128) {

12505

// ISD::TRUNCATE

12506

DstVT = MVT::getVectorVT(DstVT, NumSrcElts);

12507

} else {

12508

// X86ISD::VTRUNC

12509

DstVT = MVT::getVectorVT(DstVT, 128 / EltSizeInBits);

12510

}

12511

return true;

12512

}

12513

12514

return false;

12515

}

12516

12517

// Helper to create TRUNCATE/VTRUNC nodes, optionally with zero/undef upper

12518

// element padding to the final DstVT.

12519

static SDValue getAVX512TruncNode(const SDLoc &DL, MVT DstVT, SDValue Src,

12520

const X86Subtarget &Subtarget,

12521

SelectionDAG &DAG, bool ZeroUppers) {

12522

MVT SrcVT = Src.getSimpleValueType();

12523

MVT DstSVT = DstVT.getScalarType();

12524

unsigned NumDstElts = DstVT.getVectorNumElements();

12525

unsigned NumSrcElts = SrcVT.getVectorNumElements();

12526

unsigned DstEltSizeInBits = DstVT.getScalarSizeInBits();

12527

12528

if (!DAG.getTargetLoweringInfo().isTypeLegal(SrcVT))

12529

return SDValue();

12530

12531

// Perform a direct ISD::TRUNCATE if possible.

12532

if (NumSrcElts == NumDstElts)

12533

return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Src);

12534

12535

if (NumSrcElts > NumDstElts) {

12536

MVT TruncVT = MVT::getVectorVT(DstSVT, NumSrcElts);

12537

SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Src);

12538

return extractSubVector(Trunc, 0, DAG, DL, DstVT.getSizeInBits());

12539

}

12540

12541

if ((NumSrcElts * DstEltSizeInBits) >= 128) {

12542

MVT TruncVT = MVT::getVectorVT(DstSVT, NumSrcElts);

12543

SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Src);

12544

return widenSubVector(Trunc, ZeroUppers, Subtarget, DAG, DL,

12545

DstVT.getSizeInBits());

12546

}

12547

12548

// Non-VLX targets must truncate from a 512-bit type, so we need to

12549

// widen, truncate and then possibly extract the original subvector.

12550

if (!Subtarget.hasVLX() && !SrcVT.is512BitVector()) {

12551

SDValue NewSrc = widenSubVector(Src, ZeroUppers, Subtarget, DAG, DL, 512);

12552

return getAVX512TruncNode(DL, DstVT, NewSrc, Subtarget, DAG, ZeroUppers);

12553

}

12554

12555

// Fallback to a X86ISD::VTRUNC, padding if necessary.

12556

MVT TruncVT = MVT::getVectorVT(DstSVT, 128 / DstEltSizeInBits);

12557

SDValue Trunc = DAG.getNode(X86ISD::VTRUNC, DL, TruncVT, Src);

12558

if (DstVT != TruncVT)

12559

Trunc = widenSubVector(Trunc, ZeroUppers, Subtarget, DAG, DL,

12560

DstVT.getSizeInBits());

12561

return Trunc;

12562

}

12563

12564

// Try to lower trunc+vector_shuffle to a vpmovdb or a vpmovdw instruction.

12565

//

12566

// An example is the following:

12567

//

12568

// t0: ch = EntryToken

12569

// t2: v4i64,ch = CopyFromReg t0, Register:v4i64 %0

12570

// t25: v4i32 = truncate t2

12571

// t41: v8i16 = bitcast t25

12572

// t21: v8i16 = BUILD_VECTOR undef:i16, undef:i16, undef:i16, undef:i16,

12573

// Constant:i16<0>, Constant:i16<0>, Constant:i16<0>, Constant:i16<0>

12574

// t51: v8i16 = vector_shuffle<0,2,4,6,12,13,14,15> t41, t21

12575

// t18: v2i64 = bitcast t51

12576

//

12577

// One can just use a single vpmovdw instruction, without avx512vl we need to

12578

// use the zmm variant and extract the lower subvector, padding with zeroes.

12579

// TODO: Merge with lowerShuffleAsVTRUNC.

12580

static SDValue lowerShuffleWithVPMOV(const SDLoc &DL, MVT VT, SDValue V1,

12581

SDValue V2, ArrayRef<int> Mask,

12582

const APInt &Zeroable,

12583

const X86Subtarget &Subtarget,

12584

SelectionDAG &DAG) {

12585

assert((VT == MVT::v16i8 || VT == MVT::v8i16) && "Unexpected VTRUNC type")(static_cast <bool> ((VT == MVT::v16i8 || VT == MVT::v8i16
) && "Unexpected VTRUNC type") ? void (0) : __assert_fail
("(VT == MVT::v16i8 || VT == MVT::v8i16) && \"Unexpected VTRUNC type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12585, __extension__
__PRETTY_FUNCTION__));

12586

if (!Subtarget.hasAVX512())

12587

return SDValue();

12588

12589

unsigned NumElts = VT.getVectorNumElements();

12590

unsigned EltSizeInBits = VT.getScalarSizeInBits();

12591

unsigned MaxScale = 64 / EltSizeInBits;

12592

for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {

12593

unsigned SrcEltBits = EltSizeInBits * Scale;

12594

unsigned NumSrcElts = NumElts / Scale;

12595

unsigned UpperElts = NumElts - NumSrcElts;

12596

if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale) ||

12597

!Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())

12598

continue;

12599

12600

// Attempt to find a matching source truncation, but as a fall back VLX

12601

// cases can use the VPMOV directly.

12602

SDValue Src = peekThroughBitcasts(V1);

12603

if (Src.getOpcode() == ISD::TRUNCATE &&

12604

Src.getScalarValueSizeInBits() == SrcEltBits) {

12605

Src = Src.getOperand(0);

12606

} else if (Subtarget.hasVLX()) {

12607

MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);

12608

MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);

12609

Src = DAG.getBitcast(SrcVT, Src);

12610

// Don't do this if PACKSS/PACKUS could perform it cheaper.

12611

if (Scale == 2 &&

12612

((DAG.ComputeNumSignBits(Src) > EltSizeInBits) ||

12613

(DAG.computeKnownBits(Src).countMinLeadingZeros() >= EltSizeInBits)))

12614

return SDValue();

12615

} else

12616

return SDValue();

12617

12618

// VPMOVWB is only available with avx512bw.

12619

if (!Subtarget.hasBWI() && Src.getScalarValueSizeInBits() < 32)

12620

return SDValue();

12621

12622

bool UndefUppers = isUndefInRange(Mask, NumSrcElts, UpperElts);

12623

return getAVX512TruncNode(DL, VT, Src, Subtarget, DAG, !UndefUppers);

12624

}

12625

12626

return SDValue();

12627

}

12628

12629

// Attempt to match binary shuffle patterns as a truncate.

12630

static SDValue lowerShuffleAsVTRUNC(const SDLoc &DL, MVT VT, SDValue V1,

12631

SDValue V2, ArrayRef<int> Mask,

12632

const APInt &Zeroable,

12633

const X86Subtarget &Subtarget,

12634

SelectionDAG &DAG) {

12635

assert((VT.is128BitVector() || VT.is256BitVector()) &&(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
()) && "Unexpected VTRUNC type") ? void (0) : __assert_fail
("(VT.is128BitVector() || VT.is256BitVector()) && \"Unexpected VTRUNC type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12636, __extension__
__PRETTY_FUNCTION__))

12636

"Unexpected VTRUNC type")(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
()) && "Unexpected VTRUNC type") ? void (0) : __assert_fail
("(VT.is128BitVector() || VT.is256BitVector()) && \"Unexpected VTRUNC type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12636, __extension__
__PRETTY_FUNCTION__));

12637

if (!Subtarget.hasAVX512())

12638

return SDValue();

12639

12640

unsigned NumElts = VT.getVectorNumElements();

12641

unsigned EltSizeInBits = VT.getScalarSizeInBits();

12642

unsigned MaxScale = 64 / EltSizeInBits;

12643

for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {

12644

// TODO: Support non-BWI VPMOVWB truncations?

12645

unsigned SrcEltBits = EltSizeInBits * Scale;

12646

if (SrcEltBits < 32 && !Subtarget.hasBWI())

12647

continue;

12648

12649

// Match shuffle <Ofs,Ofs+Scale,Ofs+2*Scale,..,undef_or_zero,undef_or_zero>

12650

// Bail if the V2 elements are undef.

12651

unsigned NumHalfSrcElts = NumElts / Scale;

12652

unsigned NumSrcElts = 2 * NumHalfSrcElts;

12653

for (unsigned Offset = 0; Offset != Scale; ++Offset) {

12654

if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, Offset, Scale) ||

12655

isUndefInRange(Mask, NumHalfSrcElts, NumHalfSrcElts))

12656

continue;

12657

12658

// The elements beyond the truncation must be undef/zero.

12659

unsigned UpperElts = NumElts - NumSrcElts;

12660

if (UpperElts > 0 &&

12661

!Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())

12662

continue;

12663

bool UndefUppers =

12664

UpperElts > 0 && isUndefInRange(Mask, NumSrcElts, UpperElts);

12665

12666

// For offset truncations, ensure that the concat is cheap.

12667

if (Offset) {

12668

auto IsCheapConcat = [&](SDValue Lo, SDValue Hi) {

12669

if (Lo.getOpcode() == ISD::EXTRACT_SUBVECTOR &&

12670

Hi.getOpcode() == ISD::EXTRACT_SUBVECTOR)

12671

return Lo.getOperand(0) == Hi.getOperand(0);

12672

if (ISD::isNormalLoad(Lo.getNode()) &&

12673

ISD::isNormalLoad(Hi.getNode())) {

12674

auto *LDLo = cast<LoadSDNode>(Lo);

12675

auto *LDHi = cast<LoadSDNode>(Hi);

12676

return DAG.areNonVolatileConsecutiveLoads(

12677

LDHi, LDLo, Lo.getValueType().getStoreSize(), 1);

12678

}

12679

return false;

12680

};

12681

if (!IsCheapConcat(V1, V2))

12682

continue;

12683

}

12684

12685

// As we're using both sources then we need to concat them together

12686

// and truncate from the double-sized src.

12687

MVT ConcatVT = MVT::getVectorVT(VT.getScalarType(), NumElts * 2);

12688

SDValue Src = DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT, V1, V2);

12689

12690

MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);

12691

MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);

12692

Src = DAG.getBitcast(SrcVT, Src);

12693

12694

// Shift the offset'd elements into place for the truncation.

12695

// TODO: Use getTargetVShiftByConstNode.

12696

if (Offset)

12697

Src = DAG.getNode(

12698

X86ISD::VSRLI, DL, SrcVT, Src,

12699

DAG.getTargetConstant(Offset * EltSizeInBits, DL, MVT::i8));

12700

12701

return getAVX512TruncNode(DL, VT, Src, Subtarget, DAG, !UndefUppers);

12702

}

12703

}

12704

12705

return SDValue();

12706

}

12707

12708

/// Check whether a compaction lowering can be done by dropping even/odd

12709

/// elements and compute how many times even/odd elements must be dropped.

12710

///

12711

/// This handles shuffles which take every Nth element where N is a power of

12712

/// two. Example shuffle masks:

12713

///

12714

/// (even)

12715

/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14

12716

/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30

12717

/// N = 2: 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12

12718

/// N = 2: 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28

12719

/// N = 3: 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8

12720

/// N = 3: 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24

12721

///

12722

/// (odd)

12723

/// N = 1: 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, 12, 14

12724

/// N = 1: 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31

12725

///

12726

/// Any of these lanes can of course be undef.

12727

///

12728

/// This routine only supports N <= 3.

12729

/// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here

12730

/// for larger N.

12731

///

12732

/// \returns N above, or the number of times even/odd elements must be dropped

12733

/// if there is such a number. Otherwise returns zero.

12734

static int canLowerByDroppingElements(ArrayRef<int> Mask, bool MatchEven,

12735

bool IsSingleInput) {

12736

// The modulus for the shuffle vector entries is based on whether this is

12737

// a single input or not.

12738

int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);

12739

assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&(static_cast <bool> (isPowerOf2_32((uint32_t)ShuffleModulus
) && "We should only be called with masks with a power-of-2 size!"
) ? void (0) : __assert_fail ("isPowerOf2_32((uint32_t)ShuffleModulus) && \"We should only be called with masks with a power-of-2 size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12740, __extension__
__PRETTY_FUNCTION__))

12740

"We should only be called with masks with a power-of-2 size!")(static_cast <bool> (isPowerOf2_32((uint32_t)ShuffleModulus
) && "We should only be called with masks with a power-of-2 size!"
) ? void (0) : __assert_fail ("isPowerOf2_32((uint32_t)ShuffleModulus) && \"We should only be called with masks with a power-of-2 size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12740, __extension__
__PRETTY_FUNCTION__));

12741

12742

uint64_t ModMask = (uint64_t)ShuffleModulus - 1;

12743

int Offset = MatchEven ? 0 : 1;

12744

12745

// We track whether the input is viable for all power-of-2 strides 2^1, 2^2,

12746

// and 2^3 simultaneously. This is because we may have ambiguity with

12747

// partially undef inputs.

12748

bool ViableForN[3] = {true, true, true};

12749

12750

for (int i = 0, e = Mask.size(); i < e; ++i) {

12751

// Ignore undef lanes, we'll optimistically collapse them to the pattern we

12752

// want.

12753

if (Mask[i] < 0)

12754

continue;

12755

12756

bool IsAnyViable = false;

12757

for (unsigned j = 0; j != std::size(ViableForN); ++j)

12758

if (ViableForN[j]) {

12759

uint64_t N = j + 1;

12760

12761

// The shuffle mask must be equal to (i * 2^N) % M.

12762

if ((uint64_t)(Mask[i] - Offset) == (((uint64_t)i << N) & ModMask))

12763

IsAnyViable = true;

12764

else

12765

ViableForN[j] = false;

12766

}

12767

// Early exit if we exhaust the possible powers of two.

12768

if (!IsAnyViable)

12769

break;

12770

}

12771

12772

for (unsigned j = 0; j != std::size(ViableForN); ++j)

12773

if (ViableForN[j])

12774

return j + 1;

12775

12776

// Return 0 as there is no viable power of two.

12777

return 0;

12778

}

12779

12780

// X86 has dedicated pack instructions that can handle specific truncation

12781

// operations: PACKSS and PACKUS.

12782

// Checks for compaction shuffle masks if MaxStages > 1.

12783

// TODO: Add support for matching multiple PACKSS/PACKUS stages.

12784

static bool matchShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1, SDValue &V2,

12785

unsigned &PackOpcode, ArrayRef<int> TargetMask,

12786

const SelectionDAG &DAG,

12787

const X86Subtarget &Subtarget,

12788

unsigned MaxStages = 1) {

12789

unsigned NumElts = VT.getVectorNumElements();

12790

unsigned BitSize = VT.getScalarSizeInBits();

12791

assert(0 < MaxStages && MaxStages <= 3 && (BitSize << MaxStages) <= 64 &&(static_cast <bool> (0 < MaxStages && MaxStages
<= 3 && (BitSize << MaxStages) <= 64 &&
"Illegal maximum compaction") ? void (0) : __assert_fail ("0 < MaxStages && MaxStages <= 3 && (BitSize << MaxStages) <= 64 && \"Illegal maximum compaction\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12792, __extension__
__PRETTY_FUNCTION__))

12792

"Illegal maximum compaction")(static_cast <bool> (0 < MaxStages && MaxStages
<= 3 && (BitSize << MaxStages) <= 64 &&
"Illegal maximum compaction") ? void (0) : __assert_fail ("0 < MaxStages && MaxStages <= 3 && (BitSize << MaxStages) <= 64 && \"Illegal maximum compaction\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12792, __extension__
__PRETTY_FUNCTION__));

12793

12794

auto MatchPACK = [&](SDValue N1, SDValue N2, MVT PackVT) {

12795

unsigned NumSrcBits = PackVT.getScalarSizeInBits();

12796

unsigned NumPackedBits = NumSrcBits - BitSize;

12797

N1 = peekThroughBitcasts(N1);

12798

N2 = peekThroughBitcasts(N2);

12799

unsigned NumBits1 = N1.getScalarValueSizeInBits();

12800

unsigned NumBits2 = N2.getScalarValueSizeInBits();

12801

bool IsZero1 = llvm::isNullOrNullSplat(N1, /*AllowUndefs*/ false);

12802

bool IsZero2 = llvm::isNullOrNullSplat(N2, /*AllowUndefs*/ false);

12803

if ((!N1.isUndef() && !IsZero1 && NumBits1 != NumSrcBits) ||

12804

(!N2.isUndef() && !IsZero2 && NumBits2 != NumSrcBits))

12805

return false;

12806

if (Subtarget.hasSSE41() || BitSize == 8) {

12807

APInt ZeroMask = APInt::getHighBitsSet(NumSrcBits, NumPackedBits);

12808

if ((N1.isUndef() || IsZero1 || DAG.MaskedValueIsZero(N1, ZeroMask)) &&

12809

(N2.isUndef() || IsZero2 || DAG.MaskedValueIsZero(N2, ZeroMask))) {

12810

V1 = N1;

12811

V2 = N2;

12812

SrcVT = PackVT;

12813

PackOpcode = X86ISD::PACKUS;

12814

return true;

12815

}

12816

}

12817

bool IsAllOnes1 = llvm::isAllOnesOrAllOnesSplat(N1, /*AllowUndefs*/ false);

12818

bool IsAllOnes2 = llvm::isAllOnesOrAllOnesSplat(N2, /*AllowUndefs*/ false);

12819

if ((N1.isUndef() || IsZero1 || IsAllOnes1 ||

12820

DAG.ComputeNumSignBits(N1) > NumPackedBits) &&

12821

(N2.isUndef() || IsZero2 || IsAllOnes2 ||

12822

DAG.ComputeNumSignBits(N2) > NumPackedBits)) {

12823

V1 = N1;

12824

V2 = N2;

12825

SrcVT = PackVT;

12826

PackOpcode = X86ISD::PACKSS;

12827

return true;

12828

}

12829

return false;

12830

};

12831

12832

// Attempt to match against wider and wider compaction patterns.

12833

for (unsigned NumStages = 1; NumStages <= MaxStages; ++NumStages) {

12834

MVT PackSVT = MVT::getIntegerVT(BitSize << NumStages);

12835

MVT PackVT = MVT::getVectorVT(PackSVT, NumElts >> NumStages);

12836

12837

// Try binary shuffle.

12838

SmallVector<int, 32> BinaryMask;

12839

createPackShuffleMask(VT, BinaryMask, false, NumStages);

12840

if (isTargetShuffleEquivalent(VT, TargetMask, BinaryMask, DAG, V1, V2))

12841

if (MatchPACK(V1, V2, PackVT))

12842

return true;

12843

12844

// Try unary shuffle.

12845

SmallVector<int, 32> UnaryMask;

12846

createPackShuffleMask(VT, UnaryMask, true, NumStages);

12847

if (isTargetShuffleEquivalent(VT, TargetMask, UnaryMask, DAG, V1))

12848

if (MatchPACK(V1, V1, PackVT))

12849

return true;

12850

}

12851

12852

return false;

12853

}

12854

12855

static SDValue lowerShuffleWithPACK(const SDLoc &DL, MVT VT, ArrayRef<int> Mask,

12856

SDValue V1, SDValue V2, SelectionDAG &DAG,

12857

const X86Subtarget &Subtarget) {

12858

MVT PackVT;

12859

unsigned PackOpcode;

12860

unsigned SizeBits = VT.getSizeInBits();

12861

unsigned EltBits = VT.getScalarSizeInBits();

12862

unsigned MaxStages = Log2_32(64 / EltBits);

12863

if (!matchShuffleWithPACK(VT, PackVT, V1, V2, PackOpcode, Mask, DAG,

12864

Subtarget, MaxStages))

12865

return SDValue();

12866

12867

unsigned CurrentEltBits = PackVT.getScalarSizeInBits();

12868

unsigned NumStages = Log2_32(CurrentEltBits / EltBits);

12869

12870

// Don't lower multi-stage packs on AVX512, truncation is better.

12871

if (NumStages != 1 && SizeBits == 128 && Subtarget.hasVLX())

12872

return SDValue();

12873

12874

// Pack to the largest type possible:

12875

// vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.

12876

unsigned MaxPackBits = 16;

12877

if (CurrentEltBits > 16 &&

12878

(PackOpcode == X86ISD::PACKSS || Subtarget.hasSSE41()))

12879

MaxPackBits = 32;

12880

12881

// Repeatedly pack down to the target size.

12882

SDValue Res;

12883

for (unsigned i = 0; i != NumStages; ++i) {

12884

unsigned SrcEltBits = std::min(MaxPackBits, CurrentEltBits);

12885

unsigned NumSrcElts = SizeBits / SrcEltBits;

12886

MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);

12887

MVT DstSVT = MVT::getIntegerVT(SrcEltBits / 2);

12888

MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);

12889

MVT DstVT = MVT::getVectorVT(DstSVT, NumSrcElts * 2);

12890

Res = DAG.getNode(PackOpcode, DL, DstVT, DAG.getBitcast(SrcVT, V1),

12891

DAG.getBitcast(SrcVT, V2));

12892

V1 = V2 = Res;

12893

CurrentEltBits /= 2;

12894

}

12895

assert(Res && Res.getValueType() == VT &&(static_cast <bool> (Res && Res.getValueType() ==
VT && "Failed to lower compaction shuffle") ? void (
0) : __assert_fail ("Res && Res.getValueType() == VT && \"Failed to lower compaction shuffle\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12896, __extension__
__PRETTY_FUNCTION__))

12896

"Failed to lower compaction shuffle")(static_cast <bool> (Res && Res.getValueType() ==
VT && "Failed to lower compaction shuffle") ? void (
0) : __assert_fail ("Res && Res.getValueType() == VT && \"Failed to lower compaction shuffle\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12896, __extension__
__PRETTY_FUNCTION__));

12897

return Res;

12898

}

12899

12900

/// Try to emit a bitmask instruction for a shuffle.

12901

///

12902

/// This handles cases where we can model a blend exactly as a bitmask due to

12903

/// one of the inputs being zeroable.

12904

static SDValue lowerShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1,

12905

SDValue V2, ArrayRef<int> Mask,

12906

const APInt &Zeroable,

12907

const X86Subtarget &Subtarget,

12908

SelectionDAG &DAG) {

12909

MVT MaskVT = VT;

12910

MVT EltVT = VT.getVectorElementType();

12911

SDValue Zero, AllOnes;

12912

// Use f64 if i64 isn't legal.

12913

if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {

12914

EltVT = MVT::f64;

12915

MaskVT = MVT::getVectorVT(EltVT, Mask.size());

12916

}

12917

12918

MVT LogicVT = VT;

12919

if (EltVT == MVT::f32 || EltVT == MVT::f64) {

12920

Zero = DAG.getConstantFP(0.0, DL, EltVT);

12921

APFloat AllOnesValue =

12922

APFloat::getAllOnesValue(SelectionDAG::EVTToAPFloatSemantics(EltVT));

12923

AllOnes = DAG.getConstantFP(AllOnesValue, DL, EltVT);

12924

LogicVT =

12925

MVT::getVectorVT(EltVT == MVT::f64 ? MVT::i64 : MVT::i32, Mask.size());

12926

} else {

12927

Zero = DAG.getConstant(0, DL, EltVT);

12928

AllOnes = DAG.getAllOnesConstant(DL, EltVT);

12929

}

12930

12931

SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);

12932

SDValue V;

12933

for (int i = 0, Size = Mask.size(); i < Size; ++i) {

12934

if (Zeroable[i])

12935

continue;

12936

if (Mask[i] % Size != i)

12937

return SDValue(); // Not a blend.

12938

if (!V)

12939

V = Mask[i] < Size ? V1 : V2;

12940

else if (V != (Mask[i] < Size ? V1 : V2))

12941

return SDValue(); // Can only let one input through the mask.

12942

12943

VMaskOps[i] = AllOnes;

12944

}

12945

if (!V)

12946

return SDValue(); // No non-zeroable elements!

12947

12948

SDValue VMask = DAG.getBuildVector(MaskVT, DL, VMaskOps);

12949

VMask = DAG.getBitcast(LogicVT, VMask);

12950

V = DAG.getBitcast(LogicVT, V);

12951

SDValue And = DAG.getNode(ISD::AND, DL, LogicVT, V, VMask);

12952

return DAG.getBitcast(VT, And);

12953

}

12954

12955

/// Try to emit a blend instruction for a shuffle using bit math.

12956

///

12957

/// This is used as a fallback approach when first class blend instructions are

12958

/// unavailable. Currently it is only suitable for integer vectors, but could

12959

/// be generalized for floating point vectors if desirable.

12960

static SDValue lowerShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1,

12961

SDValue V2, ArrayRef<int> Mask,

12962

SelectionDAG &DAG) {

12963

assert(VT.isInteger() && "Only supports integer vector types!")(static_cast <bool> (VT.isInteger() && "Only supports integer vector types!"
) ? void (0) : __assert_fail ("VT.isInteger() && \"Only supports integer vector types!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12963, __extension__
__PRETTY_FUNCTION__));

12964

MVT EltVT = VT.getVectorElementType();

12965

SDValue Zero = DAG.getConstant(0, DL, EltVT);

12966

SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);

12967

SmallVector<SDValue, 16> MaskOps;

12968

for (int i = 0, Size = Mask.size(); i < Size; ++i) {

12969

if (Mask[i] >= 0 && Mask[i] != i && Mask[i] != i + Size)

12970

return SDValue(); // Shuffled input!

12971

MaskOps.push_back(Mask[i] < Size ? AllOnes : Zero);

12972

}

12973

12974

SDValue V1Mask = DAG.getBuildVector(VT, DL, MaskOps);

12975

V1 = DAG.getNode(ISD::AND, DL, VT, V1, V1Mask);

12976

V2 = DAG.getNode(X86ISD::ANDNP, DL, VT, V1Mask, V2);

12977

return DAG.getNode(ISD::OR, DL, VT, V1, V2);

12978

}

12979

12980

static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,

12981

SDValue PreservedSrc,

12982

const X86Subtarget &Subtarget,

12983

SelectionDAG &DAG);

12984

12985

static bool matchShuffleAsBlend(MVT VT, SDValue V1, SDValue V2,

12986

MutableArrayRef<int> Mask,

12987

const APInt &Zeroable, bool &ForceV1Zero,

12988

bool &ForceV2Zero, uint64_t &BlendMask) {

12989

bool V1IsZeroOrUndef =

12990

V1.isUndef() || ISD::isBuildVectorAllZeros(V1.getNode());

12991

bool V2IsZeroOrUndef =

12992

V2.isUndef() || ISD::isBuildVectorAllZeros(V2.getNode());

12993

12994

BlendMask = 0;

12995

ForceV1Zero = false, ForceV2Zero = false;

12996

assert(Mask.size() <= 64 && "Shuffle mask too big for blend mask")(static_cast <bool> (Mask.size() <= 64 && "Shuffle mask too big for blend mask"
) ? void (0) : __assert_fail ("Mask.size() <= 64 && \"Shuffle mask too big for blend mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12996, __extension__
__PRETTY_FUNCTION__));

12997

12998

int NumElts = Mask.size();

12999

int NumLanes = VT.getSizeInBits() / 128;

13000

int NumEltsPerLane = NumElts / NumLanes;

13001

assert((NumLanes * NumEltsPerLane) == NumElts && "Value type mismatch")(static_cast <bool> ((NumLanes * NumEltsPerLane) == NumElts
&& "Value type mismatch") ? void (0) : __assert_fail
("(NumLanes * NumEltsPerLane) == NumElts && \"Value type mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13001, __extension__
__PRETTY_FUNCTION__));

13002

13003

// For 32/64-bit elements, if we only reference one input (plus any undefs),

13004

// then ensure the blend mask part for that lane just references that input.

13005

bool ForceWholeLaneMasks =

13006

VT.is256BitVector() && VT.getScalarSizeInBits() >= 32;

13007

13008

// Attempt to generate the binary blend mask. If an input is zero then

13009

// we can use any lane.

13010

for (int Lane = 0; Lane != NumLanes; ++Lane) {

13011

// Keep track of the inputs used per lane.

13012

bool LaneV1InUse = false;

13013

bool LaneV2InUse = false;

13014

uint64_t LaneBlendMask = 0;

13015

for (int LaneElt = 0; LaneElt != NumEltsPerLane; ++LaneElt) {

13016

int Elt = (Lane * NumEltsPerLane) + LaneElt;

13017

int M = Mask[Elt];

13018

if (M == SM_SentinelUndef)

13019

continue;

13020

if (M == Elt || (0 <= M && M < NumElts &&

13021

IsElementEquivalent(NumElts, V1, V1, M, Elt))) {

13022

Mask[Elt] = Elt;

13023

LaneV1InUse = true;

13024

continue;

13025

}

13026

if (M == (Elt + NumElts) ||

13027

(NumElts <= M &&

13028

IsElementEquivalent(NumElts, V2, V2, M - NumElts, Elt))) {

13029

LaneBlendMask |= 1ull << LaneElt;

13030

Mask[Elt] = Elt + NumElts;

13031

LaneV2InUse = true;

13032

continue;

13033

}

13034

if (Zeroable[Elt]) {

13035

if (V1IsZeroOrUndef) {

13036

ForceV1Zero = true;

13037

Mask[Elt] = Elt;

13038

LaneV1InUse = true;

13039

continue;

13040

}

13041

if (V2IsZeroOrUndef) {

13042

ForceV2Zero = true;

13043

LaneBlendMask |= 1ull << LaneElt;

13044

Mask[Elt] = Elt + NumElts;

13045

LaneV2InUse = true;

13046

continue;

13047

}

13048

}

13049

return false;

13050

}

13051

13052

// If we only used V2 then splat the lane blend mask to avoid any demanded

13053

// elts from V1 in this lane (the V1 equivalent is implicit with a zero

13054

// blend mask bit).

13055

if (ForceWholeLaneMasks && LaneV2InUse && !LaneV1InUse)

13056

LaneBlendMask = (1ull << NumEltsPerLane) - 1;

13057

13058

BlendMask |= LaneBlendMask << (Lane * NumEltsPerLane);

13059

}

13060

return true;

13061

}

13062

13063

static uint64_t scaleVectorShuffleBlendMask(uint64_t BlendMask, int Size,

13064

int Scale) {

13065

uint64_t ScaledMask = 0;

13066

for (int i = 0; i != Size; ++i)

13067

if (BlendMask & (1ull << i))

13068

ScaledMask |= ((1ull << Scale) - 1) << (i * Scale);

13069

return ScaledMask;

13070

}

13071

13072

/// Try to emit a blend instruction for a shuffle.

13073

///

13074

/// This doesn't do any checks for the availability of instructions for blending

13075

/// these values. It relies on the availability of the X86ISD::BLENDI pattern to

13076

/// be matched in the backend with the type given. What it does check for is

13077

/// that the shuffle mask is a blend, or convertible into a blend with zero.

13078

static SDValue lowerShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,

13079

SDValue V2, ArrayRef<int> Original,

13080

const APInt &Zeroable,

13081

const X86Subtarget &Subtarget,

13082

SelectionDAG &DAG) {

13083

uint64_t BlendMask = 0;

13084

bool ForceV1Zero = false, ForceV2Zero = false;

13085

SmallVector<int, 64> Mask(Original);

13086

if (!matchShuffleAsBlend(VT, V1, V2, Mask, Zeroable, ForceV1Zero, ForceV2Zero,

13087

BlendMask))

13088

return SDValue();

13089

13090

// Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.

13091

if (ForceV1Zero)

13092

V1 = getZeroVector(VT, Subtarget, DAG, DL);

13093

if (ForceV2Zero)

13094

V2 = getZeroVector(VT, Subtarget, DAG, DL);

13095

13096

unsigned NumElts = VT.getVectorNumElements();

13097

13098

switch (VT.SimpleTy) {

13099

case MVT::v4i64:

13100

case MVT::v8i32:

13101

assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!")(static_cast <bool> (Subtarget.hasAVX2() && "256-bit integer blends require AVX2!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"256-bit integer blends require AVX2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13101, __extension__
__PRETTY_FUNCTION__));

13102

[[fallthrough]];

13103

case MVT::v4f64:

13104

case MVT::v8f32:

13105

assert(Subtarget.hasAVX() && "256-bit float blends require AVX!")(static_cast <bool> (Subtarget.hasAVX() && "256-bit float blends require AVX!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX() && \"256-bit float blends require AVX!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13105, __extension__
__PRETTY_FUNCTION__));

13106

[[fallthrough]];

13107

case MVT::v2f64:

13108

case MVT::v2i64:

13109

case MVT::v4f32:

13110

case MVT::v4i32:

13111

case MVT::v8i16:

13112

assert(Subtarget.hasSSE41() && "128-bit blends require SSE41!")(static_cast <bool> (Subtarget.hasSSE41() && "128-bit blends require SSE41!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE41() && \"128-bit blends require SSE41!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13112, __extension__
__PRETTY_FUNCTION__));

13113

return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,

13114

DAG.getTargetConstant(BlendMask, DL, MVT::i8));

13115

case MVT::v16i16: {

13116

assert(Subtarget.hasAVX2() && "v16i16 blends require AVX2!")(static_cast <bool> (Subtarget.hasAVX2() && "v16i16 blends require AVX2!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"v16i16 blends require AVX2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13116, __extension__
__PRETTY_FUNCTION__));

13117

SmallVector<int, 8> RepeatedMask;

13118

if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {

13119

// We can lower these with PBLENDW which is mirrored across 128-bit lanes.

13120

assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!")(static_cast <bool> (RepeatedMask.size() == 8 &&
"Repeated mask size doesn't match!") ? void (0) : __assert_fail
("RepeatedMask.size() == 8 && \"Repeated mask size doesn't match!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13120, __extension__
__PRETTY_FUNCTION__));

13121

BlendMask = 0;

13122

for (int i = 0; i < 8; ++i)

13123

if (RepeatedMask[i] >= 8)

13124

BlendMask |= 1ull << i;

13125

return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,

13126

DAG.getTargetConstant(BlendMask, DL, MVT::i8));

13127

}

13128

// Use PBLENDW for lower/upper lanes and then blend lanes.

13129

// TODO - we should allow 2 PBLENDW here and leave shuffle combine to

13130

// merge to VSELECT where useful.

13131

uint64_t LoMask = BlendMask & 0xFF;

13132

uint64_t HiMask = (BlendMask >> 8) & 0xFF;

13133

if (LoMask == 0 || LoMask == 255 || HiMask == 0 || HiMask == 255) {

13134

SDValue Lo = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,

13135

DAG.getTargetConstant(LoMask, DL, MVT::i8));

13136

SDValue Hi = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,

13137

DAG.getTargetConstant(HiMask, DL, MVT::i8));

13138

return DAG.getVectorShuffle(

13139

MVT::v16i16, DL, Lo, Hi,

13140

{0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31});

13141

}

13142

[[fallthrough]];

13143

}

13144

case MVT::v32i8:

13145

assert(Subtarget.hasAVX2() && "256-bit byte-blends require AVX2!")(static_cast <bool> (Subtarget.hasAVX2() && "256-bit byte-blends require AVX2!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"256-bit byte-blends require AVX2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13145, __extension__
__PRETTY_FUNCTION__));

13146

[[fallthrough]];

13147

case MVT::v16i8: {

13148

assert(Subtarget.hasSSE41() && "128-bit byte-blends require SSE41!")(static_cast <bool> (Subtarget.hasSSE41() && "128-bit byte-blends require SSE41!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE41() && \"128-bit byte-blends require SSE41!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13148, __extension__
__PRETTY_FUNCTION__));

13149

13150

// Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB.

13151

if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,

13152

Subtarget, DAG))

13153

return Masked;

13154

13155

if (Subtarget.hasBWI() && Subtarget.hasVLX()) {

13156

MVT IntegerType = MVT::getIntegerVT(std::max<unsigned>(NumElts, 8));

13157

SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);

13158

return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);

13159

}

13160

13161

// If we have VPTERNLOG, we can use that as a bit blend.

13162

if (Subtarget.hasVLX())

13163

if (SDValue BitBlend =

13164

lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))

13165

return BitBlend;

13166

13167

// Scale the blend by the number of bytes per element.

13168

int Scale = VT.getScalarSizeInBits() / 8;

13169

13170

// This form of blend is always done on bytes. Compute the byte vector

13171

// type.

13172

MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);

13173

13174

// x86 allows load folding with blendvb from the 2nd source operand. But

13175

// we are still using LLVM select here (see comment below), so that's V1.

13176

// If V2 can be load-folded and V1 cannot be load-folded, then commute to

13177

// allow that load-folding possibility.

13178

if (!ISD::isNormalLoad(V1.getNode()) && ISD::isNormalLoad(V2.getNode())) {

13179

ShuffleVectorSDNode::commuteMask(Mask);

13180

std::swap(V1, V2);

13181

}

13182

13183

// Compute the VSELECT mask. Note that VSELECT is really confusing in the

13184

// mix of LLVM's code generator and the x86 backend. We tell the code

13185

// generator that boolean values in the elements of an x86 vector register

13186

// are -1 for true and 0 for false. We then use the LLVM semantics of 'true'

13187

// mapping a select to operand #1, and 'false' mapping to operand #2. The

13188

// reality in x86 is that vector masks (pre-AVX-512) use only the high bit

13189

// of the element (the remaining are ignored) and 0 in that high bit would

13190

// mean operand #1 while 1 in the high bit would mean operand #2. So while

13191

// the LLVM model for boolean values in vector elements gets the relevant

13192

// bit set, it is set backwards and over constrained relative to x86's

13193

// actual model.

13194

SmallVector<SDValue, 32> VSELECTMask;

13195

for (int i = 0, Size = Mask.size(); i < Size; ++i)

13196

for (int j = 0; j < Scale; ++j)

13197

VSELECTMask.push_back(

13198

Mask[i] < 0 ? DAG.getUNDEF(MVT::i8)

13199

: DAG.getConstant(Mask[i] < Size ? -1 : 0, DL,

13200

MVT::i8));

13201

13202

V1 = DAG.getBitcast(BlendVT, V1);

13203

V2 = DAG.getBitcast(BlendVT, V2);

13204

return DAG.getBitcast(

13205

VT,

13206

DAG.getSelect(DL, BlendVT, DAG.getBuildVector(BlendVT, DL, VSELECTMask),

13207

V1, V2));

13208

}

13209

case MVT::v16f32:

13210

case MVT::v8f64:

13211

case MVT::v8i64:

13212

case MVT::v16i32:

13213

case MVT::v32i16:

13214

case MVT::v64i8: {

13215

// Attempt to lower to a bitmask if we can. Only if not optimizing for size.

13216

bool OptForSize = DAG.shouldOptForSize();

13217

if (!OptForSize) {

13218

if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,

13219

Subtarget, DAG))

13220

return Masked;

13221

}

13222

13223

// Otherwise load an immediate into a GPR, cast to k-register, and use a

13224

// masked move.

13225

MVT IntegerType = MVT::getIntegerVT(std::max<unsigned>(NumElts, 8));

13226

SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);

13227

return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);

13228

}

13229

default:

13230

llvm_unreachable("Not a supported integer vector type!")::llvm::llvm_unreachable_internal("Not a supported integer vector type!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13230);

13231

}

13232

}

13233

13234

/// Try to lower as a blend of elements from two inputs followed by

13235

/// a single-input permutation.

13236

///

13237

/// This matches the pattern where we can blend elements from two inputs and

13238

/// then reduce the shuffle to a single-input permutation.

13239

static SDValue lowerShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT,

13240

SDValue V1, SDValue V2,

13241

ArrayRef<int> Mask,

13242

SelectionDAG &DAG,

13243

bool ImmBlends = false) {

13244

// We build up the blend mask while checking whether a blend is a viable way

13245

// to reduce the shuffle.

13246

SmallVector<int, 32> BlendMask(Mask.size(), -1);

13247

SmallVector<int, 32> PermuteMask(Mask.size(), -1);

13248

13249

for (int i = 0, Size = Mask.size(); i < Size; ++i) {

13250

if (Mask[i] < 0)

13251

continue;

13252

13253

assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds.")(static_cast <bool> (Mask[i] < Size * 2 && "Shuffle input is out of bounds."
) ? void (0) : __assert_fail ("Mask[i] < Size * 2 && \"Shuffle input is out of bounds.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13253, __extension__
__PRETTY_FUNCTION__));

13254

13255

if (BlendMask[Mask[i] % Size] < 0)

13256

BlendMask[Mask[i] % Size] = Mask[i];

13257

else if (BlendMask[Mask[i] % Size] != Mask[i])

13258

return SDValue(); // Can't blend in the needed input!

13259

13260

PermuteMask[i] = Mask[i] % Size;

13261

}

13262

13263

// If only immediate blends, then bail if the blend mask can't be widened to

13264

// i16.

13265

unsigned EltSize = VT.getScalarSizeInBits();

13266

if (ImmBlends && EltSize == 8 && !canWidenShuffleElements(BlendMask))

13267

return SDValue();

13268

13269

SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);

13270

return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask);

13271

}

13272

13273

/// Try to lower as an unpack of elements from two inputs followed by

13274

/// a single-input permutation.

13275

///

13276

/// This matches the pattern where we can unpack elements from two inputs and

13277

/// then reduce the shuffle to a single-input (wider) permutation.

13278

static SDValue lowerShuffleAsUNPCKAndPermute(const SDLoc &DL, MVT VT,

13279

SDValue V1, SDValue V2,

13280

ArrayRef<int> Mask,

13281

SelectionDAG &DAG) {

13282

int NumElts = Mask.size();

13283

int NumLanes = VT.getSizeInBits() / 128;

13284

int NumLaneElts = NumElts / NumLanes;

13285

int NumHalfLaneElts = NumLaneElts / 2;

13286

13287

bool MatchLo = true, MatchHi = true;

13288

SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};

13289

13290

// Determine UNPCKL/UNPCKH type and operand order.

13291

for (int Elt = 0; Elt != NumElts; ++Elt) {

13292

int M = Mask[Elt];

13293

if (M < 0)

13294

continue;

13295

13296

// Normalize the mask value depending on whether it's V1 or V2.

13297

int NormM = M;

13298

SDValue &Op = Ops[Elt & 1];

13299

if (M < NumElts && (Op.isUndef() || Op == V1))

13300

Op = V1;

13301

else if (NumElts <= M && (Op.isUndef() || Op == V2)) {

13302

Op = V2;

13303

NormM -= NumElts;

13304

} else

13305

return SDValue();

13306

13307

bool MatchLoAnyLane = false, MatchHiAnyLane = false;

13308

for (int Lane = 0; Lane != NumElts; Lane += NumLaneElts) {

13309

int Lo = Lane, Mid = Lane + NumHalfLaneElts, Hi = Lane + NumLaneElts;

13310

MatchLoAnyLane |= isUndefOrInRange(NormM, Lo, Mid);

13311

MatchHiAnyLane |= isUndefOrInRange(NormM, Mid, Hi);

13312

if (MatchLoAnyLane || MatchHiAnyLane) {

13313

assert((MatchLoAnyLane ^ MatchHiAnyLane) &&(static_cast <bool> ((MatchLoAnyLane ^ MatchHiAnyLane) &&
"Failed to match UNPCKLO/UNPCKHI") ? void (0) : __assert_fail
("(MatchLoAnyLane ^ MatchHiAnyLane) && \"Failed to match UNPCKLO/UNPCKHI\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13314, __extension__
__PRETTY_FUNCTION__))

13314

"Failed to match UNPCKLO/UNPCKHI")(static_cast <bool> ((MatchLoAnyLane ^ MatchHiAnyLane) &&
"Failed to match UNPCKLO/UNPCKHI") ? void (0) : __assert_fail
("(MatchLoAnyLane ^ MatchHiAnyLane) && \"Failed to match UNPCKLO/UNPCKHI\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13314, __extension__
__PRETTY_FUNCTION__));

13315

break;

13316

}

13317

}

13318

MatchLo &= MatchLoAnyLane;

13319

MatchHi &= MatchHiAnyLane;

13320

if (!MatchLo && !MatchHi)

13321

return SDValue();

13322

}

13323

assert((MatchLo ^ MatchHi) && "Failed to match UNPCKLO/UNPCKHI")(static_cast <bool> ((MatchLo ^ MatchHi) && "Failed to match UNPCKLO/UNPCKHI"
) ? void (0) : __assert_fail ("(MatchLo ^ MatchHi) && \"Failed to match UNPCKLO/UNPCKHI\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13323, __extension__
__PRETTY_FUNCTION__));

13324

13325

// Element indices have changed after unpacking. Calculate permute mask

13326

// so that they will be put back to the position as dictated by the

13327

// original shuffle mask indices.

13328

SmallVector<int, 32> PermuteMask(NumElts, -1);

13329

for (int Elt = 0; Elt != NumElts; ++Elt) {

13330

int M = Mask[Elt];

13331

if (NumElts <= M)

13332

PermuteMask[Elt] = NumLaneElts * ((M - NumElts) / NumLaneElts) +

13333

(2 * (M % NumHalfLaneElts)) + 1;

13334

else if (0 <= M)

13335

PermuteMask[Elt] =

13336

NumLaneElts * (M / NumLaneElts) + (2 * (M % NumHalfLaneElts));

13337

}

13338

13339

unsigned UnpckOp = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;

13340

SDValue Unpck = DAG.getNode(UnpckOp, DL, VT, Ops);

13341

return DAG.getVectorShuffle(VT, DL, Unpck, DAG.getUNDEF(VT), PermuteMask);

13342

}

13343

13344

/// Try to lower a shuffle as a permute of the inputs followed by an

13345

/// UNPCK instruction.

13346

///

13347

/// This specifically targets cases where we end up with alternating between

13348

/// the two inputs, and so can permute them into something that feeds a single

13349

/// UNPCK instruction. Note that this routine only targets integer vectors

13350

/// because for floating point vectors we have a generalized SHUFPS lowering

13351

/// strategy that handles everything that doesn't *exactly* match an unpack,

13352

/// making this clever lowering unnecessary.

13353

static SDValue lowerShuffleAsPermuteAndUnpack(const SDLoc &DL, MVT VT,

13354

SDValue V1, SDValue V2,

13355

ArrayRef<int> Mask,

13356

const X86Subtarget &Subtarget,

13357

SelectionDAG &DAG) {

13358

int Size = Mask.size();

13359

assert(Mask.size() >= 2 && "Single element masks are invalid.")(static_cast <bool> (Mask.size() >= 2 && "Single element masks are invalid."
) ? void (0) : __assert_fail ("Mask.size() >= 2 && \"Single element masks are invalid.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13359, __extension__
__PRETTY_FUNCTION__));

13360

13361

// This routine only supports 128-bit integer dual input vectors.

13362

if (VT.isFloatingPoint() || !VT.is128BitVector() || V2.isUndef())

13363

return SDValue();

13364

13365

int NumLoInputs =

13366

count_if(Mask, [Size](int M) { return M >= 0 && M % Size < Size / 2; });

13367

int NumHiInputs =

13368

count_if(Mask, [Size](int M) { return M % Size >= Size / 2; });

13369

13370

bool UnpackLo = NumLoInputs >= NumHiInputs;

13371

13372

auto TryUnpack = [&](int ScalarSize, int Scale) {

13373

SmallVector<int, 16> V1Mask((unsigned)Size, -1);

13374

SmallVector<int, 16> V2Mask((unsigned)Size, -1);

13375

13376

for (int i = 0; i < Size; ++i) {

13377

if (Mask[i] < 0)

13378

continue;

13379

13380

// Each element of the unpack contains Scale elements from this mask.

13381

int UnpackIdx = i / Scale;

13382

13383

// We only handle the case where V1 feeds the first slots of the unpack.

13384

// We rely on canonicalization to ensure this is the case.

13385

if ((UnpackIdx % 2 == 0) != (Mask[i] < Size))

13386

return SDValue();

13387

13388

// Setup the mask for this input. The indexing is tricky as we have to

13389

// handle the unpack stride.

13390

SmallVectorImpl<int> &VMask = (UnpackIdx % 2 == 0) ? V1Mask : V2Mask;

13391

VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 : Size / 2)] =

13392

Mask[i] % Size;

13393

}

13394

13395

// If we will have to shuffle both inputs to use the unpack, check whether

13396

// we can just unpack first and shuffle the result. If so, skip this unpack.

13397

if ((NumLoInputs == 0 || NumHiInputs == 0) && !isNoopShuffleMask(V1Mask) &&

13398

!isNoopShuffleMask(V2Mask))

13399

return SDValue();

13400

13401

// Shuffle the inputs into place.

13402

V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);

13403

V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);

13404

13405

// Cast the inputs to the type we will use to unpack them.

13406

MVT UnpackVT =

13407

MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), Size / Scale);

13408

V1 = DAG.getBitcast(UnpackVT, V1);

13409

V2 = DAG.getBitcast(UnpackVT, V2);

13410

13411

// Unpack the inputs and cast the result back to the desired type.

13412

return DAG.getBitcast(

13413

VT, DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,

13414

UnpackVT, V1, V2));

13415

};

13416

13417

// We try each unpack from the largest to the smallest to try and find one

13418

// that fits this mask.

13419

int OrigScalarSize = VT.getScalarSizeInBits();

13420

for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2)

13421

if (SDValue Unpack = TryUnpack(ScalarSize, ScalarSize / OrigScalarSize))

13422

return Unpack;

13423

13424

// If we're shuffling with a zero vector then we're better off not doing

13425

// VECTOR_SHUFFLE(UNPCK()) as we lose track of those zero elements.

13426

if (ISD::isBuildVectorAllZeros(V1.getNode()) ||

13427

ISD::isBuildVectorAllZeros(V2.getNode()))

13428

return SDValue();

13429

13430

// If none of the unpack-rooted lowerings worked (or were profitable) try an

13431

// initial unpack.

13432

if (NumLoInputs == 0 || NumHiInputs == 0) {

13433

assert((NumLoInputs > 0 || NumHiInputs > 0) &&(static_cast <bool> ((NumLoInputs > 0 || NumHiInputs
> 0) && "We have to have *some* inputs!") ? void (
0) : __assert_fail ("(NumLoInputs > 0 || NumHiInputs > 0) && \"We have to have *some* inputs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13434, __extension__
__PRETTY_FUNCTION__))

13434

"We have to have *some* inputs!")(static_cast <bool> ((NumLoInputs > 0 || NumHiInputs
> 0) && "We have to have *some* inputs!") ? void (
0) : __assert_fail ("(NumLoInputs > 0 || NumHiInputs > 0) && \"We have to have *some* inputs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13434, __extension__
__PRETTY_FUNCTION__));

13435

int HalfOffset = NumLoInputs == 0 ? Size / 2 : 0;

13436

13437

// FIXME: We could consider the total complexity of the permute of each

13438

// possible unpacking. Or at the least we should consider how many

13439

// half-crossings are created.

13440

// FIXME: We could consider commuting the unpacks.

13441

13442

SmallVector<int, 32> PermMask((unsigned)Size, -1);

13443

for (int i = 0; i < Size; ++i) {

13444

if (Mask[i] < 0)

13445

continue;

13446

13447

assert(Mask[i] % Size >= HalfOffset && "Found input from wrong half!")(static_cast <bool> (Mask[i] % Size >= HalfOffset &&
"Found input from wrong half!") ? void (0) : __assert_fail (
"Mask[i] % Size >= HalfOffset && \"Found input from wrong half!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13447, __extension__
__PRETTY_FUNCTION__));

13448

13449

PermMask[i] =

13450

2 * ((Mask[i] % Size) - HalfOffset) + (Mask[i] < Size ? 0 : 1);

13451

}

13452

return DAG.getVectorShuffle(

13453

VT, DL,

13454

DAG.getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL, DL, VT,

13455

V1, V2),

13456

DAG.getUNDEF(VT), PermMask);

13457

}

13458

13459

return SDValue();

13460

}

13461

13462

/// Helper to form a PALIGNR-based rotate+permute, merging 2 inputs and then

13463

/// permuting the elements of the result in place.

13464

static SDValue lowerShuffleAsByteRotateAndPermute(

13465

const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,

13466

const X86Subtarget &Subtarget, SelectionDAG &DAG) {

13467

if ((VT.is128BitVector() && !Subtarget.hasSSSE3()) ||

13468

(VT.is256BitVector() && !Subtarget.hasAVX2()) ||

13469

(VT.is512BitVector() && !Subtarget.hasBWI()))

13470

return SDValue();

13471

13472

// We don't currently support lane crossing permutes.

13473

if (is128BitLaneCrossingShuffleMask(VT, Mask))

13474

return SDValue();

13475

13476

int Scale = VT.getScalarSizeInBits() / 8;

13477

int NumLanes = VT.getSizeInBits() / 128;

13478

int NumElts = VT.getVectorNumElements();

13479

int NumEltsPerLane = NumElts / NumLanes;

13480

13481

// Determine range of mask elts.

13482

bool Blend1 = true;

13483

bool Blend2 = true;

13484

std::pair<int, int> Range1 = std::make_pair(INT_MAX2147483647, INT_MIN(-2147483647 -1));

13485

std::pair<int, int> Range2 = std::make_pair(INT_MAX2147483647, INT_MIN(-2147483647 -1));

13486

for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {

13487

for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {

13488

int M = Mask[Lane + Elt];

13489

if (M < 0)

13490

continue;

13491

if (M < NumElts) {

13492

Blend1 &= (M == (Lane + Elt));

13493

assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask")(static_cast <bool> (Lane <= M && M < (Lane
+ NumEltsPerLane) && "Out of range mask") ? void (0)
: __assert_fail ("Lane <= M && M < (Lane + NumEltsPerLane) && \"Out of range mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13493, __extension__
__PRETTY_FUNCTION__));

13494

M = M % NumEltsPerLane;

13495

Range1.first = std::min(Range1.first, M);

13496

Range1.second = std::max(Range1.second, M);

13497

} else {

13498

M -= NumElts;

13499

Blend2 &= (M == (Lane + Elt));

13500

assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask")(static_cast <bool> (Lane <= M && M < (Lane
+ NumEltsPerLane) && "Out of range mask") ? void (0)
: __assert_fail ("Lane <= M && M < (Lane + NumEltsPerLane) && \"Out of range mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13500, __extension__
__PRETTY_FUNCTION__));

13501

M = M % NumEltsPerLane;

13502

Range2.first = std::min(Range2.first, M);

13503

Range2.second = std::max(Range2.second, M);

13504

}

13505

}

13506

}

13507

13508

// Bail if we don't need both elements.

13509

// TODO - it might be worth doing this for unary shuffles if the permute

13510

// can be widened.

13511

if (!(0 <= Range1.first && Range1.second < NumEltsPerLane) ||

13512

!(0 <= Range2.first && Range2.second < NumEltsPerLane))

13513

return SDValue();

13514

13515

if (VT.getSizeInBits() > 128 && (Blend1 || Blend2))

13516

return SDValue();

13517

13518

// Rotate the 2 ops so we can access both ranges, then permute the result.

13519

auto RotateAndPermute = [&](SDValue Lo, SDValue Hi, int RotAmt, int Ofs) {

13520

MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);

13521

SDValue Rotate = DAG.getBitcast(

13522

VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, DAG.getBitcast(ByteVT, Hi),

13523

DAG.getBitcast(ByteVT, Lo),

13524

DAG.getTargetConstant(Scale * RotAmt, DL, MVT::i8)));

13525

SmallVector<int, 64> PermMask(NumElts, SM_SentinelUndef);

13526

for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {

13527

for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {

13528

int M = Mask[Lane + Elt];

13529

if (M < 0)

13530

continue;

13531

if (M < NumElts)

13532

PermMask[Lane + Elt] = Lane + ((M + Ofs - RotAmt) % NumEltsPerLane);

13533

else

13534

PermMask[Lane + Elt] = Lane + ((M - Ofs - RotAmt) % NumEltsPerLane);

13535

}

13536

}

13537

return DAG.getVectorShuffle(VT, DL, Rotate, DAG.getUNDEF(VT), PermMask);

13538

};

13539

13540

// Check if the ranges are small enough to rotate from either direction.

13541

if (Range2.second < Range1.first)

13542

return RotateAndPermute(V1, V2, Range1.first, 0);

13543

if (Range1.second < Range2.first)

13544

return RotateAndPermute(V2, V1, Range2.first, NumElts);

13545

return SDValue();

13546

}

13547

13548

static bool isBroadcastShuffleMask(ArrayRef<int> Mask) {

13549

return isUndefOrEqual(Mask, 0);

13550

}

13551

13552

static bool isNoopOrBroadcastShuffleMask(ArrayRef<int> Mask) {

13553

return isNoopShuffleMask(Mask) || isBroadcastShuffleMask(Mask);

13554

}

13555

13556

/// Check if the Mask consists of the same element repeated multiple times.

13557

static bool isSingleElementRepeatedMask(ArrayRef<int> Mask) {

13558

size_t NumUndefs = 0;

13559

std::optional<int> UniqueElt;

13560

for (int Elt : Mask) {

13561

if (Elt == SM_SentinelUndef) {

13562

NumUndefs++;

13563

continue;

13564

}

13565

if (UniqueElt.has_value() && UniqueElt.value() != Elt)

13566

return false;

13567

UniqueElt = Elt;

13568

}

13569

// Make sure the element is repeated enough times by checking the number of

13570

// undefs is small.

13571

return NumUndefs <= Mask.size() / 2 && UniqueElt.has_value();

13572

}

13573

13574

/// Generic routine to decompose a shuffle and blend into independent

13575

/// blends and permutes.

13576

///

13577

/// This matches the extremely common pattern for handling combined

13578

/// shuffle+blend operations on newer X86 ISAs where we have very fast blend

13579

/// operations. It will try to pick the best arrangement of shuffles and

13580

/// blends. For vXi8/vXi16 shuffles we may use unpack instead of blend.

13581

static SDValue lowerShuffleAsDecomposedShuffleMerge(

13582

const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,

13583

const X86Subtarget &Subtarget, SelectionDAG &DAG) {

13584

int NumElts = Mask.size();

13585

int NumLanes = VT.getSizeInBits() / 128;

13586

int NumEltsPerLane = NumElts / NumLanes;

13587

13588

// Shuffle the input elements into the desired positions in V1 and V2 and

13589

// unpack/blend them together.

13590

bool IsAlternating = true;

13591

SmallVector<int, 32> V1Mask(NumElts, -1);

13592

SmallVector<int, 32> V2Mask(NumElts, -1);

13593

SmallVector<int, 32> FinalMask(NumElts, -1);

13594

for (int i = 0; i < NumElts; ++i) {

13595

int M = Mask[i];

13596

if (M >= 0 && M < NumElts) {

13597

V1Mask[i] = M;

13598

FinalMask[i] = i;

13599

IsAlternating &= (i & 1) == 0;

13600

} else if (M >= NumElts) {

13601

V2Mask[i] = M - NumElts;

13602

FinalMask[i] = i + NumElts;

13603

IsAlternating &= (i & 1) == 1;

13604

}

13605

}

13606

13607

// If we effectively only demand the 0'th element of \p Input, and not only

13608

// as 0'th element, then broadcast said input,

13609

// and change \p InputMask to be a no-op (identity) mask.

13610

auto canonicalizeBroadcastableInput = [DL, VT, &Subtarget,

13611

&DAG](SDValue &Input,

13612

MutableArrayRef<int> InputMask) {

13613

unsigned EltSizeInBits = Input.getScalarValueSizeInBits();

13614

if (!Subtarget.hasAVX2() && (!Subtarget.hasAVX() || EltSizeInBits < 32 ||

13615

!X86::mayFoldLoad(Input, Subtarget)))

13616

return;

13617

if (isNoopShuffleMask(InputMask))

13618

return;

13619

assert(isBroadcastShuffleMask(InputMask) &&(static_cast <bool> (isBroadcastShuffleMask(InputMask) &&
"Expected to demand only the 0'th element.") ? void (0) : __assert_fail
("isBroadcastShuffleMask(InputMask) && \"Expected to demand only the 0'th element.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13620, __extension__
__PRETTY_FUNCTION__))

13620

"Expected to demand only the 0'th element.")(static_cast <bool> (isBroadcastShuffleMask(InputMask) &&
"Expected to demand only the 0'th element.") ? void (0) : __assert_fail
("isBroadcastShuffleMask(InputMask) && \"Expected to demand only the 0'th element.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13620, __extension__
__PRETTY_FUNCTION__));

13621

Input = DAG.getNode(X86ISD::VBROADCAST, DL, VT, Input);

13622

for (auto I : enumerate(InputMask)) {

13623

int &InputMaskElt = I.value();

13624

if (InputMaskElt >= 0)

13625

InputMaskElt = I.index();

13626

}

13627

};

13628

13629

// Currently, we may need to produce one shuffle per input, and blend results.

13630

// It is possible that the shuffle for one of the inputs is already a no-op.

13631

// See if we can simplify non-no-op shuffles into broadcasts,

13632

// which we consider to be strictly better than an arbitrary shuffle.

13633

if (isNoopOrBroadcastShuffleMask(V1Mask) &&

13634

isNoopOrBroadcastShuffleMask(V2Mask)) {

13635

canonicalizeBroadcastableInput(V1, V1Mask);

13636

canonicalizeBroadcastableInput(V2, V2Mask);

13637

}

13638

13639

// Try to lower with the simpler initial blend/unpack/rotate strategies unless

13640

// one of the input shuffles would be a no-op. We prefer to shuffle inputs as

13641

// the shuffle may be able to fold with a load or other benefit. However, when

13642

// we'll have to do 2x as many shuffles in order to achieve this, a 2-input

13643

// pre-shuffle first is a better strategy.

13644

if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask)) {

13645

// Only prefer immediate blends to unpack/rotate.

13646

if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask,

13647

DAG, true))

13648

return BlendPerm;

13649

// If either input vector provides only a single element which is repeated

13650

// multiple times, unpacking from both input vectors would generate worse

13651

// code. e.g. for

13652

// t5: v16i8 = vector_shuffle<16,0,16,1,16,2,16,3,16,4,16,5,16,6,16,7> t2, t4

13653

// it is better to process t4 first to create a vector of t4[0], then unpack

13654

// that vector with t2.

13655

if (!isSingleElementRepeatedMask(V1Mask) &&

13656

!isSingleElementRepeatedMask(V2Mask))

13657

if (SDValue UnpackPerm =

13658

lowerShuffleAsUNPCKAndPermute(DL, VT, V1, V2, Mask, DAG))

13659

return UnpackPerm;

13660

if (SDValue RotatePerm = lowerShuffleAsByteRotateAndPermute(

13661

DL, VT, V1, V2, Mask, Subtarget, DAG))

13662

return RotatePerm;

13663

// Unpack/rotate failed - try again with variable blends.

13664

if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask,

13665

DAG))

13666

return BlendPerm;

13667

if (VT.getScalarSizeInBits() >= 32)

13668

if (SDValue PermUnpack = lowerShuffleAsPermuteAndUnpack(

13669

DL, VT, V1, V2, Mask, Subtarget, DAG))

13670

return PermUnpack;

13671

}

13672

13673

// If the final mask is an alternating blend of vXi8/vXi16, convert to an

13674

// UNPCKL(SHUFFLE, SHUFFLE) pattern.

13675

// TODO: It doesn't have to be alternating - but each lane mustn't have more

13676

// than half the elements coming from each source.

13677

if (IsAlternating && VT.getScalarSizeInBits() < 32) {

13678

V1Mask.assign(NumElts, -1);

13679

V2Mask.assign(NumElts, -1);

13680

FinalMask.assign(NumElts, -1);

13681

for (int i = 0; i != NumElts; i += NumEltsPerLane)

13682

for (int j = 0; j != NumEltsPerLane; ++j) {

13683

int M = Mask[i + j];

13684

if (M >= 0 && M < NumElts) {

13685

V1Mask[i + (j / 2)] = M;

13686

FinalMask[i + j] = i + (j / 2);

13687

} else if (M >= NumElts) {

13688

V2Mask[i + (j / 2)] = M - NumElts;

13689

FinalMask[i + j] = i + (j / 2) + NumElts;

13690

}

13691

}

13692

}

13693

13694

V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);

13695

V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);

13696

return DAG.getVectorShuffle(VT, DL, V1, V2, FinalMask);

13697

}

13698

13699

/// Try to lower a vector shuffle as a bit rotation.

13700

///

13701

/// Look for a repeated rotation pattern in each sub group.

13702

/// Returns a ISD::ROTL element rotation amount or -1 if failed.

13703

static int matchShuffleAsBitRotate(ArrayRef<int> Mask, int NumSubElts) {

13704

int NumElts = Mask.size();

13705

assert((NumElts % NumSubElts) == 0 && "Illegal shuffle mask")(static_cast <bool> ((NumElts % NumSubElts) == 0 &&
"Illegal shuffle mask") ? void (0) : __assert_fail ("(NumElts % NumSubElts) == 0 && \"Illegal shuffle mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13705, __extension__
__PRETTY_FUNCTION__));

13706

13707

int RotateAmt = -1;

13708

for (int i = 0; i != NumElts; i += NumSubElts) {

13709

for (int j = 0; j != NumSubElts; ++j) {

13710

int M = Mask[i + j];

13711

if (M < 0)

13712

continue;

13713

if (!isInRange(M, i, i + NumSubElts))

13714

return -1;

13715

int Offset = (NumSubElts - (M - (i + j))) % NumSubElts;

13716

if (0 <= RotateAmt && Offset != RotateAmt)

13717

return -1;

13718

RotateAmt = Offset;

13719

}

13720

}

13721

return RotateAmt;

13722

}

13723

13724

static int matchShuffleAsBitRotate(MVT &RotateVT, int EltSizeInBits,

13725

const X86Subtarget &Subtarget,

13726

ArrayRef<int> Mask) {

13727

assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!")(static_cast <bool> (!isNoopShuffleMask(Mask) &&
"We shouldn't lower no-op shuffles!") ? void (0) : __assert_fail
("!isNoopShuffleMask(Mask) && \"We shouldn't lower no-op shuffles!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13727, __extension__
__PRETTY_FUNCTION__));

13728

assert(EltSizeInBits < 64 && "Can't rotate 64-bit integers")(static_cast <bool> (EltSizeInBits < 64 && "Can't rotate 64-bit integers"
) ? void (0) : __assert_fail ("EltSizeInBits < 64 && \"Can't rotate 64-bit integers\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13728, __extension__
__PRETTY_FUNCTION__));

13729

13730

// AVX512 only has vXi32/vXi64 rotates, so limit the rotation sub group size.

13731

int MinSubElts = Subtarget.hasAVX512() ? std::max(32 / EltSizeInBits, 2) : 2;

13732

int MaxSubElts = 64 / EltSizeInBits;

13733

for (int NumSubElts = MinSubElts; NumSubElts <= MaxSubElts; NumSubElts *= 2) {

13734

int RotateAmt = matchShuffleAsBitRotate(Mask, NumSubElts);

13735

if (RotateAmt < 0)

13736

continue;

13737

13738

int NumElts = Mask.size();

13739

MVT RotateSVT = MVT::getIntegerVT(EltSizeInBits * NumSubElts);

13740

RotateVT = MVT::getVectorVT(RotateSVT, NumElts / NumSubElts);

13741

return RotateAmt * EltSizeInBits;

13742

}

13743

13744

return -1;

13745

}

13746

13747

/// Lower shuffle using X86ISD::VROTLI rotations.

13748

static SDValue lowerShuffleAsBitRotate(const SDLoc &DL, MVT VT, SDValue V1,

13749

ArrayRef<int> Mask,

13750

const X86Subtarget &Subtarget,

13751

SelectionDAG &DAG) {

13752

// Only XOP + AVX512 targets have bit rotation instructions.

13753

// If we at least have SSSE3 (PSHUFB) then we shouldn't attempt to use this.

13754

bool IsLegal =

13755

(VT.is128BitVector() && Subtarget.hasXOP()) || Subtarget.hasAVX512();

13756

if (!IsLegal && Subtarget.hasSSE3())

13757

return SDValue();

13758

13759

MVT RotateVT;

13760

int RotateAmt = matchShuffleAsBitRotate(RotateVT, VT.getScalarSizeInBits(),

13761

Subtarget, Mask);

13762

if (RotateAmt < 0)

13763

return SDValue();

13764

13765

// For pre-SSSE3 targets, if we are shuffling vXi8 elts then ISD::ROTL,

13766

// expanded to OR(SRL,SHL), will be more efficient, but if they can

13767

// widen to vXi16 or more then existing lowering should will be better.

13768

if (!IsLegal) {

13769

if ((RotateAmt % 16) == 0)

13770

return SDValue();

13771

// TODO: Use getTargetVShiftByConstNode.

13772

unsigned ShlAmt = RotateAmt;

13773

unsigned SrlAmt = RotateVT.getScalarSizeInBits() - RotateAmt;

13774

V1 = DAG.getBitcast(RotateVT, V1);

13775

SDValue SHL = DAG.getNode(X86ISD::VSHLI, DL, RotateVT, V1,

13776

DAG.getTargetConstant(ShlAmt, DL, MVT::i8));

13777

SDValue SRL = DAG.getNode(X86ISD::VSRLI, DL, RotateVT, V1,

13778

DAG.getTargetConstant(SrlAmt, DL, MVT::i8));

13779

SDValue Rot = DAG.getNode(ISD::OR, DL, RotateVT, SHL, SRL);

13780

return DAG.getBitcast(VT, Rot);

13781

}

13782

13783

SDValue Rot =

13784

DAG.getNode(X86ISD::VROTLI, DL, RotateVT, DAG.getBitcast(RotateVT, V1),

13785

DAG.getTargetConstant(RotateAmt, DL, MVT::i8));

13786

return DAG.getBitcast(VT, Rot);

13787

}

13788

13789

/// Try to match a vector shuffle as an element rotation.

13790

///

13791

/// This is used for support PALIGNR for SSSE3 or VALIGND/Q for AVX512.

13792

static int matchShuffleAsElementRotate(SDValue &V1, SDValue &V2,

13793

ArrayRef<int> Mask) {

13794

int NumElts = Mask.size();

13795

13796

// We need to detect various ways of spelling a rotation:

13797

// [11, 12, 13, 14, 15, 0, 1, 2]

13798

// [-1, 12, 13, 14, -1, -1, 1, -1]

13799

// [-1, -1, -1, -1, -1, -1, 1, 2]

13800

// [ 3, 4, 5, 6, 7, 8, 9, 10]

13801

// [-1, 4, 5, 6, -1, -1, 9, -1]

13802

// [-1, 4, 5, 6, -1, -1, -1, -1]

13803

int Rotation = 0;

13804

SDValue Lo, Hi;

13805

for (int i = 0; i < NumElts; ++i) {

13806

int M = Mask[i];

13807

assert((M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) &&(static_cast <bool> ((M == SM_SentinelUndef || (0 <=
M && M < (2*NumElts))) && "Unexpected mask index."
) ? void (0) : __assert_fail ("(M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) && \"Unexpected mask index.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13808, __extension__
__PRETTY_FUNCTION__))

13808

"Unexpected mask index.")(static_cast <bool> ((M == SM_SentinelUndef || (0 <=
M && M < (2*NumElts))) && "Unexpected mask index."
) ? void (0) : __assert_fail ("(M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) && \"Unexpected mask index.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13808, __extension__
__PRETTY_FUNCTION__));

13809

if (M < 0)

13810

continue;

13811

13812

// Determine where a rotated vector would have started.

13813

int StartIdx = i - (M % NumElts);

13814

if (StartIdx == 0)

13815

// The identity rotation isn't interesting, stop.

13816

return -1;

13817

13818

// If we found the tail of a vector the rotation must be the missing

13819

// front. If we found the head of a vector, it must be how much of the

13820

// head.

13821

int CandidateRotation = StartIdx < 0 ? -StartIdx : NumElts - StartIdx;

13822

13823

if (Rotation == 0)

13824

Rotation = CandidateRotation;

13825

else if (Rotation != CandidateRotation)

13826

// The rotations don't match, so we can't match this mask.

13827

return -1;

13828

13829

// Compute which value this mask is pointing at.

13830

SDValue MaskV = M < NumElts ? V1 : V2;

13831

13832

// Compute which of the two target values this index should be assigned

13833

// to. This reflects whether the high elements are remaining or the low

13834

// elements are remaining.

13835

SDValue &TargetV = StartIdx < 0 ? Hi : Lo;

13836

13837

// Either set up this value if we've not encountered it before, or check

13838

// that it remains consistent.

13839

if (!TargetV)

13840

TargetV = MaskV;

13841

else if (TargetV != MaskV)

13842

// This may be a rotation, but it pulls from the inputs in some

13843

// unsupported interleaving.

13844

return -1;

13845

}

13846

13847

// Check that we successfully analyzed the mask, and normalize the results.

13848

assert(Rotation != 0 && "Failed to locate a viable rotation!")(static_cast <bool> (Rotation != 0 && "Failed to locate a viable rotation!"
) ? void (0) : __assert_fail ("Rotation != 0 && \"Failed to locate a viable rotation!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13848, __extension__
__PRETTY_FUNCTION__));

13849

assert((Lo || Hi) && "Failed to find a rotated input vector!")(static_cast <bool> ((Lo || Hi) && "Failed to find a rotated input vector!"
) ? void (0) : __assert_fail ("(Lo || Hi) && \"Failed to find a rotated input vector!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13849, __extension__
__PRETTY_FUNCTION__));

13850

if (!Lo)

13851

Lo = Hi;

13852

else if (!Hi)

13853

Hi = Lo;

13854

13855

V1 = Lo;

13856

V2 = Hi;

13857

13858

return Rotation;

13859

}

13860

13861

/// Try to lower a vector shuffle as a byte rotation.

13862

///

13863

/// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary

13864

/// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use

13865

/// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will

13866

/// try to generically lower a vector shuffle through such an pattern. It

13867

/// does not check for the profitability of lowering either as PALIGNR or

13868

/// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.

13869

/// This matches shuffle vectors that look like:

13870

///

13871

/// v8i16 [11, 12, 13, 14, 15, 0, 1, 2]

13872

///

13873

/// Essentially it concatenates V1 and V2, shifts right by some number of

13874

/// elements, and takes the low elements as the result. Note that while this is

13875

/// specified as a *right shift* because x86 is little-endian, it is a *left

13876

/// rotate* of the vector lanes.

13877

static int matchShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2,

13878

ArrayRef<int> Mask) {

13879

// Don't accept any shuffles with zero elements.

13880

if (isAnyZero(Mask))

13881

return -1;

13882

13883

// PALIGNR works on 128-bit lanes.

13884

SmallVector<int, 16> RepeatedMask;

13885

if (!is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask))

13886

return -1;

13887

13888

int Rotation = matchShuffleAsElementRotate(V1, V2, RepeatedMask);

13889

if (Rotation <= 0)

13890

return -1;

13891

13892

// PALIGNR rotates bytes, so we need to scale the

13893

// rotation based on how many bytes are in the vector lane.

13894

int NumElts = RepeatedMask.size();

13895

int Scale = 16 / NumElts;

13896

return Rotation * Scale;

13897

}

13898

13899

static SDValue lowerShuffleAsByteRotate(const SDLoc &DL, MVT VT, SDValue V1,

13900

SDValue V2, ArrayRef<int> Mask,

13901

const X86Subtarget &Subtarget,

13902

SelectionDAG &DAG) {

13903

assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!")(static_cast <bool> (!isNoopShuffleMask(Mask) &&
"We shouldn't lower no-op shuffles!") ? void (0) : __assert_fail
("!isNoopShuffleMask(Mask) && \"We shouldn't lower no-op shuffles!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13903, __extension__
__PRETTY_FUNCTION__));

13904

13905

SDValue Lo = V1, Hi = V2;

13906

int ByteRotation = matchShuffleAsByteRotate(VT, Lo, Hi, Mask);

13907

if (ByteRotation <= 0)

13908

return SDValue();

13909

13910

// Cast the inputs to i8 vector of correct length to match PALIGNR or

13911

// PSLLDQ/PSRLDQ.

13912

MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);

13913

Lo = DAG.getBitcast(ByteVT, Lo);

13914

Hi = DAG.getBitcast(ByteVT, Hi);

13915

13916

// SSSE3 targets can use the palignr instruction.

13917

if (Subtarget.hasSSSE3()) {

13918

assert((!VT.is512BitVector() || Subtarget.hasBWI()) &&(static_cast <bool> ((!VT.is512BitVector() || Subtarget
.hasBWI()) && "512-bit PALIGNR requires BWI instructions"
) ? void (0) : __assert_fail ("(!VT.is512BitVector() || Subtarget.hasBWI()) && \"512-bit PALIGNR requires BWI instructions\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13919, __extension__
__PRETTY_FUNCTION__))

13919

"512-bit PALIGNR requires BWI instructions")(static_cast <bool> ((!VT.is512BitVector() || Subtarget
.hasBWI()) && "512-bit PALIGNR requires BWI instructions"
) ? void (0) : __assert_fail ("(!VT.is512BitVector() || Subtarget.hasBWI()) && \"512-bit PALIGNR requires BWI instructions\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13919, __extension__
__PRETTY_FUNCTION__));

13920

return DAG.getBitcast(

13921

VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, Lo, Hi,

13922

DAG.getTargetConstant(ByteRotation, DL, MVT::i8)));

13923

}

13924

13925

assert(VT.is128BitVector() &&(static_cast <bool> (VT.is128BitVector() && "Rotate-based lowering only supports 128-bit lowering!"
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Rotate-based lowering only supports 128-bit lowering!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13926, __extension__
__PRETTY_FUNCTION__))

13926

"Rotate-based lowering only supports 128-bit lowering!")(static_cast <bool> (VT.is128BitVector() && "Rotate-based lowering only supports 128-bit lowering!"
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Rotate-based lowering only supports 128-bit lowering!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13926, __extension__
__PRETTY_FUNCTION__));

13927

assert(Mask.size() <= 16 &&(static_cast <bool> (Mask.size() <= 16 && "Can shuffle at most 16 bytes in a 128-bit vector!"
) ? void (0) : __assert_fail ("Mask.size() <= 16 && \"Can shuffle at most 16 bytes in a 128-bit vector!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13928, __extension__
__PRETTY_FUNCTION__))

13928

"Can shuffle at most 16 bytes in a 128-bit vector!")(static_cast <bool> (Mask.size() <= 16 && "Can shuffle at most 16 bytes in a 128-bit vector!"
) ? void (0) : __assert_fail ("Mask.size() <= 16 && \"Can shuffle at most 16 bytes in a 128-bit vector!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13928, __extension__
__PRETTY_FUNCTION__));

13929

assert(ByteVT == MVT::v16i8 &&(static_cast <bool> (ByteVT == MVT::v16i8 && "SSE2 rotate lowering only needed for v16i8!"
) ? void (0) : __assert_fail ("ByteVT == MVT::v16i8 && \"SSE2 rotate lowering only needed for v16i8!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13930, __extension__
__PRETTY_FUNCTION__))

13930

"SSE2 rotate lowering only needed for v16i8!")(static_cast <bool> (ByteVT == MVT::v16i8 && "SSE2 rotate lowering only needed for v16i8!"
) ? void (0) : __assert_fail ("ByteVT == MVT::v16i8 && \"SSE2 rotate lowering only needed for v16i8!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13930, __extension__
__PRETTY_FUNCTION__));

13931

13932

// Default SSE2 implementation

13933

int LoByteShift = 16 - ByteRotation;

13934

int HiByteShift = ByteRotation;

13935

13936

SDValue LoShift =

13937

DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Lo,

13938

DAG.getTargetConstant(LoByteShift, DL, MVT::i8));

13939

SDValue HiShift =

13940

DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Hi,

13941

DAG.getTargetConstant(HiByteShift, DL, MVT::i8));

13942

return DAG.getBitcast(VT,

13943

DAG.getNode(ISD::OR, DL, MVT::v16i8, LoShift, HiShift));

13944

}

13945

13946

/// Try to lower a vector shuffle as a dword/qword rotation.

13947

///

13948

/// AVX512 has a VALIGND/VALIGNQ instructions that will do an arbitrary

13949

/// rotation of the concatenation of two vectors; This routine will

13950

/// try to generically lower a vector shuffle through such an pattern.

13951

///

13952

/// Essentially it concatenates V1 and V2, shifts right by some number of

13953

/// elements, and takes the low elements as the result. Note that while this is

13954

/// specified as a *right shift* because x86 is little-endian, it is a *left

13955

/// rotate* of the vector lanes.

13956

static SDValue lowerShuffleAsVALIGN(const SDLoc &DL, MVT VT, SDValue V1,

13957

SDValue V2, ArrayRef<int> Mask,

13958

const X86Subtarget &Subtarget,

13959

SelectionDAG &DAG) {

13960

assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&(static_cast <bool> ((VT.getScalarType() == MVT::i32 ||
VT.getScalarType() == MVT::i64) && "Only 32-bit and 64-bit elements are supported!"
) ? void (0) : __assert_fail ("(VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) && \"Only 32-bit and 64-bit elements are supported!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13961, __extension__
__PRETTY_FUNCTION__))

13961

"Only 32-bit and 64-bit elements are supported!")(static_cast <bool> ((VT.getScalarType() == MVT::i32 ||
VT.getScalarType() == MVT::i64) && "Only 32-bit and 64-bit elements are supported!"
) ? void (0) : __assert_fail ("(VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) && \"Only 32-bit and 64-bit elements are supported!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13961, __extension__
__PRETTY_FUNCTION__));

13962

13963

// 128/256-bit vectors are only supported with VLX.

13964

assert((Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector()))(static_cast <bool> ((Subtarget.hasVLX() || (!VT.is128BitVector
() && !VT.is256BitVector())) && "VLX required for 128/256-bit vectors"
) ? void (0) : __assert_fail ("(Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector())) && \"VLX required for 128/256-bit vectors\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13965, __extension__
__PRETTY_FUNCTION__))

13965

&& "VLX required for 128/256-bit vectors")(static_cast <bool> ((Subtarget.hasVLX() || (!VT.is128BitVector
() && !VT.is256BitVector())) && "VLX required for 128/256-bit vectors"
) ? void (0) : __assert_fail ("(Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector())) && \"VLX required for 128/256-bit vectors\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13965, __extension__
__PRETTY_FUNCTION__));

13966

13967

SDValue Lo = V1, Hi = V2;

13968

int Rotation = matchShuffleAsElementRotate(Lo, Hi, Mask);

13969

if (Rotation <= 0)

13970

return SDValue();

13971

13972

return DAG.getNode(X86ISD::VALIGN, DL, VT, Lo, Hi,

13973

DAG.getTargetConstant(Rotation, DL, MVT::i8));

13974

}

13975

13976

/// Try to lower a vector shuffle as a byte shift sequence.

13977

static SDValue lowerShuffleAsByteShiftMask(const SDLoc &DL, MVT VT, SDValue V1,

13978

SDValue V2, ArrayRef<int> Mask,

13979

const APInt &Zeroable,

13980

const X86Subtarget &Subtarget,

13981

SelectionDAG &DAG) {

13982

assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!")(static_cast <bool> (!isNoopShuffleMask(Mask) &&
"We shouldn't lower no-op shuffles!") ? void (0) : __assert_fail
("!isNoopShuffleMask(Mask) && \"We shouldn't lower no-op shuffles!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13982, __extension__
__PRETTY_FUNCTION__));

13983

assert(VT.is128BitVector() && "Only 128-bit vectors supported")(static_cast <bool> (VT.is128BitVector() && "Only 128-bit vectors supported"
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Only 128-bit vectors supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13983, __extension__
__PRETTY_FUNCTION__));

13984

13985

// We need a shuffle that has zeros at one/both ends and a sequential

13986

// shuffle from one source within.

13987

unsigned ZeroLo = Zeroable.countr_one();

13988

unsigned ZeroHi = Zeroable.countl_one();

13989

if (!ZeroLo && !ZeroHi)

13990

return SDValue();

13991

13992

unsigned NumElts = Mask.size();

13993

unsigned Len = NumElts - (ZeroLo + ZeroHi);

13994

if (!isSequentialOrUndefInRange(Mask, ZeroLo, Len, Mask[ZeroLo]))

13995

return SDValue();

13996

13997

unsigned Scale = VT.getScalarSizeInBits() / 8;

13998

ArrayRef<int> StubMask = Mask.slice(ZeroLo, Len);

13999

if (!isUndefOrInRange(StubMask, 0, NumElts) &&

14000

!isUndefOrInRange(StubMask, NumElts, 2 * NumElts))

14001

return SDValue();

14002

14003

SDValue Res = Mask[ZeroLo] < (int)NumElts ? V1 : V2;

14004

Res = DAG.getBitcast(MVT::v16i8, Res);

14005

14006

// Use VSHLDQ/VSRLDQ ops to zero the ends of a vector and leave an

14007

// inner sequential set of elements, possibly offset:

14008

// 01234567 --> zzzzzz01 --> 1zzzzzzz

14009

// 01234567 --> 4567zzzz --> zzzzz456

14010

// 01234567 --> z0123456 --> 3456zzzz --> zz3456zz

14011

if (ZeroLo == 0) {

14012

unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);

14013

Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,

14014

DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));

14015

Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,

14016

DAG.getTargetConstant(Scale * ZeroHi, DL, MVT::i8));

14017

} else if (ZeroHi == 0) {

14018

unsigned Shift = Mask[ZeroLo] % NumElts;

14019

Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,

14020

DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));

14021

Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,

14022

DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8));

14023

} else if (!Subtarget.hasSSSE3()) {

14024

// If we don't have PSHUFB then its worth avoiding an AND constant mask

14025

// by performing 3 byte shifts. Shuffle combining can kick in above that.

14026

// TODO: There may be some cases where VSH{LR}DQ+PAND is still better.

14027

unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);

14028

Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,

14029

DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));

14030

Shift += Mask[ZeroLo] % NumElts;

14031

Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,

14032

DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));

14033

Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,

14034

DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8));

14035

} else

14036

return SDValue();

14037

14038

return DAG.getBitcast(VT, Res);

14039

}

14040

14041

/// Try to lower a vector shuffle as a bit shift (shifts in zeros).

14042

///

14043

/// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and

14044

/// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function

14045

/// matches elements from one of the input vectors shuffled to the left or

14046

/// right with zeroable elements 'shifted in'. It handles both the strictly

14047

/// bit-wise element shifts and the byte shift across an entire 128-bit double

14048

/// quad word lane.

14049

///

14050

/// PSHL : (little-endian) left bit shift.

14051

/// [ zz, 0, zz, 2 ]

14052

/// [ -1, 4, zz, -1 ]

14053

/// PSRL : (little-endian) right bit shift.

14054

/// [ 1, zz, 3, zz]

14055

/// [ -1, -1, 7, zz]

14056

/// PSLLDQ : (little-endian) left byte shift

14057

/// [ zz, 0, 1, 2, 3, 4, 5, 6]

14058

/// [ zz, zz, -1, -1, 2, 3, 4, -1]

14059

/// [ zz, zz, zz, zz, zz, zz, -1, 1]

14060

/// PSRLDQ : (little-endian) right byte shift

14061

/// [ 5, 6, 7, zz, zz, zz, zz, zz]

14062

/// [ -1, 5, 6, 7, zz, zz, zz, zz]

14063

/// [ 1, 2, -1, -1, -1, -1, zz, zz]

14064

static int matchShuffleAsShift(MVT &ShiftVT, unsigned &Opcode,

14065

unsigned ScalarSizeInBits, ArrayRef<int> Mask,

14066

int MaskOffset, const APInt &Zeroable,

14067

const X86Subtarget &Subtarget) {

14068

int Size = Mask.size();

14069

unsigned SizeInBits = Size * ScalarSizeInBits;

14070

14071

auto CheckZeros = [&](int Shift, int Scale, bool Left) {

14072

for (int i = 0; i < Size; i += Scale)

14073

for (int j = 0; j < Shift; ++j)

14074

if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))])

14075

return false;

14076

14077

return true;

14078

};

14079

14080

auto MatchShift = [&](int Shift, int Scale, bool Left) {

14081

for (int i = 0; i != Size; i += Scale) {

14082

unsigned Pos = Left ? i + Shift : i;

14083

unsigned Low = Left ? i : i + Shift;

14084

unsigned Len = Scale - Shift;

14085

if (!isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset))

14086

return -1;

14087

}

14088

14089

int ShiftEltBits = ScalarSizeInBits * Scale;

14090

bool ByteShift = ShiftEltBits > 64;

14091

Opcode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI)

14092

: (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI);

14093

int ShiftAmt = Shift * ScalarSizeInBits / (ByteShift ? 8 : 1);

14094

14095

// Normalize the scale for byte shifts to still produce an i64 element

14096

// type.

14097

Scale = ByteShift ? Scale / 2 : Scale;

14098

14099

// We need to round trip through the appropriate type for the shift.

14100

MVT ShiftSVT = MVT::getIntegerVT(ScalarSizeInBits * Scale);

14101

ShiftVT = ByteShift ? MVT::getVectorVT(MVT::i8, SizeInBits / 8)

14102

: MVT::getVectorVT(ShiftSVT, Size / Scale);

14103

return (int)ShiftAmt;

14104

};

14105

14106

// SSE/AVX supports logical shifts up to 64-bit integers - so we can just

14107

// keep doubling the size of the integer elements up to that. We can

14108

// then shift the elements of the integer vector by whole multiples of

14109

// their width within the elements of the larger integer vector. Test each

14110

// multiple to see if we can find a match with the moved element indices

14111

// and that the shifted in elements are all zeroable.

14112

unsigned MaxWidth = ((SizeInBits == 512) && !Subtarget.hasBWI() ? 64 : 128);

14113

for (int Scale = 2; Scale * ScalarSizeInBits <= MaxWidth; Scale *= 2)

14114

for (int Shift = 1; Shift != Scale; ++Shift)

14115

for (bool Left : {true, false})

14116

if (CheckZeros(Shift, Scale, Left)) {

14117

int ShiftAmt = MatchShift(Shift, Scale, Left);

14118

if (0 < ShiftAmt)

14119

return ShiftAmt;

14120

}

14121

14122

// no match

14123

return -1;

14124

}

14125

14126

static SDValue lowerShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1,

14127

SDValue V2, ArrayRef<int> Mask,

14128

const APInt &Zeroable,

14129

const X86Subtarget &Subtarget,

14130

SelectionDAG &DAG, bool BitwiseOnly) {

14131

int Size = Mask.size();

14132

assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size")(static_cast <bool> (Size == (int)VT.getVectorNumElements
() && "Unexpected mask size") ? void (0) : __assert_fail
("Size == (int)VT.getVectorNumElements() && \"Unexpected mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14132, __extension__
__PRETTY_FUNCTION__));

14133

14134

MVT ShiftVT;

14135

SDValue V = V1;

14136

unsigned Opcode;

14137

14138

// Try to match shuffle against V1 shift.

14139

int ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),

14140

Mask, 0, Zeroable, Subtarget);

14141

14142

// If V1 failed, try to match shuffle against V2 shift.

14143

if (ShiftAmt < 0) {

14144

ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),

14145

Mask, Size, Zeroable, Subtarget);

14146

V = V2;

14147

}

14148

14149

if (ShiftAmt < 0)

14150

return SDValue();

14151

14152

if (BitwiseOnly && (Opcode == X86ISD::VSHLDQ || Opcode == X86ISD::VSRLDQ))

14153

return SDValue();

14154

14155

assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&(static_cast <bool> (DAG.getTargetLoweringInfo().isTypeLegal
(ShiftVT) && "Illegal integer vector type") ? void (0
) : __assert_fail ("DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) && \"Illegal integer vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14156, __extension__
__PRETTY_FUNCTION__))

14156

"Illegal integer vector type")(static_cast <bool> (DAG.getTargetLoweringInfo().isTypeLegal
(ShiftVT) && "Illegal integer vector type") ? void (0
) : __assert_fail ("DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) && \"Illegal integer vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14156, __extension__
__PRETTY_FUNCTION__));

14157

V = DAG.getBitcast(ShiftVT, V);

14158

V = DAG.getNode(Opcode, DL, ShiftVT, V,

14159

DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));

14160

return DAG.getBitcast(VT, V);

14161

}

14162

14163

// EXTRQ: Extract Len elements from lower half of source, starting at Idx.

14164

// Remainder of lower half result is zero and upper half is all undef.

14165

static bool matchShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2,

14166

ArrayRef<int> Mask, uint64_t &BitLen,

14167

uint64_t &BitIdx, const APInt &Zeroable) {

14168

int Size = Mask.size();

14169

int HalfSize = Size / 2;

14170

assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size")(static_cast <bool> (Size == (int)VT.getVectorNumElements
() && "Unexpected mask size") ? void (0) : __assert_fail
("Size == (int)VT.getVectorNumElements() && \"Unexpected mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14170, __extension__
__PRETTY_FUNCTION__));

14171

assert(!Zeroable.isAllOnes() && "Fully zeroable shuffle mask")(static_cast <bool> (!Zeroable.isAllOnes() && "Fully zeroable shuffle mask"
) ? void (0) : __assert_fail ("!Zeroable.isAllOnes() && \"Fully zeroable shuffle mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14171, __extension__
__PRETTY_FUNCTION__));

14172

14173

// Upper half must be undefined.

14174

if (!isUndefUpperHalf(Mask))

14175

return false;

14176

14177

// Determine the extraction length from the part of the

14178

// lower half that isn't zeroable.

14179

int Len = HalfSize;

14180

for (; Len > 0; --Len)

14181

if (!Zeroable[Len - 1])

14182

break;

14183

assert(Len > 0 && "Zeroable shuffle mask")(static_cast <bool> (Len > 0 && "Zeroable shuffle mask"
) ? void (0) : __assert_fail ("Len > 0 && \"Zeroable shuffle mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14183, __extension__
__PRETTY_FUNCTION__));

14184

14185

// Attempt to match first Len sequential elements from the lower half.

14186

SDValue Src;

14187

int Idx = -1;

14188

for (int i = 0; i != Len; ++i) {

14189

int M = Mask[i];

14190

if (M == SM_SentinelUndef)

14191

continue;

14192

SDValue &V = (M < Size ? V1 : V2);

14193

M = M % Size;

14194

14195

// The extracted elements must start at a valid index and all mask

14196

// elements must be in the lower half.

14197

if (i > M || M >= HalfSize)

14198

return false;

14199

14200

if (Idx < 0 || (Src == V && Idx == (M - i))) {

14201

Src = V;

14202

Idx = M - i;

14203

continue;

14204

}

14205

return false;

14206

}

14207

14208

if (!Src || Idx < 0)

14209

return false;

14210

14211

assert((Idx + Len) <= HalfSize && "Illegal extraction mask")(static_cast <bool> ((Idx + Len) <= HalfSize &&
"Illegal extraction mask") ? void (0) : __assert_fail ("(Idx + Len) <= HalfSize && \"Illegal extraction mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14211, __extension__
__PRETTY_FUNCTION__));

14212

BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;

14213

BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;

14214

V1 = Src;

14215

return true;

14216

}

14217

14218

// INSERTQ: Extract lowest Len elements from lower half of second source and

14219

// insert over first source, starting at Idx.

14220

// { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... }

14221

static bool matchShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2,

14222

ArrayRef<int> Mask, uint64_t &BitLen,

14223

uint64_t &BitIdx) {

14224

int Size = Mask.size();

14225

int HalfSize = Size / 2;

14226

assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size")(static_cast <bool> (Size == (int)VT.getVectorNumElements
() && "Unexpected mask size") ? void (0) : __assert_fail
("Size == (int)VT.getVectorNumElements() && \"Unexpected mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14226, __extension__
__PRETTY_FUNCTION__));

14227

14228

// Upper half must be undefined.

14229

if (!isUndefUpperHalf(Mask))

14230

return false;

14231

14232

for (int Idx = 0; Idx != HalfSize; ++Idx) {

14233

SDValue Base;

14234

14235

// Attempt to match first source from mask before insertion point.

14236

if (isUndefInRange(Mask, 0, Idx)) {

14237

/* EMPTY */

14238

} else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) {

14239

Base = V1;

14240

} else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) {

14241

Base = V2;

14242

} else {

14243

continue;

14244

}

14245

14246

// Extend the extraction length looking to match both the insertion of

14247

// the second source and the remaining elements of the first.

14248

for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) {

14249

SDValue Insert;

14250

int Len = Hi - Idx;

14251

14252

// Match insertion.

14253

if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) {

14254

Insert = V1;

14255

} else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) {

14256

Insert = V2;

14257

} else {

14258

continue;

14259

}

14260

14261

// Match the remaining elements of the lower half.

14262

if (isUndefInRange(Mask, Hi, HalfSize - Hi)) {

14263

/* EMPTY */

14264

} else if ((!Base || (Base == V1)) &&

14265

isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) {

14266

Base = V1;

14267

} else if ((!Base || (Base == V2)) &&

14268

isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi,

14269

Size + Hi)) {

14270

Base = V2;

14271

} else {

14272

continue;

14273

}

14274

14275

BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;

14276

BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;

14277

V1 = Base;

14278

V2 = Insert;

14279

return true;

14280

}

14281

}

14282

14283

return false;

14284

}

14285

14286

/// Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.

14287

static SDValue lowerShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1,

14288

SDValue V2, ArrayRef<int> Mask,

14289

const APInt &Zeroable, SelectionDAG &DAG) {

14290

uint64_t BitLen, BitIdx;

14291

if (matchShuffleAsEXTRQ(VT, V1, V2, Mask, BitLen, BitIdx, Zeroable))

14292

return DAG.getNode(X86ISD::EXTRQI, DL, VT, V1,

14293

DAG.getTargetConstant(BitLen, DL, MVT::i8),

14294

DAG.getTargetConstant(BitIdx, DL, MVT::i8));

14295

14296

if (matchShuffleAsINSERTQ(VT, V1, V2, Mask, BitLen, BitIdx))

14297

return DAG.getNode(X86ISD::INSERTQI, DL, VT, V1 ? V1 : DAG.getUNDEF(VT),

14298

V2 ? V2 : DAG.getUNDEF(VT),

14299

DAG.getTargetConstant(BitLen, DL, MVT::i8),

14300

DAG.getTargetConstant(BitIdx, DL, MVT::i8));

14301

14302

return SDValue();

14303

}

14304

14305

/// Lower a vector shuffle as a zero or any extension.

14306

///

14307

/// Given a specific number of elements, element bit width, and extension

14308

/// stride, produce either a zero or any extension based on the available

14309

/// features of the subtarget. The extended elements are consecutive and

14310

/// begin and can start from an offsetted element index in the input; to

14311

/// avoid excess shuffling the offset must either being in the bottom lane

14312

/// or at the start of a higher lane. All extended elements must be from

14313

/// the same lane.

14314

static SDValue lowerShuffleAsSpecificZeroOrAnyExtend(

14315

const SDLoc &DL, MVT VT, int Scale, int Offset, bool AnyExt, SDValue InputV,

14316

ArrayRef<int> Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) {

14317

assert(Scale > 1 && "Need a scale to extend.")(static_cast <bool> (Scale > 1 && "Need a scale to extend."
) ? void (0) : __assert_fail ("Scale > 1 && \"Need a scale to extend.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14317, __extension__
__PRETTY_FUNCTION__));

14318

int EltBits = VT.getScalarSizeInBits();

14319

int NumElements = VT.getVectorNumElements();

14320

int NumEltsPerLane = 128 / EltBits;

14321

int OffsetLane = Offset / NumEltsPerLane;

14322

assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&(static_cast <bool> ((EltBits == 8 || EltBits == 16 || EltBits
== 32) && "Only 8, 16, and 32 bit elements can be extended."
) ? void (0) : __assert_fail ("(EltBits == 8 || EltBits == 16 || EltBits == 32) && \"Only 8, 16, and 32 bit elements can be extended.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14323, __extension__
__PRETTY_FUNCTION__))

14323

"Only 8, 16, and 32 bit elements can be extended.")(static_cast <bool> ((EltBits == 8 || EltBits == 16 || EltBits
== 32) && "Only 8, 16, and 32 bit elements can be extended."
) ? void (0) : __assert_fail ("(EltBits == 8 || EltBits == 16 || EltBits == 32) && \"Only 8, 16, and 32 bit elements can be extended.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14323, __extension__
__PRETTY_FUNCTION__));

14324

assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.")(static_cast <bool> (Scale * EltBits <= 64 &&
"Cannot zero extend past 64 bits.") ? void (0) : __assert_fail
("Scale * EltBits <= 64 && \"Cannot zero extend past 64 bits.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14324, __extension__
__PRETTY_FUNCTION__));

14325

assert(0 <= Offset && "Extension offset must be positive.")(static_cast <bool> (0 <= Offset && "Extension offset must be positive."
) ? void (0) : __assert_fail ("0 <= Offset && \"Extension offset must be positive.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14325, __extension__
__PRETTY_FUNCTION__));

14326

assert((Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) &&(static_cast <bool> ((Offset < NumEltsPerLane || Offset
% NumEltsPerLane == 0) && "Extension offset must be in the first lane or start an upper lane."
) ? void (0) : __assert_fail ("(Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) && \"Extension offset must be in the first lane or start an upper lane.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14327, __extension__
__PRETTY_FUNCTION__))

14327

"Extension offset must be in the first lane or start an upper lane.")(static_cast <bool> ((Offset < NumEltsPerLane || Offset
% NumEltsPerLane == 0) && "Extension offset must be in the first lane or start an upper lane."
) ? void (0) : __assert_fail ("(Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) && \"Extension offset must be in the first lane or start an upper lane.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14327, __extension__
__PRETTY_FUNCTION__));

14328

14329

// Check that an index is in same lane as the base offset.

14330

auto SafeOffset = [&](int Idx) {

14331

return OffsetLane == (Idx / NumEltsPerLane);

14332

};

14333

14334

// Shift along an input so that the offset base moves to the first element.

14335

auto ShuffleOffset = [&](SDValue V) {

14336

if (!Offset)

14337

return V;

14338

14339

SmallVector<int, 8> ShMask((unsigned)NumElements, -1);

14340

for (int i = 0; i * Scale < NumElements; ++i) {

14341

int SrcIdx = i + Offset;

14342

ShMask[i] = SafeOffset(SrcIdx) ? SrcIdx : -1;

14343

}

14344

return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), ShMask);

14345

};

14346

14347

// Found a valid a/zext mask! Try various lowering strategies based on the

14348

// input type and available ISA extensions.

14349

if (Subtarget.hasSSE41()) {

14350

// Not worth offsetting 128-bit vectors if scale == 2, a pattern using

14351

// PUNPCK will catch this in a later shuffle match.

14352

if (Offset && Scale == 2 && VT.is128BitVector())

14353

return SDValue();

14354

MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),

14355

NumElements / Scale);

14356

InputV = DAG.getBitcast(VT, InputV);

14357

InputV = ShuffleOffset(InputV);

14358

InputV = getEXTEND_VECTOR_INREG(AnyExt ? ISD::ANY_EXTEND : ISD::ZERO_EXTEND,

14359

DL, ExtVT, InputV, DAG);

14360

return DAG.getBitcast(VT, InputV);

14361

}

14362

14363

assert(VT.is128BitVector() && "Only 128-bit vectors can be extended.")(static_cast <bool> (VT.is128BitVector() && "Only 128-bit vectors can be extended."
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Only 128-bit vectors can be extended.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14363, __extension__
__PRETTY_FUNCTION__));

14364

InputV = DAG.getBitcast(VT, InputV);

14365

14366

// For any extends we can cheat for larger element sizes and use shuffle

14367

// instructions that can fold with a load and/or copy.

14368

if (AnyExt && EltBits == 32) {

14369

int PSHUFDMask[4] = {Offset, -1, SafeOffset(Offset + 1) ? Offset + 1 : -1,

14370

-1};

14371

return DAG.getBitcast(

14372

VT, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,

14373

DAG.getBitcast(MVT::v4i32, InputV),

14374

getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));

14375

}

14376

if (AnyExt && EltBits == 16 && Scale > 2) {

14377

int PSHUFDMask[4] = {Offset / 2, -1,

14378

SafeOffset(Offset + 1) ? (Offset + 1) / 2 : -1, -1};

14379

InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,

14380

DAG.getBitcast(MVT::v4i32, InputV),

14381

getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));

14382

int PSHUFWMask[4] = {1, -1, -1, -1};

14383

unsigned OddEvenOp = (Offset & 1) ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;

14384

return DAG.getBitcast(

14385

VT, DAG.getNode(OddEvenOp, DL, MVT::v8i16,

14386

DAG.getBitcast(MVT::v8i16, InputV),

14387

getV4X86ShuffleImm8ForMask(PSHUFWMask, DL, DAG)));

14388

}

14389

14390

// The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes

14391

// to 64-bits.

14392

if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget.hasSSE4A()) {

14393

assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!")(static_cast <bool> (NumElements == (int)Mask.size() &&
"Unexpected shuffle mask size!") ? void (0) : __assert_fail (
"NumElements == (int)Mask.size() && \"Unexpected shuffle mask size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14393, __extension__
__PRETTY_FUNCTION__));

14394

assert(VT.is128BitVector() && "Unexpected vector width!")(static_cast <bool> (VT.is128BitVector() && "Unexpected vector width!"
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Unexpected vector width!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14394, __extension__
__PRETTY_FUNCTION__));

14395

14396

int LoIdx = Offset * EltBits;

14397

SDValue Lo = DAG.getBitcast(

14398

MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,

14399

DAG.getTargetConstant(EltBits, DL, MVT::i8),

14400

DAG.getTargetConstant(LoIdx, DL, MVT::i8)));

14401

14402

if (isUndefUpperHalf(Mask) || !SafeOffset(Offset + 1))

14403

return DAG.getBitcast(VT, Lo);

14404

14405

int HiIdx = (Offset + 1) * EltBits;

14406

SDValue Hi = DAG.getBitcast(

14407

MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,

14408

DAG.getTargetConstant(EltBits, DL, MVT::i8),

14409

DAG.getTargetConstant(HiIdx, DL, MVT::i8)));

14410

return DAG.getBitcast(VT,

14411

DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi));

14412

}

14413

14414

// If this would require more than 2 unpack instructions to expand, use

14415

// pshufb when available. We can only use more than 2 unpack instructions

14416

// when zero extending i8 elements which also makes it easier to use pshufb.

14417

if (Scale > 4 && EltBits == 8 && Subtarget.hasSSSE3()) {

14418

assert(NumElements == 16 && "Unexpected byte vector width!")(static_cast <bool> (NumElements == 16 && "Unexpected byte vector width!"
) ? void (0) : __assert_fail ("NumElements == 16 && \"Unexpected byte vector width!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14418, __extension__
__PRETTY_FUNCTION__));

14419

SDValue PSHUFBMask[16];

14420

for (int i = 0; i < 16; ++i) {

14421

int Idx = Offset + (i / Scale);

14422

if ((i % Scale == 0 && SafeOffset(Idx))) {

14423

PSHUFBMask[i] = DAG.getConstant(Idx, DL, MVT::i8);

14424

continue;

14425

}

14426

PSHUFBMask[i] =

14427

AnyExt ? DAG.getUNDEF(MVT::i8) : DAG.getConstant(0x80, DL, MVT::i8);

14428

}

14429

InputV = DAG.getBitcast(MVT::v16i8, InputV);

14430

return DAG.getBitcast(

14431

VT, DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,

14432

DAG.getBuildVector(MVT::v16i8, DL, PSHUFBMask)));

14433

}

14434

14435

// If we are extending from an offset, ensure we start on a boundary that

14436

// we can unpack from.

14437

int AlignToUnpack = Offset % (NumElements / Scale);

14438

if (AlignToUnpack) {

14439

SmallVector<int, 8> ShMask((unsigned)NumElements, -1);

14440

for (int i = AlignToUnpack; i < NumElements; ++i)

14441

ShMask[i - AlignToUnpack] = i;

14442

InputV = DAG.getVectorShuffle(VT, DL, InputV, DAG.getUNDEF(VT), ShMask);

14443

Offset -= AlignToUnpack;

14444

}

14445

14446

// Otherwise emit a sequence of unpacks.

14447

do {

14448

unsigned UnpackLoHi = X86ISD::UNPCKL;

14449

if (Offset >= (NumElements / 2)) {

14450

UnpackLoHi = X86ISD::UNPCKH;

14451

Offset -= (NumElements / 2);

14452

}

14453

14454

MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);

14455

SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)

14456

: getZeroVector(InputVT, Subtarget, DAG, DL);

14457

InputV = DAG.getBitcast(InputVT, InputV);

14458

InputV = DAG.getNode(UnpackLoHi, DL, InputVT, InputV, Ext);

14459

Scale /= 2;

14460

EltBits *= 2;

14461

NumElements /= 2;

14462

} while (Scale > 1);

14463

return DAG.getBitcast(VT, InputV);

14464

}

14465

14466

/// Try to lower a vector shuffle as a zero extension on any microarch.

14467

///

14468

/// This routine will try to do everything in its power to cleverly lower

14469

/// a shuffle which happens to match the pattern of a zero extend. It doesn't

14470

/// check for the profitability of this lowering, it tries to aggressively

14471

/// match this pattern. It will use all of the micro-architectural details it

14472

/// can to emit an efficient lowering. It handles both blends with all-zero

14473

/// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to

14474

/// masking out later).

14475

///

14476

/// The reason we have dedicated lowering for zext-style shuffles is that they

14477

/// are both incredibly common and often quite performance sensitive.

14478

static SDValue lowerShuffleAsZeroOrAnyExtend(

14479

const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,

14480

const APInt &Zeroable, const X86Subtarget &Subtarget,

14481

SelectionDAG &DAG) {

14482

int Bits = VT.getSizeInBits();

14483

int NumLanes = Bits / 128;

14484

int NumElements = VT.getVectorNumElements();

14485

int NumEltsPerLane = NumElements / NumLanes;

14486

assert(VT.getScalarSizeInBits() <= 32 &&(static_cast <bool> (VT.getScalarSizeInBits() <= 32 &&
"Exceeds 32-bit integer zero extension limit") ? void (0) : __assert_fail
("VT.getScalarSizeInBits() <= 32 && \"Exceeds 32-bit integer zero extension limit\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14487, __extension__
__PRETTY_FUNCTION__))

14487

"Exceeds 32-bit integer zero extension limit")(static_cast <bool> (VT.getScalarSizeInBits() <= 32 &&
"Exceeds 32-bit integer zero extension limit") ? void (0) : __assert_fail
("VT.getScalarSizeInBits() <= 32 && \"Exceeds 32-bit integer zero extension limit\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14487, __extension__
__PRETTY_FUNCTION__));

14488

assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size")(static_cast <bool> ((int)Mask.size() == NumElements &&
"Unexpected shuffle mask size") ? void (0) : __assert_fail (
"(int)Mask.size() == NumElements && \"Unexpected shuffle mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14488, __extension__
__PRETTY_FUNCTION__));

14489

14490

// Define a helper function to check a particular ext-scale and lower to it if

14491

// valid.

14492

auto Lower = [&](int Scale) -> SDValue {

14493

SDValue InputV;

14494

bool AnyExt = true;

14495

int Offset = 0;

14496

int Matches = 0;

14497

for (int i = 0; i < NumElements; ++i) {

14498

int M = Mask[i];

14499

if (M < 0)

14500

continue; // Valid anywhere but doesn't tell us anything.

14501

if (i % Scale != 0) {

14502

// Each of the extended elements need to be zeroable.

14503

if (!Zeroable[i])

14504

return SDValue();

14505

14506

// We no longer are in the anyext case.

14507

AnyExt = false;

14508

continue;

14509

}

14510

14511

// Each of the base elements needs to be consecutive indices into the

14512

// same input vector.

14513

SDValue V = M < NumElements ? V1 : V2;

14514

M = M % NumElements;

14515

if (!InputV) {

14516

InputV = V;

14517

Offset = M - (i / Scale);

14518

} else if (InputV != V)

14519

return SDValue(); // Flip-flopping inputs.

14520

14521

// Offset must start in the lowest 128-bit lane or at the start of an

14522

// upper lane.

14523

// FIXME: Is it ever worth allowing a negative base offset?

14524

if (!((0 <= Offset && Offset < NumEltsPerLane) ||

14525

(Offset % NumEltsPerLane) == 0))

14526

return SDValue();

14527

14528

// If we are offsetting, all referenced entries must come from the same

14529

// lane.

14530

if (Offset && (Offset / NumEltsPerLane) != (M / NumEltsPerLane))

14531

return SDValue();

14532

14533

if ((M % NumElements) != (Offset + (i / Scale)))

14534

return SDValue(); // Non-consecutive strided elements.

14535

Matches++;

14536

}

14537

14538

// If we fail to find an input, we have a zero-shuffle which should always

14539

// have already been handled.

14540

// FIXME: Maybe handle this here in case during blending we end up with one?

14541

if (!InputV)

14542

return SDValue();

14543

14544

// If we are offsetting, don't extend if we only match a single input, we

14545

// can always do better by using a basic PSHUF or PUNPCK.

14546

if (Offset != 0 && Matches < 2)

14547

return SDValue();

14548

14549

return lowerShuffleAsSpecificZeroOrAnyExtend(DL, VT, Scale, Offset, AnyExt,

14550

InputV, Mask, Subtarget, DAG);

14551

};

14552

14553

// The widest scale possible for extending is to a 64-bit integer.

14554

assert(Bits % 64 == 0 &&(static_cast <bool> (Bits % 64 == 0 && "The number of bits in a vector must be divisible by 64 on x86!"
) ? void (0) : __assert_fail ("Bits % 64 == 0 && \"The number of bits in a vector must be divisible by 64 on x86!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14555, __extension__
__PRETTY_FUNCTION__))

14555

"The number of bits in a vector must be divisible by 64 on x86!")(static_cast <bool> (Bits % 64 == 0 && "The number of bits in a vector must be divisible by 64 on x86!"
) ? void (0) : __assert_fail ("Bits % 64 == 0 && \"The number of bits in a vector must be divisible by 64 on x86!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14555, __extension__
__PRETTY_FUNCTION__));

14556

int NumExtElements = Bits / 64;

14557

14558

// Each iteration, try extending the elements half as much, but into twice as

14559

// many elements.

14560

for (; NumExtElements < NumElements; NumExtElements *= 2) {

14561

assert(NumElements % NumExtElements == 0 &&(static_cast <bool> (NumElements % NumExtElements == 0 &&
"The input vector size must be divisible by the extended size."
) ? void (0) : __assert_fail ("NumElements % NumExtElements == 0 && \"The input vector size must be divisible by the extended size.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14562, __extension__
__PRETTY_FUNCTION__))

14562

"The input vector size must be divisible by the extended size.")(static_cast <bool> (NumElements % NumExtElements == 0 &&
"The input vector size must be divisible by the extended size."
) ? void (0) : __assert_fail ("NumElements % NumExtElements == 0 && \"The input vector size must be divisible by the extended size.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14562, __extension__
__PRETTY_FUNCTION__));

14563

if (SDValue V = Lower(NumElements / NumExtElements))

14564

return V;

14565

}

14566

14567

// General extends failed, but 128-bit vectors may be able to use MOVQ.

14568

if (Bits != 128)

14569

return SDValue();

14570

14571

// Returns one of the source operands if the shuffle can be reduced to a

14572

// MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits.

14573

auto CanZExtLowHalf = [&]() {

14574

for (int i = NumElements / 2; i != NumElements; ++i)

14575

if (!Zeroable[i])

14576

return SDValue();

14577

if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0))

14578

return V1;

14579

if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements))

14580

return V2;

14581

return SDValue();

14582

};

14583

14584

if (SDValue V = CanZExtLowHalf()) {

14585

V = DAG.getBitcast(MVT::v2i64, V);

14586

V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V);

14587

return DAG.getBitcast(VT, V);

14588

}

14589

14590

// No viable ext lowering found.

14591

return SDValue();

14592

}

14593

14594

/// Try to get a scalar value for a specific element of a vector.

14595

///

14596

/// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar.

14597

static SDValue getScalarValueForVectorElement(SDValue V, int Idx,

14598

SelectionDAG &DAG) {

14599

MVT VT = V.getSimpleValueType();

14600

MVT EltVT = VT.getVectorElementType();

14601

V = peekThroughBitcasts(V);

14602

14603

// If the bitcasts shift the element size, we can't extract an equivalent

14604

// element from it.

14605

MVT NewVT = V.getSimpleValueType();

14606

if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits())

14607

return SDValue();

14608

14609

if (V.getOpcode() == ISD::BUILD_VECTOR ||

14610

(Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR)) {

14611

// Ensure the scalar operand is the same size as the destination.

14612

// FIXME: Add support for scalar truncation where possible.

14613

SDValue S = V.getOperand(Idx);

14614

if (EltVT.getSizeInBits() == S.getSimpleValueType().getSizeInBits())

14615

return DAG.getBitcast(EltVT, S);

14616

}

14617

14618

return SDValue();

14619

}

14620

14621

/// Helper to test for a load that can be folded with x86 shuffles.

14622

///

14623

/// This is particularly important because the set of instructions varies

14624

/// significantly based on whether the operand is a load or not.

14625

static bool isShuffleFoldableLoad(SDValue V) {

14626

return V->hasOneUse() &&

14627

ISD::isNON_EXTLoad(peekThroughOneUseBitcasts(V).getNode());

14628

}

14629

14630

template<typename T>

14631

static bool isSoftFP16(T VT, const X86Subtarget &Subtarget) {

14632

return VT.getScalarType() == MVT::f16 && !Subtarget.hasFP16();

14633

}

14634

14635

template<typename T>

14636

bool X86TargetLowering::isSoftFP16(T VT) const {

14637

return ::isSoftFP16(VT, Subtarget);

14638

}

14639

14640

/// Try to lower insertion of a single element into a zero vector.

14641

///

14642

/// This is a common pattern that we have especially efficient patterns to lower

14643

/// across all subtarget feature sets.

14644

static SDValue lowerShuffleAsElementInsertion(

14645

const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,

14646

const APInt &Zeroable, const X86Subtarget &Subtarget,

14647

SelectionDAG &DAG) {

14648

MVT ExtVT = VT;

14649

MVT EltVT = VT.getVectorElementType();

14650

14651

if (isSoftFP16(EltVT, Subtarget))

14652

return SDValue();

14653

14654

int V2Index =

14655

find_if(Mask, [&Mask](int M) { return M >= (int)Mask.size(); }) -

14656

Mask.begin();

14657

bool IsV1Zeroable = true;

14658

for (int i = 0, Size = Mask.size(); i < Size; ++i)

14659

if (i != V2Index && !Zeroable[i]) {

14660

IsV1Zeroable = false;

14661

break;

14662

}

14663

14664

// Check for a single input from a SCALAR_TO_VECTOR node.

14665

// FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and

14666

// all the smarts here sunk into that routine. However, the current

14667

// lowering of BUILD_VECTOR makes that nearly impossible until the old

14668

// vector shuffle lowering is dead.

14669

SDValue V2S = getScalarValueForVectorElement(V2, Mask[V2Index] - Mask.size(),

14670

DAG);

14671

if (V2S && DAG.getTargetLoweringInfo().isTypeLegal(V2S.getValueType())) {

14672

// We need to zext the scalar if it is smaller than an i32.

14673

V2S = DAG.getBitcast(EltVT, V2S);

14674

if (EltVT == MVT::i8 || (EltVT == MVT::i16 && !Subtarget.hasFP16())) {

14675

// Using zext to expand a narrow element won't work for non-zero

14676

// insertions.

14677

if (!IsV1Zeroable)

14678

return SDValue();

14679

14680

// Zero-extend directly to i32.

14681

ExtVT = MVT::getVectorVT(MVT::i32, ExtVT.getSizeInBits() / 32);

14682

V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);

14683

}

14684

V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);

14685

} else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 ||

14686

EltVT == MVT::i16) {

14687

// Either not inserting from the low element of the input or the input

14688

// element size is too small to use VZEXT_MOVL to clear the high bits.

14689

return SDValue();

14690

}

14691

14692

if (!IsV1Zeroable) {

14693

// If V1 can't be treated as a zero vector we have fewer options to lower

14694

// this. We can't support integer vectors or non-zero targets cheaply, and

14695

// the V1 elements can't be permuted in any way.

14696

assert(VT == ExtVT && "Cannot change extended type when non-zeroable!")(static_cast <bool> (VT == ExtVT && "Cannot change extended type when non-zeroable!"
) ? void (0) : __assert_fail ("VT == ExtVT && \"Cannot change extended type when non-zeroable!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14696, __extension__
__PRETTY_FUNCTION__));

14697

if (!VT.isFloatingPoint() || V2Index != 0)

14698

return SDValue();

14699

SmallVector<int, 8> V1Mask(Mask);

14700

V1Mask[V2Index] = -1;

14701

if (!isNoopShuffleMask(V1Mask))

14702

return SDValue();

14703

if (!VT.is128BitVector())

14704

return SDValue();

14705

14706

// Otherwise, use MOVSD, MOVSS or MOVSH.

14707

unsigned MovOpc = 0;

14708

if (EltVT == MVT::f16)

14709

MovOpc = X86ISD::MOVSH;

14710

else if (EltVT == MVT::f32)

14711

MovOpc = X86ISD::MOVSS;

14712

else if (EltVT == MVT::f64)

14713

MovOpc = X86ISD::MOVSD;

14714

else

14715

llvm_unreachable("Unsupported floating point element type to handle!")::llvm::llvm_unreachable_internal("Unsupported floating point element type to handle!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14715);

14716

return DAG.getNode(MovOpc, DL, ExtVT, V1, V2);

14717

}

14718

14719

// This lowering only works for the low element with floating point vectors.

14720

if (VT.isFloatingPoint() && V2Index != 0)

14721

return SDValue();

14722

14723

V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2);

14724

if (ExtVT != VT)

14725

V2 = DAG.getBitcast(VT, V2);

14726

14727

if (V2Index != 0) {

14728

// If we have 4 or fewer lanes we can cheaply shuffle the element into

14729

// the desired position. Otherwise it is more efficient to do a vector

14730

// shift left. We know that we can do a vector shift left because all

14731

// the inputs are zero.

14732

if (VT.isFloatingPoint() || VT.getVectorNumElements() <= 4) {

14733

SmallVector<int, 4> V2Shuffle(Mask.size(), 1);

14734

V2Shuffle[V2Index] = 0;

14735

V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);

14736

} else {

14737

V2 = DAG.getBitcast(MVT::v16i8, V2);

14738

V2 = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, V2,

14739

DAG.getTargetConstant(

14740

V2Index * EltVT.getSizeInBits() / 8, DL, MVT::i8));

14741

V2 = DAG.getBitcast(VT, V2);

14742

}

14743

}

14744

return V2;

14745

}

14746

14747

/// Try to lower broadcast of a single - truncated - integer element,

14748

/// coming from a scalar_to_vector/build_vector node \p V0 with larger elements.

14749

///

14750

/// This assumes we have AVX2.

14751

static SDValue lowerShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT, SDValue V0,

14752

int BroadcastIdx,

14753

const X86Subtarget &Subtarget,

14754

SelectionDAG &DAG) {

14755

assert(Subtarget.hasAVX2() &&(static_cast <bool> (Subtarget.hasAVX2() && "We can only lower integer broadcasts with AVX2!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"We can only lower integer broadcasts with AVX2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14756, __extension__
__PRETTY_FUNCTION__))

14756

"We can only lower integer broadcasts with AVX2!")(static_cast <bool> (Subtarget.hasAVX2() && "We can only lower integer broadcasts with AVX2!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"We can only lower integer broadcasts with AVX2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14756, __extension__
__PRETTY_FUNCTION__));

14757

14758

MVT EltVT = VT.getVectorElementType();

14759

MVT V0VT = V0.getSimpleValueType();

14760

14761

assert(VT.isInteger() && "Unexpected non-integer trunc broadcast!")(static_cast <bool> (VT.isInteger() && "Unexpected non-integer trunc broadcast!"
) ? void (0) : __assert_fail ("VT.isInteger() && \"Unexpected non-integer trunc broadcast!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14761, __extension__
__PRETTY_FUNCTION__));

14762

assert(V0VT.isVector() && "Unexpected non-vector vector-sized value!")(static_cast <bool> (V0VT.isVector() && "Unexpected non-vector vector-sized value!"
) ? void (0) : __assert_fail ("V0VT.isVector() && \"Unexpected non-vector vector-sized value!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14762, __extension__
__PRETTY_FUNCTION__));

14763

14764

MVT V0EltVT = V0VT.getVectorElementType();

14765

if (!V0EltVT.isInteger())

14766

return SDValue();

14767

14768

const unsigned EltSize = EltVT.getSizeInBits();

14769

const unsigned V0EltSize = V0EltVT.getSizeInBits();

14770

14771

// This is only a truncation if the original element type is larger.

14772

if (V0EltSize <= EltSize)

14773

return SDValue();

14774

14775

assert(((V0EltSize % EltSize) == 0) &&(static_cast <bool> (((V0EltSize % EltSize) == 0) &&
"Scalar type sizes must all be powers of 2 on x86!") ? void (
0) : __assert_fail ("((V0EltSize % EltSize) == 0) && \"Scalar type sizes must all be powers of 2 on x86!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14776, __extension__
__PRETTY_FUNCTION__))

14776

"Scalar type sizes must all be powers of 2 on x86!")(static_cast <bool> (((V0EltSize % EltSize) == 0) &&
"Scalar type sizes must all be powers of 2 on x86!") ? void (
0) : __assert_fail ("((V0EltSize % EltSize) == 0) && \"Scalar type sizes must all be powers of 2 on x86!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14776, __extension__
__PRETTY_FUNCTION__));

14777

14778

const unsigned V0Opc = V0.getOpcode();

14779

const unsigned Scale = V0EltSize / EltSize;

14780

const unsigned V0BroadcastIdx = BroadcastIdx / Scale;

14781

14782

if ((V0Opc != ISD::SCALAR_TO_VECTOR || V0BroadcastIdx != 0) &&

14783

V0Opc != ISD::BUILD_VECTOR)

14784

return SDValue();

14785

14786

SDValue Scalar = V0.getOperand(V0BroadcastIdx);

14787

14788

// If we're extracting non-least-significant bits, shift so we can truncate.

14789

// Hopefully, we can fold away the trunc/srl/load into the broadcast.

14790

// Even if we can't (and !isShuffleFoldableLoad(Scalar)), prefer

14791

// vpbroadcast+vmovd+shr to vpshufb(m)+vmovd.

14792

if (const int OffsetIdx = BroadcastIdx % Scale)

14793

Scalar = DAG.getNode(ISD::SRL, DL, Scalar.getValueType(), Scalar,

14794

DAG.getConstant(OffsetIdx * EltSize, DL, MVT::i8));

14795

14796

return DAG.getNode(X86ISD::VBROADCAST, DL, VT,

14797

DAG.getNode(ISD::TRUNCATE, DL, EltVT, Scalar));

14798

}

14799

14800

/// Test whether this can be lowered with a single SHUFPS instruction.

14801

///

14802

/// This is used to disable more specialized lowerings when the shufps lowering

14803

/// will happen to be efficient.

14804

static bool isSingleSHUFPSMask(ArrayRef<int> Mask) {

14805

// This routine only handles 128-bit shufps.

14806

assert(Mask.size() == 4 && "Unsupported mask size!")(static_cast <bool> (Mask.size() == 4 && "Unsupported mask size!"
) ? void (0) : __assert_fail ("Mask.size() == 4 && \"Unsupported mask size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14806, __extension__
__PRETTY_FUNCTION__));

14807

assert(Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!")(static_cast <bool> (Mask[0] >= -1 && Mask[0
] < 8 && "Out of bound mask element!") ? void (0) :
__assert_fail ("Mask[0] >= -1 && Mask[0] < 8 && \"Out of bound mask element!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14807, __extension__
__PRETTY_FUNCTION__));

14808

assert(Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!")(static_cast <bool> (Mask[1] >= -1 && Mask[1
] < 8 && "Out of bound mask element!") ? void (0) :
__assert_fail ("Mask[1] >= -1 && Mask[1] < 8 && \"Out of bound mask element!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14808, __extension__
__PRETTY_FUNCTION__));

14809

assert(Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!")(static_cast <bool> (Mask[2] >= -1 && Mask[2
] < 8 && "Out of bound mask element!") ? void (0) :
__assert_fail ("Mask[2] >= -1 && Mask[2] < 8 && \"Out of bound mask element!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14809, __extension__
__PRETTY_FUNCTION__));

14810

assert(Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!")(static_cast <bool> (Mask[3] >= -1 && Mask[3
] < 8 && "Out of bound mask element!") ? void (0) :
__assert_fail ("Mask[3] >= -1 && Mask[3] < 8 && \"Out of bound mask element!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14810, __extension__
__PRETTY_FUNCTION__));

14811

14812

// To lower with a single SHUFPS we need to have the low half and high half

14813

// each requiring a single input.

14814

if (Mask[0] >= 0 && Mask[1] >= 0 && (Mask[0] < 4) != (Mask[1] < 4))

14815

return false;

14816

if (Mask[2] >= 0 && Mask[3] >= 0 && (Mask[2] < 4) != (Mask[3] < 4))

14817

return false;

14818

14819

return true;

14820

}

14821

14822

/// Test whether the specified input (0 or 1) is in-place blended by the

14823

/// given mask.

14824

///

14825

/// This returns true if the elements from a particular input are already in the

14826

/// slot required by the given mask and require no permutation.

14827

static bool isShuffleMaskInputInPlace(int Input, ArrayRef<int> Mask) {

14828

assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.")(static_cast <bool> ((Input == 0 || Input == 1) &&
"Only two inputs to shuffles.") ? void (0) : __assert_fail (
"(Input == 0 || Input == 1) && \"Only two inputs to shuffles.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14828, __extension__
__PRETTY_FUNCTION__));

14829

int Size = Mask.size();

14830

for (int i = 0; i < Size; ++i)

14831

if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i)

14832

return false;

14833

14834

return true;

14835

}

14836

14837

/// If we are extracting two 128-bit halves of a vector and shuffling the

14838

/// result, match that to a 256-bit AVX2 vperm* instruction to avoid a

14839

/// multi-shuffle lowering.

14840

static SDValue lowerShuffleOfExtractsAsVperm(const SDLoc &DL, SDValue N0,

14841

SDValue N1, ArrayRef<int> Mask,

14842

SelectionDAG &DAG) {

14843

MVT VT = N0.getSimpleValueType();

14844

assert((VT.is128BitVector() &&(static_cast <bool> ((VT.is128BitVector() && (VT
.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64
)) && "VPERM* family of shuffles requires 32-bit or 64-bit elements"
) ? void (0) : __assert_fail ("(VT.is128BitVector() && (VT.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64)) && \"VPERM* family of shuffles requires 32-bit or 64-bit elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14846, __extension__
__PRETTY_FUNCTION__))

14845

(VT.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64)) &&(static_cast <bool> ((VT.is128BitVector() && (VT
.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64
)) && "VPERM* family of shuffles requires 32-bit or 64-bit elements"
) ? void (0) : __assert_fail ("(VT.is128BitVector() && (VT.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64)) && \"VPERM* family of shuffles requires 32-bit or 64-bit elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14846, __extension__
__PRETTY_FUNCTION__))

14846

"VPERM* family of shuffles requires 32-bit or 64-bit elements")(static_cast <bool> ((VT.is128BitVector() && (VT
.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64
)) && "VPERM* family of shuffles requires 32-bit or 64-bit elements"
) ? void (0) : __assert_fail ("(VT.is128BitVector() && (VT.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64)) && \"VPERM* family of shuffles requires 32-bit or 64-bit elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14846, __extension__
__PRETTY_FUNCTION__));

14847

14848

// Check that both sources are extracts of the same source vector.

14849

if (N0.getOpcode() != ISD::EXTRACT_SUBVECTOR ||

14850

N1.getOpcode() != ISD::EXTRACT_SUBVECTOR ||

14851

N0.getOperand(0) != N1.getOperand(0) ||

14852

!N0.hasOneUse() || !N1.hasOneUse())

14853

return SDValue();

14854

14855

SDValue WideVec = N0.getOperand(0);

14856

MVT WideVT = WideVec.getSimpleValueType();

14857

if (!WideVT.is256BitVector())

14858

return SDValue();

14859

14860

// Match extracts of each half of the wide source vector. Commute the shuffle

14861

// if the extract of the low half is N1.

14862

unsigned NumElts = VT.getVectorNumElements();

14863

SmallVector<int, 4> NewMask(Mask);

14864

const APInt &ExtIndex0 = N0.getConstantOperandAPInt(1);

14865

const APInt &ExtIndex1 = N1.getConstantOperandAPInt(1);

14866

if (ExtIndex1 == 0 && ExtIndex0 == NumElts)

14867

ShuffleVectorSDNode::commuteMask(NewMask);

14868

else if (ExtIndex0 != 0 || ExtIndex1 != NumElts)

14869

return SDValue();

14870

14871

// Final bailout: if the mask is simple, we are better off using an extract

14872

// and a simple narrow shuffle. Prefer extract+unpack(h/l)ps to vpermps

14873

// because that avoids a constant load from memory.

14874

if (NumElts == 4 &&

14875

(isSingleSHUFPSMask(NewMask) || is128BitUnpackShuffleMask(NewMask, DAG)))

14876

return SDValue();

14877

14878

// Extend the shuffle mask with undef elements.

14879

NewMask.append(NumElts, -1);

14880

14881

// shuf (extract X, 0), (extract X, 4), M --> extract (shuf X, undef, M'), 0

14882

SDValue Shuf = DAG.getVectorShuffle(WideVT, DL, WideVec, DAG.getUNDEF(WideVT),

14883

NewMask);

14884

// This is free: ymm -> xmm.

14885

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Shuf,

14886

DAG.getIntPtrConstant(0, DL));

14887

}

14888

14889

/// Try to lower broadcast of a single element.

14890

///

14891

/// For convenience, this code also bundles all of the subtarget feature set

14892

/// filtering. While a little annoying to re-dispatch on type here, there isn't

14893

/// a convenient way to factor it out.

14894

static SDValue lowerShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1,

14895

SDValue V2, ArrayRef<int> Mask,

14896

const X86Subtarget &Subtarget,

14897

SelectionDAG &DAG) {

14898

if (!((Subtarget.hasSSE3() && VT == MVT::v2f64) ||

14899

(Subtarget.hasAVX() && VT.isFloatingPoint()) ||

14900

(Subtarget.hasAVX2() && VT.isInteger())))

14901

return SDValue();

14902

14903

// With MOVDDUP (v2f64) we can broadcast from a register or a load, otherwise

14904

// we can only broadcast from a register with AVX2.

14905

unsigned NumEltBits = VT.getScalarSizeInBits();

14906

unsigned Opcode = (VT == MVT::v2f64 && !Subtarget.hasAVX2())

14907

? X86ISD::MOVDDUP

14908

: X86ISD::VBROADCAST;

14909

bool BroadcastFromReg = (Opcode == X86ISD::MOVDDUP) || Subtarget.hasAVX2();

14910

14911

// Check that the mask is a broadcast.

14912

int BroadcastIdx = getSplatIndex(Mask);

14913

if (BroadcastIdx < 0)

14914

return SDValue();

14915

assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "(static_cast <bool> (BroadcastIdx < (int)Mask.size()
&& "We only expect to be called with " "a sorted mask where the broadcast "
"comes from V1.") ? void (0) : __assert_fail ("BroadcastIdx < (int)Mask.size() && \"We only expect to be called with \" \"a sorted mask where the broadcast \" \"comes from V1.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14917, __extension__
__PRETTY_FUNCTION__))

14916

"a sorted mask where the broadcast "(static_cast <bool> (BroadcastIdx < (int)Mask.size()
&& "We only expect to be called with " "a sorted mask where the broadcast "
"comes from V1.") ? void (0) : __assert_fail ("BroadcastIdx < (int)Mask.size() && \"We only expect to be called with \" \"a sorted mask where the broadcast \" \"comes from V1.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14917, __extension__
__PRETTY_FUNCTION__))

14917

"comes from V1.")(static_cast <bool> (BroadcastIdx < (int)Mask.size()
&& "We only expect to be called with " "a sorted mask where the broadcast "
"comes from V1.") ? void (0) : __assert_fail ("BroadcastIdx < (int)Mask.size() && \"We only expect to be called with \" \"a sorted mask where the broadcast \" \"comes from V1.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14917, __extension__
__PRETTY_FUNCTION__));

14918

14919

// Go up the chain of (vector) values to find a scalar load that we can

14920

// combine with the broadcast.

14921

// TODO: Combine this logic with findEltLoadSrc() used by

14922

// EltsFromConsecutiveLoads().

14923

int BitOffset = BroadcastIdx * NumEltBits;

14924

SDValue V = V1;

14925

for (;;) {

14926

switch (V.getOpcode()) {

14927

case ISD::BITCAST: {

14928

V = V.getOperand(0);

14929

continue;

14930

}

14931

case ISD::CONCAT_VECTORS: {

14932

int OpBitWidth = V.getOperand(0).getValueSizeInBits();

14933

int OpIdx = BitOffset / OpBitWidth;

14934

V = V.getOperand(OpIdx);

14935

BitOffset %= OpBitWidth;

14936

continue;

14937

}

14938

case ISD::EXTRACT_SUBVECTOR: {

14939

// The extraction index adds to the existing offset.

14940

unsigned EltBitWidth = V.getScalarValueSizeInBits();

14941

unsigned Idx = V.getConstantOperandVal(1);

14942

unsigned BeginOffset = Idx * EltBitWidth;

14943

BitOffset += BeginOffset;

14944

V = V.getOperand(0);

14945

continue;

14946

}

14947

case ISD::INSERT_SUBVECTOR: {

14948

SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);

14949

int EltBitWidth = VOuter.getScalarValueSizeInBits();

14950

int Idx = (int)V.getConstantOperandVal(2);

14951

int NumSubElts = (int)VInner.getSimpleValueType().getVectorNumElements();

14952

int BeginOffset = Idx * EltBitWidth;

14953

int EndOffset = BeginOffset + NumSubElts * EltBitWidth;

14954

if (BeginOffset <= BitOffset && BitOffset < EndOffset) {

14955

BitOffset -= BeginOffset;

14956

V = VInner;

14957

} else {

14958

V = VOuter;

14959

}

14960

continue;

14961

}

14962

}

14963

break;

14964

}

14965

assert((BitOffset % NumEltBits) == 0 && "Illegal bit-offset")(static_cast <bool> ((BitOffset % NumEltBits) == 0 &&
"Illegal bit-offset") ? void (0) : __assert_fail ("(BitOffset % NumEltBits) == 0 && \"Illegal bit-offset\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14965, __extension__
__PRETTY_FUNCTION__));

14966

BroadcastIdx = BitOffset / NumEltBits;

14967

14968

// Do we need to bitcast the source to retrieve the original broadcast index?

14969

bool BitCastSrc = V.getScalarValueSizeInBits() != NumEltBits;

14970

14971

// Check if this is a broadcast of a scalar. We special case lowering

14972

// for scalars so that we can more effectively fold with loads.

14973

// If the original value has a larger element type than the shuffle, the

14974

// broadcast element is in essence truncated. Make that explicit to ease

14975

// folding.

14976

if (BitCastSrc && VT.isInteger())

14977

if (SDValue TruncBroadcast = lowerShuffleAsTruncBroadcast(

14978

DL, VT, V, BroadcastIdx, Subtarget, DAG))

14979

return TruncBroadcast;

14980

14981

// Also check the simpler case, where we can directly reuse the scalar.

14982

if (!BitCastSrc &&

14983

((V.getOpcode() == ISD::BUILD_VECTOR && V.hasOneUse()) ||

14984

(V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0))) {

14985

V = V.getOperand(BroadcastIdx);

14986

14987

// If we can't broadcast from a register, check that the input is a load.

14988

if (!BroadcastFromReg && !isShuffleFoldableLoad(V))

14989

return SDValue();

14990

} else if (ISD::isNormalLoad(V.getNode()) &&

14991

cast<LoadSDNode>(V)->isSimple()) {

14992

// We do not check for one-use of the vector load because a broadcast load

14993

// is expected to be a win for code size, register pressure, and possibly

14994

// uops even if the original vector load is not eliminated.

14995

14996

// Reduce the vector load and shuffle to a broadcasted scalar load.

14997

LoadSDNode *Ld = cast<LoadSDNode>(V);

14998

SDValue BaseAddr = Ld->getOperand(1);

14999

MVT SVT = VT.getScalarType();

15000

unsigned Offset = BroadcastIdx * SVT.getStoreSize();

15001

assert((int)(Offset * 8) == BitOffset && "Unexpected bit-offset")(static_cast <bool> ((int)(Offset * 8) == BitOffset &&
"Unexpected bit-offset") ? void (0) : __assert_fail ("(int)(Offset * 8) == BitOffset && \"Unexpected bit-offset\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15001, __extension__
__PRETTY_FUNCTION__));

15002

SDValue NewAddr =

15003

DAG.getMemBasePlusOffset(BaseAddr, TypeSize::Fixed(Offset), DL);

15004

15005

// Directly form VBROADCAST_LOAD if we're using VBROADCAST opcode rather

15006

// than MOVDDUP.

15007

// FIXME: Should we add VBROADCAST_LOAD isel patterns for pre-AVX?

15008

if (Opcode == X86ISD::VBROADCAST) {

15009

SDVTList Tys = DAG.getVTList(VT, MVT::Other);

15010

SDValue Ops[] = {Ld->getChain(), NewAddr};

15011

V = DAG.getMemIntrinsicNode(

15012

X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, SVT,

15013

DAG.getMachineFunction().getMachineMemOperand(

15014

Ld->getMemOperand(), Offset, SVT.getStoreSize()));

15015

DAG.makeEquivalentMemoryOrdering(Ld, V);

15016

return DAG.getBitcast(VT, V);

15017

}

15018

assert(SVT == MVT::f64 && "Unexpected VT!")(static_cast <bool> (SVT == MVT::f64 && "Unexpected VT!"
) ? void (0) : __assert_fail ("SVT == MVT::f64 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15018, __extension__
__PRETTY_FUNCTION__));

15019

V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr,

15020

DAG.getMachineFunction().getMachineMemOperand(

15021

Ld->getMemOperand(), Offset, SVT.getStoreSize()));

15022

DAG.makeEquivalentMemoryOrdering(Ld, V);

15023

} else if (!BroadcastFromReg) {

15024

// We can't broadcast from a vector register.

15025

return SDValue();

15026

} else if (BitOffset != 0) {

15027

// We can only broadcast from the zero-element of a vector register,

15028

// but it can be advantageous to broadcast from the zero-element of a

15029

// subvector.

15030

if (!VT.is256BitVector() && !VT.is512BitVector())

15031

return SDValue();

15032

15033

// VPERMQ/VPERMPD can perform the cross-lane shuffle directly.

15034

if (VT == MVT::v4f64 || VT == MVT::v4i64)

15035

return SDValue();

15036

15037

// Only broadcast the zero-element of a 128-bit subvector.

15038

if ((BitOffset % 128) != 0)

15039

return SDValue();

15040

15041

assert((BitOffset % V.getScalarValueSizeInBits()) == 0 &&(static_cast <bool> ((BitOffset % V.getScalarValueSizeInBits
()) == 0 && "Unexpected bit-offset") ? void (0) : __assert_fail
("(BitOffset % V.getScalarValueSizeInBits()) == 0 && \"Unexpected bit-offset\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15042, __extension__
__PRETTY_FUNCTION__))

15042

"Unexpected bit-offset")(static_cast <bool> ((BitOffset % V.getScalarValueSizeInBits
()) == 0 && "Unexpected bit-offset") ? void (0) : __assert_fail
("(BitOffset % V.getScalarValueSizeInBits()) == 0 && \"Unexpected bit-offset\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15042, __extension__
__PRETTY_FUNCTION__));

15043

assert((V.getValueSizeInBits() == 256 || V.getValueSizeInBits() == 512) &&(static_cast <bool> ((V.getValueSizeInBits() == 256 || V
.getValueSizeInBits() == 512) && "Unexpected vector size"
) ? void (0) : __assert_fail ("(V.getValueSizeInBits() == 256 || V.getValueSizeInBits() == 512) && \"Unexpected vector size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15044, __extension__
__PRETTY_FUNCTION__))

15044

"Unexpected vector size")(static_cast <bool> ((V.getValueSizeInBits() == 256 || V
.getValueSizeInBits() == 512) && "Unexpected vector size"
) ? void (0) : __assert_fail ("(V.getValueSizeInBits() == 256 || V.getValueSizeInBits() == 512) && \"Unexpected vector size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15044, __extension__
__PRETTY_FUNCTION__));

15045

unsigned ExtractIdx = BitOffset / V.getScalarValueSizeInBits();

15046

V = extract128BitVector(V, ExtractIdx, DAG, DL);

15047

}

15048

15049

// On AVX we can use VBROADCAST directly for scalar sources.

15050

if (Opcode == X86ISD::MOVDDUP && !V.getValueType().isVector()) {

15051

V = DAG.getBitcast(MVT::f64, V);

15052

if (Subtarget.hasAVX()) {

15053

V = DAG.getNode(X86ISD::VBROADCAST, DL, MVT::v2f64, V);

15054

return DAG.getBitcast(VT, V);

15055

}

15056

V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V);

15057

}

15058

15059

// If this is a scalar, do the broadcast on this type and bitcast.

15060

if (!V.getValueType().isVector()) {

15061

assert(V.getScalarValueSizeInBits() == NumEltBits &&(static_cast <bool> (V.getScalarValueSizeInBits() == NumEltBits
&& "Unexpected scalar size") ? void (0) : __assert_fail
("V.getScalarValueSizeInBits() == NumEltBits && \"Unexpected scalar size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15062, __extension__
__PRETTY_FUNCTION__))

15062

"Unexpected scalar size")(static_cast <bool> (V.getScalarValueSizeInBits() == NumEltBits
&& "Unexpected scalar size") ? void (0) : __assert_fail
("V.getScalarValueSizeInBits() == NumEltBits && \"Unexpected scalar size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15062, __extension__
__PRETTY_FUNCTION__));

15063

MVT BroadcastVT = MVT::getVectorVT(V.getSimpleValueType(),

15064

VT.getVectorNumElements());

15065

return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V));

15066

}

15067

15068

// We only support broadcasting from 128-bit vectors to minimize the

15069

// number of patterns we need to deal with in isel. So extract down to

15070

// 128-bits, removing as many bitcasts as possible.

15071

if (V.getValueSizeInBits() > 128)

15072

V = extract128BitVector(peekThroughBitcasts(V), 0, DAG, DL);

15073

15074

// Otherwise cast V to a vector with the same element type as VT, but

15075

// possibly narrower than VT. Then perform the broadcast.

15076

unsigned NumSrcElts = V.getValueSizeInBits() / NumEltBits;

15077

MVT CastVT = MVT::getVectorVT(VT.getVectorElementType(), NumSrcElts);

15078

return DAG.getNode(Opcode, DL, VT, DAG.getBitcast(CastVT, V));

15079

}

15080

15081

// Check for whether we can use INSERTPS to perform the shuffle. We only use

15082

// INSERTPS when the V1 elements are already in the correct locations

15083

// because otherwise we can just always use two SHUFPS instructions which

15084

// are much smaller to encode than a SHUFPS and an INSERTPS. We can also

15085

// perform INSERTPS if a single V1 element is out of place and all V2

15086

// elements are zeroable.

15087

static bool matchShuffleAsInsertPS(SDValue &V1, SDValue &V2,

15088

unsigned &InsertPSMask,

15089

const APInt &Zeroable,

15090

ArrayRef<int> Mask, SelectionDAG &DAG) {

15091

assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType().is128BitVector
() && "Bad operand type!") ? void (0) : __assert_fail
("V1.getSimpleValueType().is128BitVector() && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15091, __extension__
__PRETTY_FUNCTION__));

15092

assert(V2.getSimpleValueType().is128BitVector() && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType().is128BitVector
() && "Bad operand type!") ? void (0) : __assert_fail
("V2.getSimpleValueType().is128BitVector() && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15092, __extension__
__PRETTY_FUNCTION__));

15093

assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!")(static_cast <bool> (Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 4 && \"Unexpected mask size for v4 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15093, __extension__
__PRETTY_FUNCTION__));

15094

15095

// Attempt to match INSERTPS with one element from VA or VB being

15096

// inserted into VA (or undef). If successful, V1, V2 and InsertPSMask

15097

// are updated.

15098

auto matchAsInsertPS = [&](SDValue VA, SDValue VB,

15099

ArrayRef<int> CandidateMask) {

15100

unsigned ZMask = 0;

15101

int VADstIndex = -1;

15102

int VBDstIndex = -1;

15103

bool VAUsedInPlace = false;

15104

15105

for (int i = 0; i < 4; ++i) {

15106

// Synthesize a zero mask from the zeroable elements (includes undefs).

15107

if (Zeroable[i]) {

15108

ZMask |= 1 << i;

15109

continue;

15110

}

15111

15112

// Flag if we use any VA inputs in place.

15113

if (i == CandidateMask[i]) {

15114

VAUsedInPlace = true;

15115

continue;

15116

}

15117

15118

// We can only insert a single non-zeroable element.

15119

if (VADstIndex >= 0 || VBDstIndex >= 0)

15120

return false;

15121

15122

if (CandidateMask[i] < 4) {

15123

// VA input out of place for insertion.

15124

VADstIndex = i;

15125

} else {

15126

// VB input for insertion.

15127

VBDstIndex = i;

15128

}

15129

}

15130

15131

// Don't bother if we have no (non-zeroable) element for insertion.

15132

if (VADstIndex < 0 && VBDstIndex < 0)

15133

return false;

15134

15135

// Determine element insertion src/dst indices. The src index is from the

15136

// start of the inserted vector, not the start of the concatenated vector.

15137

unsigned VBSrcIndex = 0;

15138

if (VADstIndex >= 0) {

15139

// If we have a VA input out of place, we use VA as the V2 element

15140

// insertion and don't use the original V2 at all.

15141

VBSrcIndex = CandidateMask[VADstIndex];

15142

VBDstIndex = VADstIndex;

15143

VB = VA;

15144

} else {

15145

VBSrcIndex = CandidateMask[VBDstIndex] - 4;

15146

}

15147

15148

// If no V1 inputs are used in place, then the result is created only from

15149

// the zero mask and the V2 insertion - so remove V1 dependency.

15150

if (!VAUsedInPlace)

15151

VA = DAG.getUNDEF(MVT::v4f32);

15152

15153

// Update V1, V2 and InsertPSMask accordingly.

15154

V1 = VA;

15155

V2 = VB;

15156

15157

// Insert the V2 element into the desired position.

15158

InsertPSMask = VBSrcIndex << 6 | VBDstIndex << 4 | ZMask;

15159

assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!")(static_cast <bool> ((InsertPSMask & ~0xFFu) == 0 &&
"Invalid mask!") ? void (0) : __assert_fail ("(InsertPSMask & ~0xFFu) == 0 && \"Invalid mask!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15159, __extension__
__PRETTY_FUNCTION__));

15160

return true;

15161

};

15162

15163

if (matchAsInsertPS(V1, V2, Mask))

15164

return true;

15165

15166

// Commute and try again.

15167

SmallVector<int, 4> CommutedMask(Mask);

15168

ShuffleVectorSDNode::commuteMask(CommutedMask);

15169

if (matchAsInsertPS(V2, V1, CommutedMask))

15170

return true;

15171

15172

return false;

15173

}

15174

15175

static SDValue lowerShuffleAsInsertPS(const SDLoc &DL, SDValue V1, SDValue V2,

15176

ArrayRef<int> Mask, const APInt &Zeroable,

15177

SelectionDAG &DAG) {

15178

assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v4f32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v4f32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15178, __extension__
__PRETTY_FUNCTION__));

15179

assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v4f32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v4f32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15179, __extension__
__PRETTY_FUNCTION__));

15180

15181

// Attempt to match the insertps pattern.

15182

unsigned InsertPSMask = 0;

15183

if (!matchShuffleAsInsertPS(V1, V2, InsertPSMask, Zeroable, Mask, DAG))

15184

return SDValue();

15185

15186

// Insert the V2 element into the desired position.

15187

return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,

15188

DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));

15189

}

15190

15191

/// Handle lowering of 2-lane 64-bit floating point shuffles.

15192

///

15193

/// This is the basis function for the 2-lane 64-bit shuffles as we have full

15194

/// support for floating point shuffles but not integer shuffles. These

15195

/// instructions will incur a domain crossing penalty on some chips though so

15196

/// it is better to avoid lowering through this for integer vectors where

15197

/// possible.

15198

static SDValue lowerV2F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

15199

const APInt &Zeroable, SDValue V1, SDValue V2,

15200

const X86Subtarget &Subtarget,

15201

SelectionDAG &DAG) {

15202

assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v2f64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v2f64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15202, __extension__
__PRETTY_FUNCTION__));

15203

assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v2f64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v2f64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15203, __extension__
__PRETTY_FUNCTION__));

15204

assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!")(static_cast <bool> (Mask.size() == 2 && "Unexpected mask size for v2 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 2 && \"Unexpected mask size for v2 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15204, __extension__
__PRETTY_FUNCTION__));

15205

15206

if (V2.isUndef()) {

15207

// Check for being able to broadcast a single element.

15208

if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2f64, V1, V2,

15209

Mask, Subtarget, DAG))

15210

return Broadcast;

15211

15212

// Straight shuffle of a single input vector. Simulate this by using the

15213

// single input as both of the "inputs" to this instruction..

15214

unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1);

15215

15216

if (Subtarget.hasAVX()) {

15217

// If we have AVX, we can use VPERMILPS which will allow folding a load

15218

// into the shuffle.

15219

return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1,

15220

DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));

15221

}

15222

15223

return DAG.getNode(

15224

X86ISD::SHUFP, DL, MVT::v2f64,

15225

Mask[0] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,

15226

Mask[1] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,

15227

DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));

15228

}

15229

assert(Mask[0] >= 0 && "No undef lanes in multi-input v2 shuffles!")(static_cast <bool> (Mask[0] >= 0 && "No undef lanes in multi-input v2 shuffles!"
) ? void (0) : __assert_fail ("Mask[0] >= 0 && \"No undef lanes in multi-input v2 shuffles!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15229, __extension__
__PRETTY_FUNCTION__));

15230

assert(Mask[1] >= 0 && "No undef lanes in multi-input v2 shuffles!")(static_cast <bool> (Mask[1] >= 0 && "No undef lanes in multi-input v2 shuffles!"
) ? void (0) : __assert_fail ("Mask[1] >= 0 && \"No undef lanes in multi-input v2 shuffles!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15230, __extension__
__PRETTY_FUNCTION__));

15231

assert(Mask[0] < 2 && "We sort V1 to be the first input.")(static_cast <bool> (Mask[0] < 2 && "We sort V1 to be the first input."
) ? void (0) : __assert_fail ("Mask[0] < 2 && \"We sort V1 to be the first input.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15231, __extension__
__PRETTY_FUNCTION__));

15232

assert(Mask[1] >= 2 && "We sort V2 to be the second input.")(static_cast <bool> (Mask[1] >= 2 && "We sort V2 to be the second input."
) ? void (0) : __assert_fail ("Mask[1] >= 2 && \"We sort V2 to be the second input.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15232, __extension__
__PRETTY_FUNCTION__));

15233

15234

if (Subtarget.hasAVX2())

15235

if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))

15236

return Extract;

15237

15238

// When loading a scalar and then shuffling it into a vector we can often do

15239

// the insertion cheaply.

15240

if (SDValue Insertion = lowerShuffleAsElementInsertion(

15241

DL, MVT::v2f64, V1, V2, Mask, Zeroable, Subtarget, DAG))

15242

return Insertion;

15243

// Try inverting the insertion since for v2 masks it is easy to do and we

15244

// can't reliably sort the mask one way or the other.

15245

int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),

15246

Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};

15247

if (SDValue Insertion = lowerShuffleAsElementInsertion(

15248

DL, MVT::v2f64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))

15249

return Insertion;

15250

15251

// Try to use one of the special instruction patterns to handle two common

15252

// blend patterns if a zero-blend above didn't work.

15253

if (isShuffleEquivalent(Mask, {0, 3}, V1, V2) ||

15254

isShuffleEquivalent(Mask, {1, 3}, V1, V2))

15255

if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))

15256

// We can either use a special instruction to load over the low double or

15257

// to move just the low double.

15258

return DAG.getNode(

15259

X86ISD::MOVSD, DL, MVT::v2f64, V2,

15260

DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S));

15261

15262

if (Subtarget.hasSSE41())

15263

if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,

15264

Zeroable, Subtarget, DAG))

15265

return Blend;

15266

15267

// Use dedicated unpack instructions for masks that match their pattern.

15268

if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2f64, Mask, V1, V2, DAG))

15269

return V;

15270

15271

unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);

15272

return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V2,

15273

DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));

15274

}

15275

15276

/// Handle lowering of 2-lane 64-bit integer shuffles.

15277

///

15278

/// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by

15279

/// the integer unit to minimize domain crossing penalties. However, for blends

15280

/// it falls back to the floating point shuffle operation with appropriate bit

15281

/// casting.

15282

static SDValue lowerV2I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

15283

const APInt &Zeroable, SDValue V1, SDValue V2,

15284

const X86Subtarget &Subtarget,

15285

SelectionDAG &DAG) {

15286

assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v2i64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v2i64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15286, __extension__
__PRETTY_FUNCTION__));

15287

assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v2i64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v2i64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15287, __extension__
__PRETTY_FUNCTION__));

15288

assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!")(static_cast <bool> (Mask.size() == 2 && "Unexpected mask size for v2 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 2 && \"Unexpected mask size for v2 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15288, __extension__
__PRETTY_FUNCTION__));

15289

15290

if (V2.isUndef()) {

15291

// Check for being able to broadcast a single element.

15292

if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2i64, V1, V2,

15293

Mask, Subtarget, DAG))

15294

return Broadcast;

15295

15296

// Straight shuffle of a single input vector. For everything from SSE2

15297

// onward this has a single fast instruction with no scary immediates.

15298

// We have to map the mask as it is actually a v4i32 shuffle instruction.

15299

V1 = DAG.getBitcast(MVT::v4i32, V1);

15300

int WidenedMask[4] = {Mask[0] < 0 ? -1 : (Mask[0] * 2),

15301

Mask[0] < 0 ? -1 : ((Mask[0] * 2) + 1),

15302

Mask[1] < 0 ? -1 : (Mask[1] * 2),

15303

Mask[1] < 0 ? -1 : ((Mask[1] * 2) + 1)};

15304

return DAG.getBitcast(

15305

MVT::v2i64,

15306

DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,

15307

getV4X86ShuffleImm8ForMask(WidenedMask, DL, DAG)));

15308

}

15309

assert(Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!")(static_cast <bool> (Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!"
) ? void (0) : __assert_fail ("Mask[0] != -1 && \"No undef lanes in multi-input v2 shuffles!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15309, __extension__
__PRETTY_FUNCTION__));

15310

assert(Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!")(static_cast <bool> (Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!"
) ? void (0) : __assert_fail ("Mask[1] != -1 && \"No undef lanes in multi-input v2 shuffles!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15310, __extension__
__PRETTY_FUNCTION__));

15311

assert(Mask[0] < 2 && "We sort V1 to be the first input.")(static_cast <bool> (Mask[0] < 2 && "We sort V1 to be the first input."
) ? void (0) : __assert_fail ("Mask[0] < 2 && \"We sort V1 to be the first input.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15311, __extension__
__PRETTY_FUNCTION__));

15312

assert(Mask[1] >= 2 && "We sort V2 to be the second input.")(static_cast <bool> (Mask[1] >= 2 && "We sort V2 to be the second input."
) ? void (0) : __assert_fail ("Mask[1] >= 2 && \"We sort V2 to be the second input.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15312, __extension__
__PRETTY_FUNCTION__));

15313

15314

if (Subtarget.hasAVX2())

15315

if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))

15316

return Extract;

15317

15318

// Try to use shift instructions.

15319

if (SDValue Shift =

15320

lowerShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget,

15321

DAG, /*BitwiseOnly*/ false))

15322

return Shift;

15323

15324

// When loading a scalar and then shuffling it into a vector we can often do

15325

// the insertion cheaply.

15326

if (SDValue Insertion = lowerShuffleAsElementInsertion(

15327

DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget, DAG))

15328

return Insertion;

15329

// Try inverting the insertion since for v2 masks it is easy to do and we

15330

// can't reliably sort the mask one way or the other.

15331

int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2};

15332

if (SDValue Insertion = lowerShuffleAsElementInsertion(

15333

DL, MVT::v2i64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))

15334

return Insertion;

15335

15336

// We have different paths for blend lowering, but they all must use the

15337

// *exact* same predicate.

15338

bool IsBlendSupported = Subtarget.hasSSE41();

15339

if (IsBlendSupported)

15340

if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,

15341

Zeroable, Subtarget, DAG))

15342

return Blend;

15343

15344

// Use dedicated unpack instructions for masks that match their pattern.

15345

if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2i64, Mask, V1, V2, DAG))

15346

return V;

15347

15348

// Try to use byte rotation instructions.

15349

// Its more profitable for pre-SSSE3 to use shuffles/unpacks.

15350

if (Subtarget.hasSSSE3()) {

15351

if (Subtarget.hasVLX())

15352

if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v2i64, V1, V2, Mask,

15353

Subtarget, DAG))

15354

return Rotate;

15355

15356

if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v2i64, V1, V2, Mask,

15357

Subtarget, DAG))

15358

return Rotate;

15359

}

15360

15361

// If we have direct support for blends, we should lower by decomposing into

15362

// a permute. That will be faster than the domain cross.

15363

if (IsBlendSupported)

15364

return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v2i64, V1, V2, Mask,

15365

Subtarget, DAG);

15366

15367

// We implement this with SHUFPD which is pretty lame because it will likely

15368

// incur 2 cycles of stall for integer vectors on Nehalem and older chips.

15369

// However, all the alternatives are still more cycles and newer chips don't

15370

// have this problem. It would be really nice if x86 had better shuffles here.

15371

V1 = DAG.getBitcast(MVT::v2f64, V1);

15372

V2 = DAG.getBitcast(MVT::v2f64, V2);

15373

return DAG.getBitcast(MVT::v2i64,

15374

DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));

15375

}

15376

15377

/// Lower a vector shuffle using the SHUFPS instruction.

15378

///

15379

/// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.

15380

/// It makes no assumptions about whether this is the *best* lowering, it simply

15381

/// uses it.

15382

static SDValue lowerShuffleWithSHUFPS(const SDLoc &DL, MVT VT,

15383

ArrayRef<int> Mask, SDValue V1,

15384

SDValue V2, SelectionDAG &DAG) {

15385

SDValue LowV = V1, HighV = V2;

15386

SmallVector<int, 4> NewMask(Mask);

15387

int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });

15388

15389

if (NumV2Elements == 1) {

15390

int V2Index = find_if(Mask, [](int M) { return M >= 4; }) - Mask.begin();

15391

15392

// Compute the index adjacent to V2Index and in the same half by toggling

15393

// the low bit.

15394

int V2AdjIndex = V2Index ^ 1;

15395

15396

if (Mask[V2AdjIndex] < 0) {

15397

// Handles all the cases where we have a single V2 element and an undef.

15398

// This will only ever happen in the high lanes because we commute the

15399

// vector otherwise.

15400

if (V2Index < 2)

15401

std::swap(LowV, HighV);

15402

NewMask[V2Index] -= 4;

15403

} else {

15404

// Handle the case where the V2 element ends up adjacent to a V1 element.

15405

// To make this work, blend them together as the first step.

15406

int V1Index = V2AdjIndex;

15407

int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};

15408

V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,

15409

getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));

15410

15411

// Now proceed to reconstruct the final blend as we have the necessary

15412

// high or low half formed.

15413

if (V2Index < 2) {

15414

LowV = V2;

15415

HighV = V1;

15416

} else {

15417

HighV = V2;

15418

}

15419

NewMask[V1Index] = 2; // We put the V1 element in V2[2].

15420

NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].

15421

}

15422

} else if (NumV2Elements == 2) {

15423

if (Mask[0] < 4 && Mask[1] < 4) {

15424

// Handle the easy case where we have V1 in the low lanes and V2 in the

15425

// high lanes.

15426

NewMask[2] -= 4;

15427

NewMask[3] -= 4;

15428

} else if (Mask[2] < 4 && Mask[3] < 4) {

15429

// We also handle the reversed case because this utility may get called

15430

// when we detect a SHUFPS pattern but can't easily commute the shuffle to

15431

// arrange things in the right direction.

15432

NewMask[0] -= 4;

15433

NewMask[1] -= 4;

15434

HighV = V1;

15435

LowV = V2;

15436

} else {

15437

// We have a mixture of V1 and V2 in both low and high lanes. Rather than

15438

// trying to place elements directly, just blend them and set up the final

15439

// shuffle to place them.

15440

15441

// The first two blend mask elements are for V1, the second two are for

15442

// V2.

15443

int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],

15444

Mask[2] < 4 ? Mask[2] : Mask[3],

15445

(Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,

15446

(Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};

15447

V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,

15448

getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));

15449

15450

// Now we do a normal shuffle of V1 by giving V1 as both operands to

15451

// a blend.

15452

LowV = HighV = V1;

15453

NewMask[0] = Mask[0] < 4 ? 0 : 2;

15454

NewMask[1] = Mask[0] < 4 ? 2 : 0;

15455

NewMask[2] = Mask[2] < 4 ? 1 : 3;

15456

NewMask[3] = Mask[2] < 4 ? 3 : 1;

15457

}

15458

} else if (NumV2Elements == 3) {

15459

// Ideally canonicalizeShuffleMaskWithCommute should have caught this, but

15460

// we can get here due to other paths (e.g repeated mask matching) that we

15461

// don't want to do another round of lowerVECTOR_SHUFFLE.

15462

ShuffleVectorSDNode::commuteMask(NewMask);

15463

return lowerShuffleWithSHUFPS(DL, VT, NewMask, V2, V1, DAG);

15464

}

15465

return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV,

15466

getV4X86ShuffleImm8ForMask(NewMask, DL, DAG));

15467

}

15468

15469

/// Lower 4-lane 32-bit floating point shuffles.

15470

///

15471

/// Uses instructions exclusively from the floating point unit to minimize

15472

/// domain crossing penalties, as these are sufficient to implement all v4f32

15473

/// shuffles.

15474

static SDValue lowerV4F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

15475

const APInt &Zeroable, SDValue V1, SDValue V2,

15476

const X86Subtarget &Subtarget,

15477

SelectionDAG &DAG) {

15478

assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v4f32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v4f32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15478, __extension__
__PRETTY_FUNCTION__));

15479

assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v4f32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v4f32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15479, __extension__
__PRETTY_FUNCTION__));

15480

assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!")(static_cast <bool> (Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 4 && \"Unexpected mask size for v4 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15480, __extension__
__PRETTY_FUNCTION__));

15481

15482

if (Subtarget.hasSSE41())

15483

if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,

15484

Zeroable, Subtarget, DAG))

15485

return Blend;

15486

15487

int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });

15488

15489

if (NumV2Elements == 0) {

15490

// Check for being able to broadcast a single element.

15491

if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f32, V1, V2,

15492

Mask, Subtarget, DAG))

15493

return Broadcast;

15494

15495

// Use even/odd duplicate instructions for masks that match their pattern.

15496

if (Subtarget.hasSSE3()) {

15497

if (isShuffleEquivalent(Mask, {0, 0, 2, 2}, V1, V2))

15498

return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);

15499

if (isShuffleEquivalent(Mask, {1, 1, 3, 3}, V1, V2))

15500

return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);

15501

}

15502

15503

if (Subtarget.hasAVX()) {

15504

// If we have AVX, we can use VPERMILPS which will allow folding a load

15505

// into the shuffle.

15506

return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1,

15507

getV4X86ShuffleImm8ForMask(Mask, DL, DAG));

15508

}

15509

15510

// Use MOVLHPS/MOVHLPS to simulate unary shuffles. These are only valid

15511

// in SSE1 because otherwise they are widened to v2f64 and never get here.

15512

if (!Subtarget.hasSSE2()) {

15513

if (isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1, V2))

15514

return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V1);

15515

if (isShuffleEquivalent(Mask, {2, 3, 2, 3}, V1, V2))

15516

return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V1, V1);

15517

}

15518

15519

// Otherwise, use a straight shuffle of a single input vector. We pass the

15520

// input vector to both operands to simulate this with a SHUFPS.

15521

return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,

15522

getV4X86ShuffleImm8ForMask(Mask, DL, DAG));

15523

}

15524

15525

if (Subtarget.hasSSE2())

15526

if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(

15527

DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG)) {

15528

ZExt = DAG.getBitcast(MVT::v4f32, ZExt);

15529

return ZExt;

15530

}

15531

15532

if (Subtarget.hasAVX2())

15533

if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))

15534

return Extract;

15535

15536

// There are special ways we can lower some single-element blends. However, we

15537

// have custom ways we can lower more complex single-element blends below that

15538

// we defer to if both this and BLENDPS fail to match, so restrict this to

15539

// when the V2 input is targeting element 0 of the mask -- that is the fast

15540

// case here.

15541

if (NumV2Elements == 1 && Mask[0] >= 4)

15542

if (SDValue V = lowerShuffleAsElementInsertion(

15543

DL, MVT::v4f32, V1, V2, Mask, Zeroable, Subtarget, DAG))

15544

return V;

15545

15546

if (Subtarget.hasSSE41()) {

15547

// Use INSERTPS if we can complete the shuffle efficiently.

15548

if (SDValue V = lowerShuffleAsInsertPS(DL, V1, V2, Mask, Zeroable, DAG))

15549

return V;

15550

15551

if (!isSingleSHUFPSMask(Mask))

15552

if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, MVT::v4f32, V1,

15553

V2, Mask, DAG))

15554

return BlendPerm;

15555

}

15556

15557

// Use low/high mov instructions. These are only valid in SSE1 because

15558

// otherwise they are widened to v2f64 and never get here.

15559

if (!Subtarget.hasSSE2()) {

15560

if (isShuffleEquivalent(Mask, {0, 1, 4, 5}, V1, V2))

15561

return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V2);

15562

if (isShuffleEquivalent(Mask, {2, 3, 6, 7}, V1, V2))

15563

return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V2, V1);

15564

}

15565

15566

// Use dedicated unpack instructions for masks that match their pattern.

15567

if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f32, Mask, V1, V2, DAG))

15568

return V;

15569

15570

// Otherwise fall back to a SHUFPS lowering strategy.

15571

return lowerShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);

15572

}

15573

15574

/// Lower 4-lane i32 vector shuffles.

15575

///

15576

/// We try to handle these with integer-domain shuffles where we can, but for

15577

/// blends we use the floating point domain blend instructions.

15578

static SDValue lowerV4I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

15579

const APInt &Zeroable, SDValue V1, SDValue V2,

15580

const X86Subtarget &Subtarget,

15581

SelectionDAG &DAG) {

15582

assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v4i32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v4i32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15582, __extension__
__PRETTY_FUNCTION__));

15583

assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v4i32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v4i32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15583, __extension__
__PRETTY_FUNCTION__));

15584

assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!")(static_cast <bool> (Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 4 && \"Unexpected mask size for v4 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15584, __extension__
__PRETTY_FUNCTION__));

15585

15586

// Whenever we can lower this as a zext, that instruction is strictly faster

15587

// than any alternative. It also allows us to fold memory operands into the

15588

// shuffle in many cases.

15589

if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v4i32, V1, V2, Mask,

15590

Zeroable, Subtarget, DAG))

15591

return ZExt;

15592

15593

int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });

15594

15595

// Try to use shift instructions if fast.

15596

if (Subtarget.preferLowerShuffleAsShift()) {

15597

if (SDValue Shift =

15598

lowerShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask, Zeroable,

15599

Subtarget, DAG, /*BitwiseOnly*/ true))

15600

return Shift;

15601

if (NumV2Elements == 0)

15602

if (SDValue Rotate =

15603

lowerShuffleAsBitRotate(DL, MVT::v4i32, V1, Mask, Subtarget, DAG))

15604

return Rotate;

15605

}

15606

15607

if (NumV2Elements == 0) {

15608

// Try to use broadcast unless the mask only has one non-undef element.

15609

if (count_if(Mask, [](int M) { return M >= 0 && M < 4; }) > 1) {

15610

if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i32, V1, V2,

15611

Mask, Subtarget, DAG))

15612

return Broadcast;

15613

}

15614

15615

// Straight shuffle of a single input vector. For everything from SSE2

15616

// onward this has a single fast instruction with no scary immediates.

15617

// We coerce the shuffle pattern to be compatible with UNPCK instructions

15618

// but we aren't actually going to use the UNPCK instruction because doing

15619

// so prevents folding a load into this instruction or making a copy.

15620

const int UnpackLoMask[] = {0, 0, 1, 1};

15621

const int UnpackHiMask[] = {2, 2, 3, 3};

15622

if (isShuffleEquivalent(Mask, {0, 0, 1, 1}, V1, V2))

15623

Mask = UnpackLoMask;

15624

else if (isShuffleEquivalent(Mask, {2, 2, 3, 3}, V1, V2))

15625

Mask = UnpackHiMask;

15626

15627

return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,

15628

getV4X86ShuffleImm8ForMask(Mask, DL, DAG));

15629

}

15630

15631

if (Subtarget.hasAVX2())

15632

if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))

15633

return Extract;

15634

15635

// Try to use shift instructions.

15636

if (SDValue Shift =

15637

lowerShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget,

15638

DAG, /*BitwiseOnly*/ false))

15639

return Shift;

15640

15641

// There are special ways we can lower some single-element blends.

15642

if (NumV2Elements == 1)

15643

if (SDValue V = lowerShuffleAsElementInsertion(

15644

DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))

15645

return V;

15646

15647

// We have different paths for blend lowering, but they all must use the

15648

// *exact* same predicate.

15649

bool IsBlendSupported = Subtarget.hasSSE41();

15650

if (IsBlendSupported)

15651

if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,

15652

Zeroable, Subtarget, DAG))

15653

return Blend;

15654

15655

if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask,

15656

Zeroable, Subtarget, DAG))

15657

return Masked;

15658

15659

// Use dedicated unpack instructions for masks that match their pattern.

15660

if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i32, Mask, V1, V2, DAG))

15661

return V;

15662

15663

// Try to use byte rotation instructions.

15664

// Its more profitable for pre-SSSE3 to use shuffles/unpacks.

15665

if (Subtarget.hasSSSE3()) {

15666

if (Subtarget.hasVLX())

15667

if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i32, V1, V2, Mask,

15668

Subtarget, DAG))

15669

return Rotate;

15670

15671

if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i32, V1, V2, Mask,

15672

Subtarget, DAG))

15673

return Rotate;

15674

}

15675

15676

// Assume that a single SHUFPS is faster than an alternative sequence of

15677

// multiple instructions (even if the CPU has a domain penalty).

15678

// If some CPU is harmed by the domain switch, we can fix it in a later pass.

15679

if (!isSingleSHUFPSMask(Mask)) {

15680

// If we have direct support for blends, we should lower by decomposing into

15681

// a permute. That will be faster than the domain cross.

15682

if (IsBlendSupported)

15683

return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i32, V1, V2, Mask,

15684

Subtarget, DAG);

15685

15686

// Try to lower by permuting the inputs into an unpack instruction.

15687

if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v4i32, V1, V2,

15688

Mask, Subtarget, DAG))

15689

return Unpack;

15690

}

15691

15692

// We implement this with SHUFPS because it can blend from two vectors.

15693

// Because we're going to eventually use SHUFPS, we use SHUFPS even to build

15694

// up the inputs, bypassing domain shift penalties that we would incur if we

15695

// directly used PSHUFD on Nehalem and older. For newer chips, this isn't

15696

// relevant.

15697

SDValue CastV1 = DAG.getBitcast(MVT::v4f32, V1);

15698

SDValue CastV2 = DAG.getBitcast(MVT::v4f32, V2);

15699

SDValue ShufPS = DAG.getVectorShuffle(MVT::v4f32, DL, CastV1, CastV2, Mask);

15700

return DAG.getBitcast(MVT::v4i32, ShufPS);

15701

}

15702

15703

/// Lowering of single-input v8i16 shuffles is the cornerstone of SSE2

15704

/// shuffle lowering, and the most complex part.

15705

///

15706

/// The lowering strategy is to try to form pairs of input lanes which are

15707

/// targeted at the same half of the final vector, and then use a dword shuffle

15708

/// to place them onto the right half, and finally unpack the paired lanes into

15709

/// their final position.

15710

///

15711

/// The exact breakdown of how to form these dword pairs and align them on the

15712

/// correct sides is really tricky. See the comments within the function for

15713

/// more of the details.

15714

///

15715

/// This code also handles repeated 128-bit lanes of v8i16 shuffles, but each

15716

/// lane must shuffle the *exact* same way. In fact, you must pass a v8 Mask to

15717

/// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16

15718

/// vector, form the analogous 128-bit 8-element Mask.

15719

static SDValue lowerV8I16GeneralSingleInputShuffle(

15720

const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef<int> Mask,

15721

const X86Subtarget &Subtarget, SelectionDAG &DAG) {

15722

assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!")(static_cast <bool> (VT.getVectorElementType() == MVT::
i16 && "Bad input type!") ? void (0) : __assert_fail (
"VT.getVectorElementType() == MVT::i16 && \"Bad input type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15722, __extension__
__PRETTY_FUNCTION__));

15723

MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);

15724

15725

assert(Mask.size() == 8 && "Shuffle mask length doesn't match!")(static_cast <bool> (Mask.size() == 8 && "Shuffle mask length doesn't match!"
) ? void (0) : __assert_fail ("Mask.size() == 8 && \"Shuffle mask length doesn't match!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15725, __extension__
__PRETTY_FUNCTION__));

15726

MutableArrayRef<int> LoMask = Mask.slice(0, 4);

15727

MutableArrayRef<int> HiMask = Mask.slice(4, 4);

15728

15729

// Attempt to directly match PSHUFLW or PSHUFHW.

15730

if (isUndefOrInRange(LoMask, 0, 4) &&

15731

isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {

15732

return DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,

15733

getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));

15734

}

15735

if (isUndefOrInRange(HiMask, 4, 8) &&

15736

isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {

15737

for (int i = 0; i != 4; ++i)

15738

HiMask[i] = (HiMask[i] < 0 ? HiMask[i] : (HiMask[i] - 4));

15739

return DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,

15740

getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));

15741

}

15742

15743

SmallVector<int, 4> LoInputs;

15744

copy_if(LoMask, std::back_inserter(LoInputs), [](int M) { return M >= 0; });

15745

array_pod_sort(LoInputs.begin(), LoInputs.end());

15746

LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end());

15747

SmallVector<int, 4> HiInputs;

15748

copy_if(HiMask, std::back_inserter(HiInputs), [](int M) { return M >= 0; });

15749

array_pod_sort(HiInputs.begin(), HiInputs.end());

15750

HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end());

15751

int NumLToL = llvm::lower_bound(LoInputs, 4) - LoInputs.begin();

15752

int NumHToL = LoInputs.size() - NumLToL;

15753

int NumLToH = llvm::lower_bound(HiInputs, 4) - HiInputs.begin();

15754

int NumHToH = HiInputs.size() - NumLToH;

15755

MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL);

15756

MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH);

15757

MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);

15758

MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);

15759

15760

// If we are shuffling values from one half - check how many different DWORD

15761

// pairs we need to create. If only 1 or 2 then we can perform this as a

15762

// PSHUFLW/PSHUFHW + PSHUFD instead of the PSHUFD+PSHUFLW+PSHUFHW chain below.

15763

auto ShuffleDWordPairs = [&](ArrayRef<int> PSHUFHalfMask,

15764

ArrayRef<int> PSHUFDMask, unsigned ShufWOp) {

15765

V = DAG.getNode(ShufWOp, DL, VT, V,

15766

getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));

15767

V = DAG.getBitcast(PSHUFDVT, V);

15768

V = DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, V,

15769

getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));

15770

return DAG.getBitcast(VT, V);

15771

};

15772

15773

if ((NumHToL + NumHToH) == 0 || (NumLToL + NumLToH) == 0) {

15774

int PSHUFDMask[4] = { -1, -1, -1, -1 };

15775

SmallVector<std::pair<int, int>, 4> DWordPairs;

15776

int DOffset = ((NumHToL + NumHToH) == 0 ? 0 : 2);

15777

15778

// Collect the different DWORD pairs.

15779

for (int DWord = 0; DWord != 4; ++DWord) {

15780

int M0 = Mask[2 * DWord + 0];

15781

int M1 = Mask[2 * DWord + 1];

15782

M0 = (M0 >= 0 ? M0 % 4 : M0);

15783

M1 = (M1 >= 0 ? M1 % 4 : M1);

15784

if (M0 < 0 && M1 < 0)

15785

continue;

15786

15787

bool Match = false;

15788

for (int j = 0, e = DWordPairs.size(); j < e; ++j) {

15789

auto &DWordPair = DWordPairs[j];

15790

if ((M0 < 0 || isUndefOrEqual(DWordPair.first, M0)) &&

15791

(M1 < 0 || isUndefOrEqual(DWordPair.second, M1))) {

15792

DWordPair.first = (M0 >= 0 ? M0 : DWordPair.first);

15793

DWordPair.second = (M1 >= 0 ? M1 : DWordPair.second);

15794

PSHUFDMask[DWord] = DOffset + j;

15795

Match = true;

15796

break;

15797

}

15798

}

15799

if (!Match) {

15800

PSHUFDMask[DWord] = DOffset + DWordPairs.size();

15801

DWordPairs.push_back(std::make_pair(M0, M1));

15802

}

15803

}

15804

15805

if (DWordPairs.size() <= 2) {

15806

DWordPairs.resize(2, std::make_pair(-1, -1));

15807

int PSHUFHalfMask[4] = {DWordPairs[0].first, DWordPairs[0].second,

15808

DWordPairs[1].first, DWordPairs[1].second};

15809

if ((NumHToL + NumHToH) == 0)

15810

return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFLW);

15811

if ((NumLToL + NumLToH) == 0)

15812

return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFHW);

15813

}

15814

}

15815

15816

// Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all

15817

// such inputs we can swap two of the dwords across the half mark and end up

15818

// with <=2 inputs to each half in each half. Once there, we can fall through

15819

// to the generic code below. For example:

15820

//

15821

// Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]

15822

// Mask: [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5]

15823

//

15824

// However in some very rare cases we have a 1-into-3 or 3-into-1 on one half

15825

// and an existing 2-into-2 on the other half. In this case we may have to

15826

// pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or

15827

// 1-into-3 which could cause us to cycle endlessly fixing each side in turn.

15828

// Fortunately, we don't have to handle anything but a 2-into-2 pattern

15829

// because any other situation (including a 3-into-1 or 1-into-3 in the other

15830

// half than the one we target for fixing) will be fixed when we re-enter this

15831

// path. We will also combine away any sequence of PSHUFD instructions that

15832

// result into a single instruction. Here is an example of the tricky case:

15833

//

15834

// Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]

15835

// Mask: [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3]

15836

//

15837

// This now has a 1-into-3 in the high half! Instead, we do two shuffles:

15838

//

15839

// Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h]

15840

// Mask: [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6]

15841

//

15842

// Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h]

15843

// Mask: [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6]

15844

//

15845

// The result is fine to be handled by the generic logic.

15846

auto balanceSides = [&](ArrayRef<int> AToAInputs, ArrayRef<int> BToAInputs,

15847

ArrayRef<int> BToBInputs, ArrayRef<int> AToBInputs,

15848

int AOffset, int BOffset) {

15849

assert((AToAInputs.size() == 3 || AToAInputs.size() == 1) &&(static_cast <bool> ((AToAInputs.size() == 3 || AToAInputs
.size() == 1) && "Must call this with A having 3 or 1 inputs from the A half."
) ? void (0) : __assert_fail ("(AToAInputs.size() == 3 || AToAInputs.size() == 1) && \"Must call this with A having 3 or 1 inputs from the A half.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15850, __extension__
__PRETTY_FUNCTION__))

15850

"Must call this with A having 3 or 1 inputs from the A half.")(static_cast <bool> ((AToAInputs.size() == 3 || AToAInputs
.size() == 1) && "Must call this with A having 3 or 1 inputs from the A half."
) ? void (0) : __assert_fail ("(AToAInputs.size() == 3 || AToAInputs.size() == 1) && \"Must call this with A having 3 or 1 inputs from the A half.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15850, __extension__
__PRETTY_FUNCTION__));

15851

assert((BToAInputs.size() == 1 || BToAInputs.size() == 3) &&(static_cast <bool> ((BToAInputs.size() == 1 || BToAInputs
.size() == 3) && "Must call this with B having 1 or 3 inputs from the B half."
) ? void (0) : __assert_fail ("(BToAInputs.size() == 1 || BToAInputs.size() == 3) && \"Must call this with B having 1 or 3 inputs from the B half.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15852, __extension__
__PRETTY_FUNCTION__))

15852

"Must call this with B having 1 or 3 inputs from the B half.")(static_cast <bool> ((BToAInputs.size() == 1 || BToAInputs
.size() == 3) && "Must call this with B having 1 or 3 inputs from the B half."
) ? void (0) : __assert_fail ("(BToAInputs.size() == 1 || BToAInputs.size() == 3) && \"Must call this with B having 1 or 3 inputs from the B half.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15852, __extension__
__PRETTY_FUNCTION__));

15853

assert(AToAInputs.size() + BToAInputs.size() == 4 &&(static_cast <bool> (AToAInputs.size() + BToAInputs.size
() == 4 && "Must call this with either 3:1 or 1:3 inputs (summing to 4)."
) ? void (0) : __assert_fail ("AToAInputs.size() + BToAInputs.size() == 4 && \"Must call this with either 3:1 or 1:3 inputs (summing to 4).\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15854, __extension__
__PRETTY_FUNCTION__))

15854

"Must call this with either 3:1 or 1:3 inputs (summing to 4).")(static_cast <bool> (AToAInputs.size() + BToAInputs.size
() == 4 && "Must call this with either 3:1 or 1:3 inputs (summing to 4)."
) ? void (0) : __assert_fail ("AToAInputs.size() + BToAInputs.size() == 4 && \"Must call this with either 3:1 or 1:3 inputs (summing to 4).\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15854, __extension__
__PRETTY_FUNCTION__));

15855

15856

bool ThreeAInputs = AToAInputs.size() == 3;

15857

15858

// Compute the index of dword with only one word among the three inputs in

15859

// a half by taking the sum of the half with three inputs and subtracting

15860

// the sum of the actual three inputs. The difference is the remaining

15861

// slot.

15862

int ADWord = 0, BDWord = 0;

15863

int &TripleDWord = ThreeAInputs ? ADWord : BDWord;

15864

int &OneInputDWord = ThreeAInputs ? BDWord : ADWord;

15865

int TripleInputOffset = ThreeAInputs ? AOffset : BOffset;

15866

ArrayRef<int> TripleInputs = ThreeAInputs ? AToAInputs : BToAInputs;

15867

int OneInput = ThreeAInputs ? BToAInputs[0] : AToAInputs[0];

15868

int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);

15869

int TripleNonInputIdx =

15870

TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);

15871

TripleDWord = TripleNonInputIdx / 2;

15872

15873

// We use xor with one to compute the adjacent DWord to whichever one the

15874

// OneInput is in.

15875

OneInputDWord = (OneInput / 2) ^ 1;

15876

15877

// Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA

15878

// and BToA inputs. If there is also such a problem with the BToB and AToB

15879

// inputs, we don't try to fix it necessarily -- we'll recurse and see it in

15880

// the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it

15881

// is essential that we don't *create* a 3<-1 as then we might oscillate.

15882

if (BToBInputs.size() == 2 && AToBInputs.size() == 2) {

15883

// Compute how many inputs will be flipped by swapping these DWords. We

15884

// need

15885

// to balance this to ensure we don't form a 3-1 shuffle in the other

15886

// half.

15887

int NumFlippedAToBInputs = llvm::count(AToBInputs, 2 * ADWord) +

15888

llvm::count(AToBInputs, 2 * ADWord + 1);

15889

int NumFlippedBToBInputs = llvm::count(BToBInputs, 2 * BDWord) +

15890

llvm::count(BToBInputs, 2 * BDWord + 1);

15891

if ((NumFlippedAToBInputs == 1 &&

15892

(NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) ||

15893

(NumFlippedBToBInputs == 1 &&

15894

(NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) {

15895

// We choose whether to fix the A half or B half based on whether that

15896

// half has zero flipped inputs. At zero, we may not be able to fix it

15897

// with that half. We also bias towards fixing the B half because that

15898

// will more commonly be the high half, and we have to bias one way.

15899

auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord,

15900

ArrayRef<int> Inputs) {

15901

int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot.

15902

bool IsFixIdxInput = is_contained(Inputs, PinnedIdx ^ 1);

15903

// Determine whether the free index is in the flipped dword or the

15904

// unflipped dword based on where the pinned index is. We use this bit

15905

// in an xor to conditionally select the adjacent dword.

15906

int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord));

15907

bool IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);

15908

if (IsFixIdxInput == IsFixFreeIdxInput)

15909

FixFreeIdx += 1;

15910

IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);

15911

assert(IsFixIdxInput != IsFixFreeIdxInput &&(static_cast <bool> (IsFixIdxInput != IsFixFreeIdxInput
&& "We need to be changing the number of flipped inputs!"
) ? void (0) : __assert_fail ("IsFixIdxInput != IsFixFreeIdxInput && \"We need to be changing the number of flipped inputs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15912, __extension__
__PRETTY_FUNCTION__))

15912

"We need to be changing the number of flipped inputs!")(static_cast <bool> (IsFixIdxInput != IsFixFreeIdxInput
&& "We need to be changing the number of flipped inputs!"
) ? void (0) : __assert_fail ("IsFixIdxInput != IsFixFreeIdxInput && \"We need to be changing the number of flipped inputs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15912, __extension__
__PRETTY_FUNCTION__));

15913

int PSHUFHalfMask[] = {0, 1, 2, 3};

15914

std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);

15915

V = DAG.getNode(

15916

FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL,

15917

MVT::getVectorVT(MVT::i16, V.getValueSizeInBits() / 16), V,

15918

getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));

15919

15920

for (int &M : Mask)

15921

if (M >= 0 && M == FixIdx)

15922

M = FixFreeIdx;

15923

else if (M >= 0 && M == FixFreeIdx)

15924

M = FixIdx;

15925

};

15926

if (NumFlippedBToBInputs != 0) {

15927

int BPinnedIdx =

15928

BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;

15929

FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs);

15930

} else {

15931

assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!")(static_cast <bool> (NumFlippedAToBInputs != 0 &&
"Impossible given predicates!") ? void (0) : __assert_fail (
"NumFlippedAToBInputs != 0 && \"Impossible given predicates!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15931, __extension__
__PRETTY_FUNCTION__));

15932

int APinnedIdx = ThreeAInputs ? TripleNonInputIdx : OneInput;

15933

FixFlippedInputs(APinnedIdx, ADWord, AToBInputs);

15934

}

15935

}

15936

}

15937

15938

int PSHUFDMask[] = {0, 1, 2, 3};

15939

PSHUFDMask[ADWord] = BDWord;

15940

PSHUFDMask[BDWord] = ADWord;

15941

V = DAG.getBitcast(

15942

VT,

15943

DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),

15944

getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));

15945

15946

// Adjust the mask to match the new locations of A and B.

15947

for (int &M : Mask)

15948

if (M >= 0 && M/2 == ADWord)

15949

M = 2 * BDWord + M % 2;

15950

else if (M >= 0 && M/2 == BDWord)

15951

M = 2 * ADWord + M % 2;

15952

15953

// Recurse back into this routine to re-compute state now that this isn't

15954

// a 3 and 1 problem.

15955

return lowerV8I16GeneralSingleInputShuffle(DL, VT, V, Mask, Subtarget, DAG);

15956

};

15957

if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3))

15958

return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);

15959

if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3))

15960

return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);

15961

15962

// At this point there are at most two inputs to the low and high halves from

15963

// each half. That means the inputs can always be grouped into dwords and

15964

// those dwords can then be moved to the correct half with a dword shuffle.

15965

// We use at most one low and one high word shuffle to collect these paired

15966

// inputs into dwords, and finally a dword shuffle to place them.

15967

int PSHUFLMask[4] = {-1, -1, -1, -1};

15968

int PSHUFHMask[4] = {-1, -1, -1, -1};

15969

int PSHUFDMask[4] = {-1, -1, -1, -1};

15970

15971

// First fix the masks for all the inputs that are staying in their

15972

// original halves. This will then dictate the targets of the cross-half

15973

// shuffles.

15974

auto fixInPlaceInputs =

15975

[&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs,

15976

MutableArrayRef<int> SourceHalfMask,

15977

MutableArrayRef<int> HalfMask, int HalfOffset) {

15978

if (InPlaceInputs.empty())

15979

return;

15980

if (InPlaceInputs.size() == 1) {

15981

SourceHalfMask[InPlaceInputs[0] - HalfOffset] =

15982

InPlaceInputs[0] - HalfOffset;

15983

PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;

15984

return;

15985

}

15986

if (IncomingInputs.empty()) {

15987

// Just fix all of the in place inputs.

15988

for (int Input : InPlaceInputs) {

15989

SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;

15990

PSHUFDMask[Input / 2] = Input / 2;

15991

}

15992

return;

15993

}

15994

15995

assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!")(static_cast <bool> (InPlaceInputs.size() == 2 &&
"Cannot handle 3 or 4 inputs!") ? void (0) : __assert_fail (
"InPlaceInputs.size() == 2 && \"Cannot handle 3 or 4 inputs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15995, __extension__
__PRETTY_FUNCTION__));

15996

SourceHalfMask[InPlaceInputs[0] - HalfOffset] =

15997

InPlaceInputs[0] - HalfOffset;

15998

// Put the second input next to the first so that they are packed into

15999

// a dword. We find the adjacent index by toggling the low bit.

16000

int AdjIndex = InPlaceInputs[0] ^ 1;

16001

SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;

16002

std::replace(HalfMask.begin(), HalfMask.end(), InPlaceInputs[1], AdjIndex);

16003

PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;

16004

};

16005

fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);

16006

fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);

16007

16008

// Now gather the cross-half inputs and place them into a free dword of

16009

// their target half.

16010

// FIXME: This operation could almost certainly be simplified dramatically to

16011

// look more like the 3-1 fixing operation.

16012

auto moveInputsToRightHalf = [&PSHUFDMask](

16013

MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,

16014

MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,

16015

MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset,

16016

int DestOffset) {

16017

auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {

16018

return SourceHalfMask[Word] >= 0 && SourceHalfMask[Word] != Word;

16019

};

16020

auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask,

16021

int Word) {

16022

int LowWord = Word & ~1;

16023

int HighWord = Word | 1;

16024

return isWordClobbered(SourceHalfMask, LowWord) ||

16025

isWordClobbered(SourceHalfMask, HighWord);

16026

};

16027

16028

if (IncomingInputs.empty())

16029

return;

16030

16031

if (ExistingInputs.empty()) {

16032

// Map any dwords with inputs from them into the right half.

16033

for (int Input : IncomingInputs) {

16034

// If the source half mask maps over the inputs, turn those into

16035

// swaps and use the swapped lane.

16036

if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) {

16037

if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] < 0) {

16038

SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =

16039

Input - SourceOffset;

16040

// We have to swap the uses in our half mask in one sweep.

16041

for (int &M : HalfMask)

16042

if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset)

16043

M = Input;

16044

else if (M == Input)

16045

M = SourceHalfMask[Input - SourceOffset] + SourceOffset;

16046

} else {

16047

assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] ==(static_cast <bool> (SourceHalfMask[SourceHalfMask[Input
- SourceOffset]] == Input - SourceOffset && "Previous placement doesn't match!"
) ? void (0) : __assert_fail ("SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == Input - SourceOffset && \"Previous placement doesn't match!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16049, __extension__
__PRETTY_FUNCTION__))

16048

Input - SourceOffset &&(static_cast <bool> (SourceHalfMask[SourceHalfMask[Input
- SourceOffset]] == Input - SourceOffset && "Previous placement doesn't match!"
) ? void (0) : __assert_fail ("SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == Input - SourceOffset && \"Previous placement doesn't match!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16049, __extension__
__PRETTY_FUNCTION__))

16049

"Previous placement doesn't match!")(static_cast <bool> (SourceHalfMask[SourceHalfMask[Input
- SourceOffset]] == Input - SourceOffset && "Previous placement doesn't match!"
) ? void (0) : __assert_fail ("SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == Input - SourceOffset && \"Previous placement doesn't match!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16049, __extension__
__PRETTY_FUNCTION__));

16050

}

16051

// Note that this correctly re-maps both when we do a swap and when

16052

// we observe the other side of the swap above. We rely on that to

16053

// avoid swapping the members of the input list directly.

16054

Input = SourceHalfMask[Input - SourceOffset] + SourceOffset;

16055

}

16056

16057

// Map the input's dword into the correct half.

16058

if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] < 0)

16059

PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;

16060

else

16061

assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==(static_cast <bool> (PSHUFDMask[(Input - SourceOffset +
DestOffset) / 2] == Input / 2 && "Previous placement doesn't match!"
) ? void (0) : __assert_fail ("PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == Input / 2 && \"Previous placement doesn't match!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16063, __extension__
__PRETTY_FUNCTION__))

16062

Input / 2 &&(static_cast <bool> (PSHUFDMask[(Input - SourceOffset +
DestOffset) / 2] == Input / 2 && "Previous placement doesn't match!"
) ? void (0) : __assert_fail ("PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == Input / 2 && \"Previous placement doesn't match!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16063, __extension__
__PRETTY_FUNCTION__))

16063

"Previous placement doesn't match!")(static_cast <bool> (PSHUFDMask[(Input - SourceOffset +
DestOffset) / 2] == Input / 2 && "Previous placement doesn't match!"
) ? void (0) : __assert_fail ("PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == Input / 2 && \"Previous placement doesn't match!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16063, __extension__
__PRETTY_FUNCTION__));

16064

}

16065

16066

// And just directly shift any other-half mask elements to be same-half

16067

// as we will have mirrored the dword containing the element into the

16068

// same position within that half.

16069

for (int &M : HalfMask)

16070

if (M >= SourceOffset && M < SourceOffset + 4) {

16071

M = M - SourceOffset + DestOffset;

16072

assert(M >= 0 && "This should never wrap below zero!")(static_cast <bool> (M >= 0 && "This should never wrap below zero!"
) ? void (0) : __assert_fail ("M >= 0 && \"This should never wrap below zero!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16072, __extension__
__PRETTY_FUNCTION__));

16073

}

16074

return;

16075

}

16076

16077

// Ensure we have the input in a viable dword of its current half. This

16078

// is particularly tricky because the original position may be clobbered

16079

// by inputs being moved and *staying* in that half.

16080

if (IncomingInputs.size() == 1) {

16081

if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {

16082

int InputFixed = find(SourceHalfMask, -1) - std::begin(SourceHalfMask) +

16083

SourceOffset;

16084

SourceHalfMask[InputFixed - SourceOffset] =

16085

IncomingInputs[0] - SourceOffset;

16086

std::replace(HalfMask.begin(), HalfMask.end(), IncomingInputs[0],

16087

InputFixed);

16088

IncomingInputs[0] = InputFixed;

16089

}

16090

} else if (IncomingInputs.size() == 2) {

16091

if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 ||

16092

isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {

16093

// We have two non-adjacent or clobbered inputs we need to extract from

16094

// the source half. To do this, we need to map them into some adjacent

16095

// dword slot in the source mask.

16096

int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,

16097

IncomingInputs[1] - SourceOffset};

16098

16099

// If there is a free slot in the source half mask adjacent to one of

16100

// the inputs, place the other input in it. We use (Index XOR 1) to

16101

// compute an adjacent index.

16102

if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&

16103

SourceHalfMask[InputsFixed[0] ^ 1] < 0) {

16104

SourceHalfMask[InputsFixed[0]] = InputsFixed[0];

16105

SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];

16106

InputsFixed[1] = InputsFixed[0] ^ 1;

16107

} else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&

16108

SourceHalfMask[InputsFixed[1] ^ 1] < 0) {

16109

SourceHalfMask[InputsFixed[1]] = InputsFixed[1];

16110

SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];

16111

InputsFixed[0] = InputsFixed[1] ^ 1;

16112

} else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] < 0 &&

16113

SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] < 0) {

16114

// The two inputs are in the same DWord but it is clobbered and the

16115

// adjacent DWord isn't used at all. Move both inputs to the free

16116

// slot.

16117

SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];

16118

SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];

16119

InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);

16120

InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;

16121

} else {

16122

// The only way we hit this point is if there is no clobbering

16123

// (because there are no off-half inputs to this half) and there is no

16124

// free slot adjacent to one of the inputs. In this case, we have to

16125

// swap an input with a non-input.

16126

for (int i = 0; i < 4; ++i)

16127

assert((SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) &&(static_cast <bool> ((SourceHalfMask[i] < 0 || SourceHalfMask
[i] == i) && "We can't handle any clobbers here!") ? void
(0) : __assert_fail ("(SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) && \"We can't handle any clobbers here!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16128, __extension__
__PRETTY_FUNCTION__))

16128

"We can't handle any clobbers here!")(static_cast <bool> ((SourceHalfMask[i] < 0 || SourceHalfMask
[i] == i) && "We can't handle any clobbers here!") ? void
(0) : __assert_fail ("(SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) && \"We can't handle any clobbers here!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16128, __extension__
__PRETTY_FUNCTION__));

16129

assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&(static_cast <bool> (InputsFixed[1] != (InputsFixed[0] ^
1) && "Cannot have adjacent inputs here!") ? void (0
) : __assert_fail ("InputsFixed[1] != (InputsFixed[0] ^ 1) && \"Cannot have adjacent inputs here!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16130, __extension__
__PRETTY_FUNCTION__))

16130

"Cannot have adjacent inputs here!")(static_cast <bool> (InputsFixed[1] != (InputsFixed[0] ^
1) && "Cannot have adjacent inputs here!") ? void (0
) : __assert_fail ("InputsFixed[1] != (InputsFixed[0] ^ 1) && \"Cannot have adjacent inputs here!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16130, __extension__
__PRETTY_FUNCTION__));

16131

16132

SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];

16133

SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;

16134

16135

// We also have to update the final source mask in this case because

16136

// it may need to undo the above swap.

16137

for (int &M : FinalSourceHalfMask)

16138

if (M == (InputsFixed[0] ^ 1) + SourceOffset)

16139

M = InputsFixed[1] + SourceOffset;

16140

else if (M == InputsFixed[1] + SourceOffset)

16141

M = (InputsFixed[0] ^ 1) + SourceOffset;

16142

16143

InputsFixed[1] = InputsFixed[0] ^ 1;

16144

}

16145

16146

// Point everything at the fixed inputs.

16147

for (int &M : HalfMask)

16148

if (M == IncomingInputs[0])

16149

M = InputsFixed[0] + SourceOffset;

16150

else if (M == IncomingInputs[1])

16151

M = InputsFixed[1] + SourceOffset;

16152

16153

IncomingInputs[0] = InputsFixed[0] + SourceOffset;

16154

IncomingInputs[1] = InputsFixed[1] + SourceOffset;

16155

}

16156

} else {

16157

llvm_unreachable("Unhandled input size!")::llvm::llvm_unreachable_internal("Unhandled input size!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16157);

16158

}

16159

16160

// Now hoist the DWord down to the right half.

16161

int FreeDWord = (PSHUFDMask[DestOffset / 2] < 0 ? 0 : 1) + DestOffset / 2;

16162

assert(PSHUFDMask[FreeDWord] < 0 && "DWord not free")(static_cast <bool> (PSHUFDMask[FreeDWord] < 0 &&
"DWord not free") ? void (0) : __assert_fail ("PSHUFDMask[FreeDWord] < 0 && \"DWord not free\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16162, __extension__
__PRETTY_FUNCTION__));

16163

PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;

16164

for (int &M : HalfMask)

16165

for (int Input : IncomingInputs)

16166

if (M == Input)

16167

M = FreeDWord * 2 + Input % 2;

16168

};

16169

moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,

16170

/*SourceOffset*/ 4, /*DestOffset*/ 0);

16171

moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,

16172

/*SourceOffset*/ 0, /*DestOffset*/ 4);

16173

16174

// Now enact all the shuffles we've computed to move the inputs into their

16175

// target half.

16176

if (!isNoopShuffleMask(PSHUFLMask))

16177

V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,

16178

getV4X86ShuffleImm8ForMask(PSHUFLMask, DL, DAG));

16179

if (!isNoopShuffleMask(PSHUFHMask))

16180

V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,

16181

getV4X86ShuffleImm8ForMask(PSHUFHMask, DL, DAG));

16182

if (!isNoopShuffleMask(PSHUFDMask))

16183

V = DAG.getBitcast(

16184

VT,

16185

DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),

16186

getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));

16187

16188

// At this point, each half should contain all its inputs, and we can then

16189

// just shuffle them into their final position.

16190

assert(count_if(LoMask, [](int M) { return M >= 4; }) == 0 &&(static_cast <bool> (count_if(LoMask, [](int M) { return
M >= 4; }) == 0 && "Failed to lift all the high half inputs to the low mask!"
) ? void (0) : __assert_fail ("count_if(LoMask, [](int M) { return M >= 4; }) == 0 && \"Failed to lift all the high half inputs to the low mask!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16191, __extension__
__PRETTY_FUNCTION__))

16191

"Failed to lift all the high half inputs to the low mask!")(static_cast <bool> (count_if(LoMask, [](int M) { return
M >= 4; }) == 0 && "Failed to lift all the high half inputs to the low mask!"
) ? void (0) : __assert_fail ("count_if(LoMask, [](int M) { return M >= 4; }) == 0 && \"Failed to lift all the high half inputs to the low mask!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16191, __extension__
__PRETTY_FUNCTION__));

16192

assert(count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 &&(static_cast <bool> (count_if(HiMask, [](int M) { return
M >= 0 && M < 4; }) == 0 && "Failed to lift all the low half inputs to the high mask!"
) ? void (0) : __assert_fail ("count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 && \"Failed to lift all the low half inputs to the high mask!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16193, __extension__
__PRETTY_FUNCTION__))

16193

"Failed to lift all the low half inputs to the high mask!")(static_cast <bool> (count_if(HiMask, [](int M) { return
M >= 0 && M < 4; }) == 0 && "Failed to lift all the low half inputs to the high mask!"
) ? void (0) : __assert_fail ("count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 && \"Failed to lift all the low half inputs to the high mask!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16193, __extension__
__PRETTY_FUNCTION__));

16194

16195

// Do a half shuffle for the low mask.

16196

if (!isNoopShuffleMask(LoMask))

16197

V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,

16198

getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));

16199

16200

// Do a half shuffle with the high mask after shifting its values down.

16201

for (int &M : HiMask)

16202

if (M >= 0)

16203

M -= 4;

16204

if (!isNoopShuffleMask(HiMask))

16205

V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,

16206

getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));

16207

16208

return V;

16209

}

16210

16211

/// Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the

16212

/// blend if only one input is used.

16213

static SDValue lowerShuffleAsBlendOfPSHUFBs(

16214

const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,

16215

const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse, bool &V2InUse) {

16216

assert(!is128BitLaneCrossingShuffleMask(VT, Mask) &&(static_cast <bool> (!is128BitLaneCrossingShuffleMask(VT
, Mask) && "Lane crossing shuffle masks not supported"
) ? void (0) : __assert_fail ("!is128BitLaneCrossingShuffleMask(VT, Mask) && \"Lane crossing shuffle masks not supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16217, __extension__
__PRETTY_FUNCTION__))

16217

"Lane crossing shuffle masks not supported")(static_cast <bool> (!is128BitLaneCrossingShuffleMask(VT
, Mask) && "Lane crossing shuffle masks not supported"
) ? void (0) : __assert_fail ("!is128BitLaneCrossingShuffleMask(VT, Mask) && \"Lane crossing shuffle masks not supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16217, __extension__
__PRETTY_FUNCTION__));

16218

16219

int NumBytes = VT.getSizeInBits() / 8;

16220

int Size = Mask.size();

16221

int Scale = NumBytes / Size;

16222

16223

SmallVector<SDValue, 64> V1Mask(NumBytes, DAG.getUNDEF(MVT::i8));

16224

SmallVector<SDValue, 64> V2Mask(NumBytes, DAG.getUNDEF(MVT::i8));

16225

V1InUse = false;

16226

V2InUse = false;

16227

16228

for (int i = 0; i < NumBytes; ++i) {

16229

int M = Mask[i / Scale];

16230

if (M < 0)

16231

continue;

16232

16233

const int ZeroMask = 0x80;

16234

int V1Idx = M < Size ? M * Scale + i % Scale : ZeroMask;

16235

int V2Idx = M < Size ? ZeroMask : (M - Size) * Scale + i % Scale;

16236

if (Zeroable[i / Scale])

16237

V1Idx = V2Idx = ZeroMask;

16238

16239

V1Mask[i] = DAG.getConstant(V1Idx, DL, MVT::i8);

16240

V2Mask[i] = DAG.getConstant(V2Idx, DL, MVT::i8);

16241

V1InUse |= (ZeroMask != V1Idx);

16242

V2InUse |= (ZeroMask != V2Idx);

16243

}

16244

16245

MVT ShufVT = MVT::getVectorVT(MVT::i8, NumBytes);

16246

if (V1InUse)

16247

V1 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V1),

16248

DAG.getBuildVector(ShufVT, DL, V1Mask));

16249

if (V2InUse)

16250

V2 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V2),

16251

DAG.getBuildVector(ShufVT, DL, V2Mask));

16252

16253

// If we need shuffled inputs from both, blend the two.

16254

SDValue V;

16255

if (V1InUse && V2InUse)

16256

V = DAG.getNode(ISD::OR, DL, ShufVT, V1, V2);

16257

else

16258

V = V1InUse ? V1 : V2;

16259

16260

// Cast the result back to the correct type.

16261

return DAG.getBitcast(VT, V);

16262

}

16263

16264

/// Generic lowering of 8-lane i16 shuffles.

16265

///

16266

/// This handles both single-input shuffles and combined shuffle/blends with

16267

/// two inputs. The single input shuffles are immediately delegated to

16268

/// a dedicated lowering routine.

16269

///

16270

/// The blends are lowered in one of three fundamental ways. If there are few

16271

/// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle

16272

/// of the input is significantly cheaper when lowered as an interleaving of

16273

/// the two inputs, try to interleave them. Otherwise, blend the low and high

16274

/// halves of the inputs separately (making them have relatively few inputs)

16275

/// and then concatenate them.

16276

static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

16277

const APInt &Zeroable, SDValue V1, SDValue V2,

16278

const X86Subtarget &Subtarget,

16279

SelectionDAG &DAG) {

16280

assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v8i16
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v8i16 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16280, __extension__
__PRETTY_FUNCTION__));

16281

assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v8i16
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v8i16 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16281, __extension__
__PRETTY_FUNCTION__));

16282

assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")(static_cast <bool> (Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 8 && \"Unexpected mask size for v8 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16282, __extension__
__PRETTY_FUNCTION__));

16283

16284

// Whenever we can lower this as a zext, that instruction is strictly faster

16285

// than any alternative.

16286

if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i16, V1, V2, Mask,

16287

Zeroable, Subtarget, DAG))

16288

return ZExt;

16289

16290

// Try to use lower using a truncation.

16291

if (SDValue V = lowerShuffleWithVPMOV(DL, MVT::v8i16, V1, V2, Mask, Zeroable,

16292

Subtarget, DAG))

16293

return V;

16294

16295

int NumV2Inputs = count_if(Mask, [](int M) { return M >= 8; });

16296

16297

if (NumV2Inputs == 0) {

16298

// Try to use shift instructions.

16299

if (SDValue Shift =

16300

lowerShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask, Zeroable,

16301

Subtarget, DAG, /*BitwiseOnly*/ false))

16302

return Shift;

16303

16304

// Check for being able to broadcast a single element.

16305

if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i16, V1, V2,

16306

Mask, Subtarget, DAG))

16307

return Broadcast;

16308

16309

// Try to use bit rotation instructions.

16310

if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v8i16, V1, Mask,

16311

Subtarget, DAG))

16312

return Rotate;

16313

16314

// Use dedicated unpack instructions for masks that match their pattern.

16315

if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))

16316

return V;

16317

16318

// Use dedicated pack instructions for masks that match their pattern.

16319

if (SDValue V = lowerShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2, DAG,

16320

Subtarget))

16321

return V;

16322

16323

// Try to use byte rotation instructions.

16324

if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V1, Mask,

16325

Subtarget, DAG))

16326

return Rotate;

16327

16328

// Make a copy of the mask so it can be modified.

16329

SmallVector<int, 8> MutableMask(Mask);

16330

return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v8i16, V1, MutableMask,

16331

Subtarget, DAG);

16332

}

16333

16334

assert(llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) &&(static_cast <bool> (llvm::any_of(Mask, [](int M) { return
M >= 0 && M < 8; }) && "All single-input shuffles should be canonicalized to be V1-input "
"shuffles.") ? void (0) : __assert_fail ("llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) && \"All single-input shuffles should be canonicalized to be V1-input \" \"shuffles.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16336, __extension__
__PRETTY_FUNCTION__))

16335

"All single-input shuffles should be canonicalized to be V1-input "(static_cast <bool> (llvm::any_of(Mask, [](int M) { return
M >= 0 && M < 8; }) && "All single-input shuffles should be canonicalized to be V1-input "
"shuffles.") ? void (0) : __assert_fail ("llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) && \"All single-input shuffles should be canonicalized to be V1-input \" \"shuffles.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16336, __extension__
__PRETTY_FUNCTION__))

16336

"shuffles.")(static_cast <bool> (llvm::any_of(Mask, [](int M) { return
M >= 0 && M < 8; }) && "All single-input shuffles should be canonicalized to be V1-input "
"shuffles.") ? void (0) : __assert_fail ("llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) && \"All single-input shuffles should be canonicalized to be V1-input \" \"shuffles.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16336, __extension__
__PRETTY_FUNCTION__));

16337

16338

// Try to use shift instructions.

16339

if (SDValue Shift =

16340

lowerShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget,

16341

DAG, /*BitwiseOnly*/ false))

16342

return Shift;

16343

16344

// See if we can use SSE4A Extraction / Insertion.

16345

if (Subtarget.hasSSE4A())

16346

if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask,

16347

Zeroable, DAG))

16348

return V;

16349

16350

// There are special ways we can lower some single-element blends.

16351

if (NumV2Inputs == 1)

16352

if (SDValue V = lowerShuffleAsElementInsertion(

16353

DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))

16354

return V;

16355

16356

// We have different paths for blend lowering, but they all must use the

16357

// *exact* same predicate.

16358

bool IsBlendSupported = Subtarget.hasSSE41();

16359

if (IsBlendSupported)

16360

if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,

16361

Zeroable, Subtarget, DAG))

16362

return Blend;

16363

16364

if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask,

16365

Zeroable, Subtarget, DAG))

16366

return Masked;

16367

16368

// Use dedicated unpack instructions for masks that match their pattern.

16369

if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))

16370

return V;

16371

16372

// Use dedicated pack instructions for masks that match their pattern.

16373

if (SDValue V = lowerShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2, DAG,

16374

Subtarget))

16375

return V;

16376

16377

// Try to use lower using a truncation.

16378

if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v8i16, V1, V2, Mask, Zeroable,

16379

Subtarget, DAG))

16380

return V;

16381

16382

// Try to use byte rotation instructions.

16383

if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V2, Mask,

16384

Subtarget, DAG))

16385

return Rotate;

16386

16387

if (SDValue BitBlend =

16388

lowerShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG))

16389

return BitBlend;

16390

16391

// Try to use byte shift instructions to mask.

16392

if (SDValue V = lowerShuffleAsByteShiftMask(DL, MVT::v8i16, V1, V2, Mask,

16393

Zeroable, Subtarget, DAG))

16394

return V;

16395

16396

// Attempt to lower using compaction, SSE41 is necessary for PACKUSDW.

16397

// We could use SIGN_EXTEND_INREG+PACKSSDW for older targets but this seems to

16398

// be slower than a PSHUFLW+PSHUFHW+PSHUFD chain.

16399

int NumEvenDrops = canLowerByDroppingElements(Mask, true, false);

16400

if ((NumEvenDrops == 1 || NumEvenDrops == 2) && Subtarget.hasSSE41() &&

16401

!Subtarget.hasVLX()) {

16402

// Check if this is part of a 256-bit vector truncation.

16403

if (NumEvenDrops == 2 && Subtarget.hasAVX2() &&

16404

peekThroughBitcasts(V1).getOpcode() == ISD::EXTRACT_SUBVECTOR &&

16405

peekThroughBitcasts(V2).getOpcode() == ISD::EXTRACT_SUBVECTOR) {

16406

SDValue V1V2 = concatSubVectors(V1, V2, DAG, DL);

16407

V1V2 = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1V2,

16408

getZeroVector(MVT::v16i16, Subtarget, DAG, DL),

16409

DAG.getTargetConstant(0xEE, DL, MVT::i8));

16410

V1V2 = DAG.getBitcast(MVT::v8i32, V1V2);

16411

V1 = extract128BitVector(V1V2, 0, DAG, DL);

16412

V2 = extract128BitVector(V1V2, 4, DAG, DL);

16413

} else {

16414

SmallVector<SDValue, 4> DWordClearOps(4,

16415

DAG.getConstant(0, DL, MVT::i32));

16416

for (unsigned i = 0; i != 4; i += 1 << (NumEvenDrops - 1))

16417

DWordClearOps[i] = DAG.getConstant(0xFFFF, DL, MVT::i32);

16418

SDValue DWordClearMask =

16419

DAG.getBuildVector(MVT::v4i32, DL, DWordClearOps);

16420

V1 = DAG.getNode(ISD::AND, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, V1),

16421

DWordClearMask);

16422

V2 = DAG.getNode(ISD::AND, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, V2),

16423

DWordClearMask);

16424

}

16425

// Now pack things back together.

16426

SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v8i16, V1, V2);

16427

if (NumEvenDrops == 2) {

16428

Result = DAG.getBitcast(MVT::v4i32, Result);

16429

Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v8i16, Result, Result);

16430

}

16431

return Result;

16432

}

16433

16434

// When compacting odd (upper) elements, use PACKSS pre-SSE41.

16435

int NumOddDrops = canLowerByDroppingElements(Mask, false, false);

16436

if (NumOddDrops == 1) {

16437

bool HasSSE41 = Subtarget.hasSSE41();

16438

V1 = DAG.getNode(HasSSE41 ? X86ISD::VSRLI : X86ISD::VSRAI, DL, MVT::v4i32,

16439

DAG.getBitcast(MVT::v4i32, V1),

16440

DAG.getTargetConstant(16, DL, MVT::i8));

16441

V2 = DAG.getNode(HasSSE41 ? X86ISD::VSRLI : X86ISD::VSRAI, DL, MVT::v4i32,

16442

DAG.getBitcast(MVT::v4i32, V2),

16443

DAG.getTargetConstant(16, DL, MVT::i8));

16444

return DAG.getNode(HasSSE41 ? X86ISD::PACKUS : X86ISD::PACKSS, DL,

16445

MVT::v8i16, V1, V2);

16446

}

16447

16448

// Try to lower by permuting the inputs into an unpack instruction.

16449

if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1, V2,

16450

Mask, Subtarget, DAG))

16451

return Unpack;

16452

16453

// If we can't directly blend but can use PSHUFB, that will be better as it

16454

// can both shuffle and set up the inefficient blend.

16455

if (!IsBlendSupported && Subtarget.hasSSSE3()) {

16456

bool V1InUse, V2InUse;

16457

return lowerShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask,

16458

Zeroable, DAG, V1InUse, V2InUse);

16459

}

16460

16461

// We can always bit-blend if we have to so the fallback strategy is to

16462

// decompose into single-input permutes and blends/unpacks.

16463

return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8i16, V1, V2,

16464

Mask, Subtarget, DAG);

16465

}

16466

16467

/// Lower 8-lane 16-bit floating point shuffles.

16468

static SDValue lowerV8F16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

16469

const APInt &Zeroable, SDValue V1, SDValue V2,

16470

const X86Subtarget &Subtarget,

16471

SelectionDAG &DAG) {

16472

assert(V1.getSimpleValueType() == MVT::v8f16 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v8f16
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v8f16 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16472, __extension__
__PRETTY_FUNCTION__));

16473

assert(V2.getSimpleValueType() == MVT::v8f16 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v8f16
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v8f16 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16473, __extension__
__PRETTY_FUNCTION__));

16474

assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")(static_cast <bool> (Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 8 && \"Unexpected mask size for v8 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16474, __extension__
__PRETTY_FUNCTION__));

16475

int NumV2Elements = count_if(Mask, [](int M) { return M >= 8; });

16476

16477

if (Subtarget.hasFP16()) {

16478

if (NumV2Elements == 0) {

16479

// Check for being able to broadcast a single element.

16480

if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8f16, V1, V2,

16481

Mask, Subtarget, DAG))

16482

return Broadcast;

16483

}

16484

if (NumV2Elements == 1 && Mask[0] >= 8)

16485

if (SDValue V = lowerShuffleAsElementInsertion(

16486

DL, MVT::v8f16, V1, V2, Mask, Zeroable, Subtarget, DAG))

16487

return V;

16488

}

16489

16490

V1 = DAG.getBitcast(MVT::v8i16, V1);

16491

V2 = DAG.getBitcast(MVT::v8i16, V2);

16492

return DAG.getBitcast(MVT::v8f16,

16493

DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, Mask));

16494

}

16495

16496

// Lowers unary/binary shuffle as VPERMV/VPERMV3, for non-VLX targets,

16497

// sub-512-bit shuffles are padded to 512-bits for the shuffle and then

16498

// the active subvector is extracted.

16499

static SDValue lowerShuffleWithPERMV(const SDLoc &DL, MVT VT,

16500

ArrayRef<int> Mask, SDValue V1, SDValue V2,

16501

const X86Subtarget &Subtarget,

16502

SelectionDAG &DAG) {

16503

MVT MaskVT = VT.changeTypeToInteger();

16504

SDValue MaskNode;

16505

MVT ShuffleVT = VT;

16506

if (!VT.is512BitVector() && !Subtarget.hasVLX()) {

16507

V1 = widenSubVector(V1, false, Subtarget, DAG, DL, 512);

16508

V2 = widenSubVector(V2, false, Subtarget, DAG, DL, 512);

16509

ShuffleVT = V1.getSimpleValueType();

16510

16511

// Adjust mask to correct indices for the second input.

16512

int NumElts = VT.getVectorNumElements();

16513

unsigned Scale = 512 / VT.getSizeInBits();

16514

SmallVector<int, 32> AdjustedMask(Mask);

16515

for (int &M : AdjustedMask)

16516

if (NumElts <= M)

16517

M += (Scale - 1) * NumElts;

16518

MaskNode = getConstVector(AdjustedMask, MaskVT, DAG, DL, true);

16519

MaskNode = widenSubVector(MaskNode, false, Subtarget, DAG, DL, 512);

16520

} else {

16521

MaskNode = getConstVector(Mask, MaskVT, DAG, DL, true);

16522

}

16523

16524

SDValue Result;

16525

if (V2.isUndef())

16526

Result = DAG.getNode(X86ISD::VPERMV, DL, ShuffleVT, MaskNode, V1);

16527

else

16528

Result = DAG.getNode(X86ISD::VPERMV3, DL, ShuffleVT, V1, MaskNode, V2);

16529

16530

if (VT != ShuffleVT)

16531

Result = extractSubVector(Result, 0, DAG, DL, VT.getSizeInBits());

16532

16533

return Result;

16534

}

16535

16536

/// Generic lowering of v16i8 shuffles.

16537

///

16538

/// This is a hybrid strategy to lower v16i8 vectors. It first attempts to

16539

/// detect any complexity reducing interleaving. If that doesn't help, it uses

16540

/// UNPCK to spread the i8 elements across two i16-element vectors, and uses

16541

/// the existing lowering for v8i16 blends on each half, finally PACK-ing them

16542

/// back together.

16543

static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

16544

const APInt &Zeroable, SDValue V1, SDValue V2,

16545

const X86Subtarget &Subtarget,

16546

SelectionDAG &DAG) {

16547

assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v16i8
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v16i8 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16547, __extension__
__PRETTY_FUNCTION__));

16548

assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v16i8
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v16i8 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16548, __extension__
__PRETTY_FUNCTION__));

16549

assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!")(static_cast <bool> (Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 16 && \"Unexpected mask size for v16 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16549, __extension__
__PRETTY_FUNCTION__));

16550

16551

// Try to use shift instructions.

16552

if (SDValue Shift =

16553

lowerShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget,

16554

DAG, /*BitwiseOnly*/ false))

16555

return Shift;

16556

16557

// Try to use byte rotation instructions.

16558

if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i8, V1, V2, Mask,

16559

Subtarget, DAG))

16560

return Rotate;

16561

16562

// Use dedicated pack instructions for masks that match their pattern.

16563

if (SDValue V = lowerShuffleWithPACK(DL, MVT::v16i8, Mask, V1, V2, DAG,

16564

Subtarget))

16565

return V;

16566

16567

// Try to use a zext lowering.

16568

if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v16i8, V1, V2, Mask,

16569

Zeroable, Subtarget, DAG))

16570

return ZExt;

16571

16572

// Try to use lower using a truncation.

16573

if (SDValue V = lowerShuffleWithVPMOV(DL, MVT::v16i8, V1, V2, Mask, Zeroable,

16574

Subtarget, DAG))

16575

return V;

16576

16577

if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v16i8, V1, V2, Mask, Zeroable,

16578

Subtarget, DAG))

16579

return V;

16580

16581

// See if we can use SSE4A Extraction / Insertion.

16582

if (Subtarget.hasSSE4A())

16583

if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask,

16584

Zeroable, DAG))

16585

return V;

16586

16587

int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });

16588

16589

// For single-input shuffles, there are some nicer lowering tricks we can use.

16590

if (NumV2Elements == 0) {

16591

// Check for being able to broadcast a single element.

16592

if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i8, V1, V2,

16593

Mask, Subtarget, DAG))

16594

return Broadcast;

16595

16596

// Try to use bit rotation instructions.

16597

if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v16i8, V1, Mask,

16598

Subtarget, DAG))

16599

return Rotate;

16600

16601

if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))

16602

return V;

16603

16604

// Check whether we can widen this to an i16 shuffle by duplicating bytes.

16605

// Notably, this handles splat and partial-splat shuffles more efficiently.

16606

// However, it only makes sense if the pre-duplication shuffle simplifies

16607

// things significantly. Currently, this means we need to be able to

16608

// express the pre-duplication shuffle as an i16 shuffle.

16609

//

16610

// FIXME: We should check for other patterns which can be widened into an

16611

// i16 shuffle as well.

16612

auto canWidenViaDuplication = [](ArrayRef<int> Mask) {

16613

for (int i = 0; i < 16; i += 2)

16614

if (Mask[i] >= 0 && Mask[i + 1] >= 0 && Mask[i] != Mask[i + 1])

16615

return false;

16616

16617

return true;

16618

};

16619

auto tryToWidenViaDuplication = [&]() -> SDValue {

16620

if (!canWidenViaDuplication(Mask))

16621

return SDValue();

16622

SmallVector<int, 4> LoInputs;

16623

copy_if(Mask, std::back_inserter(LoInputs),

16624

[](int M) { return M >= 0 && M < 8; });

16625

array_pod_sort(LoInputs.begin(), LoInputs.end());

16626

LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()),

16627

LoInputs.end());

16628

SmallVector<int, 4> HiInputs;

16629

copy_if(Mask, std::back_inserter(HiInputs), [](int M) { return M >= 8; });

16630

array_pod_sort(HiInputs.begin(), HiInputs.end());

16631

HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()),

16632

HiInputs.end());

16633

16634

bool TargetLo = LoInputs.size() >= HiInputs.size();

16635

ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;

16636

ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;

16637

16638

int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};

16639

SmallDenseMap<int, int, 8> LaneMap;

16640

for (int I : InPlaceInputs) {

16641

PreDupI16Shuffle[I/2] = I/2;

16642

LaneMap[I] = I;

16643

}

16644

int j = TargetLo ? 0 : 4, je = j + 4;

16645

for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) {

16646

// Check if j is already a shuffle of this input. This happens when

16647

// there are two adjacent bytes after we move the low one.

16648

if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {

16649

// If we haven't yet mapped the input, search for a slot into which

16650

// we can map it.

16651

while (j < je && PreDupI16Shuffle[j] >= 0)

16652

++j;

16653

16654

if (j == je)

16655

// We can't place the inputs into a single half with a simple i16 shuffle, so bail.

16656

return SDValue();

16657

16658

// Map this input with the i16 shuffle.

16659

PreDupI16Shuffle[j] = MovingInputs[i] / 2;

16660

}

16661

16662

// Update the lane map based on the mapping we ended up with.

16663

LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;

16664

}

16665

V1 = DAG.getBitcast(

16666

MVT::v16i8,

16667

DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),

16668

DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));

16669

16670

// Unpack the bytes to form the i16s that will be shuffled into place.

16671

bool EvenInUse = false, OddInUse = false;

16672

for (int i = 0; i < 16; i += 2) {

16673

EvenInUse |= (Mask[i + 0] >= 0);

16674

OddInUse |= (Mask[i + 1] >= 0);

16675

if (EvenInUse && OddInUse)

16676

break;

16677

}

16678

V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,

16679

MVT::v16i8, EvenInUse ? V1 : DAG.getUNDEF(MVT::v16i8),

16680

OddInUse ? V1 : DAG.getUNDEF(MVT::v16i8));

16681

16682

int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};

16683

for (int i = 0; i < 16; ++i)

16684

if (Mask[i] >= 0) {

16685

int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);

16686

assert(MappedMask < 8 && "Invalid v8 shuffle mask!")(static_cast <bool> (MappedMask < 8 && "Invalid v8 shuffle mask!"
) ? void (0) : __assert_fail ("MappedMask < 8 && \"Invalid v8 shuffle mask!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16686, __extension__
__PRETTY_FUNCTION__));

16687

if (PostDupI16Shuffle[i / 2] < 0)

16688

PostDupI16Shuffle[i / 2] = MappedMask;

16689

else

16690

assert(PostDupI16Shuffle[i / 2] == MappedMask &&(static_cast <bool> (PostDupI16Shuffle[i / 2] == MappedMask
&& "Conflicting entries in the original shuffle!") ?
void (0) : __assert_fail ("PostDupI16Shuffle[i / 2] == MappedMask && \"Conflicting entries in the original shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16691, __extension__
__PRETTY_FUNCTION__))

16691

"Conflicting entries in the original shuffle!")(static_cast <bool> (PostDupI16Shuffle[i / 2] == MappedMask
&& "Conflicting entries in the original shuffle!") ?
void (0) : __assert_fail ("PostDupI16Shuffle[i / 2] == MappedMask && \"Conflicting entries in the original shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16691, __extension__
__PRETTY_FUNCTION__));

16692

}

16693

return DAG.getBitcast(

16694

MVT::v16i8,

16695

DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),

16696

DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));

16697

};

16698

if (SDValue V = tryToWidenViaDuplication())

16699

return V;

16700

}

16701

16702

if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask,

16703

Zeroable, Subtarget, DAG))

16704

return Masked;

16705

16706

// Use dedicated unpack instructions for masks that match their pattern.

16707

if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))

16708

return V;

16709

16710

// Try to use byte shift instructions to mask.

16711

if (SDValue V = lowerShuffleAsByteShiftMask(DL, MVT::v16i8, V1, V2, Mask,

16712

Zeroable, Subtarget, DAG))

16713

return V;

16714

16715

// Check for compaction patterns.

16716

bool IsSingleInput = V2.isUndef();

16717

int NumEvenDrops = canLowerByDroppingElements(Mask, true, IsSingleInput);

16718

16719

// Check for SSSE3 which lets us lower all v16i8 shuffles much more directly

16720

// with PSHUFB. It is important to do this before we attempt to generate any

16721

// blends but after all of the single-input lowerings. If the single input

16722

// lowerings can find an instruction sequence that is faster than a PSHUFB, we

16723

// want to preserve that and we can DAG combine any longer sequences into

16724

// a PSHUFB in the end. But once we start blending from multiple inputs,

16725

// the complexity of DAG combining bad patterns back into PSHUFB is too high,

16726

// and there are *very* few patterns that would actually be faster than the

16727

// PSHUFB approach because of its ability to zero lanes.

16728

//

16729

// If the mask is a binary compaction, we can more efficiently perform this

16730

// as a PACKUS(AND(),AND()) - which is quicker than UNPACK(PSHUFB(),PSHUFB()).

16731

//

16732

// FIXME: The only exceptions to the above are blends which are exact

16733

// interleavings with direct instructions supporting them. We currently don't

16734

// handle those well here.

16735

if (Subtarget.hasSSSE3() && (IsSingleInput || NumEvenDrops != 1)) {

16736

bool V1InUse = false;

16737

bool V2InUse = false;

16738

16739

SDValue PSHUFB = lowerShuffleAsBlendOfPSHUFBs(

16740

DL, MVT::v16i8, V1, V2, Mask, Zeroable, DAG, V1InUse, V2InUse);

16741

16742

// If both V1 and V2 are in use and we can use a direct blend or an unpack,

16743

// do so. This avoids using them to handle blends-with-zero which is

16744

// important as a single pshufb is significantly faster for that.

16745

if (V1InUse && V2InUse) {

16746

if (Subtarget.hasSSE41())

16747

if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i8, V1, V2, Mask,

16748

Zeroable, Subtarget, DAG))

16749

return Blend;

16750

16751

// We can use an unpack to do the blending rather than an or in some

16752

// cases. Even though the or may be (very minorly) more efficient, we

16753

// preference this lowering because there are common cases where part of

16754

// the complexity of the shuffles goes away when we do the final blend as

16755

// an unpack.

16756

// FIXME: It might be worth trying to detect if the unpack-feeding

16757

// shuffles will both be pshufb, in which case we shouldn't bother with

16758

// this.

16759

if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(

16760

DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))

16761

return Unpack;

16762

16763

// AVX512VBMI can lower to VPERMB (non-VLX will pad to v64i8).

16764

if (Subtarget.hasVBMI())

16765

return lowerShuffleWithPERMV(DL, MVT::v16i8, Mask, V1, V2, Subtarget,

16766

DAG);

16767

16768

// If we have XOP we can use one VPPERM instead of multiple PSHUFBs.

16769

if (Subtarget.hasXOP()) {

16770

SDValue MaskNode = getConstVector(Mask, MVT::v16i8, DAG, DL, true);

16771

return DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, V1, V2, MaskNode);

16772

}

16773

16774

// Use PALIGNR+Permute if possible - permute might become PSHUFB but the

16775

// PALIGNR will be cheaper than the second PSHUFB+OR.

16776

if (SDValue V = lowerShuffleAsByteRotateAndPermute(

16777

DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))

16778

return V;

16779

}

16780

16781

return PSHUFB;

16782

}

16783

16784

// There are special ways we can lower some single-element blends.

16785

if (NumV2Elements == 1)

16786

if (SDValue V = lowerShuffleAsElementInsertion(

16787

DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))

16788

return V;

16789

16790

if (SDValue Blend = lowerShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG))

16791

return Blend;

16792

16793

// Check whether a compaction lowering can be done. This handles shuffles

16794

// which take every Nth element for some even N. See the helper function for

16795

// details.

16796

//

16797

// We special case these as they can be particularly efficiently handled with

16798

// the PACKUSB instruction on x86 and they show up in common patterns of

16799

// rearranging bytes to truncate wide elements.

16800

if (NumEvenDrops) {

16801

// NumEvenDrops is the power of two stride of the elements. Another way of

16802

// thinking about it is that we need to drop the even elements this many

16803

// times to get the original input.

16804

16805

// First we need to zero all the dropped bytes.

16806

assert(NumEvenDrops <= 3 &&(static_cast <bool> (NumEvenDrops <= 3 && "No support for dropping even elements more than 3 times."
) ? void (0) : __assert_fail ("NumEvenDrops <= 3 && \"No support for dropping even elements more than 3 times.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16807, __extension__
__PRETTY_FUNCTION__))

16807

"No support for dropping even elements more than 3 times.")(static_cast <bool> (NumEvenDrops <= 3 && "No support for dropping even elements more than 3 times."
) ? void (0) : __assert_fail ("NumEvenDrops <= 3 && \"No support for dropping even elements more than 3 times.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16807, __extension__
__PRETTY_FUNCTION__));

16808

SmallVector<SDValue, 8> WordClearOps(8, DAG.getConstant(0, DL, MVT::i16));

16809

for (unsigned i = 0; i != 8; i += 1 << (NumEvenDrops - 1))

16810

WordClearOps[i] = DAG.getConstant(0xFF, DL, MVT::i16);

16811

SDValue WordClearMask = DAG.getBuildVector(MVT::v8i16, DL, WordClearOps);

16812

V1 = DAG.getNode(ISD::AND, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, V1),

16813

WordClearMask);

16814

if (!IsSingleInput)

16815

V2 = DAG.getNode(ISD::AND, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, V2),

16816

WordClearMask);

16817

16818

// Now pack things back together.

16819

SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1,

16820

IsSingleInput ? V1 : V2);

16821

for (int i = 1; i < NumEvenDrops; ++i) {

16822

Result = DAG.getBitcast(MVT::v8i16, Result);

16823

Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result);

16824

}

16825

return Result;

16826

}

16827

16828

int NumOddDrops = canLowerByDroppingElements(Mask, false, IsSingleInput);

16829

if (NumOddDrops == 1) {

16830

V1 = DAG.getNode(X86ISD::VSRLI, DL, MVT::v8i16,

16831

DAG.getBitcast(MVT::v8i16, V1),

16832

DAG.getTargetConstant(8, DL, MVT::i8));

16833

if (!IsSingleInput)

16834

V2 = DAG.getNode(X86ISD::VSRLI, DL, MVT::v8i16,

16835

DAG.getBitcast(MVT::v8i16, V2),

16836

DAG.getTargetConstant(8, DL, MVT::i8));

16837

return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1,

16838

IsSingleInput ? V1 : V2);

16839

}

16840

16841

// Handle multi-input cases by blending/unpacking single-input shuffles.

16842

if (NumV2Elements > 0)

16843

return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v16i8, V1, V2, Mask,

16844

Subtarget, DAG);

16845

16846

// The fallback path for single-input shuffles widens this into two v8i16

16847

// vectors with unpacks, shuffles those, and then pulls them back together

16848

// with a pack.

16849

SDValue V = V1;

16850

16851

std::array<int, 8> LoBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};

16852

std::array<int, 8> HiBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};

16853

for (int i = 0; i < 16; ++i)

16854

if (Mask[i] >= 0)

16855

(i < 8 ? LoBlendMask[i] : HiBlendMask[i % 8]) = Mask[i];

16856

16857

SDValue VLoHalf, VHiHalf;

16858

// Check if any of the odd lanes in the v16i8 are used. If not, we can mask

16859

// them out and avoid using UNPCK{L,H} to extract the elements of V as

16860

// i16s.

16861

if (none_of(LoBlendMask, [](int M) { return M >= 0 && M % 2 == 1; }) &&

16862

none_of(HiBlendMask, [](int M) { return M >= 0 && M % 2 == 1; })) {

16863

// Use a mask to drop the high bytes.

16864

VLoHalf = DAG.getBitcast(MVT::v8i16, V);

16865

VLoHalf = DAG.getNode(ISD::AND, DL, MVT::v8i16, VLoHalf,

16866

DAG.getConstant(0x00FF, DL, MVT::v8i16));

16867

16868

// This will be a single vector shuffle instead of a blend so nuke VHiHalf.

16869

VHiHalf = DAG.getUNDEF(MVT::v8i16);

16870

16871

// Squash the masks to point directly into VLoHalf.

16872

for (int &M : LoBlendMask)

16873

if (M >= 0)

16874

M /= 2;

16875

for (int &M : HiBlendMask)

16876

if (M >= 0)

16877

M /= 2;

16878

} else {

16879

// Otherwise just unpack the low half of V into VLoHalf and the high half into

16880

// VHiHalf so that we can blend them as i16s.

16881

SDValue Zero = getZeroVector(MVT::v16i8, Subtarget, DAG, DL);

16882

16883

VLoHalf = DAG.getBitcast(

16884

MVT::v8i16, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));

16885

VHiHalf = DAG.getBitcast(

16886

MVT::v8i16, DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));

16887

}

16888

16889

SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask);

16890

SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, HiBlendMask);

16891

16892

return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);

16893

}

16894

16895

/// Dispatching routine to lower various 128-bit x86 vector shuffles.

16896

///

16897

/// This routine breaks down the specific type of 128-bit shuffle and

16898

/// dispatches to the lowering routines accordingly.

16899

static SDValue lower128BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,

16900

MVT VT, SDValue V1, SDValue V2,

16901

const APInt &Zeroable,

16902

const X86Subtarget &Subtarget,

16903

SelectionDAG &DAG) {

16904

switch (VT.SimpleTy) {

16905

case MVT::v2i64:

16906

return lowerV2I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

16907

case MVT::v2f64:

16908

return lowerV2F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

16909

case MVT::v4i32:

16910

return lowerV4I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

16911

case MVT::v4f32:

16912

return lowerV4F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

16913

case MVT::v8i16:

16914

return lowerV8I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

16915

case MVT::v8f16:

16916

return lowerV8F16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

16917

case MVT::v16i8:

16918

return lowerV16I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

16919

16920

default:

16921

llvm_unreachable("Unimplemented!")::llvm::llvm_unreachable_internal("Unimplemented!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16921);

16922

}

16923

}

16924

16925

/// Generic routine to split vector shuffle into half-sized shuffles.

16926

///

16927

/// This routine just extracts two subvectors, shuffles them independently, and

16928

/// then concatenates them back together. This should work effectively with all

16929

/// AVX vector shuffle types.

16930

static SDValue splitAndLowerShuffle(const SDLoc &DL, MVT VT, SDValue V1,

16931

SDValue V2, ArrayRef<int> Mask,

16932

SelectionDAG &DAG, bool SimpleOnly) {

16933

assert(VT.getSizeInBits() >= 256 &&(static_cast <bool> (VT.getSizeInBits() >= 256 &&
"Only for 256-bit or wider vector shuffles!") ? void (0) : __assert_fail
("VT.getSizeInBits() >= 256 && \"Only for 256-bit or wider vector shuffles!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16934, __extension__
__PRETTY_FUNCTION__))

16934

"Only for 256-bit or wider vector shuffles!")(static_cast <bool> (VT.getSizeInBits() >= 256 &&
"Only for 256-bit or wider vector shuffles!") ? void (0) : __assert_fail
("VT.getSizeInBits() >= 256 && \"Only for 256-bit or wider vector shuffles!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16934, __extension__
__PRETTY_FUNCTION__));

16935

assert(V1.getSimpleValueType() == VT && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == VT &&
"Bad operand type!") ? void (0) : __assert_fail ("V1.getSimpleValueType() == VT && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16935, __extension__
__PRETTY_FUNCTION__));

16936

assert(V2.getSimpleValueType() == VT && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == VT &&
"Bad operand type!") ? void (0) : __assert_fail ("V2.getSimpleValueType() == VT && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16936, __extension__
__PRETTY_FUNCTION__));

16937

16938

ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2);

16939

ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2);

16940

16941

int NumElements = VT.getVectorNumElements();

16942

int SplitNumElements = NumElements / 2;

16943

MVT ScalarVT = VT.getVectorElementType();

16944

MVT SplitVT = MVT::getVectorVT(ScalarVT, SplitNumElements);

16945

16946

// Use splitVector/extractSubVector so that split build-vectors just build two

16947

// narrower build vectors. This helps shuffling with splats and zeros.

16948

auto SplitVector = [&](SDValue V) {

16949

SDValue LoV, HiV;

16950

std::tie(LoV, HiV) = splitVector(peekThroughBitcasts(V), DAG, DL);

16951

return std::make_pair(DAG.getBitcast(SplitVT, LoV),

16952

DAG.getBitcast(SplitVT, HiV));

16953

};

16954

16955

SDValue LoV1, HiV1, LoV2, HiV2;

16956

std::tie(LoV1, HiV1) = SplitVector(V1);

16957

std::tie(LoV2, HiV2) = SplitVector(V2);

16958

16959

// Now create two 4-way blends of these half-width vectors.

16960

auto GetHalfBlendPiecesReq = [&](const ArrayRef<int> &HalfMask, bool &UseLoV1,

16961

bool &UseHiV1, bool &UseLoV2,

16962

bool &UseHiV2) {

16963

UseLoV1 = UseHiV1 = UseLoV2 = UseHiV2 = false;

16964

for (int i = 0; i < SplitNumElements; ++i) {

16965

int M = HalfMask[i];

16966

if (M >= NumElements) {

16967

if (M >= NumElements + SplitNumElements)

16968

UseHiV2 = true;

16969

else

16970

UseLoV2 = true;

16971

} else if (M >= 0) {

16972

if (M >= SplitNumElements)

16973

UseHiV1 = true;

16974

else

16975

UseLoV1 = true;

16976

}

16977

}

16978

};

16979

16980

auto CheckHalfBlendUsable = [&](const ArrayRef<int> &HalfMask) -> bool {

16981

if (!SimpleOnly)

16982

return true;

16983

16984

bool UseLoV1, UseHiV1, UseLoV2, UseHiV2;

16985

GetHalfBlendPiecesReq(HalfMask, UseLoV1, UseHiV1, UseLoV2, UseHiV2);

16986

16987

return !(UseHiV1 || UseHiV2);

16988

};

16989

16990

auto HalfBlend = [&](ArrayRef<int> HalfMask) {

16991

SmallVector<int, 32> V1BlendMask((unsigned)SplitNumElements, -1);

16992

SmallVector<int, 32> V2BlendMask((unsigned)SplitNumElements, -1);

16993

SmallVector<int, 32> BlendMask((unsigned)SplitNumElements, -1);

16994

for (int i = 0; i < SplitNumElements; ++i) {

16995

int M = HalfMask[i];

16996

if (M >= NumElements) {

16997

V2BlendMask[i] = M - NumElements;

16998

BlendMask[i] = SplitNumElements + i;

16999

} else if (M >= 0) {

17000

V1BlendMask[i] = M;

17001

BlendMask[i] = i;

17002

}

17003

}

17004

17005

bool UseLoV1, UseHiV1, UseLoV2, UseHiV2;

17006

GetHalfBlendPiecesReq(HalfMask, UseLoV1, UseHiV1, UseLoV2, UseHiV2);

17007

17008

// Because the lowering happens after all combining takes place, we need to

17009

// manually combine these blend masks as much as possible so that we create

17010

// a minimal number of high-level vector shuffle nodes.

17011

assert((!SimpleOnly || (!UseHiV1 && !UseHiV2)) && "Shuffle isn't simple")(static_cast <bool> ((!SimpleOnly || (!UseHiV1 &&
!UseHiV2)) && "Shuffle isn't simple") ? void (0) : __assert_fail
("(!SimpleOnly || (!UseHiV1 && !UseHiV2)) && \"Shuffle isn't simple\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17011, __extension__
__PRETTY_FUNCTION__));

17012

17013

// First try just blending the halves of V1 or V2.

17014

if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)

17015

return DAG.getUNDEF(SplitVT);

17016

if (!UseLoV2 && !UseHiV2)

17017

return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);

17018

if (!UseLoV1 && !UseHiV1)

17019

return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);

17020

17021

SDValue V1Blend, V2Blend;

17022

if (UseLoV1 && UseHiV1) {

17023

V1Blend = DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);

17024

} else {

17025

// We only use half of V1 so map the usage down into the final blend mask.

17026

V1Blend = UseLoV1 ? LoV1 : HiV1;

17027

for (int i = 0; i < SplitNumElements; ++i)

17028

if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements)

17029

BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);

17030

}

17031

if (UseLoV2 && UseHiV2) {

17032

V2Blend = DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);

17033

} else {

17034

// We only use half of V2 so map the usage down into the final blend mask.

17035

V2Blend = UseLoV2 ? LoV2 : HiV2;

17036

for (int i = 0; i < SplitNumElements; ++i)

17037

if (BlendMask[i] >= SplitNumElements)

17038

BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0);

17039

}

17040

return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask);

17041

};

17042

17043

if (!CheckHalfBlendUsable(LoMask) || !CheckHalfBlendUsable(HiMask))

17044

return SDValue();

17045

17046

SDValue Lo = HalfBlend(LoMask);

17047

SDValue Hi = HalfBlend(HiMask);

17048

return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);

17049

}

17050

17051

/// Either split a vector in halves or decompose the shuffles and the

17052

/// blend/unpack.

17053

///

17054

/// This is provided as a good fallback for many lowerings of non-single-input

17055

/// shuffles with more than one 128-bit lane. In those cases, we want to select

17056

/// between splitting the shuffle into 128-bit components and stitching those

17057

/// back together vs. extracting the single-input shuffles and blending those

17058

/// results.

17059

static SDValue lowerShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT, SDValue V1,

17060

SDValue V2, ArrayRef<int> Mask,

17061

const X86Subtarget &Subtarget,

17062

SelectionDAG &DAG) {

17063

assert(!V2.isUndef() && "This routine must not be used to lower single-input "(static_cast <bool> (!V2.isUndef() && "This routine must not be used to lower single-input "
"shuffles as it could then recurse on itself.") ? void (0) :
__assert_fail ("!V2.isUndef() && \"This routine must not be used to lower single-input \" \"shuffles as it could then recurse on itself.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17064, __extension__
__PRETTY_FUNCTION__))

17064

"shuffles as it could then recurse on itself.")(static_cast <bool> (!V2.isUndef() && "This routine must not be used to lower single-input "
"shuffles as it could then recurse on itself.") ? void (0) :
__assert_fail ("!V2.isUndef() && \"This routine must not be used to lower single-input \" \"shuffles as it could then recurse on itself.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17064, __extension__
__PRETTY_FUNCTION__));

17065

int Size = Mask.size();

17066

17067

// If this can be modeled as a broadcast of two elements followed by a blend,

17068

// prefer that lowering. This is especially important because broadcasts can

17069

// often fold with memory operands.

17070

auto DoBothBroadcast = [&] {

17071

int V1BroadcastIdx = -1, V2BroadcastIdx = -1;

17072

for (int M : Mask)

17073

if (M >= Size) {

17074

if (V2BroadcastIdx < 0)

17075

V2BroadcastIdx = M - Size;

17076

else if (M - Size != V2BroadcastIdx)

17077

return false;

17078

} else if (M >= 0) {

17079

if (V1BroadcastIdx < 0)

17080

V1BroadcastIdx = M;

17081

else if (M != V1BroadcastIdx)

17082

return false;

17083

}

17084

return true;

17085

};

17086

if (DoBothBroadcast())

17087

return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Subtarget,

17088

DAG);

17089

17090

// If the inputs all stem from a single 128-bit lane of each input, then we

17091

// split them rather than blending because the split will decompose to

17092

// unusually few instructions.

17093

int LaneCount = VT.getSizeInBits() / 128;

17094

int LaneSize = Size / LaneCount;

17095

SmallBitVector LaneInputs[2];

17096

LaneInputs[0].resize(LaneCount, false);

17097

LaneInputs[1].resize(LaneCount, false);

17098

for (int i = 0; i < Size; ++i)

17099

if (Mask[i] >= 0)

17100

LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true;

17101

if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)

17102

return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,

17103

/*SimpleOnly*/ false);

17104

17105

// Otherwise, just fall back to decomposed shuffles and a blend/unpack. This

17106

// requires that the decomposed single-input shuffles don't end up here.

17107

return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Subtarget,

17108

DAG);

17109

}

17110

17111

// Lower as SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).

17112

// TODO: Extend to support v8f32 (+ 512-bit shuffles).

17113

static SDValue lowerShuffleAsLanePermuteAndSHUFP(const SDLoc &DL, MVT VT,

17114

SDValue V1, SDValue V2,

17115

ArrayRef<int> Mask,

17116

SelectionDAG &DAG) {

17117

assert(VT == MVT::v4f64 && "Only for v4f64 shuffles")(static_cast <bool> (VT == MVT::v4f64 && "Only for v4f64 shuffles"
) ? void (0) : __assert_fail ("VT == MVT::v4f64 && \"Only for v4f64 shuffles\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17117, __extension__
__PRETTY_FUNCTION__));

17118

17119

int LHSMask[4] = {-1, -1, -1, -1};

17120

int RHSMask[4] = {-1, -1, -1, -1};

17121

unsigned SHUFPMask = 0;

17122

17123

// As SHUFPD uses a single LHS/RHS element per lane, we can always

17124

// perform the shuffle once the lanes have been shuffled in place.

17125

for (int i = 0; i != 4; ++i) {

17126

int M = Mask[i];

17127

if (M < 0)

17128

continue;

17129

int LaneBase = i & ~1;

17130

auto &LaneMask = (i & 1) ? RHSMask : LHSMask;

17131

LaneMask[LaneBase + (M & 1)] = M;

17132

SHUFPMask |= (M & 1) << i;

17133

}

17134

17135

SDValue LHS = DAG.getVectorShuffle(VT, DL, V1, V2, LHSMask);

17136

SDValue RHS = DAG.getVectorShuffle(VT, DL, V1, V2, RHSMask);

17137

return DAG.getNode(X86ISD::SHUFP, DL, VT, LHS, RHS,

17138

DAG.getTargetConstant(SHUFPMask, DL, MVT::i8));

17139

}

17140

17141

/// Lower a vector shuffle crossing multiple 128-bit lanes as

17142

/// a lane permutation followed by a per-lane permutation.

17143

///

17144

/// This is mainly for cases where we can have non-repeating permutes

17145

/// in each lane.

17146

///

17147

/// TODO: This is very similar to lowerShuffleAsLanePermuteAndRepeatedMask,

17148

/// we should investigate merging them.

17149

static SDValue lowerShuffleAsLanePermuteAndPermute(

17150

const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,

17151

SelectionDAG &DAG, const X86Subtarget &Subtarget) {

17152

int NumElts = VT.getVectorNumElements();

17153

int NumLanes = VT.getSizeInBits() / 128;

17154

int NumEltsPerLane = NumElts / NumLanes;

17155

bool CanUseSublanes = Subtarget.hasAVX2() && V2.isUndef();

17156

17157

/// Attempts to find a sublane permute with the given size

17158

/// that gets all elements into their target lanes.

17159

///

17160

/// If successful, fills CrossLaneMask and InLaneMask and returns true.

17161

/// If unsuccessful, returns false and may overwrite InLaneMask.

17162

auto getSublanePermute = [&](int NumSublanes) -> SDValue {

17163

int NumSublanesPerLane = NumSublanes / NumLanes;

17164

int NumEltsPerSublane = NumElts / NumSublanes;

17165

17166

SmallVector<int, 16> CrossLaneMask;

17167

SmallVector<int, 16> InLaneMask(NumElts, SM_SentinelUndef);

17168

// CrossLaneMask but one entry == one sublane.

17169

SmallVector<int, 16> CrossLaneMaskLarge(NumSublanes, SM_SentinelUndef);

17170

17171

for (int i = 0; i != NumElts; ++i) {

17172

int M = Mask[i];

17173

if (M < 0)

17174

continue;

17175

17176

int SrcSublane = M / NumEltsPerSublane;

17177

int DstLane = i / NumEltsPerLane;

17178

17179

// We only need to get the elements into the right lane, not sublane.

17180

// So search all sublanes that make up the destination lane.

17181

bool Found = false;

17182

int DstSubStart = DstLane * NumSublanesPerLane;

17183

int DstSubEnd = DstSubStart + NumSublanesPerLane;

17184

for (int DstSublane = DstSubStart; DstSublane < DstSubEnd; ++DstSublane) {

17185

if (!isUndefOrEqual(CrossLaneMaskLarge[DstSublane], SrcSublane))

17186

continue;

17187

17188

Found = true;

17189

CrossLaneMaskLarge[DstSublane] = SrcSublane;

17190

int DstSublaneOffset = DstSublane * NumEltsPerSublane;

17191

InLaneMask[i] = DstSublaneOffset + M % NumEltsPerSublane;

17192

break;

17193

}

17194

if (!Found)

17195

return SDValue();

17196

}

17197

17198

// Fill CrossLaneMask using CrossLaneMaskLarge.

17199

narrowShuffleMaskElts(NumEltsPerSublane, CrossLaneMaskLarge, CrossLaneMask);

17200

17201

if (!CanUseSublanes) {

17202

// If we're only shuffling a single lowest lane and the rest are identity

17203

// then don't bother.

17204

// TODO - isShuffleMaskInputInPlace could be extended to something like

17205

// this.

17206

int NumIdentityLanes = 0;

17207

bool OnlyShuffleLowestLane = true;

17208

for (int i = 0; i != NumLanes; ++i) {

17209

int LaneOffset = i * NumEltsPerLane;

17210

if (isSequentialOrUndefInRange(InLaneMask, LaneOffset, NumEltsPerLane,

17211

i * NumEltsPerLane))

17212

NumIdentityLanes++;

17213

else if (CrossLaneMask[LaneOffset] != 0)

17214

OnlyShuffleLowestLane = false;

17215

}

17216

if (OnlyShuffleLowestLane && NumIdentityLanes == (NumLanes - 1))

17217

return SDValue();

17218

}

17219

17220

// Avoid returning the same shuffle operation. For example,

17221

// t7: v16i16 = vector_shuffle<8,9,10,11,4,5,6,7,0,1,2,3,12,13,14,15> t5,

17222

// undef:v16i16

17223

if (CrossLaneMask == Mask || InLaneMask == Mask)

17224

return SDValue();

17225

17226

SDValue CrossLane = DAG.getVectorShuffle(VT, DL, V1, V2, CrossLaneMask);

17227

return DAG.getVectorShuffle(VT, DL, CrossLane, DAG.getUNDEF(VT),

17228

InLaneMask);

17229

};

17230

17231

// First attempt a solution with full lanes.

17232

if (SDValue V = getSublanePermute(/*NumSublanes=*/NumLanes))

17233

return V;

17234

17235

// The rest of the solutions use sublanes.

17236

if (!CanUseSublanes)

17237

return SDValue();

17238

17239

// Then attempt a solution with 64-bit sublanes (vpermq).

17240

if (SDValue V = getSublanePermute(/*NumSublanes=*/NumLanes * 2))

17241

return V;

17242

17243

// If that doesn't work and we have fast variable cross-lane shuffle,

17244

// attempt 32-bit sublanes (vpermd).

17245

if (!Subtarget.hasFastVariableCrossLaneShuffle())

17246

return SDValue();

17247

17248

return getSublanePermute(/*NumSublanes=*/NumLanes * 4);

17249

}

17250

17251

/// Helper to get compute inlane shuffle mask for a complete shuffle mask.

17252

static void computeInLaneShuffleMask(const ArrayRef<int> &Mask, int LaneSize,

17253

SmallVector<int> &InLaneMask) {

17254

int Size = Mask.size();

17255

InLaneMask.assign(Mask.begin(), Mask.end());

17256

for (int i = 0; i < Size; ++i) {

17257

int &M = InLaneMask[i];

17258

if (M < 0)

17259

continue;

17260

if (((M % Size) / LaneSize) != (i / LaneSize))

17261

M = (M % LaneSize) + ((i / LaneSize) * LaneSize) + Size;

17262

}

17263

}

17264

17265

/// Lower a vector shuffle crossing multiple 128-bit lanes by shuffling one

17266

/// source with a lane permutation.

17267

///

17268

/// This lowering strategy results in four instructions in the worst case for a

17269

/// single-input cross lane shuffle which is lower than any other fully general

17270

/// cross-lane shuffle strategy I'm aware of. Special cases for each particular

17271

/// shuffle pattern should be handled prior to trying this lowering.

17272

static SDValue lowerShuffleAsLanePermuteAndShuffle(

17273

const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,

17274

SelectionDAG &DAG, const X86Subtarget &Subtarget) {

17275

// FIXME: This should probably be generalized for 512-bit vectors as well.

17276

assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!")(static_cast <bool> (VT.is256BitVector() && "Only for 256-bit vector shuffles!"
) ? void (0) : __assert_fail ("VT.is256BitVector() && \"Only for 256-bit vector shuffles!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17276, __extension__
__PRETTY_FUNCTION__));

17277

int Size = Mask.size();

17278

int LaneSize = Size / 2;

17279

17280

// Fold to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).

17281

// Only do this if the elements aren't all from the lower lane,

17282

// otherwise we're (probably) better off doing a split.

17283

if (VT == MVT::v4f64 &&

17284

!all_of(Mask, [LaneSize](int M) { return M < LaneSize; }))

17285

return lowerShuffleAsLanePermuteAndSHUFP(DL, VT, V1, V2, Mask, DAG);

17286

17287

// If there are only inputs from one 128-bit lane, splitting will in fact be

17288

// less expensive. The flags track whether the given lane contains an element

17289

// that crosses to another lane.

17290

bool AllLanes;

17291

if (!Subtarget.hasAVX2()) {

17292

bool LaneCrossing[2] = {false, false};

17293

for (int i = 0; i < Size; ++i)

17294

if (Mask[i] >= 0 && ((Mask[i] % Size) / LaneSize) != (i / LaneSize))

17295

LaneCrossing[(Mask[i] % Size) / LaneSize] = true;

17296

AllLanes = LaneCrossing[0] && LaneCrossing[1];

17297

} else {

17298

bool LaneUsed[2] = {false, false};

17299

for (int i = 0; i < Size; ++i)

17300

if (Mask[i] >= 0)

17301

LaneUsed[(Mask[i] % Size) / LaneSize] = true;

17302

AllLanes = LaneUsed[0] && LaneUsed[1];

17303

}

17304

17305

// TODO - we could support shuffling V2 in the Flipped input.

17306

assert(V2.isUndef() &&(static_cast <bool> (V2.isUndef() && "This last part of this routine only works on single input shuffles"
) ? void (0) : __assert_fail ("V2.isUndef() && \"This last part of this routine only works on single input shuffles\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17307, __extension__
__PRETTY_FUNCTION__))

17307

"This last part of this routine only works on single input shuffles")(static_cast <bool> (V2.isUndef() && "This last part of this routine only works on single input shuffles"
) ? void (0) : __assert_fail ("V2.isUndef() && \"This last part of this routine only works on single input shuffles\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17307, __extension__
__PRETTY_FUNCTION__));

17308

17309

SmallVector<int> InLaneMask;

17310

computeInLaneShuffleMask(Mask, Mask.size() / 2, InLaneMask);

17311

17312

assert(!is128BitLaneCrossingShuffleMask(VT, InLaneMask) &&(static_cast <bool> (!is128BitLaneCrossingShuffleMask(VT
, InLaneMask) && "In-lane shuffle mask expected") ? void
(0) : __assert_fail ("!is128BitLaneCrossingShuffleMask(VT, InLaneMask) && \"In-lane shuffle mask expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17313, __extension__
__PRETTY_FUNCTION__))

17313

"In-lane shuffle mask expected")(static_cast <bool> (!is128BitLaneCrossingShuffleMask(VT
, InLaneMask) && "In-lane shuffle mask expected") ? void
(0) : __assert_fail ("!is128BitLaneCrossingShuffleMask(VT, InLaneMask) && \"In-lane shuffle mask expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17313, __extension__
__PRETTY_FUNCTION__));

17314

17315

// If we're not using both lanes in each lane and the inlane mask is not

17316

// repeating, then we're better off splitting.

17317

if (!AllLanes && !is128BitLaneRepeatedShuffleMask(VT, InLaneMask))

17318

return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,

17319

/*SimpleOnly*/ false);

17320

17321

// Flip the lanes, and shuffle the results which should now be in-lane.

17322

MVT PVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;

17323

SDValue Flipped = DAG.getBitcast(PVT, V1);

17324

Flipped =

17325

DAG.getVectorShuffle(PVT, DL, Flipped, DAG.getUNDEF(PVT), {2, 3, 0, 1});

17326

Flipped = DAG.getBitcast(VT, Flipped);

17327

return DAG.getVectorShuffle(VT, DL, V1, Flipped, InLaneMask);

17328

}

17329

17330

/// Handle lowering 2-lane 128-bit shuffles.

17331

static SDValue lowerV2X128Shuffle(const SDLoc &DL, MVT VT, SDValue V1,

17332

SDValue V2, ArrayRef<int> Mask,

17333

const APInt &Zeroable,

17334

const X86Subtarget &Subtarget,

17335

SelectionDAG &DAG) {

17336

if (V2.isUndef()) {

17337

// Attempt to match VBROADCAST*128 subvector broadcast load.

17338

bool SplatLo = isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1);

17339

bool SplatHi = isShuffleEquivalent(Mask, {2, 3, 2, 3}, V1);

17340

if ((SplatLo || SplatHi) && !Subtarget.hasAVX512() && V1.hasOneUse() &&

17341

X86::mayFoldLoad(peekThroughOneUseBitcasts(V1), Subtarget)) {

17342

MVT MemVT = VT.getHalfNumVectorElementsVT();

17343

unsigned Ofs = SplatLo ? 0 : MemVT.getStoreSize();

17344

auto *Ld = cast<LoadSDNode>(peekThroughOneUseBitcasts(V1));

17345

if (SDValue BcstLd = getBROADCAST_LOAD(X86ISD::SUBV_BROADCAST_LOAD, DL,

17346

VT, MemVT, Ld, Ofs, DAG))

17347

return BcstLd;

17348

}

17349

17350

// With AVX2, use VPERMQ/VPERMPD for unary shuffles to allow memory folding.

17351

if (Subtarget.hasAVX2())

17352

return SDValue();

17353

}

17354

17355

bool V2IsZero = !V2.isUndef() && ISD::isBuildVectorAllZeros(V2.getNode());

17356

17357

SmallVector<int, 4> WidenedMask;

17358

if (!canWidenShuffleElements(Mask, Zeroable, V2IsZero, WidenedMask))

17359

return SDValue();

17360

17361

bool IsLowZero = (Zeroable & 0x3) == 0x3;

17362

bool IsHighZero = (Zeroable & 0xc) == 0xc;

17363

17364

// Try to use an insert into a zero vector.

17365

if (WidenedMask[0] == 0 && IsHighZero) {

17366

MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);

17367

SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,

17368

DAG.getIntPtrConstant(0, DL));

17369

return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,

17370

getZeroVector(VT, Subtarget, DAG, DL), LoV,

17371

DAG.getIntPtrConstant(0, DL));

17372

}

17373

17374

// TODO: If minimizing size and one of the inputs is a zero vector and the

17375

// the zero vector has only one use, we could use a VPERM2X128 to save the

17376

// instruction bytes needed to explicitly generate the zero vector.

17377

17378

// Blends are faster and handle all the non-lane-crossing cases.

17379

if (SDValue Blend = lowerShuffleAsBlend(DL, VT, V1, V2, Mask, Zeroable,

17380

Subtarget, DAG))

17381

return Blend;

17382

17383

// If either input operand is a zero vector, use VPERM2X128 because its mask

17384

// allows us to replace the zero input with an implicit zero.

17385

if (!IsLowZero && !IsHighZero) {

17386

// Check for patterns which can be matched with a single insert of a 128-bit

17387

// subvector.

17388

bool OnlyUsesV1 = isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1, V2);

17389

if (OnlyUsesV1 || isShuffleEquivalent(Mask, {0, 1, 4, 5}, V1, V2)) {

17390

17391

// With AVX1, use vperm2f128 (below) to allow load folding. Otherwise,

17392

// this will likely become vinsertf128 which can't fold a 256-bit memop.

17393

if (!isa<LoadSDNode>(peekThroughBitcasts(V1))) {

17394

MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);

17395

SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,

17396

OnlyUsesV1 ? V1 : V2,

17397

DAG.getIntPtrConstant(0, DL));

17398

return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,

17399

DAG.getIntPtrConstant(2, DL));

17400

}

17401

}

17402

17403

// Try to use SHUF128 if possible.

17404

if (Subtarget.hasVLX()) {

17405

if (WidenedMask[0] < 2 && WidenedMask[1] >= 2) {

17406

unsigned PermMask = ((WidenedMask[0] % 2) << 0) |

17407

((WidenedMask[1] % 2) << 1);

17408

return DAG.getNode(X86ISD::SHUF128, DL, VT, V1, V2,

17409

DAG.getTargetConstant(PermMask, DL, MVT::i8));

17410

}

17411

}

17412

}

17413

17414

// Otherwise form a 128-bit permutation. After accounting for undefs,

17415

// convert the 64-bit shuffle mask selection values into 128-bit

17416

// selection bits by dividing the indexes by 2 and shifting into positions

17417

// defined by a vperm2*128 instruction's immediate control byte.

17418

17419

// The immediate permute control byte looks like this:

17420

// [1:0] - select 128 bits from sources for low half of destination

17421

// [2] - ignore

17422

// [3] - zero low half of destination

17423

// [5:4] - select 128 bits from sources for high half of destination

17424

// [6] - ignore

17425

// [7] - zero high half of destination

17426

17427

assert((WidenedMask[0] >= 0 || IsLowZero) &&(static_cast <bool> ((WidenedMask[0] >= 0 || IsLowZero
) && (WidenedMask[1] >= 0 || IsHighZero) &&
"Undef half?") ? void (0) : __assert_fail ("(WidenedMask[0] >= 0 || IsLowZero) && (WidenedMask[1] >= 0 || IsHighZero) && \"Undef half?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17428, __extension__
__PRETTY_FUNCTION__))

17428

(WidenedMask[1] >= 0 || IsHighZero) && "Undef half?")(static_cast <bool> ((WidenedMask[0] >= 0 || IsLowZero
) && (WidenedMask[1] >= 0 || IsHighZero) &&
"Undef half?") ? void (0) : __assert_fail ("(WidenedMask[0] >= 0 || IsLowZero) && (WidenedMask[1] >= 0 || IsHighZero) && \"Undef half?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17428, __extension__
__PRETTY_FUNCTION__));

17429

17430

unsigned PermMask = 0;

17431

PermMask |= IsLowZero ? 0x08 : (WidenedMask[0] << 0);

17432

PermMask |= IsHighZero ? 0x80 : (WidenedMask[1] << 4);

17433

17434

// Check the immediate mask and replace unused sources with undef.

17435

if ((PermMask & 0x0a) != 0x00 && (PermMask & 0xa0) != 0x00)

17436

V1 = DAG.getUNDEF(VT);

17437

if ((PermMask & 0x0a) != 0x02 && (PermMask & 0xa0) != 0x20)

17438

V2 = DAG.getUNDEF(VT);

17439

17440

return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,

17441

DAG.getTargetConstant(PermMask, DL, MVT::i8));

17442

}

17443

17444

/// Lower a vector shuffle by first fixing the 128-bit lanes and then

17445

/// shuffling each lane.

17446

///

17447

/// This attempts to create a repeated lane shuffle where each lane uses one

17448

/// or two of the lanes of the inputs. The lanes of the input vectors are

17449

/// shuffled in one or two independent shuffles to get the lanes into the

17450

/// position needed by the final shuffle.

17451

static SDValue lowerShuffleAsLanePermuteAndRepeatedMask(

17452

const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,

17453

const X86Subtarget &Subtarget, SelectionDAG &DAG) {

17454

assert(!V2.isUndef() && "This is only useful with multiple inputs.")(static_cast <bool> (!V2.isUndef() && "This is only useful with multiple inputs."
) ? void (0) : __assert_fail ("!V2.isUndef() && \"This is only useful with multiple inputs.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17454, __extension__
__PRETTY_FUNCTION__));

17455

17456

if (is128BitLaneRepeatedShuffleMask(VT, Mask))

17457

return SDValue();

17458

17459

int NumElts = Mask.size();

17460

int NumLanes = VT.getSizeInBits() / 128;

17461

int NumLaneElts = 128 / VT.getScalarSizeInBits();

17462

SmallVector<int, 16> RepeatMask(NumLaneElts, -1);

17463

SmallVector<std::array<int, 2>, 2> LaneSrcs(NumLanes, {{-1, -1}});

17464

17465

// First pass will try to fill in the RepeatMask from lanes that need two

17466

// sources.

17467

for (int Lane = 0; Lane != NumLanes; ++Lane) {

17468

int Srcs[2] = {-1, -1};

17469

SmallVector<int, 16> InLaneMask(NumLaneElts, -1);

17470

for (int i = 0; i != NumLaneElts; ++i) {

17471

int M = Mask[(Lane * NumLaneElts) + i];

17472

if (M < 0)

17473

continue;

17474

// Determine which of the possible input lanes (NumLanes from each source)

17475

// this element comes from. Assign that as one of the sources for this

17476

// lane. We can assign up to 2 sources for this lane. If we run out

17477

// sources we can't do anything.

17478

int LaneSrc = M / NumLaneElts;

17479

int Src;

17480

if (Srcs[0] < 0 || Srcs[0] == LaneSrc)

17481

Src = 0;

17482

else if (Srcs[1] < 0 || Srcs[1] == LaneSrc)

17483

Src = 1;

17484

else

17485

return SDValue();

17486

17487

Srcs[Src] = LaneSrc;

17488

InLaneMask[i] = (M % NumLaneElts) + Src * NumElts;

17489

}

17490

17491

// If this lane has two sources, see if it fits with the repeat mask so far.

17492

if (Srcs[1] < 0)

17493

continue;

17494

17495

LaneSrcs[Lane][0] = Srcs[0];

17496

LaneSrcs[Lane][1] = Srcs[1];

17497

17498

auto MatchMasks = [](ArrayRef<int> M1, ArrayRef<int> M2) {

17499

assert(M1.size() == M2.size() && "Unexpected mask size")(static_cast <bool> (M1.size() == M2.size() && "Unexpected mask size"
) ? void (0) : __assert_fail ("M1.size() == M2.size() && \"Unexpected mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17499, __extension__
__PRETTY_FUNCTION__));

17500

for (int i = 0, e = M1.size(); i != e; ++i)

17501

if (M1[i] >= 0 && M2[i] >= 0 && M1[i] != M2[i])

17502

return false;

17503

return true;

17504

};

17505

17506

auto MergeMasks = [](ArrayRef<int> Mask, MutableArrayRef<int> MergedMask) {

17507

assert(Mask.size() == MergedMask.size() && "Unexpected mask size")(static_cast <bool> (Mask.size() == MergedMask.size() &&
"Unexpected mask size") ? void (0) : __assert_fail ("Mask.size() == MergedMask.size() && \"Unexpected mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17507, __extension__
__PRETTY_FUNCTION__));

17508

for (int i = 0, e = MergedMask.size(); i != e; ++i) {

17509

int M = Mask[i];

17510

if (M < 0)

17511

continue;

17512

assert((MergedMask[i] < 0 || MergedMask[i] == M) &&(static_cast <bool> ((MergedMask[i] < 0 || MergedMask
[i] == M) && "Unexpected mask element") ? void (0) : __assert_fail
("(MergedMask[i] < 0 || MergedMask[i] == M) && \"Unexpected mask element\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17513, __extension__
__PRETTY_FUNCTION__))

17513

"Unexpected mask element")(static_cast <bool> ((MergedMask[i] < 0 || MergedMask
[i] == M) && "Unexpected mask element") ? void (0) : __assert_fail
("(MergedMask[i] < 0 || MergedMask[i] == M) && \"Unexpected mask element\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17513, __extension__
__PRETTY_FUNCTION__));

17514

MergedMask[i] = M;

17515

}

17516

};

17517

17518

if (MatchMasks(InLaneMask, RepeatMask)) {

17519

// Merge this lane mask into the final repeat mask.

17520

MergeMasks(InLaneMask, RepeatMask);

17521

continue;

17522

}

17523

17524

// Didn't find a match. Swap the operands and try again.

17525

std::swap(LaneSrcs[Lane][0], LaneSrcs[Lane][1]);

17526

ShuffleVectorSDNode::commuteMask(InLaneMask);

17527

17528

if (MatchMasks(InLaneMask, RepeatMask)) {

17529

// Merge this lane mask into the final repeat mask.

17530

MergeMasks(InLaneMask, RepeatMask);

17531

continue;

17532

}

17533

17534

// Couldn't find a match with the operands in either order.

17535

return SDValue();

17536

}

17537

17538

// Now handle any lanes with only one source.

17539

for (int Lane = 0; Lane != NumLanes; ++Lane) {

17540

// If this lane has already been processed, skip it.

17541

if (LaneSrcs[Lane][0] >= 0)

17542

continue;

17543

17544

for (int i = 0; i != NumLaneElts; ++i) {

17545

int M = Mask[(Lane * NumLaneElts) + i];

17546

if (M < 0)

17547

continue;

17548

17549

// If RepeatMask isn't defined yet we can define it ourself.

17550

if (RepeatMask[i] < 0)

17551

RepeatMask[i] = M % NumLaneElts;

17552

17553

if (RepeatMask[i] < NumElts) {

17554

if (RepeatMask[i] != M % NumLaneElts)

17555

return SDValue();

17556

LaneSrcs[Lane][0] = M / NumLaneElts;

17557

} else {

17558

if (RepeatMask[i] != ((M % NumLaneElts) + NumElts))

17559

return SDValue();

17560

LaneSrcs[Lane][1] = M / NumLaneElts;

17561

}

17562

}

17563

17564

if (LaneSrcs[Lane][0] < 0 && LaneSrcs[Lane][1] < 0)

17565

return SDValue();

17566

}

17567

17568

SmallVector<int, 16> NewMask(NumElts, -1);

17569

for (int Lane = 0; Lane != NumLanes; ++Lane) {

17570

int Src = LaneSrcs[Lane][0];

17571

for (int i = 0; i != NumLaneElts; ++i) {

17572

int M = -1;

17573

if (Src >= 0)

17574

M = Src * NumLaneElts + i;

17575

NewMask[Lane * NumLaneElts + i] = M;

17576

}

17577

}

17578

SDValue NewV1 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);

17579

// Ensure we didn't get back the shuffle we started with.

17580

// FIXME: This is a hack to make up for some splat handling code in

17581

// getVectorShuffle.

17582

if (isa<ShuffleVectorSDNode>(NewV1) &&

17583

cast<ShuffleVectorSDNode>(NewV1)->getMask() == Mask)

17584

return SDValue();

17585

17586

for (int Lane = 0; Lane != NumLanes; ++Lane) {

17587

int Src = LaneSrcs[Lane][1];

17588

for (int i = 0; i != NumLaneElts; ++i) {

17589

int M = -1;

17590

if (Src >= 0)

17591

M = Src * NumLaneElts + i;

17592

NewMask[Lane * NumLaneElts + i] = M;

17593

}

17594

}

17595

SDValue NewV2 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);

17596

// Ensure we didn't get back the shuffle we started with.

17597

// FIXME: This is a hack to make up for some splat handling code in

17598

// getVectorShuffle.

17599

if (isa<ShuffleVectorSDNode>(NewV2) &&

17600

cast<ShuffleVectorSDNode>(NewV2)->getMask() == Mask)

17601

return SDValue();

17602

17603

for (int i = 0; i != NumElts; ++i) {

17604

if (Mask[i] < 0) {

17605

NewMask[i] = -1;

17606

continue;

17607

}

17608

NewMask[i] = RepeatMask[i % NumLaneElts];

17609

if (NewMask[i] < 0)

17610

continue;

17611

17612

NewMask[i] += (i / NumLaneElts) * NumLaneElts;

17613

}

17614

return DAG.getVectorShuffle(VT, DL, NewV1, NewV2, NewMask);

17615

}

17616

17617

/// If the input shuffle mask results in a vector that is undefined in all upper

17618

/// or lower half elements and that mask accesses only 2 halves of the

17619

/// shuffle's operands, return true. A mask of half the width with mask indexes

17620

/// adjusted to access the extracted halves of the original shuffle operands is

17621

/// returned in HalfMask. HalfIdx1 and HalfIdx2 return whether the upper or

17622

/// lower half of each input operand is accessed.

17623

static bool

17624

getHalfShuffleMask(ArrayRef<int> Mask, MutableArrayRef<int> HalfMask,

17625

int &HalfIdx1, int &HalfIdx2) {

17626

assert((Mask.size() == HalfMask.size() * 2) &&(static_cast <bool> ((Mask.size() == HalfMask.size() * 2
) && "Expected input mask to be twice as long as output"
) ? void (0) : __assert_fail ("(Mask.size() == HalfMask.size() * 2) && \"Expected input mask to be twice as long as output\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17627, __extension__
__PRETTY_FUNCTION__))

17627

"Expected input mask to be twice as long as output")(static_cast <bool> ((Mask.size() == HalfMask.size() * 2
) && "Expected input mask to be twice as long as output"
) ? void (0) : __assert_fail ("(Mask.size() == HalfMask.size() * 2) && \"Expected input mask to be twice as long as output\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17627, __extension__
__PRETTY_FUNCTION__));

17628

17629

// Exactly one half of the result must be undef to allow narrowing.

17630

bool UndefLower = isUndefLowerHalf(Mask);

17631

bool UndefUpper = isUndefUpperHalf(Mask);

17632

if (UndefLower == UndefUpper)

17633

return false;

17634

17635

unsigned HalfNumElts = HalfMask.size();

17636

unsigned MaskIndexOffset = UndefLower ? HalfNumElts : 0;

17637

HalfIdx1 = -1;

17638

HalfIdx2 = -1;

17639

for (unsigned i = 0; i != HalfNumElts; ++i) {

17640

int M = Mask[i + MaskIndexOffset];

17641

if (M < 0) {

17642

HalfMask[i] = M;

17643

continue;

17644

}

17645

17646

// Determine which of the 4 half vectors this element is from.

17647

// i.e. 0 = Lower V1, 1 = Upper V1, 2 = Lower V2, 3 = Upper V2.

17648

int HalfIdx = M / HalfNumElts;

17649

17650

// Determine the element index into its half vector source.

17651

int HalfElt = M % HalfNumElts;

17652

17653

// We can shuffle with up to 2 half vectors, set the new 'half'

17654

// shuffle mask accordingly.

17655

if (HalfIdx1 < 0 || HalfIdx1 == HalfIdx) {

17656

HalfMask[i] = HalfElt;

17657

HalfIdx1 = HalfIdx;

17658

continue;

17659

}

17660

if (HalfIdx2 < 0 || HalfIdx2 == HalfIdx) {

17661

HalfMask[i] = HalfElt + HalfNumElts;

17662

HalfIdx2 = HalfIdx;

17663

continue;

17664

}

17665

17666

// Too many half vectors referenced.

17667

return false;

17668

}

17669

17670

return true;

17671

}

17672

17673

/// Given the output values from getHalfShuffleMask(), create a half width

17674

/// shuffle of extracted vectors followed by an insert back to full width.

17675

static SDValue getShuffleHalfVectors(const SDLoc &DL, SDValue V1, SDValue V2,

17676

ArrayRef<int> HalfMask, int HalfIdx1,

17677

int HalfIdx2, bool UndefLower,

17678

SelectionDAG &DAG, bool UseConcat = false) {

17679

assert(V1.getValueType() == V2.getValueType() && "Different sized vectors?")(static_cast <bool> (V1.getValueType() == V2.getValueType
() && "Different sized vectors?") ? void (0) : __assert_fail
("V1.getValueType() == V2.getValueType() && \"Different sized vectors?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17679, __extension__
__PRETTY_FUNCTION__));

17680

assert(V1.getValueType().isSimple() && "Expecting only simple types")(static_cast <bool> (V1.getValueType().isSimple() &&
"Expecting only simple types") ? void (0) : __assert_fail ("V1.getValueType().isSimple() && \"Expecting only simple types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17680, __extension__
__PRETTY_FUNCTION__));

17681

17682

MVT VT = V1.getSimpleValueType();

17683

MVT HalfVT = VT.getHalfNumVectorElementsVT();

17684

unsigned HalfNumElts = HalfVT.getVectorNumElements();

17685

17686

auto getHalfVector = [&](int HalfIdx) {

17687

if (HalfIdx < 0)

17688

return DAG.getUNDEF(HalfVT);

17689

SDValue V = (HalfIdx < 2 ? V1 : V2);

17690

HalfIdx = (HalfIdx % 2) * HalfNumElts;

17691

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V,

17692

DAG.getIntPtrConstant(HalfIdx, DL));

17693

};

17694

17695

// ins undef, (shuf (ext V1, HalfIdx1), (ext V2, HalfIdx2), HalfMask), Offset

17696

SDValue Half1 = getHalfVector(HalfIdx1);

17697

SDValue Half2 = getHalfVector(HalfIdx2);

17698

SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask);

17699

if (UseConcat) {

17700

SDValue Op0 = V;

17701

SDValue Op1 = DAG.getUNDEF(HalfVT);

17702

if (UndefLower)

17703

std::swap(Op0, Op1);

17704

return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Op0, Op1);

17705

}

17706

17707

unsigned Offset = UndefLower ? HalfNumElts : 0;

17708

return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V,

17709

DAG.getIntPtrConstant(Offset, DL));

17710

}

17711

17712

/// Lower shuffles where an entire half of a 256 or 512-bit vector is UNDEF.

17713

/// This allows for fast cases such as subvector extraction/insertion

17714

/// or shuffling smaller vector types which can lower more efficiently.

17715

static SDValue lowerShuffleWithUndefHalf(const SDLoc &DL, MVT VT, SDValue V1,

17716

SDValue V2, ArrayRef<int> Mask,

17717

const X86Subtarget &Subtarget,

17718

SelectionDAG &DAG) {

17719

assert((VT.is256BitVector() || VT.is512BitVector()) &&(static_cast <bool> ((VT.is256BitVector() || VT.is512BitVector
()) && "Expected 256-bit or 512-bit vector") ? void (
0) : __assert_fail ("(VT.is256BitVector() || VT.is512BitVector()) && \"Expected 256-bit or 512-bit vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17720, __extension__
__PRETTY_FUNCTION__))

17720

"Expected 256-bit or 512-bit vector")(static_cast <bool> ((VT.is256BitVector() || VT.is512BitVector
()) && "Expected 256-bit or 512-bit vector") ? void (
0) : __assert_fail ("(VT.is256BitVector() || VT.is512BitVector()) && \"Expected 256-bit or 512-bit vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17720, __extension__
__PRETTY_FUNCTION__));

17721

17722

bool UndefLower = isUndefLowerHalf(Mask);

17723

if (!UndefLower && !isUndefUpperHalf(Mask))

17724

return SDValue();

17725

17726

assert((!UndefLower || !isUndefUpperHalf(Mask)) &&(static_cast <bool> ((!UndefLower || !isUndefUpperHalf(
Mask)) && "Completely undef shuffle mask should have been simplified already"
) ? void (0) : __assert_fail ("(!UndefLower || !isUndefUpperHalf(Mask)) && \"Completely undef shuffle mask should have been simplified already\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17727, __extension__
__PRETTY_FUNCTION__))

17727

"Completely undef shuffle mask should have been simplified already")(static_cast <bool> ((!UndefLower || !isUndefUpperHalf(
Mask)) && "Completely undef shuffle mask should have been simplified already"
) ? void (0) : __assert_fail ("(!UndefLower || !isUndefUpperHalf(Mask)) && \"Completely undef shuffle mask should have been simplified already\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17727, __extension__
__PRETTY_FUNCTION__));

17728

17729

// Upper half is undef and lower half is whole upper subvector.

17730

// e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>

17731

MVT HalfVT = VT.getHalfNumVectorElementsVT();

17732

unsigned HalfNumElts = HalfVT.getVectorNumElements();

17733

if (!UndefLower &&

17734

isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) {

17735

SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,

17736

DAG.getIntPtrConstant(HalfNumElts, DL));

17737

return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,

17738

DAG.getIntPtrConstant(0, DL));

17739

}

17740

17741

// Lower half is undef and upper half is whole lower subvector.

17742

// e.g. vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>

17743

if (UndefLower &&

17744

isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) {

17745

SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,

17746

DAG.getIntPtrConstant(0, DL));

17747

return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,

17748

DAG.getIntPtrConstant(HalfNumElts, DL));

17749

}

17750

17751

int HalfIdx1, HalfIdx2;

17752

SmallVector<int, 8> HalfMask(HalfNumElts);

17753

if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2))

17754

return SDValue();

17755

17756

assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length")(static_cast <bool> (HalfMask.size() == HalfNumElts &&
"Unexpected shuffle mask length") ? void (0) : __assert_fail
("HalfMask.size() == HalfNumElts && \"Unexpected shuffle mask length\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17756, __extension__
__PRETTY_FUNCTION__));

17757

17758

// Only shuffle the halves of the inputs when useful.

17759

unsigned NumLowerHalves =

17760

(HalfIdx1 == 0 || HalfIdx1 == 2) + (HalfIdx2 == 0 || HalfIdx2 == 2);

17761

unsigned NumUpperHalves =

17762

(HalfIdx1 == 1 || HalfIdx1 == 3) + (HalfIdx2 == 1 || HalfIdx2 == 3);

17763

assert(NumLowerHalves + NumUpperHalves <= 2 && "Only 1 or 2 halves allowed")(static_cast <bool> (NumLowerHalves + NumUpperHalves <=
2 && "Only 1 or 2 halves allowed") ? void (0) : __assert_fail
("NumLowerHalves + NumUpperHalves <= 2 && \"Only 1 or 2 halves allowed\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17763, __extension__
__PRETTY_FUNCTION__));

17764

17765

// Determine the larger pattern of undef/halves, then decide if it's worth

17766

// splitting the shuffle based on subtarget capabilities and types.

17767

unsigned EltWidth = VT.getVectorElementType().getSizeInBits();

17768

if (!UndefLower) {

17769

// XXXXuuuu: no insert is needed.

17770

// Always extract lowers when setting lower - these are all free subreg ops.

17771

if (NumUpperHalves == 0)

17772

return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,

17773

UndefLower, DAG);

17774

17775

if (NumUpperHalves == 1) {

17776

// AVX2 has efficient 32/64-bit element cross-lane shuffles.

17777

if (Subtarget.hasAVX2()) {

17778

// extract128 + vunpckhps/vshufps, is better than vblend + vpermps.

17779

if (EltWidth == 32 && NumLowerHalves && HalfVT.is128BitVector() &&

17780

!is128BitUnpackShuffleMask(HalfMask, DAG) &&

17781

(!isSingleSHUFPSMask(HalfMask) ||

17782

Subtarget.hasFastVariableCrossLaneShuffle()))

17783

return SDValue();

17784

// If this is a unary shuffle (assume that the 2nd operand is

17785

// canonicalized to undef), then we can use vpermpd. Otherwise, we

17786

// are better off extracting the upper half of 1 operand and using a

17787

// narrow shuffle.

17788

if (EltWidth == 64 && V2.isUndef())

17789

return SDValue();

17790

}

17791

// AVX512 has efficient cross-lane shuffles for all legal 512-bit types.

17792

if (Subtarget.hasAVX512() && VT.is512BitVector())

17793

return SDValue();

17794

// Extract + narrow shuffle is better than the wide alternative.

17795

return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,

17796

UndefLower, DAG);

17797

}

17798

17799

// Don't extract both uppers, instead shuffle and then extract.

17800

assert(NumUpperHalves == 2 && "Half vector count went wrong")(static_cast <bool> (NumUpperHalves == 2 && "Half vector count went wrong"
) ? void (0) : __assert_fail ("NumUpperHalves == 2 && \"Half vector count went wrong\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17800, __extension__
__PRETTY_FUNCTION__));

17801

return SDValue();

17802

}

17803

17804

// UndefLower - uuuuXXXX: an insert to high half is required if we split this.

17805

if (NumUpperHalves == 0) {

17806

// AVX2 has efficient 64-bit element cross-lane shuffles.

17807

// TODO: Refine to account for unary shuffle, splat, and other masks?

17808

if (Subtarget.hasAVX2() && EltWidth == 64)

17809

return SDValue();

17810

// AVX512 has efficient cross-lane shuffles for all legal 512-bit types.

17811

if (Subtarget.hasAVX512() && VT.is512BitVector())

17812

return SDValue();

17813

// Narrow shuffle + insert is better than the wide alternative.

17814

return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,

17815

UndefLower, DAG);

17816

}

17817

17818

// NumUpperHalves != 0: don't bother with extract, shuffle, and then insert.

17819

return SDValue();

17820

}

17821

17822

/// Handle case where shuffle sources are coming from the same 128-bit lane and

17823

/// every lane can be represented as the same repeating mask - allowing us to

17824

/// shuffle the sources with the repeating shuffle and then permute the result

17825

/// to the destination lanes.

17826

static SDValue lowerShuffleAsRepeatedMaskAndLanePermute(

17827

const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,

17828

const X86Subtarget &Subtarget, SelectionDAG &DAG) {

17829

int NumElts = VT.getVectorNumElements();

17830

int NumLanes = VT.getSizeInBits() / 128;

17831

int NumLaneElts = NumElts / NumLanes;

17832

17833

// On AVX2 we may be able to just shuffle the lowest elements and then

17834

// broadcast the result.

17835

if (Subtarget.hasAVX2()) {

17836

for (unsigned BroadcastSize : {16, 32, 64}) {

17837

if (BroadcastSize <= VT.getScalarSizeInBits())

17838

continue;

17839

int NumBroadcastElts = BroadcastSize / VT.getScalarSizeInBits();

17840

17841

// Attempt to match a repeating pattern every NumBroadcastElts,

17842

// accounting for UNDEFs but only references the lowest 128-bit

17843

// lane of the inputs.

17844

auto FindRepeatingBroadcastMask = [&](SmallVectorImpl<int> &RepeatMask) {

17845

for (int i = 0; i != NumElts; i += NumBroadcastElts)

17846

for (int j = 0; j != NumBroadcastElts; ++j) {

17847

int M = Mask[i + j];

17848

if (M < 0)

17849

continue;

17850

int &R = RepeatMask[j];

17851

if (0 != ((M % NumElts) / NumLaneElts))

17852

return false;

17853

if (0 <= R && R != M)

17854

return false;

17855

R = M;

17856

}

17857

return true;

17858

};

17859

17860

SmallVector<int, 8> RepeatMask((unsigned)NumElts, -1);

17861

if (!FindRepeatingBroadcastMask(RepeatMask))

17862

continue;

17863

17864

// Shuffle the (lowest) repeated elements in place for broadcast.

17865

SDValue RepeatShuf = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatMask);

17866

17867

// Shuffle the actual broadcast.

17868

SmallVector<int, 8> BroadcastMask((unsigned)NumElts, -1);

17869

for (int i = 0; i != NumElts; i += NumBroadcastElts)

17870

for (int j = 0; j != NumBroadcastElts; ++j)

17871

BroadcastMask[i + j] = j;

17872

return DAG.getVectorShuffle(VT, DL, RepeatShuf, DAG.getUNDEF(VT),

17873

BroadcastMask);

17874

}

17875

}

17876

17877

// Bail if the shuffle mask doesn't cross 128-bit lanes.

17878

if (!is128BitLaneCrossingShuffleMask(VT, Mask))

17879

return SDValue();

17880

17881

// Bail if we already have a repeated lane shuffle mask.

17882

if (is128BitLaneRepeatedShuffleMask(VT, Mask))

17883

return SDValue();

17884

17885

// Helper to look for repeated mask in each split sublane, and that those

17886

// sublanes can then be permuted into place.

17887

auto ShuffleSubLanes = [&](int SubLaneScale) {

17888

int NumSubLanes = NumLanes * SubLaneScale;

17889

int NumSubLaneElts = NumLaneElts / SubLaneScale;

17890

17891

// Check that all the sources are coming from the same lane and see if we

17892

// can form a repeating shuffle mask (local to each sub-lane). At the same

17893

// time, determine the source sub-lane for each destination sub-lane.

17894

int TopSrcSubLane = -1;

17895

SmallVector<int, 8> Dst2SrcSubLanes((unsigned)NumSubLanes, -1);

17896

SmallVector<SmallVector<int, 8>> RepeatedSubLaneMasks(

17897

SubLaneScale,

17898

SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef));

17899

17900

for (int DstSubLane = 0; DstSubLane != NumSubLanes; ++DstSubLane) {

17901

// Extract the sub-lane mask, check that it all comes from the same lane

17902

// and normalize the mask entries to come from the first lane.

17903

int SrcLane = -1;

17904

SmallVector<int, 8> SubLaneMask((unsigned)NumSubLaneElts, -1);

17905

for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {

17906

int M = Mask[(DstSubLane * NumSubLaneElts) + Elt];

17907

if (M < 0)

17908

continue;

17909

int Lane = (M % NumElts) / NumLaneElts;

17910

if ((0 <= SrcLane) && (SrcLane != Lane))

17911

return SDValue();

17912

SrcLane = Lane;

17913

int LocalM = (M % NumLaneElts) + (M < NumElts ? 0 : NumElts);

17914

SubLaneMask[Elt] = LocalM;

17915

}

17916

17917

// Whole sub-lane is UNDEF.

17918

if (SrcLane < 0)

17919

continue;

17920

17921

// Attempt to match against the candidate repeated sub-lane masks.

17922

for (int SubLane = 0; SubLane != SubLaneScale; ++SubLane) {

17923

auto MatchMasks = [NumSubLaneElts](ArrayRef<int> M1, ArrayRef<int> M2) {

17924

for (int i = 0; i != NumSubLaneElts; ++i) {

17925

if (M1[i] < 0 || M2[i] < 0)

17926

continue;

17927

if (M1[i] != M2[i])

17928

return false;

17929

}

17930

return true;

17931

};

17932

17933

auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane];

17934

if (!MatchMasks(SubLaneMask, RepeatedSubLaneMask))

17935

continue;

17936

17937

// Merge the sub-lane mask into the matching repeated sub-lane mask.

17938

for (int i = 0; i != NumSubLaneElts; ++i) {

17939

int M = SubLaneMask[i];

17940

if (M < 0)

17941

continue;

17942

assert((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) &&(static_cast <bool> ((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask
[i] == M) && "Unexpected mask element") ? void (0) : __assert_fail
("(RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) && \"Unexpected mask element\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17943, __extension__
__PRETTY_FUNCTION__))

17943

"Unexpected mask element")(static_cast <bool> ((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask
[i] == M) && "Unexpected mask element") ? void (0) : __assert_fail
("(RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) && \"Unexpected mask element\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17943, __extension__
__PRETTY_FUNCTION__));

17944

RepeatedSubLaneMask[i] = M;

17945

}

17946

17947

// Track the top most source sub-lane - by setting the remaining to

17948

// UNDEF we can greatly simplify shuffle matching.

17949

int SrcSubLane = (SrcLane * SubLaneScale) + SubLane;

17950

TopSrcSubLane = std::max(TopSrcSubLane, SrcSubLane);

17951

Dst2SrcSubLanes[DstSubLane] = SrcSubLane;

17952

break;

17953

}

17954

17955

// Bail if we failed to find a matching repeated sub-lane mask.

17956

if (Dst2SrcSubLanes[DstSubLane] < 0)

17957

return SDValue();

17958

}

17959

assert(0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes &&(static_cast <bool> (0 <= TopSrcSubLane && TopSrcSubLane
< NumSubLanes && "Unexpected source lane") ? void
(0) : __assert_fail ("0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes && \"Unexpected source lane\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17960, __extension__
__PRETTY_FUNCTION__))

17960

"Unexpected source lane")(static_cast <bool> (0 <= TopSrcSubLane && TopSrcSubLane
< NumSubLanes && "Unexpected source lane") ? void
(0) : __assert_fail ("0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes && \"Unexpected source lane\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17960, __extension__
__PRETTY_FUNCTION__));

17961

17962

// Create a repeating shuffle mask for the entire vector.

17963

SmallVector<int, 8> RepeatedMask((unsigned)NumElts, -1);

17964

for (int SubLane = 0; SubLane <= TopSrcSubLane; ++SubLane) {

17965

int Lane = SubLane / SubLaneScale;

17966

auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane % SubLaneScale];

17967

for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {

17968

int M = RepeatedSubLaneMask[Elt];

17969

if (M < 0)

17970

continue;

17971

int Idx = (SubLane * NumSubLaneElts) + Elt;

17972

RepeatedMask[Idx] = M + (Lane * NumLaneElts);

17973

}

17974

}

17975

17976

// Shuffle each source sub-lane to its destination.

17977

SmallVector<int, 8> SubLaneMask((unsigned)NumElts, -1);

17978

for (int i = 0; i != NumElts; i += NumSubLaneElts) {

17979

int SrcSubLane = Dst2SrcSubLanes[i / NumSubLaneElts];

17980

if (SrcSubLane < 0)

17981

continue;

17982

for (int j = 0; j != NumSubLaneElts; ++j)

17983

SubLaneMask[i + j] = j + (SrcSubLane * NumSubLaneElts);

17984

}

17985

17986

// Avoid returning the same shuffle operation.

17987

// v8i32 = vector_shuffle<0,1,4,5,2,3,6,7> t5, undef:v8i32

17988

if (RepeatedMask == Mask || SubLaneMask == Mask)

17989

return SDValue();

17990

17991

SDValue RepeatedShuffle =

17992

DAG.getVectorShuffle(VT, DL, V1, V2, RepeatedMask);

17993

17994

return DAG.getVectorShuffle(VT, DL, RepeatedShuffle, DAG.getUNDEF(VT),

17995

SubLaneMask);

17996

};

17997

17998

// On AVX2 targets we can permute 256-bit vectors as 64-bit sub-lanes

17999

// (with PERMQ/PERMPD). On AVX2/AVX512BW targets, permuting 32-bit sub-lanes,

18000

// even with a variable shuffle, can be worth it for v32i8/v64i8 vectors.

18001

// Otherwise we can only permute whole 128-bit lanes.

18002

int MinSubLaneScale = 1, MaxSubLaneScale = 1;

18003

if (Subtarget.hasAVX2() && VT.is256BitVector()) {

18004

bool OnlyLowestElts = isUndefOrInRange(Mask, 0, NumLaneElts);

18005

MinSubLaneScale = 2;

18006

MaxSubLaneScale =

18007

(!OnlyLowestElts && V2.isUndef() && VT == MVT::v32i8) ? 4 : 2;

18008

}

18009

if (Subtarget.hasBWI() && VT == MVT::v64i8)

18010

MinSubLaneScale = MaxSubLaneScale = 4;

18011

18012

for (int Scale = MinSubLaneScale; Scale <= MaxSubLaneScale; Scale *= 2)

18013

if (SDValue Shuffle = ShuffleSubLanes(Scale))

18014

return Shuffle;

18015

18016

return SDValue();

18017

}

18018

18019

static bool matchShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2,

18020

bool &ForceV1Zero, bool &ForceV2Zero,

18021

unsigned &ShuffleImm, ArrayRef<int> Mask,

18022

const APInt &Zeroable) {

18023

int NumElts = VT.getVectorNumElements();

18024

assert(VT.getScalarSizeInBits() == 64 &&(static_cast <bool> (VT.getScalarSizeInBits() == 64 &&
(NumElts == 2 || NumElts == 4 || NumElts == 8) && "Unexpected data type for VSHUFPD"
) ? void (0) : __assert_fail ("VT.getScalarSizeInBits() == 64 && (NumElts == 2 || NumElts == 4 || NumElts == 8) && \"Unexpected data type for VSHUFPD\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18026, __extension__
__PRETTY_FUNCTION__))

18025

(NumElts == 2 || NumElts == 4 || NumElts == 8) &&(static_cast <bool> (VT.getScalarSizeInBits() == 64 &&
(NumElts == 2 || NumElts == 4 || NumElts == 8) && "Unexpected data type for VSHUFPD"
) ? void (0) : __assert_fail ("VT.getScalarSizeInBits() == 64 && (NumElts == 2 || NumElts == 4 || NumElts == 8) && \"Unexpected data type for VSHUFPD\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18026, __extension__
__PRETTY_FUNCTION__))

18026

"Unexpected data type for VSHUFPD")(static_cast <bool> (VT.getScalarSizeInBits() == 64 &&
(NumElts == 2 || NumElts == 4 || NumElts == 8) && "Unexpected data type for VSHUFPD"
) ? void (0) : __assert_fail ("VT.getScalarSizeInBits() == 64 && (NumElts == 2 || NumElts == 4 || NumElts == 8) && \"Unexpected data type for VSHUFPD\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18026, __extension__
__PRETTY_FUNCTION__));

18027

assert(isUndefOrZeroOrInRange(Mask, 0, 2 * NumElts) &&(static_cast <bool> (isUndefOrZeroOrInRange(Mask, 0, 2 *
NumElts) && "Illegal shuffle mask") ? void (0) : __assert_fail
("isUndefOrZeroOrInRange(Mask, 0, 2 * NumElts) && \"Illegal shuffle mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18028, __extension__
__PRETTY_FUNCTION__))

18028

"Illegal shuffle mask")(static_cast <bool> (isUndefOrZeroOrInRange(Mask, 0, 2 *
NumElts) && "Illegal shuffle mask") ? void (0) : __assert_fail
("isUndefOrZeroOrInRange(Mask, 0, 2 * NumElts) && \"Illegal shuffle mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18028, __extension__
__PRETTY_FUNCTION__));

18029

18030

bool ZeroLane[2] = { true, true };

18031

for (int i = 0; i < NumElts; ++i)

18032

ZeroLane[i & 1] &= Zeroable[i];

18033

18034

// Mask for V8F64: 0/1, 8/9, 2/3, 10/11, 4/5, ..

18035

// Mask for V4F64; 0/1, 4/5, 2/3, 6/7..

18036

ShuffleImm = 0;

18037

bool ShufpdMask = true;

18038

bool CommutableMask = true;

18039

for (int i = 0; i < NumElts; ++i) {

18040

if (Mask[i] == SM_SentinelUndef || ZeroLane[i & 1])

18041

continue;

18042

if (Mask[i] < 0)

18043

return false;

18044

int Val = (i & 6) + NumElts * (i & 1);

18045

int CommutVal = (i & 0xe) + NumElts * ((i & 1) ^ 1);

18046

if (Mask[i] < Val || Mask[i] > Val + 1)

18047

ShufpdMask = false;

18048

if (Mask[i] < CommutVal || Mask[i] > CommutVal + 1)

18049

CommutableMask = false;

18050

ShuffleImm |= (Mask[i] % 2) << i;

18051

}

18052

18053

if (!ShufpdMask && !CommutableMask)

18054

return false;

18055

18056

if (!ShufpdMask && CommutableMask)

18057

std::swap(V1, V2);

18058

18059

ForceV1Zero = ZeroLane[0];

18060

ForceV2Zero = ZeroLane[1];

18061

return true;

18062

}

18063

18064

static SDValue lowerShuffleWithSHUFPD(const SDLoc &DL, MVT VT, SDValue V1,

18065

SDValue V2, ArrayRef<int> Mask,

18066

const APInt &Zeroable,

18067

const X86Subtarget &Subtarget,

18068

SelectionDAG &DAG) {

18069

assert((VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64) &&(static_cast <bool> ((VT == MVT::v2f64 || VT == MVT::v4f64
|| VT == MVT::v8f64) && "Unexpected data type for VSHUFPD"
) ? void (0) : __assert_fail ("(VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64) && \"Unexpected data type for VSHUFPD\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18070, __extension__
__PRETTY_FUNCTION__))

18070

"Unexpected data type for VSHUFPD")(static_cast <bool> ((VT == MVT::v2f64 || VT == MVT::v4f64
|| VT == MVT::v8f64) && "Unexpected data type for VSHUFPD"
) ? void (0) : __assert_fail ("(VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64) && \"Unexpected data type for VSHUFPD\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18070, __extension__
__PRETTY_FUNCTION__));

18071

18072

unsigned Immediate = 0;

18073

bool ForceV1Zero = false, ForceV2Zero = false;

18074

if (!matchShuffleWithSHUFPD(VT, V1, V2, ForceV1Zero, ForceV2Zero, Immediate,

18075

Mask, Zeroable))

18076

return SDValue();

18077

18078

// Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.

18079

if (ForceV1Zero)

18080

V1 = getZeroVector(VT, Subtarget, DAG, DL);

18081

if (ForceV2Zero)

18082

V2 = getZeroVector(VT, Subtarget, DAG, DL);

18083

18084

return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,

18085

DAG.getTargetConstant(Immediate, DL, MVT::i8));

18086

}

18087

18088

// Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed

18089

// by zeroable elements in the remaining 24 elements. Turn this into two

18090

// vmovqb instructions shuffled together.

18091

static SDValue lowerShuffleAsVTRUNCAndUnpack(const SDLoc &DL, MVT VT,

18092

SDValue V1, SDValue V2,

18093

ArrayRef<int> Mask,

18094

const APInt &Zeroable,

18095

SelectionDAG &DAG) {

18096

assert(VT == MVT::v32i8 && "Unexpected type!")(static_cast <bool> (VT == MVT::v32i8 && "Unexpected type!"
) ? void (0) : __assert_fail ("VT == MVT::v32i8 && \"Unexpected type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18096, __extension__
__PRETTY_FUNCTION__));

18097

18098

// The first 8 indices should be every 8th element.

18099

if (!isSequentialOrUndefInRange(Mask, 0, 8, 0, 8))

18100

return SDValue();

18101

18102

// Remaining elements need to be zeroable.

18103

if (Zeroable.countl_one() < (Mask.size() - 8))

18104

return SDValue();

18105

18106

V1 = DAG.getBitcast(MVT::v4i64, V1);

18107

V2 = DAG.getBitcast(MVT::v4i64, V2);

18108

18109

V1 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V1);

18110

V2 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V2);

18111

18112

// The VTRUNCs will put 0s in the upper 12 bytes. Use them to put zeroes in

18113

// the upper bits of the result using an unpckldq.

18114

SDValue Unpack = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2,

18115

{ 0, 1, 2, 3, 16, 17, 18, 19,

18116

4, 5, 6, 7, 20, 21, 22, 23 });

18117

// Insert the unpckldq into a zero vector to widen to v32i8.

18118

return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v32i8,

18119

DAG.getConstant(0, DL, MVT::v32i8), Unpack,

18120

DAG.getIntPtrConstant(0, DL));

18121

}

18122

18123

// a = shuffle v1, v2, mask1 ; interleaving lower lanes of v1 and v2

18124

// b = shuffle v1, v2, mask2 ; interleaving higher lanes of v1 and v2

18125

// =>

18126

// ul = unpckl v1, v2

18127

// uh = unpckh v1, v2

18128

// a = vperm ul, uh

18129

// b = vperm ul, uh

18130

//

18131

// Pattern-match interleave(256b v1, 256b v2) -> 512b v3 and lower it into unpck

18132

// and permute. We cannot directly match v3 because it is split into two

18133

// 256-bit vectors in earlier isel stages. Therefore, this function matches a

18134

// pair of 256-bit shuffles and makes sure the masks are consecutive.

18135

//

18136

// Once unpck and permute nodes are created, the permute corresponding to this

18137

// shuffle is returned, while the other permute replaces the other half of the

18138

// shuffle in the selection dag.

18139

static SDValue lowerShufflePairAsUNPCKAndPermute(const SDLoc &DL, MVT VT,

18140

SDValue V1, SDValue V2,

18141

ArrayRef<int> Mask,

18142

SelectionDAG &DAG) {

18143

if (VT != MVT::v8f32 && VT != MVT::v8i32 && VT != MVT::v16i16 &&

18144

VT != MVT::v32i8)

18145

return SDValue();

18146

// <B0, B1, B0+1, B1+1, ..., >

18147

auto IsInterleavingPattern = [&](ArrayRef<int> Mask, unsigned Begin0,

18148

unsigned Begin1) {

18149

size_t Size = Mask.size();

18150

assert(Size % 2 == 0 && "Expected even mask size")(static_cast <bool> (Size % 2 == 0 && "Expected even mask size"
) ? void (0) : __assert_fail ("Size % 2 == 0 && \"Expected even mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18150, __extension__
__PRETTY_FUNCTION__));

18151

for (unsigned I = 0; I < Size; I += 2) {

18152

if (Mask[I] != (int)(Begin0 + I / 2) ||

18153

Mask[I + 1] != (int)(Begin1 + I / 2))

18154

return false;

18155

}

18156

return true;

18157

};

18158

// Check which half is this shuffle node

18159

int NumElts = VT.getVectorNumElements();

18160

size_t FirstQtr = NumElts / 2;

18161

size_t ThirdQtr = NumElts + NumElts / 2;

18162

bool IsFirstHalf = IsInterleavingPattern(Mask, 0, NumElts);

18163

bool IsSecondHalf = IsInterleavingPattern(Mask, FirstQtr, ThirdQtr);

18164

if (!IsFirstHalf && !IsSecondHalf)

18165

return SDValue();

18166

18167

// Find the intersection between shuffle users of V1 and V2.

18168

SmallVector<SDNode *, 2> Shuffles;

18169

for (SDNode *User : V1->uses())

18170

if (User->getOpcode() == ISD::VECTOR_SHUFFLE && User->getOperand(0) == V1 &&

18171

User->getOperand(1) == V2)

18172

Shuffles.push_back(User);

18173

// Limit user size to two for now.

18174

if (Shuffles.size() != 2)

18175

return SDValue();

18176

// Find out which half of the 512-bit shuffles is each smaller shuffle

18177

auto *SVN1 = cast<ShuffleVectorSDNode>(Shuffles[0]);

18178

auto *SVN2 = cast<ShuffleVectorSDNode>(Shuffles[1]);

18179

SDNode *FirstHalf;

18180

SDNode *SecondHalf;

18181

if (IsInterleavingPattern(SVN1->getMask(), 0, NumElts) &&

18182

IsInterleavingPattern(SVN2->getMask(), FirstQtr, ThirdQtr)) {

18183

FirstHalf = Shuffles[0];

18184

SecondHalf = Shuffles[1];

18185

} else if (IsInterleavingPattern(SVN1->getMask(), FirstQtr, ThirdQtr) &&

18186

IsInterleavingPattern(SVN2->getMask(), 0, NumElts)) {

18187

FirstHalf = Shuffles[1];

18188

SecondHalf = Shuffles[0];

18189

} else {

18190

return SDValue();

18191

}

18192

// Lower into unpck and perm. Return the perm of this shuffle and replace

18193

// the other.

18194

SDValue Unpckl = DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);

18195

SDValue Unpckh = DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);

18196

SDValue Perm1 = DAG.getNode(X86ISD::VPERM2X128, DL, VT, Unpckl, Unpckh,

18197

DAG.getTargetConstant(0x20, DL, MVT::i8));

18198

SDValue Perm2 = DAG.getNode(X86ISD::VPERM2X128, DL, VT, Unpckl, Unpckh,

18199

DAG.getTargetConstant(0x31, DL, MVT::i8));

18200

if (IsFirstHalf) {

18201

DAG.ReplaceAllUsesWith(SecondHalf, &Perm2);

18202

return Perm1;

18203

}

18204

DAG.ReplaceAllUsesWith(FirstHalf, &Perm1);

18205

return Perm2;

18206

}

18207

18208

/// Handle lowering of 4-lane 64-bit floating point shuffles.

18209

///

18210

/// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2

18211

/// isn't available.

18212

static SDValue lowerV4F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

18213

const APInt &Zeroable, SDValue V1, SDValue V2,

18214

const X86Subtarget &Subtarget,

18215

SelectionDAG &DAG) {

18216

assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v4f64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v4f64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18216, __extension__
__PRETTY_FUNCTION__));

18217

assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v4f64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v4f64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18217, __extension__
__PRETTY_FUNCTION__));

18218

assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!")(static_cast <bool> (Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 4 && \"Unexpected mask size for v4 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18218, __extension__
__PRETTY_FUNCTION__));

18219

18220

if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4f64, V1, V2, Mask, Zeroable,

18221

Subtarget, DAG))

18222

return V;

18223

18224

if (V2.isUndef()) {

18225

// Check for being able to broadcast a single element.

18226

if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f64, V1, V2,

18227

Mask, Subtarget, DAG))

18228

return Broadcast;

18229

18230

// Use low duplicate instructions for masks that match their pattern.

18231

if (isShuffleEquivalent(Mask, {0, 0, 2, 2}, V1, V2))

18232

return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);

18233

18234

if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {

18235

// Non-half-crossing single input shuffles can be lowered with an

18236

// interleaved permutation.

18237

unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |

18238

((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3);

18239

return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1,

18240

DAG.getTargetConstant(VPERMILPMask, DL, MVT::i8));

18241

}

18242

18243

// With AVX2 we have direct support for this permutation.

18244

if (Subtarget.hasAVX2())

18245

return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1,

18246

getV4X86ShuffleImm8ForMask(Mask, DL, DAG));

18247

18248

// Try to create an in-lane repeating shuffle mask and then shuffle the

18249

// results into the target lanes.

18250

if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(

18251

DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))

18252

return V;

18253

18254

// Try to permute the lanes and then use a per-lane permute.

18255

if (SDValue V = lowerShuffleAsLanePermuteAndPermute(DL, MVT::v4f64, V1, V2,

18256

Mask, DAG, Subtarget))

18257

return V;

18258

18259

// Otherwise, fall back.

18260

return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v4f64, V1, V2, Mask,

18261

DAG, Subtarget);

18262

}

18263

18264

// Use dedicated unpack instructions for masks that match their pattern.

18265

if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f64, Mask, V1, V2, DAG))

18266

return V;

18267

18268

if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,

18269

Zeroable, Subtarget, DAG))

18270

return Blend;

18271

18272

// Check if the blend happens to exactly fit that of SHUFPD.

18273

if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v4f64, V1, V2, Mask,

18274

Zeroable, Subtarget, DAG))

18275

return Op;

18276

18277

bool V1IsInPlace = isShuffleMaskInputInPlace(0, Mask);

18278

bool V2IsInPlace = isShuffleMaskInputInPlace(1, Mask);

18279

18280

// If we have lane crossing shuffles AND they don't all come from the lower

18281

// lane elements, lower to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).

18282

// TODO: Handle BUILD_VECTOR sources which getVectorShuffle currently

18283

// canonicalize to a blend of splat which isn't necessary for this combine.

18284

if (is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask) &&

18285

!all_of(Mask, [](int M) { return M < 2 || (4 <= M && M < 6); }) &&

18286

(V1.getOpcode() != ISD::BUILD_VECTOR) &&

18287

(V2.getOpcode() != ISD::BUILD_VECTOR))

18288

return lowerShuffleAsLanePermuteAndSHUFP(DL, MVT::v4f64, V1, V2, Mask, DAG);

18289

18290

// If we have one input in place, then we can permute the other input and

18291

// blend the result.

18292

if (V1IsInPlace

10.1	'V1IsInPlace' is false

|| V2IsInPlace

10.2	'V2IsInPlace' is false

)

18293

return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4f64, V1, V2, Mask,

18294

Subtarget, DAG);

18295

18296

// Try to create an in-lane repeating shuffle mask and then shuffle the

18297

// results into the target lanes.

18298

if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(

18299

DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))

18300

return V;

18301

18302

// Try to simplify this by merging 128-bit lanes to enable a lane-based

18303

// shuffle. However, if we have AVX2 and either inputs are already in place,

18304

// we will be able to shuffle even across lanes the other input in a single

18305

// instruction so skip this pattern.

18306

if (!(Subtarget.hasAVX2() && (V1IsInPlace

11.1	'V1IsInPlace' is false

|| V2IsInPlace)))

18307

if (SDValue V = lowerShuffleAsLanePermuteAndRepeatedMask(

18308

DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))

18309

return V;

18310

18311

// If we have VLX support, we can use VEXPAND.

18312

if (Subtarget.hasVLX())

18313

if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v4f64, Zeroable, Mask, V1, V2,

18314

DAG, Subtarget))

18315

return V;

18316

18317

// If we have AVX2 then we always want to lower with a blend because an v4 we

18318

// can fully permute the elements.

18319

if (Subtarget.hasAVX2())

18320

return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4f64, V1, V2, Mask,

18321

Subtarget, DAG);

18322

18323

// Otherwise fall back on generic lowering.

18324

return lowerShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask,

18325

Subtarget, DAG);

18326

}

18327

18328

/// Handle lowering of 4-lane 64-bit integer shuffles.

18329

///

18330

/// This routine is only called when we have AVX2 and thus a reasonable

18331

/// instruction set for v4i64 shuffling..

18332

static SDValue lowerV4I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

18333

const APInt &Zeroable, SDValue V1, SDValue V2,

18334

const X86Subtarget &Subtarget,

18335

SelectionDAG &DAG) {

18336

assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v4i64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v4i64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18336, __extension__
__PRETTY_FUNCTION__));

18337

assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v4i64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v4i64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18337, __extension__
__PRETTY_FUNCTION__));

18338

assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!")(static_cast <bool> (Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 4 && \"Unexpected mask size for v4 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18338, __extension__
__PRETTY_FUNCTION__));

18339

assert(Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!")(static_cast <bool> (Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"We can only lower v4i64 with AVX2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18339, __extension__
__PRETTY_FUNCTION__));

18340

18341

if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4i64, V1, V2, Mask, Zeroable,

18342

Subtarget, DAG))

18343

return V;

18344

18345

if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,

18346

Zeroable, Subtarget, DAG))

18347

return Blend;

18348

18349

// Check for being able to broadcast a single element.

18350

if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i64, V1, V2, Mask,

18351

Subtarget, DAG))

18352

return Broadcast;

18353

18354

// Try to use shift instructions if fast.

18355

if (Subtarget.preferLowerShuffleAsShift())

18356

if (SDValue Shift =

18357

lowerShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask, Zeroable,

18358

Subtarget, DAG, /*BitwiseOnly*/ true))

18359

return Shift;

18360

18361

if (V2.isUndef()) {

18362

// When the shuffle is mirrored between the 128-bit lanes of the unit, we

18363

// can use lower latency instructions that will operate on both lanes.

18364

SmallVector<int, 2> RepeatedMask;

18365

if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {

18366

SmallVector<int, 4> PSHUFDMask;

18367

narrowShuffleMaskElts(2, RepeatedMask, PSHUFDMask);

18368

return DAG.getBitcast(

18369

MVT::v4i64,

18370

DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,

18371

DAG.getBitcast(MVT::v8i32, V1),

18372

getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));

18373

}

18374

18375

// AVX2 provides a direct instruction for permuting a single input across

18376

// lanes.

18377

return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,

18378

getV4X86ShuffleImm8ForMask(Mask, DL, DAG));

18379

}

18380

18381

// Try to use shift instructions.

18382

if (SDValue Shift =

18383

lowerShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask, Zeroable, Subtarget,

18384

DAG, /*BitwiseOnly*/ false))

18385

return Shift;

18386

18387

// If we have VLX support, we can use VALIGN or VEXPAND.

18388

if (Subtarget.hasVLX()) {

18389

if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i64, V1, V2, Mask,

18390

Subtarget, DAG))

18391

return Rotate;

18392

18393

if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v4i64, Zeroable, Mask, V1, V2,

18394

DAG, Subtarget))

18395

return V;

18396

}

18397

18398

// Try to use PALIGNR.

18399

if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i64, V1, V2, Mask,

18400

Subtarget, DAG))

18401

return Rotate;

18402

18403

// Use dedicated unpack instructions for masks that match their pattern.

18404

if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i64, Mask, V1, V2, DAG))

18405

return V;

18406

18407

bool V1IsInPlace = isShuffleMaskInputInPlace(0, Mask);

18408

bool V2IsInPlace = isShuffleMaskInputInPlace(1, Mask);

18409

18410

// If we have one input in place, then we can permute the other input and

18411

// blend the result.

18412

if (V1IsInPlace || V2IsInPlace)

18413

return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i64, V1, V2, Mask,

18414

Subtarget, DAG);

18415

18416

// Try to create an in-lane repeating shuffle mask and then shuffle the

18417

// results into the target lanes.

18418

if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(

18419

DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))

18420

return V;

18421

18422

// Try to lower to PERMQ(BLENDD(V1,V2)).

18423

if (SDValue V =

18424

lowerShuffleAsBlendAndPermute(DL, MVT::v4i64, V1, V2, Mask, DAG))

18425

return V;

18426

18427

// Try to simplify this by merging 128-bit lanes to enable a lane-based

18428

// shuffle. However, if we have AVX2 and either inputs are already in place,

18429

// we will be able to shuffle even across lanes the other input in a single

18430

// instruction so skip this pattern.

18431

if (!V1IsInPlace && !V2IsInPlace)

18432

if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(

18433

DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))

18434

return Result;

18435

18436

// Otherwise fall back on generic blend lowering.

18437

return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i64, V1, V2, Mask,

18438

Subtarget, DAG);

18439

}

18440

18441

/// Handle lowering of 8-lane 32-bit floating point shuffles.

18442

///

18443

/// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2

18444

/// isn't available.

18445

static SDValue lowerV8F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

18446

const APInt &Zeroable, SDValue V1, SDValue V2,

18447

const X86Subtarget &Subtarget,

18448

SelectionDAG &DAG) {

18449

assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v8f32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v8f32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18449, __extension__
__PRETTY_FUNCTION__));

18450

assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v8f32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v8f32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18450, __extension__
__PRETTY_FUNCTION__));

18451

assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")(static_cast <bool> (Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 8 && \"Unexpected mask size for v8 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18451, __extension__
__PRETTY_FUNCTION__));

18452

18453

if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,

18454

Zeroable, Subtarget, DAG))

18455

return Blend;

18456

18457

// Check for being able to broadcast a single element.

18458

if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8f32, V1, V2, Mask,

18459

Subtarget, DAG))

18460

return Broadcast;

18461

18462

if (!Subtarget.hasAVX2()) {

18463

SmallVector<int> InLaneMask;

18464

computeInLaneShuffleMask(Mask, Mask.size() / 2, InLaneMask);

18465

18466

if (!is128BitLaneRepeatedShuffleMask(MVT::v8f32, InLaneMask))

18467

if (SDValue R = splitAndLowerShuffle(DL, MVT::v8f32, V1, V2, Mask, DAG,

18468

/*SimpleOnly*/ true))

18469

return R;

18470

}

18471

if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2, Mask,

18472

Zeroable, Subtarget, DAG))

18473

return DAG.getBitcast(MVT::v8f32, ZExt);

18474

18475

// If the shuffle mask is repeated in each 128-bit lane, we have many more

18476

// options to efficiently lower the shuffle.

18477

SmallVector<int, 4> RepeatedMask;

18478

if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) {

18479

assert(RepeatedMask.size() == 4 &&(static_cast <bool> (RepeatedMask.size() == 4 &&
"Repeated masks must be half the mask width!") ? void (0) : __assert_fail
("RepeatedMask.size() == 4 && \"Repeated masks must be half the mask width!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18480, __extension__
__PRETTY_FUNCTION__))

18480

"Repeated masks must be half the mask width!")(static_cast <bool> (RepeatedMask.size() == 4 &&
"Repeated masks must be half the mask width!") ? void (0) : __assert_fail
("RepeatedMask.size() == 4 && \"Repeated masks must be half the mask width!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18480, __extension__
__PRETTY_FUNCTION__));

18481

18482

// Use even/odd duplicate instructions for masks that match their pattern.

18483

if (isShuffleEquivalent(RepeatedMask, {0, 0, 2, 2}, V1, V2))

18484

return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);

18485

if (isShuffleEquivalent(RepeatedMask, {1, 1, 3, 3}, V1, V2))

18486

return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);

18487

18488

if (V2.isUndef())

18489

return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,

18490

getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));

18491

18492

// Use dedicated unpack instructions for masks that match their pattern.

18493

if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8f32, Mask, V1, V2, DAG))

18494

return V;

18495

18496

// Otherwise, fall back to a SHUFPS sequence. Here it is important that we

18497

// have already handled any direct blends.

18498

return lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG);

18499

}

18500

18501

// Try to create an in-lane repeating shuffle mask and then shuffle the

18502

// results into the target lanes.

18503

if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(

18504

DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))

18505

return V;

18506

18507

// If we have a single input shuffle with different shuffle patterns in the

18508

// two 128-bit lanes use the variable mask to VPERMILPS.

18509

if (V2.isUndef()) {

18510

if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask)) {

18511

SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);

18512

return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, V1, VPermMask);

18513

}

18514

if (Subtarget.hasAVX2()) {

18515

SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);

18516

return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32, VPermMask, V1);

18517

}

18518

// Otherwise, fall back.

18519

return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v8f32, V1, V2, Mask,

18520

DAG, Subtarget);

18521

}

18522

18523

// Try to simplify this by merging 128-bit lanes to enable a lane-based

18524

// shuffle.

18525

if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(

18526

DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))

18527

return Result;

18528

18529

// If we have VLX support, we can use VEXPAND.

18530

if (Subtarget.hasVLX())

18531

if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8f32, Zeroable, Mask, V1, V2,

18532

DAG, Subtarget))

18533

return V;

18534

18535

// Try to match an interleave of two v8f32s and lower them as unpck and

18536

// permutes using ymms. This needs to go before we try to split the vectors.

18537

//

18538

// TODO: Expand this to AVX1. Currently v8i32 is casted to v8f32 and hits

18539

// this path inadvertently.

18540

if (Subtarget.hasAVX2() && !Subtarget.hasAVX512())

18541

if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v8f32, V1, V2,

18542

Mask, DAG))

18543

return V;

18544

18545

// For non-AVX512 if the Mask is of 16bit elements in lane then try to split

18546

// since after split we get a more efficient code using vpunpcklwd and

18547

// vpunpckhwd instrs than vblend.

18548

if (!Subtarget.hasAVX512() && isUnpackWdShuffleMask(Mask, MVT::v8f32, DAG))

18549

return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, Subtarget,

18550

DAG);

18551

18552

// If we have AVX2 then we always want to lower with a blend because at v8 we

18553

// can fully permute the elements.

18554

if (Subtarget.hasAVX2())

18555

return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8f32, V1, V2, Mask,

18556

Subtarget, DAG);

18557

18558

// Otherwise fall back on generic lowering.

18559

return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask,

18560

Subtarget, DAG);

18561

}

18562

18563

/// Handle lowering of 8-lane 32-bit integer shuffles.

18564

///

18565

/// This routine is only called when we have AVX2 and thus a reasonable

18566

/// instruction set for v8i32 shuffling..

18567

static SDValue lowerV8I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

18568

const APInt &Zeroable, SDValue V1, SDValue V2,

18569

const X86Subtarget &Subtarget,

18570

SelectionDAG &DAG) {

18571

assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v8i32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v8i32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18571, __extension__
__PRETTY_FUNCTION__));

18572

assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v8i32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v8i32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18572, __extension__
__PRETTY_FUNCTION__));

18573

assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")(static_cast <bool> (Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 8 && \"Unexpected mask size for v8 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18573, __extension__
__PRETTY_FUNCTION__));

18574

assert(Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!")(static_cast <bool> (Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"We can only lower v8i32 with AVX2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18574, __extension__
__PRETTY_FUNCTION__));

18575

18576

int NumV2Elements = count_if(Mask, [](int M) { return M >= 8; });

18577

18578

// Whenever we can lower this as a zext, that instruction is strictly faster

18579

// than any alternative. It also allows us to fold memory operands into the

18580

// shuffle in many cases.

18581

if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2, Mask,

18582

Zeroable, Subtarget, DAG))

18583

return ZExt;

18584

18585

// Try to match an interleave of two v8i32s and lower them as unpck and

18586

// permutes using ymms. This needs to go before we try to split the vectors.

18587

if (!Subtarget.hasAVX512())

18588

if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v8i32, V1, V2,

18589

Mask, DAG))

18590

return V;

18591

18592

// For non-AVX512 if the Mask is of 16bit elements in lane then try to split

18593

// since after split we get a more efficient code than vblend by using

18594

// vpunpcklwd and vpunpckhwd instrs.

18595

if (isUnpackWdShuffleMask(Mask, MVT::v8i32, DAG) && !V2.isUndef() &&

18596

!Subtarget.hasAVX512())

18597

return lowerShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2, Mask, Subtarget,

18598

DAG);

18599

18600

if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,

18601

Zeroable, Subtarget, DAG))

18602

return Blend;

18603

18604

// Check for being able to broadcast a single element.

18605

if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i32, V1, V2, Mask,

18606

Subtarget, DAG))

18607

return Broadcast;

18608

18609

// Try to use shift instructions if fast.

18610

if (Subtarget.preferLowerShuffleAsShift()) {

18611

if (SDValue Shift =

18612

lowerShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask, Zeroable,

18613

Subtarget, DAG, /*BitwiseOnly*/ true))

18614

return Shift;

18615

if (NumV2Elements == 0)

18616

if (SDValue Rotate =

18617

lowerShuffleAsBitRotate(DL, MVT::v8i32, V1, Mask, Subtarget, DAG))

18618

return Rotate;

18619

}

18620

18621

// If the shuffle mask is repeated in each 128-bit lane we can use more

18622

// efficient instructions that mirror the shuffles across the two 128-bit

18623

// lanes.

18624

SmallVector<int, 4> RepeatedMask;

18625

bool Is128BitLaneRepeatedShuffle =

18626

is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask);

18627

if (Is128BitLaneRepeatedShuffle) {

18628

assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!")(static_cast <bool> (RepeatedMask.size() == 4 &&
"Unexpected repeated mask size!") ? void (0) : __assert_fail
("RepeatedMask.size() == 4 && \"Unexpected repeated mask size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18628, __extension__
__PRETTY_FUNCTION__));

18629

if (V2.isUndef())

18630

return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1,

18631

getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));

18632

18633

// Use dedicated unpack instructions for masks that match their pattern.

18634

if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i32, Mask, V1, V2, DAG))

18635

return V;

18636

}

18637

18638

// Try to use shift instructions.

18639

if (SDValue Shift =

18640

lowerShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask, Zeroable, Subtarget,

18641

DAG, /*BitwiseOnly*/ false))

18642

return Shift;

18643

18644

if (!Subtarget.preferLowerShuffleAsShift() && NumV2Elements == 0)

18645

if (SDValue Rotate =

18646

lowerShuffleAsBitRotate(DL, MVT::v8i32, V1, Mask, Subtarget, DAG))

18647

return Rotate;

18648

18649

// If we have VLX support, we can use VALIGN or EXPAND.

18650

if (Subtarget.hasVLX()) {

18651

if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v8i32, V1, V2, Mask,

18652

Subtarget, DAG))

18653

return Rotate;

18654

18655

if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8i32, Zeroable, Mask, V1, V2,

18656

DAG, Subtarget))

18657

return V;

18658

}

18659

18660

// Try to use byte rotation instructions.

18661

if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i32, V1, V2, Mask,

18662

Subtarget, DAG))

18663

return Rotate;

18664

18665

// Try to create an in-lane repeating shuffle mask and then shuffle the

18666

// results into the target lanes.

18667

if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(

18668

DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))

18669

return V;

18670

18671

if (V2.isUndef()) {

18672

// Try to produce a fixed cross-128-bit lane permute followed by unpack

18673

// because that should be faster than the variable permute alternatives.

18674

if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v8i32, Mask, V1, V2, DAG))

18675

return V;

18676

18677

// If the shuffle patterns aren't repeated but it's a single input, directly

18678

// generate a cross-lane VPERMD instruction.

18679

SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);

18680

return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8i32, VPermMask, V1);

18681

}

18682

18683

// Assume that a single SHUFPS is faster than an alternative sequence of

18684

// multiple instructions (even if the CPU has a domain penalty).

18685

// If some CPU is harmed by the domain switch, we can fix it in a later pass.

18686

if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {

18687

SDValue CastV1 = DAG.getBitcast(MVT::v8f32, V1);

18688

SDValue CastV2 = DAG.getBitcast(MVT::v8f32, V2);

18689

SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask,

18690

CastV1, CastV2, DAG);

18691

return DAG.getBitcast(MVT::v8i32, ShufPS);

18692

}

18693

18694

// Try to simplify this by merging 128-bit lanes to enable a lane-based

18695

// shuffle.

18696

if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(

18697

DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))

18698

return Result;

18699

18700

// Otherwise fall back on generic blend lowering.

18701

return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8i32, V1, V2, Mask,

18702

Subtarget, DAG);

18703

}

18704

18705

/// Handle lowering of 16-lane 16-bit integer shuffles.

18706

///

18707

/// This routine is only called when we have AVX2 and thus a reasonable

18708

/// instruction set for v16i16 shuffling..

18709

static SDValue lowerV16I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

18710

const APInt &Zeroable, SDValue V1, SDValue V2,

18711

const X86Subtarget &Subtarget,

18712

SelectionDAG &DAG) {

18713

assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v16i16
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v16i16 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18713, __extension__
__PRETTY_FUNCTION__));

18714

assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v16i16
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v16i16 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18714, __extension__
__PRETTY_FUNCTION__));

18715

assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!")(static_cast <bool> (Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 16 && \"Unexpected mask size for v16 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18715, __extension__
__PRETTY_FUNCTION__));

18716

assert(Subtarget.hasAVX2() && "We can only lower v16i16 with AVX2!")(static_cast <bool> (Subtarget.hasAVX2() && "We can only lower v16i16 with AVX2!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"We can only lower v16i16 with AVX2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18716, __extension__
__PRETTY_FUNCTION__));

18717

18718

// Whenever we can lower this as a zext, that instruction is strictly faster

18719

// than any alternative. It also allows us to fold memory operands into the

18720

// shuffle in many cases.

18721

if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(

18722

DL, MVT::v16i16, V1, V2, Mask, Zeroable, Subtarget, DAG))

18723

return ZExt;

18724

18725

// Check for being able to broadcast a single element.

18726

if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i16, V1, V2, Mask,

18727

Subtarget, DAG))

18728

return Broadcast;

18729

18730

if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,

18731

Zeroable, Subtarget, DAG))

18732

return Blend;

18733

18734

// Use dedicated unpack instructions for masks that match their pattern.

18735

if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i16, Mask, V1, V2, DAG))

18736

return V;

18737

18738

// Use dedicated pack instructions for masks that match their pattern.

18739

if (SDValue V = lowerShuffleWithPACK(DL, MVT::v16i16, Mask, V1, V2, DAG,

18740

Subtarget))

18741

return V;

18742

18743

// Try to use lower using a truncation.

18744

if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v16i16, V1, V2, Mask, Zeroable,

18745

Subtarget, DAG))

18746

return V;

18747

18748

// Try to use shift instructions.

18749

if (SDValue Shift =

18750

lowerShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask, Zeroable,

18751

Subtarget, DAG, /*BitwiseOnly*/ false))

18752

return Shift;

18753

18754

// Try to use byte rotation instructions.

18755

if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i16, V1, V2, Mask,

18756

Subtarget, DAG))

18757

return Rotate;

18758

18759

// Try to create an in-lane repeating shuffle mask and then shuffle the

18760

// results into the target lanes.

18761

if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(

18762

DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))

18763

return V;

18764

18765

if (V2.isUndef()) {

18766

// Try to use bit rotation instructions.

18767

if (SDValue Rotate =

18768

lowerShuffleAsBitRotate(DL, MVT::v16i16, V1, Mask, Subtarget, DAG))

18769

return Rotate;

18770

18771

// Try to produce a fixed cross-128-bit lane permute followed by unpack

18772

// because that should be faster than the variable permute alternatives.

18773

if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v16i16, Mask, V1, V2, DAG))

18774

return V;

18775

18776

// There are no generalized cross-lane shuffle operations available on i16

18777

// element types.

18778

if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask)) {

18779

if (SDValue V = lowerShuffleAsLanePermuteAndPermute(

18780

DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))

18781

return V;

18782

18783

return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v16i16, V1, V2, Mask,

18784

DAG, Subtarget);

18785

}

18786

18787

SmallVector<int, 8> RepeatedMask;

18788

if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {

18789

// As this is a single-input shuffle, the repeated mask should be

18790

// a strictly valid v8i16 mask that we can pass through to the v8i16

18791

// lowering to handle even the v16 case.

18792

return lowerV8I16GeneralSingleInputShuffle(

18793

DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG);

18794

}

18795

}

18796

18797

if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v16i16, Mask, V1, V2,

18798

Zeroable, Subtarget, DAG))

18799

return PSHUFB;

18800

18801

// AVX512BW can lower to VPERMW (non-VLX will pad to v32i16).

18802

if (Subtarget.hasBWI())

18803

return lowerShuffleWithPERMV(DL, MVT::v16i16, Mask, V1, V2, Subtarget, DAG);

18804

18805

// Try to simplify this by merging 128-bit lanes to enable a lane-based

18806

// shuffle.

18807

if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(

18808

DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))

18809

return Result;

18810

18811

// Try to permute the lanes and then use a per-lane permute.

18812

if (SDValue V = lowerShuffleAsLanePermuteAndPermute(

18813

DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))

18814

return V;

18815

18816

// Try to match an interleave of two v16i16s and lower them as unpck and

18817

// permutes using ymms.

18818

if (!Subtarget.hasAVX512())

18819

if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v16i16, V1, V2,

18820

Mask, DAG))

18821

return V;

18822

18823

// Otherwise fall back on generic lowering.

18824

return lowerShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask,

18825

Subtarget, DAG);

18826

}

18827

18828

/// Handle lowering of 32-lane 8-bit integer shuffles.

18829

///

18830

/// This routine is only called when we have AVX2 and thus a reasonable

18831

/// instruction set for v32i8 shuffling..

18832

static SDValue lowerV32I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

18833

const APInt &Zeroable, SDValue V1, SDValue V2,

18834

const X86Subtarget &Subtarget,

18835

SelectionDAG &DAG) {

18836

assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v32i8
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v32i8 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18836, __extension__
__PRETTY_FUNCTION__));

18837

assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v32i8
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v32i8 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18837, __extension__
__PRETTY_FUNCTION__));

18838

assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!")(static_cast <bool> (Mask.size() == 32 && "Unexpected mask size for v32 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 32 && \"Unexpected mask size for v32 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18838, __extension__
__PRETTY_FUNCTION__));

18839

assert(Subtarget.hasAVX2() && "We can only lower v32i8 with AVX2!")(static_cast <bool> (Subtarget.hasAVX2() && "We can only lower v32i8 with AVX2!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"We can only lower v32i8 with AVX2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18839, __extension__
__PRETTY_FUNCTION__));

18840

18841

// Whenever we can lower this as a zext, that instruction is strictly faster

18842

// than any alternative. It also allows us to fold memory operands into the

18843

// shuffle in many cases.

18844

if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v32i8, V1, V2, Mask,

18845

Zeroable, Subtarget, DAG))

18846

return ZExt;

18847

18848

// Check for being able to broadcast a single element.

18849

if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v32i8, V1, V2, Mask,

18850

Subtarget, DAG))

18851

return Broadcast;

18852

18853

if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,

18854

Zeroable, Subtarget, DAG))

18855

return Blend;

18856

18857

// Use dedicated unpack instructions for masks that match their pattern.

18858

if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i8, Mask, V1, V2, DAG))

18859

return V;

18860

18861

// Use dedicated pack instructions for masks that match their pattern.

18862

if (SDValue V = lowerShuffleWithPACK(DL, MVT::v32i8, Mask, V1, V2, DAG,

18863

Subtarget))

18864

return V;

18865

18866

// Try to use lower using a truncation.

18867

if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v32i8, V1, V2, Mask, Zeroable,

18868

Subtarget, DAG))

18869

return V;

18870

18871

// Try to use shift instructions.

18872

if (SDValue Shift =

18873

lowerShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask, Zeroable, Subtarget,

18874

DAG, /*BitwiseOnly*/ false))

18875

return Shift;

18876

18877

// Try to use byte rotation instructions.

18878

if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i8, V1, V2, Mask,

18879

Subtarget, DAG))

18880

return Rotate;

18881

18882

// Try to use bit rotation instructions.

18883

if (V2.isUndef())

18884

if (SDValue Rotate =

18885

lowerShuffleAsBitRotate(DL, MVT::v32i8, V1, Mask, Subtarget, DAG))

18886

return Rotate;

18887

18888

// Try to create an in-lane repeating shuffle mask and then shuffle the

18889

// results into the target lanes.

18890

if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(

18891

DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))

18892

return V;

18893

18894

// There are no generalized cross-lane shuffle operations available on i8

18895

// element types.

18896

if (V2.isUndef() && is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask)) {

18897

// Try to produce a fixed cross-128-bit lane permute followed by unpack

18898

// because that should be faster than the variable permute alternatives.

18899

if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v32i8, Mask, V1, V2, DAG))

18900

return V;

18901

18902

if (SDValue V = lowerShuffleAsLanePermuteAndPermute(

18903

DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))

18904

return V;

18905

18906

return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v32i8, V1, V2, Mask,

18907

DAG, Subtarget);

18908

}

18909

18910

if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i8, Mask, V1, V2,

18911

Zeroable, Subtarget, DAG))

18912

return PSHUFB;

18913

18914

// AVX512VBMI can lower to VPERMB (non-VLX will pad to v64i8).

18915

if (Subtarget.hasVBMI())

18916

return lowerShuffleWithPERMV(DL, MVT::v32i8, Mask, V1, V2, Subtarget, DAG);

18917

18918

// Try to simplify this by merging 128-bit lanes to enable a lane-based

18919

// shuffle.

18920

if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(

18921

DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))

18922

return Result;

18923

18924

// Try to permute the lanes and then use a per-lane permute.

18925

if (SDValue V = lowerShuffleAsLanePermuteAndPermute(

18926

DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))

18927

return V;

18928

18929

// Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed

18930

// by zeroable elements in the remaining 24 elements. Turn this into two

18931

// vmovqb instructions shuffled together.

18932

if (Subtarget.hasVLX())

18933

if (SDValue V = lowerShuffleAsVTRUNCAndUnpack(DL, MVT::v32i8, V1, V2,

18934

Mask, Zeroable, DAG))

18935

return V;

18936

18937

// Try to match an interleave of two v32i8s and lower them as unpck and

18938

// permutes using ymms.

18939

if (!Subtarget.hasAVX512())

18940

if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v32i8, V1, V2,

18941

Mask, DAG))

18942

return V;

18943

18944

// Otherwise fall back on generic lowering.

18945

return lowerShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask,

18946

Subtarget, DAG);

18947

}

18948

18949

/// High-level routine to lower various 256-bit x86 vector shuffles.

18950

///

18951

/// This routine either breaks down the specific type of a 256-bit x86 vector

18952

/// shuffle or splits it into two 128-bit shuffles and fuses the results back

18953

/// together based on the available instructions.

18954

static SDValue lower256BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,

18955

SDValue V1, SDValue V2, const APInt &Zeroable,

18956

const X86Subtarget &Subtarget,

18957

SelectionDAG &DAG) {

18958

// If we have a single input to the zero element, insert that into V1 if we

18959

// can do so cheaply.

18960

int NumElts = VT.getVectorNumElements();

18961

int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });

18962

18963

if (NumV2Elements == 1 && Mask[0] >= NumElts)

18964

if (SDValue Insertion = lowerShuffleAsElementInsertion(

18965

DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))

18966

return Insertion;

18967

18968

// Handle special cases where the lower or upper half is UNDEF.

18969

if (SDValue V =

18970

lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))

18971

return V;

18972

18973

// There is a really nice hard cut-over between AVX1 and AVX2 that means we

18974

// can check for those subtargets here and avoid much of the subtarget

18975

// querying in the per-vector-type lowering routines. With AVX1 we have

18976

// essentially *zero* ability to manipulate a 256-bit vector with integer

18977

// types. Since we'll use floating point types there eventually, just

18978

// immediately cast everything to a float and operate entirely in that domain.

18979

if (VT.isInteger() && !Subtarget.hasAVX2()) {

18980

int ElementBits = VT.getScalarSizeInBits();

18981

if (ElementBits < 32) {

18982

// No floating point type available, if we can't use the bit operations

18983

// for masking/blending then decompose into 128-bit vectors.

18984

if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,

18985

Subtarget, DAG))

18986

return V;

18987

if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))

18988

return V;

18989

return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG, /*SimpleOnly*/ false);

18990

}

18991

18992

MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),

18993

VT.getVectorNumElements());

18994

V1 = DAG.getBitcast(FpVT, V1);

18995

V2 = DAG.getBitcast(FpVT, V2);

18996

return DAG.getBitcast(VT, DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask));

18997

}

18998

18999

if (VT == MVT::v16f16) {

19000

V1 = DAG.getBitcast(MVT::v16i16, V1);

19001

V2 = DAG.getBitcast(MVT::v16i16, V2);

19002

return DAG.getBitcast(MVT::v16f16,

19003

DAG.getVectorShuffle(MVT::v16i16, DL, V1, V2, Mask));

19004

}

19005

19006

switch (VT.SimpleTy) {

19007

case MVT::v4f64:

19008

return lowerV4F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

19009

case MVT::v4i64:

19010

return lowerV4I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

19011

case MVT::v8f32:

19012

return lowerV8F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

19013

case MVT::v8i32:

19014

return lowerV8I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

19015

case MVT::v16i16:

19016

return lowerV16I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

19017

case MVT::v32i8:

19018

return lowerV32I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

19019

19020

default:

19021

llvm_unreachable("Not a valid 256-bit x86 vector type!")::llvm::llvm_unreachable_internal("Not a valid 256-bit x86 vector type!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19021);

19022

}

19023

}

19024

19025

/// Try to lower a vector shuffle as a 128-bit shuffles.

19026

static SDValue lowerV4X128Shuffle(const SDLoc &DL, MVT VT, ArrayRef<int> Mask,

19027

const APInt &Zeroable, SDValue V1, SDValue V2,

19028

const X86Subtarget &Subtarget,

19029

SelectionDAG &DAG) {

19030

assert(VT.getScalarSizeInBits() == 64 &&(static_cast <bool> (VT.getScalarSizeInBits() == 64 &&
"Unexpected element type size for 128bit shuffle.") ? void (
0) : __assert_fail ("VT.getScalarSizeInBits() == 64 && \"Unexpected element type size for 128bit shuffle.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19031, __extension__
__PRETTY_FUNCTION__))

19031

"Unexpected element type size for 128bit shuffle.")(static_cast <bool> (VT.getScalarSizeInBits() == 64 &&
"Unexpected element type size for 128bit shuffle.") ? void (
0) : __assert_fail ("VT.getScalarSizeInBits() == 64 && \"Unexpected element type size for 128bit shuffle.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19031, __extension__
__PRETTY_FUNCTION__));

19032

19033

// To handle 256 bit vector requires VLX and most probably

19034

// function lowerV2X128VectorShuffle() is better solution.

19035

assert(VT.is512BitVector() && "Unexpected vector size for 512bit shuffle.")(static_cast <bool> (VT.is512BitVector() && "Unexpected vector size for 512bit shuffle."
) ? void (0) : __assert_fail ("VT.is512BitVector() && \"Unexpected vector size for 512bit shuffle.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19035, __extension__
__PRETTY_FUNCTION__));

19036

19037

// TODO - use Zeroable like we do for lowerV2X128VectorShuffle?

19038

SmallVector<int, 4> Widened128Mask;

19039

if (!canWidenShuffleElements(Mask, Widened128Mask))

19040

return SDValue();

19041

assert(Widened128Mask.size() == 4 && "Shuffle widening mismatch")(static_cast <bool> (Widened128Mask.size() == 4 &&
"Shuffle widening mismatch") ? void (0) : __assert_fail ("Widened128Mask.size() == 4 && \"Shuffle widening mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19041, __extension__
__PRETTY_FUNCTION__));

19042

19043

// Try to use an insert into a zero vector.

19044

if (Widened128Mask[0] == 0 && (Zeroable & 0xf0) == 0xf0 &&

19045

(Widened128Mask[1] == 1 || (Zeroable & 0x0c) == 0x0c)) {

19046

unsigned NumElts = ((Zeroable & 0x0c) == 0x0c) ? 2 : 4;

19047

MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);

19048

SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,

19049

DAG.getIntPtrConstant(0, DL));

19050

return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,

19051

getZeroVector(VT, Subtarget, DAG, DL), LoV,

19052

DAG.getIntPtrConstant(0, DL));

19053

}

19054

19055

// Check for patterns which can be matched with a single insert of a 256-bit

19056

// subvector.

19057

bool OnlyUsesV1 = isShuffleEquivalent(Mask, {0, 1, 2, 3, 0, 1, 2, 3}, V1, V2);

19058

if (OnlyUsesV1 ||

19059

isShuffleEquivalent(Mask, {0, 1, 2, 3, 8, 9, 10, 11}, V1, V2)) {

19060

MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 4);

19061

SDValue SubVec =

19062

DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, OnlyUsesV1 ? V1 : V2,

19063

DAG.getIntPtrConstant(0, DL));

19064

return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,

19065

DAG.getIntPtrConstant(4, DL));

19066

}

19067

19068

// See if this is an insertion of the lower 128-bits of V2 into V1.

19069

bool IsInsert = true;

19070

int V2Index = -1;

19071

for (int i = 0; i < 4; ++i) {

19072

assert(Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value")(static_cast <bool> (Widened128Mask[i] >= -1 &&
"Illegal shuffle sentinel value") ? void (0) : __assert_fail
("Widened128Mask[i] >= -1 && \"Illegal shuffle sentinel value\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19072, __extension__
__PRETTY_FUNCTION__));

19073

if (Widened128Mask[i] < 0)

19074

continue;

19075

19076

// Make sure all V1 subvectors are in place.

19077

if (Widened128Mask[i] < 4) {

19078

if (Widened128Mask[i] != i) {

19079

IsInsert = false;

19080

break;

19081

}

19082

} else {

19083

// Make sure we only have a single V2 index and its the lowest 128-bits.

19084

if (V2Index >= 0 || Widened128Mask[i] != 4) {

19085

IsInsert = false;

19086

break;

19087

}

19088

V2Index = i;

19089

}

19090

}

19091

if (IsInsert && V2Index >= 0) {

19092

MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);

19093

SDValue Subvec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2,

19094

DAG.getIntPtrConstant(0, DL));

19095

return insert128BitVector(V1, Subvec, V2Index * 2, DAG, DL);

19096

}

19097

19098

// See if we can widen to a 256-bit lane shuffle, we're going to lose 128-lane

19099

// UNDEF info by lowering to X86ISD::SHUF128 anyway, so by widening where

19100

// possible we at least ensure the lanes stay sequential to help later

19101

// combines.

19102

SmallVector<int, 2> Widened256Mask;

19103

if (canWidenShuffleElements(Widened128Mask, Widened256Mask)) {

19104

Widened128Mask.clear();

19105

narrowShuffleMaskElts(2, Widened256Mask, Widened128Mask);

19106

}

19107

19108

// Try to lower to vshuf64x2/vshuf32x4.

19109

SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};

19110

unsigned PermMask = 0;

19111

// Insure elements came from the same Op.

19112

for (int i = 0; i < 4; ++i) {

19113

assert(Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value")(static_cast <bool> (Widened128Mask[i] >= -1 &&
"Illegal shuffle sentinel value") ? void (0) : __assert_fail
("Widened128Mask[i] >= -1 && \"Illegal shuffle sentinel value\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19113, __extension__
__PRETTY_FUNCTION__));

19114

if (Widened128Mask[i] < 0)

19115

continue;

19116

19117

SDValue Op = Widened128Mask[i] >= 4 ? V2 : V1;

19118

unsigned OpIndex = i / 2;

19119

if (Ops[OpIndex].isUndef())

19120

Ops[OpIndex] = Op;

19121

else if (Ops[OpIndex] != Op)

19122

return SDValue();

19123

19124

// Convert the 128-bit shuffle mask selection values into 128-bit selection

19125

// bits defined by a vshuf64x2 instruction's immediate control byte.

19126

PermMask |= (Widened128Mask[i] % 4) << (i * 2);

19127

}

19128

19129

return DAG.getNode(X86ISD::SHUF128, DL, VT, Ops[0], Ops[1],

19130

DAG.getTargetConstant(PermMask, DL, MVT::i8));

19131

}

19132

19133

/// Handle lowering of 8-lane 64-bit floating point shuffles.

19134

static SDValue lowerV8F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

19135

const APInt &Zeroable, SDValue V1, SDValue V2,

19136

const X86Subtarget &Subtarget,

19137

SelectionDAG &DAG) {

19138

assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v8f64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v8f64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19138, __extension__
__PRETTY_FUNCTION__));

19139

assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v8f64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v8f64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19139, __extension__
__PRETTY_FUNCTION__));

19140

assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")(static_cast <bool> (Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 8 && \"Unexpected mask size for v8 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19140, __extension__
__PRETTY_FUNCTION__));

19141

19142

if (V2.isUndef()) {

19143

// Use low duplicate instructions for masks that match their pattern.

19144

if (isShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6}, V1, V2))

19145

return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v8f64, V1);

19146

19147

if (!is128BitLaneCrossingShuffleMask(MVT::v8f64, Mask)) {

19148

// Non-half-crossing single input shuffles can be lowered with an

19149

// interleaved permutation.

19150

unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |

19151

((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3) |

19152

((Mask[4] == 5) << 4) | ((Mask[5] == 5) << 5) |

19153

((Mask[6] == 7) << 6) | ((Mask[7] == 7) << 7);

19154

return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f64, V1,

19155

DAG.getTargetConstant(VPERMILPMask, DL, MVT::i8));

19156

}

19157

19158

SmallVector<int, 4> RepeatedMask;

19159

if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask))

19160

return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8f64, V1,

19161

getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));

19162

}

19163

19164

if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8f64, Mask, Zeroable, V1,

19165

V2, Subtarget, DAG))

19166

return Shuf128;

19167

19168

if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8f64, Mask, V1, V2, DAG))

19169

return Unpck;

19170

19171

// Check if the blend happens to exactly fit that of SHUFPD.

19172

if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v8f64, V1, V2, Mask,

19173

Zeroable, Subtarget, DAG))

19174

return Op;

19175

19176

if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8f64, Zeroable, Mask, V1, V2,

19177

DAG, Subtarget))

19178

return V;

19179

19180

if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f64, V1, V2, Mask,

19181

Zeroable, Subtarget, DAG))

19182

return Blend;

19183

19184

return lowerShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, Subtarget, DAG);

19185

}

19186

19187

/// Handle lowering of 16-lane 32-bit floating point shuffles.

19188

static SDValue lowerV16F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

19189

const APInt &Zeroable, SDValue V1, SDValue V2,

19190

const X86Subtarget &Subtarget,

19191

SelectionDAG &DAG) {

19192

assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v16f32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v16f32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19192, __extension__
__PRETTY_FUNCTION__));

19193

assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v16f32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v16f32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19193, __extension__
__PRETTY_FUNCTION__));

19194

assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!")(static_cast <bool> (Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 16 && \"Unexpected mask size for v16 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19194, __extension__
__PRETTY_FUNCTION__));

19195

19196

// If the shuffle mask is repeated in each 128-bit lane, we have many more

19197

// options to efficiently lower the shuffle.

19198

SmallVector<int, 4> RepeatedMask;

19199

if (is128BitLaneRepeatedShuffleMask(MVT::v16f32, Mask, RepeatedMask)) {

19200

assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!")(static_cast <bool> (RepeatedMask.size() == 4 &&
"Unexpected repeated mask size!") ? void (0) : __assert_fail
("RepeatedMask.size() == 4 && \"Unexpected repeated mask size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19200, __extension__
__PRETTY_FUNCTION__));

19201

19202

// Use even/odd duplicate instructions for masks that match their pattern.

19203

if (isShuffleEquivalent(RepeatedMask, {0, 0, 2, 2}, V1, V2))

19204

return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v16f32, V1);

19205

if (isShuffleEquivalent(RepeatedMask, {1, 1, 3, 3}, V1, V2))

19206

return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v16f32, V1);

19207

19208

if (V2.isUndef())

19209

return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v16f32, V1,

19210

getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));

19211

19212

// Use dedicated unpack instructions for masks that match their pattern.

19213

if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16f32, Mask, V1, V2, DAG))

19214

return V;

19215

19216

if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,

19217

Zeroable, Subtarget, DAG))

19218

return Blend;

19219

19220

// Otherwise, fall back to a SHUFPS sequence.

19221

return lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG);

19222

}

19223

19224

if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,

19225

Zeroable, Subtarget, DAG))

19226

return Blend;

19227

19228

if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(

19229

DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))

19230

return DAG.getBitcast(MVT::v16f32, ZExt);

19231

19232

// Try to create an in-lane repeating shuffle mask and then shuffle the

19233

// results into the target lanes.

19234

if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(

19235

DL, MVT::v16f32, V1, V2, Mask, Subtarget, DAG))

19236

return V;

19237

19238

// If we have a single input shuffle with different shuffle patterns in the

19239

// 128-bit lanes and don't lane cross, use variable mask VPERMILPS.

19240

if (V2.isUndef() &&

19241

!is128BitLaneCrossingShuffleMask(MVT::v16f32, Mask)) {

19242

SDValue VPermMask = getConstVector(Mask, MVT::v16i32, DAG, DL, true);

19243

return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v16f32, V1, VPermMask);

19244

}

19245

19246

// If we have AVX512F support, we can use VEXPAND.

19247

if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v16f32, Zeroable, Mask,

19248

V1, V2, DAG, Subtarget))

19249

return V;

19250

19251

return lowerShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, Subtarget, DAG);

19252

}

19253

19254

/// Handle lowering of 8-lane 64-bit integer shuffles.

19255

static SDValue lowerV8I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

19256

const APInt &Zeroable, SDValue V1, SDValue V2,

19257

const X86Subtarget &Subtarget,

19258

SelectionDAG &DAG) {

19259

assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v8i64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v8i64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19259, __extension__
__PRETTY_FUNCTION__));

19260

assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v8i64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v8i64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19260, __extension__
__PRETTY_FUNCTION__));

19261

assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")(static_cast <bool> (Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 8 && \"Unexpected mask size for v8 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19261, __extension__
__PRETTY_FUNCTION__));

19262

19263

// Try to use shift instructions if fast.

19264

if (Subtarget.preferLowerShuffleAsShift())

19265

if (SDValue Shift =

19266

lowerShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask, Zeroable,

19267

Subtarget, DAG, /*BitwiseOnly*/ true))

19268

return Shift;

19269

19270

if (V2.isUndef()) {

19271

// When the shuffle is mirrored between the 128-bit lanes of the unit, we

19272

// can use lower latency instructions that will operate on all four

19273

// 128-bit lanes.

19274

SmallVector<int, 2> Repeated128Mask;

19275

if (is128BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated128Mask)) {

19276

SmallVector<int, 4> PSHUFDMask;

19277

narrowShuffleMaskElts(2, Repeated128Mask, PSHUFDMask);

19278

return DAG.getBitcast(

19279

MVT::v8i64,

19280

DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32,

19281

DAG.getBitcast(MVT::v16i32, V1),

19282

getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));

19283

}

19284

19285

SmallVector<int, 4> Repeated256Mask;

19286

if (is256BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated256Mask))

19287

return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8i64, V1,

19288

getV4X86ShuffleImm8ForMask(Repeated256Mask, DL, DAG));

19289

}

19290

19291

if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8i64, Mask, Zeroable, V1,

19292

V2, Subtarget, DAG))

19293

return Shuf128;

19294

19295

// Try to use shift instructions.

19296

if (SDValue Shift =

19297

lowerShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask, Zeroable, Subtarget,

19298

DAG, /*BitwiseOnly*/ false))

19299

return Shift;

19300

19301

// Try to use VALIGN.

19302

if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v8i64, V1, V2, Mask,

19303

Subtarget, DAG))

19304

return Rotate;

19305

19306

// Try to use PALIGNR.

19307

if (Subtarget.hasBWI())

19308

if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i64, V1, V2, Mask,

19309

Subtarget, DAG))

19310

return Rotate;

19311

19312

if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG))

19313

return Unpck;

19314

19315

// If we have AVX512F support, we can use VEXPAND.

19316

if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8i64, Zeroable, Mask, V1, V2,

19317

DAG, Subtarget))

19318

return V;

19319

19320

if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i64, V1, V2, Mask,

19321

Zeroable, Subtarget, DAG))

19322

return Blend;

19323

19324

return lowerShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, Subtarget, DAG);

19325

}

19326

19327

/// Handle lowering of 16-lane 32-bit integer shuffles.

19328

static SDValue lowerV16I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

19329

const APInt &Zeroable, SDValue V1, SDValue V2,

19330

const X86Subtarget &Subtarget,

19331

SelectionDAG &DAG) {

19332

assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v16i32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v16i32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19332, __extension__
__PRETTY_FUNCTION__));

19333

assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v16i32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v16i32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19333, __extension__
__PRETTY_FUNCTION__));

19334

assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!")(static_cast <bool> (Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 16 && \"Unexpected mask size for v16 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19334, __extension__
__PRETTY_FUNCTION__));

19335

19336

int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });

19337

19338

// Whenever we can lower this as a zext, that instruction is strictly faster

19339

// than any alternative. It also allows us to fold memory operands into the

19340

// shuffle in many cases.

19341

if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(

19342

DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))

19343

return ZExt;

19344

19345

// Try to use shift instructions if fast.

19346

if (Subtarget.preferLowerShuffleAsShift()) {

19347

if (SDValue Shift =

19348

lowerShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask, Zeroable,

19349

Subtarget, DAG, /*BitwiseOnly*/ true))

19350

return Shift;

19351

if (NumV2Elements == 0)

19352

if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v16i32, V1, Mask,

19353

Subtarget, DAG))

19354

return Rotate;

19355

}

19356

19357

// If the shuffle mask is repeated in each 128-bit lane we can use more

19358

// efficient instructions that mirror the shuffles across the four 128-bit

19359

// lanes.

19360

SmallVector<int, 4> RepeatedMask;

19361

bool Is128BitLaneRepeatedShuffle =

19362

is128BitLaneRepeatedShuffleMask(MVT::v16i32, Mask, RepeatedMask);

19363

if (Is128BitLaneRepeatedShuffle) {

19364

assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!")(static_cast <bool> (RepeatedMask.size() == 4 &&
"Unexpected repeated mask size!") ? void (0) : __assert_fail
("RepeatedMask.size() == 4 && \"Unexpected repeated mask size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19364, __extension__
__PRETTY_FUNCTION__));

19365

if (V2.isUndef())

19366

return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32, V1,

19367

getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));

19368

19369

// Use dedicated unpack instructions for masks that match their pattern.

19370

if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i32, Mask, V1, V2, DAG))

19371

return V;

19372

}

19373

19374

// Try to use shift instructions.

19375

if (SDValue Shift =

19376

lowerShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask, Zeroable,

19377

Subtarget, DAG, /*BitwiseOnly*/ false))

19378

return Shift;

19379

19380

if (!Subtarget.preferLowerShuffleAsShift() && NumV2Elements != 0)

19381

if (SDValue Rotate =

19382

lowerShuffleAsBitRotate(DL, MVT::v16i32, V1, Mask, Subtarget, DAG))

19383

return Rotate;

19384

19385

// Try to use VALIGN.

19386

if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v16i32, V1, V2, Mask,

19387

Subtarget, DAG))

19388

return Rotate;

19389

19390

// Try to use byte rotation instructions.

19391

if (Subtarget.hasBWI())

19392

if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i32, V1, V2, Mask,

19393

Subtarget, DAG))

19394

return Rotate;

19395

19396

// Assume that a single SHUFPS is faster than using a permv shuffle.

19397

// If some CPU is harmed by the domain switch, we can fix it in a later pass.

19398

if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {

19399

SDValue CastV1 = DAG.getBitcast(MVT::v16f32, V1);

19400

SDValue CastV2 = DAG.getBitcast(MVT::v16f32, V2);

19401

SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask,

19402

CastV1, CastV2, DAG);

19403

return DAG.getBitcast(MVT::v16i32, ShufPS);

19404

}

19405

19406

// Try to create an in-lane repeating shuffle mask and then shuffle the

19407

// results into the target lanes.

19408

if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(

19409

DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG))

19410

return V;

19411

19412

// If we have AVX512F support, we can use VEXPAND.

19413

if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v16i32, Zeroable, Mask, V1, V2,

19414

DAG, Subtarget))

19415

return V;

19416

19417

if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i32, V1, V2, Mask,

19418

Zeroable, Subtarget, DAG))

19419

return Blend;

19420

19421

return lowerShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, Subtarget, DAG);

19422

}

19423

19424

/// Handle lowering of 32-lane 16-bit integer shuffles.

19425

static SDValue lowerV32I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

19426

const APInt &Zeroable, SDValue V1, SDValue V2,

19427

const X86Subtarget &Subtarget,

19428

SelectionDAG &DAG) {

19429

assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v32i16
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v32i16 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19429, __extension__
__PRETTY_FUNCTION__));

19430

assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v32i16
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v32i16 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19430, __extension__
__PRETTY_FUNCTION__));

19431

assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!")(static_cast <bool> (Mask.size() == 32 && "Unexpected mask size for v32 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 32 && \"Unexpected mask size for v32 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19431, __extension__
__PRETTY_FUNCTION__));

19432

assert(Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!")(static_cast <bool> (Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!"
) ? void (0) : __assert_fail ("Subtarget.hasBWI() && \"We can only lower v32i16 with AVX-512-BWI!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19432, __extension__
__PRETTY_FUNCTION__));

19433

19434

// Whenever we can lower this as a zext, that instruction is strictly faster

19435

// than any alternative. It also allows us to fold memory operands into the

19436

// shuffle in many cases.

19437

if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(

19438

DL, MVT::v32i16, V1, V2, Mask, Zeroable, Subtarget, DAG))

19439

return ZExt;

19440

19441

// Use dedicated unpack instructions for masks that match their pattern.

19442

if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i16, Mask, V1, V2, DAG))

19443

return V;

19444

19445

// Use dedicated pack instructions for masks that match their pattern.

19446

if (SDValue V =

19447

lowerShuffleWithPACK(DL, MVT::v32i16, Mask, V1, V2, DAG, Subtarget))

19448

return V;

19449

19450

// Try to use shift instructions.

19451

if (SDValue Shift =

19452

lowerShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask, Zeroable,

19453

Subtarget, DAG, /*BitwiseOnly*/ false))

19454

return Shift;

19455

19456

// Try to use byte rotation instructions.

19457

if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i16, V1, V2, Mask,

19458

Subtarget, DAG))

19459

return Rotate;

19460

19461

if (V2.isUndef()) {

19462

// Try to use bit rotation instructions.

19463

if (SDValue Rotate =

19464

lowerShuffleAsBitRotate(DL, MVT::v32i16, V1, Mask, Subtarget, DAG))

19465

return Rotate;

19466

19467

SmallVector<int, 8> RepeatedMask;

19468

if (is128BitLaneRepeatedShuffleMask(MVT::v32i16, Mask, RepeatedMask)) {

19469

// As this is a single-input shuffle, the repeated mask should be

19470

// a strictly valid v8i16 mask that we can pass through to the v8i16

19471

// lowering to handle even the v32 case.

19472

return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v32i16, V1,

19473

RepeatedMask, Subtarget, DAG);

19474

}

19475

}

19476

19477

if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i16, V1, V2, Mask,

19478

Zeroable, Subtarget, DAG))

19479

return Blend;

19480

19481

if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i16, Mask, V1, V2,

19482

Zeroable, Subtarget, DAG))

19483

return PSHUFB;

19484

19485

return lowerShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, Subtarget, DAG);

19486

}

19487

19488

/// Handle lowering of 64-lane 8-bit integer shuffles.

19489

static SDValue lowerV64I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

19490

const APInt &Zeroable, SDValue V1, SDValue V2,

19491

const X86Subtarget &Subtarget,

19492

SelectionDAG &DAG) {

19493

assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v64i8
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v64i8 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19493, __extension__
__PRETTY_FUNCTION__));

19494

assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v64i8
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v64i8 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19494, __extension__
__PRETTY_FUNCTION__));

19495

assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!")(static_cast <bool> (Mask.size() == 64 && "Unexpected mask size for v64 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 64 && \"Unexpected mask size for v64 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19495, __extension__
__PRETTY_FUNCTION__));

19496

assert(Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!")(static_cast <bool> (Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!"
) ? void (0) : __assert_fail ("Subtarget.hasBWI() && \"We can only lower v64i8 with AVX-512-BWI!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19496, __extension__
__PRETTY_FUNCTION__));

19497

19498

// Whenever we can lower this as a zext, that instruction is strictly faster

19499

// than any alternative. It also allows us to fold memory operands into the

19500

// shuffle in many cases.

19501

if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(

19502

DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget, DAG))

19503

return ZExt;

19504

19505

// Use dedicated unpack instructions for masks that match their pattern.

19506

if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v64i8, Mask, V1, V2, DAG))

19507

return V;

19508

19509

// Use dedicated pack instructions for masks that match their pattern.

19510

if (SDValue V = lowerShuffleWithPACK(DL, MVT::v64i8, Mask, V1, V2, DAG,

19511

Subtarget))

19512

return V;

19513

19514

// Try to use shift instructions.

19515

if (SDValue Shift =

19516

lowerShuffleAsShift(DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget,

19517

DAG, /*BitwiseOnly*/ false))

19518

return Shift;

19519

19520

// Try to use byte rotation instructions.

19521

if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v64i8, V1, V2, Mask,

19522

Subtarget, DAG))

19523

return Rotate;

19524

19525

// Try to use bit rotation instructions.

19526

if (V2.isUndef())

19527

if (SDValue Rotate =

19528

lowerShuffleAsBitRotate(DL, MVT::v64i8, V1, Mask, Subtarget, DAG))

19529

return Rotate;

19530

19531

// Lower as AND if possible.

19532

if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v64i8, V1, V2, Mask,

19533

Zeroable, Subtarget, DAG))

19534

return Masked;

19535

19536

if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v64i8, Mask, V1, V2,

19537

Zeroable, Subtarget, DAG))

19538

return PSHUFB;

19539

19540

// Try to create an in-lane repeating shuffle mask and then shuffle the

19541

// results into the target lanes.

19542

if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(

19543

DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))

19544

return V;

19545

19546

if (SDValue Result = lowerShuffleAsLanePermuteAndPermute(

19547

DL, MVT::v64i8, V1, V2, Mask, DAG, Subtarget))

19548

return Result;

19549

19550

if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v64i8, V1, V2, Mask,

19551

Zeroable, Subtarget, DAG))

19552

return Blend;

19553

19554

if (!is128BitLaneCrossingShuffleMask(MVT::v64i8, Mask)) {

19555

// Use PALIGNR+Permute if possible - permute might become PSHUFB but the

19556

// PALIGNR will be cheaper than the second PSHUFB+OR.

19557

if (SDValue V = lowerShuffleAsByteRotateAndPermute(DL, MVT::v64i8, V1, V2,

19558

Mask, Subtarget, DAG))

19559

return V;

19560

19561

// If we can't directly blend but can use PSHUFB, that will be better as it

19562

// can both shuffle and set up the inefficient blend.

19563

bool V1InUse, V2InUse;

19564

return lowerShuffleAsBlendOfPSHUFBs(DL, MVT::v64i8, V1, V2, Mask, Zeroable,

19565

DAG, V1InUse, V2InUse);

19566

}

19567

19568

// Try to simplify this by merging 128-bit lanes to enable a lane-based

19569

// shuffle.

19570

if (!V2.isUndef())

19571

if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(

19572

DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))

19573

return Result;

19574

19575

// VBMI can use VPERMV/VPERMV3 byte shuffles.

19576

if (Subtarget.hasVBMI())

19577

return lowerShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, Subtarget, DAG);

19578

19579

return splitAndLowerShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG, /*SimpleOnly*/ false);

19580

}

19581

19582

/// High-level routine to lower various 512-bit x86 vector shuffles.

19583

///

19584

/// This routine either breaks down the specific type of a 512-bit x86 vector

19585

/// shuffle or splits it into two 256-bit shuffles and fuses the results back

19586

/// together based on the available instructions.

19587

static SDValue lower512BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,

19588

MVT VT, SDValue V1, SDValue V2,

19589

const APInt &Zeroable,

19590

const X86Subtarget &Subtarget,

19591

SelectionDAG &DAG) {

19592

assert(Subtarget.hasAVX512() &&(static_cast <bool> (Subtarget.hasAVX512() && "Cannot lower 512-bit vectors w/ basic ISA!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && \"Cannot lower 512-bit vectors w/ basic ISA!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19593, __extension__
__PRETTY_FUNCTION__))

19593

"Cannot lower 512-bit vectors w/ basic ISA!")(static_cast <bool> (Subtarget.hasAVX512() && "Cannot lower 512-bit vectors w/ basic ISA!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && \"Cannot lower 512-bit vectors w/ basic ISA!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19593, __extension__
__PRETTY_FUNCTION__));

19594

19595

// If we have a single input to the zero element, insert that into V1 if we

19596

// can do so cheaply.

19597

int NumElts = Mask.size();

19598

int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });

19599

19600

if (NumV2Elements == 1 && Mask[0] >= NumElts)

19601

if (SDValue Insertion = lowerShuffleAsElementInsertion(

19602

DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))

19603

return Insertion;

19604

19605

// Handle special cases where the lower or upper half is UNDEF.

19606

if (SDValue V =

19607

lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))

19608

return V;

19609

19610

// Check for being able to broadcast a single element.

19611

if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, Mask,

19612

Subtarget, DAG))

19613

return Broadcast;

19614

19615

if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI()) {

19616

// Try using bit ops for masking and blending before falling back to

19617

// splitting.

19618

if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,

19619

Subtarget, DAG))

19620

return V;

19621

if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))

19622

return V;

19623

19624

return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG, /*SimpleOnly*/ false);

19625

}

19626

19627

if (VT == MVT::v32f16) {

19628

V1 = DAG.getBitcast(MVT::v32i16, V1);

19629

V2 = DAG.getBitcast(MVT::v32i16, V2);

19630

return DAG.getBitcast(MVT::v32f16,

19631

DAG.getVectorShuffle(MVT::v32i16, DL, V1, V2, Mask));

19632

}

19633

19634

// Dispatch to each element type for lowering. If we don't have support for

19635

// specific element type shuffles at 512 bits, immediately split them and

19636

// lower them. Each lowering routine of a given type is allowed to assume that

19637

// the requisite ISA extensions for that element type are available.

19638

switch (VT.SimpleTy) {

19639

case MVT::v8f64:

19640

return lowerV8F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

19641

case MVT::v16f32:

19642

return lowerV16F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

19643

case MVT::v8i64:

19644

return lowerV8I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

19645

case MVT::v16i32:

19646

return lowerV16I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

19647

case MVT::v32i16:

19648

return lowerV32I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

19649

case MVT::v64i8:

19650

return lowerV64I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

19651

19652

default:

19653

llvm_unreachable("Not a valid 512-bit x86 vector type!")::llvm::llvm_unreachable_internal("Not a valid 512-bit x86 vector type!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19653);

19654

}

19655

}

19656

19657

static SDValue lower1BitShuffleAsKSHIFTR(const SDLoc &DL, ArrayRef<int> Mask,

19658

MVT VT, SDValue V1, SDValue V2,

19659

const X86Subtarget &Subtarget,

19660

SelectionDAG &DAG) {

19661

// Shuffle should be unary.

19662

if (!V2.isUndef())

19663

return SDValue();

19664

19665

int ShiftAmt = -1;

19666

int NumElts = Mask.size();

19667

for (int i = 0; i != NumElts; ++i) {

19668

int M = Mask[i];

19669

assert((M == SM_SentinelUndef || (0 <= M && M < NumElts)) &&(static_cast <bool> ((M == SM_SentinelUndef || (0 <=
M && M < NumElts)) && "Unexpected mask index."
) ? void (0) : __assert_fail ("(M == SM_SentinelUndef || (0 <= M && M < NumElts)) && \"Unexpected mask index.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19670, __extension__
__PRETTY_FUNCTION__))

19670

"Unexpected mask index.")(static_cast <bool> ((M == SM_SentinelUndef || (0 <=
M && M < NumElts)) && "Unexpected mask index."
) ? void (0) : __assert_fail ("(M == SM_SentinelUndef || (0 <= M && M < NumElts)) && \"Unexpected mask index.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19670, __extension__
__PRETTY_FUNCTION__));

19671

if (M < 0)

19672

continue;

19673

19674

// The first non-undef element determines our shift amount.

19675

if (ShiftAmt < 0) {

19676

ShiftAmt = M - i;

19677

// Need to be shifting right.

19678

if (ShiftAmt <= 0)

19679

return SDValue();

19680

}

19681

// All non-undef elements must shift by the same amount.

19682

if (ShiftAmt != M - i)

19683

return SDValue();

19684

}

19685

assert(ShiftAmt >= 0 && "All undef?")(static_cast <bool> (ShiftAmt >= 0 && "All undef?"
) ? void (0) : __assert_fail ("ShiftAmt >= 0 && \"All undef?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19685, __extension__
__PRETTY_FUNCTION__));

19686

19687

// Great we found a shift right.

19688

MVT WideVT = VT;

19689

if ((!Subtarget.hasDQI() && NumElts == 8) || NumElts < 8)

19690

WideVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;

19691

SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideVT,

19692

DAG.getUNDEF(WideVT), V1,

19693

DAG.getIntPtrConstant(0, DL));

19694

Res = DAG.getNode(X86ISD::KSHIFTR, DL, WideVT, Res,

19695

DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));

19696

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,

19697

DAG.getIntPtrConstant(0, DL));

19698

}

19699

19700

// Determine if this shuffle can be implemented with a KSHIFT instruction.

19701

// Returns the shift amount if possible or -1 if not. This is a simplified

19702

// version of matchShuffleAsShift.

19703

static int match1BitShuffleAsKSHIFT(unsigned &Opcode, ArrayRef<int> Mask,

19704

int MaskOffset, const APInt &Zeroable) {

19705

int Size = Mask.size();

19706

19707

auto CheckZeros = [&](int Shift, bool Left) {

19708

for (int j = 0; j < Shift; ++j)

19709

if (!Zeroable[j + (Left ? 0 : (Size - Shift))])

19710

return false;

19711

19712

return true;

19713

};

19714

19715

auto MatchShift = [&](int Shift, bool Left) {

19716

unsigned Pos = Left ? Shift : 0;

19717

unsigned Low = Left ? 0 : Shift;

19718

unsigned Len = Size - Shift;

19719

return isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset);

19720

};

19721

19722

for (int Shift = 1; Shift != Size; ++Shift)

19723

for (bool Left : {true, false})

19724

if (CheckZeros(Shift, Left) && MatchShift(Shift, Left)) {

19725

Opcode = Left ? X86ISD::KSHIFTL : X86ISD::KSHIFTR;

19726

return Shift;

19727

}

19728

19729

return -1;

19730

}

19731

19732

19733

// Lower vXi1 vector shuffles.

19734

// There is no a dedicated instruction on AVX-512 that shuffles the masks.

19735

// The only way to shuffle bits is to sign-extend the mask vector to SIMD

19736

// vector, shuffle and then truncate it back.

19737

static SDValue lower1BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,

19738

MVT VT, SDValue V1, SDValue V2,

19739

const APInt &Zeroable,

19740

const X86Subtarget &Subtarget,

19741

SelectionDAG &DAG) {

19742

assert(Subtarget.hasAVX512() &&(static_cast <bool> (Subtarget.hasAVX512() && "Cannot lower 512-bit vectors w/o basic ISA!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && \"Cannot lower 512-bit vectors w/o basic ISA!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19743, __extension__
__PRETTY_FUNCTION__))

19743

"Cannot lower 512-bit vectors w/o basic ISA!")(static_cast <bool> (Subtarget.hasAVX512() && "Cannot lower 512-bit vectors w/o basic ISA!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && \"Cannot lower 512-bit vectors w/o basic ISA!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19743, __extension__
__PRETTY_FUNCTION__));

19744

19745

int NumElts = Mask.size();

19746

19747

// Try to recognize shuffles that are just padding a subvector with zeros.

19748

int SubvecElts = 0;

19749

int Src = -1;

19750

for (int i = 0; i != NumElts; ++i) {

19751

if (Mask[i] >= 0) {

19752

// Grab the source from the first valid mask. All subsequent elements need

19753

// to use this same source.

19754

if (Src < 0)

19755

Src = Mask[i] / NumElts;

19756

if (Src != (Mask[i] / NumElts) || (Mask[i] % NumElts) != i)

19757

break;

19758

}

19759

19760

++SubvecElts;

19761

}

19762

assert(SubvecElts != NumElts && "Identity shuffle?")(static_cast <bool> (SubvecElts != NumElts && "Identity shuffle?"
) ? void (0) : __assert_fail ("SubvecElts != NumElts && \"Identity shuffle?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19762, __extension__
__PRETTY_FUNCTION__));

19763

19764

// Clip to a power 2.

19765

SubvecElts = llvm::bit_floor<uint32_t>(SubvecElts);

19766

19767

// Make sure the number of zeroable bits in the top at least covers the bits

19768

// not covered by the subvector.

19769

if ((int)Zeroable.countl_one() >= (NumElts - SubvecElts)) {

19770

assert(Src >= 0 && "Expected a source!")(static_cast <bool> (Src >= 0 && "Expected a source!"
) ? void (0) : __assert_fail ("Src >= 0 && \"Expected a source!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19770, __extension__
__PRETTY_FUNCTION__));

19771

MVT ExtractVT = MVT::getVectorVT(MVT::i1, SubvecElts);

19772

SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT,

19773

Src == 0 ? V1 : V2,

19774

DAG.getIntPtrConstant(0, DL));

19775

return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,

19776

DAG.getConstant(0, DL, VT),

19777

Extract, DAG.getIntPtrConstant(0, DL));

19778

}

19779

19780

// Try a simple shift right with undef elements. Later we'll try with zeros.

19781

if (SDValue Shift = lower1BitShuffleAsKSHIFTR(DL, Mask, VT, V1, V2, Subtarget,

19782

DAG))

19783

return Shift;

19784

19785

// Try to match KSHIFTs.

19786

unsigned Offset = 0;

19787

for (SDValue V : { V1, V2 }) {

19788

unsigned Opcode;

19789

int ShiftAmt = match1BitShuffleAsKSHIFT(Opcode, Mask, Offset, Zeroable);

19790

if (ShiftAmt >= 0) {

19791

MVT WideVT = VT;

19792

if ((!Subtarget.hasDQI() && NumElts == 8) || NumElts < 8)

19793

WideVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;

19794

SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideVT,

19795

DAG.getUNDEF(WideVT), V,

19796

DAG.getIntPtrConstant(0, DL));

19797

// Widened right shifts need two shifts to ensure we shift in zeroes.

19798

if (Opcode == X86ISD::KSHIFTR && WideVT != VT) {

19799

int WideElts = WideVT.getVectorNumElements();

19800

// Shift left to put the original vector in the MSBs of the new size.

19801

Res = DAG.getNode(X86ISD::KSHIFTL, DL, WideVT, Res,

19802

DAG.getTargetConstant(WideElts - NumElts, DL, MVT::i8));

19803

// Increase the shift amount to account for the left shift.

19804

ShiftAmt += WideElts - NumElts;

19805

}

19806

19807

Res = DAG.getNode(Opcode, DL, WideVT, Res,

19808

DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));

19809

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,

19810

DAG.getIntPtrConstant(0, DL));

19811

}

19812

Offset += NumElts; // Increment for next iteration.

19813

}

19814

19815

// If we're broadcasting a SETCC result, try to broadcast the ops instead.

19816

// TODO: What other unary shuffles would benefit from this?

19817

if (isBroadcastShuffleMask(Mask) && V1.getOpcode() == ISD::SETCC &&

19818

V1->hasOneUse()) {

19819

SDValue Op0 = V1.getOperand(0);

19820

SDValue Op1 = V1.getOperand(1);

19821

ISD::CondCode CC = cast<CondCodeSDNode>(V1.getOperand(2))->get();

19822

EVT OpVT = Op0.getValueType();

19823

return DAG.getSetCC(

19824

DL, VT, DAG.getVectorShuffle(OpVT, DL, Op0, DAG.getUNDEF(OpVT), Mask),

19825

DAG.getVectorShuffle(OpVT, DL, Op1, DAG.getUNDEF(OpVT), Mask), CC);

19826

}

19827

19828

MVT ExtVT;

19829

switch (VT.SimpleTy) {

19830

default:

19831

llvm_unreachable("Expected a vector of i1 elements")::llvm::llvm_unreachable_internal("Expected a vector of i1 elements"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19831);

19832

case MVT::v2i1:

19833

ExtVT = MVT::v2i64;

19834

break;

19835

case MVT::v4i1:

19836

ExtVT = MVT::v4i32;

19837

break;

19838

case MVT::v8i1:

19839

// Take 512-bit type, more shuffles on KNL. If we have VLX use a 256-bit

19840

// shuffle.

19841

ExtVT = Subtarget.hasVLX() ? MVT::v8i32 : MVT::v8i64;

19842

break;

19843

case MVT::v16i1:

19844

// Take 512-bit type, unless we are avoiding 512-bit types and have the

19845

// 256-bit operation available.

19846

ExtVT = Subtarget.canExtendTo512DQ() ? MVT::v16i32 : MVT::v16i16;

19847

break;

19848

case MVT::v32i1:

19849

// Take 512-bit type, unless we are avoiding 512-bit types and have the

19850

// 256-bit operation available.

19851

assert(Subtarget.hasBWI() && "Expected AVX512BW support")(static_cast <bool> (Subtarget.hasBWI() && "Expected AVX512BW support"
) ? void (0) : __assert_fail ("Subtarget.hasBWI() && \"Expected AVX512BW support\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19851, __extension__
__PRETTY_FUNCTION__));

19852

ExtVT = Subtarget.canExtendTo512BW() ? MVT::v32i16 : MVT::v32i8;

19853

break;

19854

case MVT::v64i1:

19855

// Fall back to scalarization. FIXME: We can do better if the shuffle

19856

// can be partitioned cleanly.

19857

if (!Subtarget.useBWIRegs())

19858

return SDValue();

19859

ExtVT = MVT::v64i8;

19860

break;

19861

}

19862

19863

V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1);

19864

V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2);

19865

19866

SDValue Shuffle = DAG.getVectorShuffle(ExtVT, DL, V1, V2, Mask);

19867

// i1 was sign extended we can use X86ISD::CVT2MASK.

19868

int NumElems = VT.getVectorNumElements();

19869

if ((Subtarget.hasBWI() && (NumElems >= 32)) ||

19870

(Subtarget.hasDQI() && (NumElems < 32)))

19871

return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, ExtVT),

19872

Shuffle, ISD::SETGT);

19873

19874

return DAG.getNode(ISD::TRUNCATE, DL, VT, Shuffle);

19875

}

19876

19877

/// Helper function that returns true if the shuffle mask should be

19878

/// commuted to improve canonicalization.

19879

static bool canonicalizeShuffleMaskWithCommute(ArrayRef<int> Mask) {

19880

int NumElements = Mask.size();

19881

19882

int NumV1Elements = 0, NumV2Elements = 0;

19883

for (int M : Mask)

19884

if (M < 0)

19885

continue;

19886

else if (M < NumElements)

19887

++NumV1Elements;

19888

else

19889

++NumV2Elements;

19890

19891

// Commute the shuffle as needed such that more elements come from V1 than

19892

// V2. This allows us to match the shuffle pattern strictly on how many

19893

// elements come from V1 without handling the symmetric cases.

19894

if (NumV2Elements > NumV1Elements)

19895

return true;

19896

19897

assert(NumV1Elements > 0 && "No V1 indices")(static_cast <bool> (NumV1Elements > 0 && "No V1 indices"
) ? void (0) : __assert_fail ("NumV1Elements > 0 && \"No V1 indices\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19897, __extension__
__PRETTY_FUNCTION__));

19898

19899

if (NumV2Elements == 0)

19900

return false;

19901

19902

// When the number of V1 and V2 elements are the same, try to minimize the

19903

// number of uses of V2 in the low half of the vector. When that is tied,

19904

// ensure that the sum of indices for V1 is equal to or lower than the sum

19905

// indices for V2. When those are equal, try to ensure that the number of odd

19906

// indices for V1 is lower than the number of odd indices for V2.

19907

if (NumV1Elements == NumV2Elements) {

19908

int LowV1Elements = 0, LowV2Elements = 0;

19909

for (int M : Mask.slice(0, NumElements / 2))

19910

if (M >= NumElements)

19911

++LowV2Elements;

19912

else if (M >= 0)

19913

++LowV1Elements;

19914

if (LowV2Elements > LowV1Elements)

19915

return true;

19916

if (LowV2Elements == LowV1Elements) {

19917

int SumV1Indices = 0, SumV2Indices = 0;

19918

for (int i = 0, Size = Mask.size(); i < Size; ++i)

19919

if (Mask[i] >= NumElements)

19920

SumV2Indices += i;

19921

else if (Mask[i] >= 0)

19922

SumV1Indices += i;

19923

if (SumV2Indices < SumV1Indices)

19924

return true;

19925

if (SumV2Indices == SumV1Indices) {

19926

int NumV1OddIndices = 0, NumV2OddIndices = 0;

19927

for (int i = 0, Size = Mask.size(); i < Size; ++i)

19928

if (Mask[i] >= NumElements)

19929

NumV2OddIndices += i % 2;

19930

else if (Mask[i] >= 0)

19931

NumV1OddIndices += i % 2;

19932

if (NumV2OddIndices < NumV1OddIndices)

19933

return true;

19934

}

19935

}

19936

}

19937

19938

return false;

19939

}

19940

19941

static bool canCombineAsMaskOperation(SDValue V1, SDValue V2,

19942

const X86Subtarget &Subtarget) {

19943

if (!Subtarget.hasAVX512())

19944

return false;

19945

19946

MVT VT = V1.getSimpleValueType().getScalarType();

19947

if ((VT == MVT::i16 || VT == MVT::i8) && !Subtarget.hasBWI())

19948

return false;

19949

19950

// If vec width < 512, widen i8/i16 even with BWI as blendd/blendps/blendpd

19951

// are preferable to blendw/blendvb/masked-mov.

19952

if ((VT == MVT::i16 || VT == MVT::i8) &&

19953

V1.getSimpleValueType().getSizeInBits() < 512)

19954

return false;

19955

19956

auto HasMaskOperation = [&](SDValue V) {

19957

// TODO: Currently we only check limited opcode. We probably extend

19958

// it to all binary operation by checking TLI.isBinOp().

19959

switch (V->getOpcode()) {

19960

default:

19961

return false;

19962

case ISD::ADD:

19963

case ISD::SUB:

19964

case ISD::AND:

19965

case ISD::XOR:

19966

case ISD::OR:

19967

case ISD::SMAX:

19968

case ISD::SMIN:

19969

case ISD::UMAX:

19970

case ISD::UMIN:

19971

case ISD::ABS:

19972

case ISD::SHL:

19973

case ISD::SRL:

19974

case ISD::SRA:

19975

case ISD::MUL:

19976

break;

19977

}

19978

if (!V->hasOneUse())

19979

return false;

19980

19981

return true;

19982

};

19983

19984

if (HasMaskOperation(V1) || HasMaskOperation(V2))

19985

return true;

19986

19987

return false;

19988

}

19989

19990

// Forward declaration.

19991

static SDValue canonicalizeShuffleMaskWithHorizOp(

19992

MutableArrayRef<SDValue> Ops, MutableArrayRef<int> Mask,

19993

unsigned RootSizeInBits, const SDLoc &DL, SelectionDAG &DAG,

19994

const X86Subtarget &Subtarget);

19995

19996

/// Top-level lowering for x86 vector shuffles.

19997

///

19998

/// This handles decomposition, canonicalization, and lowering of all x86

19999

/// vector shuffles. Most of the specific lowering strategies are encapsulated

20000

/// above in helper routines. The canonicalization attempts to widen shuffles

20001

/// to involve fewer lanes of wider elements, consolidate symmetric patterns

20002

/// s.t. only one of the two inputs needs to be tested, etc.

20003

static SDValue lowerVECTOR_SHUFFLE(SDValue Op, const X86Subtarget &Subtarget,

20004

SelectionDAG &DAG) {

20005

ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);

20006

ArrayRef<int> OrigMask = SVOp->getMask();

20007

SDValue V1 = Op.getOperand(0);

20008

SDValue V2 = Op.getOperand(1);

20009

MVT VT = Op.getSimpleValueType();

20010

int NumElements = VT.getVectorNumElements();

20011

SDLoc DL(Op);

20012

bool Is1BitVector = (VT.getVectorElementType() == MVT::i1);

20013

20014

assert((VT.getSizeInBits() != 64 || Is1BitVector) &&(static_cast <bool> ((VT.getSizeInBits() != 64 || Is1BitVector
) && "Can't lower MMX shuffles") ? void (0) : __assert_fail
("(VT.getSizeInBits() != 64 || Is1BitVector) && \"Can't lower MMX shuffles\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20015, __extension__
__PRETTY_FUNCTION__))

20015

"Can't lower MMX shuffles")(static_cast <bool> ((VT.getSizeInBits() != 64 || Is1BitVector
) && "Can't lower MMX shuffles") ? void (0) : __assert_fail
("(VT.getSizeInBits() != 64 || Is1BitVector) && \"Can't lower MMX shuffles\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20015, __extension__
__PRETTY_FUNCTION__));

20016

20017

bool V1IsUndef = V1.isUndef();

20018

bool V2IsUndef = V2.isUndef();

20019

if (V1IsUndef && V2IsUndef)

20020

return DAG.getUNDEF(VT);

20021

20022

// When we create a shuffle node we put the UNDEF node to second operand,

20023

// but in some cases the first operand may be transformed to UNDEF.

20024

// In this case we should just commute the node.

20025

if (V1IsUndef)

20026

return DAG.getCommutedVectorShuffle(*SVOp);

20027

20028

// Check for non-undef masks pointing at an undef vector and make the masks

20029

// undef as well. This makes it easier to match the shuffle based solely on

20030

// the mask.

20031

if (V2IsUndef &&

20032

any_of(OrigMask, [NumElements](int M) { return M >= NumElements; })) {

20033

SmallVector<int, 8> NewMask(OrigMask);

20034

for (int &M : NewMask)

20035

if (M >= NumElements)

20036

M = -1;

20037

return DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);

20038

}

20039

20040

// Check for illegal shuffle mask element index values.

20041

int MaskUpperLimit = OrigMask.size() * (V2IsUndef ? 1 : 2);

20042

(void)MaskUpperLimit;

20043

assert(llvm::all_of(OrigMask,(static_cast <bool> (llvm::all_of(OrigMask, [&](int
M) { return -1 <= M && M < MaskUpperLimit; }) &&
"Out of bounds shuffle index") ? void (0) : __assert_fail ("llvm::all_of(OrigMask, [&](int M) { return -1 <= M && M < MaskUpperLimit; }) && \"Out of bounds shuffle index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20045, __extension__
__PRETTY_FUNCTION__))

20044

[&](int M) { return -1 <= M && M < MaskUpperLimit; }) &&(static_cast <bool> (llvm::all_of(OrigMask, [&](int
M) { return -1 <= M && M < MaskUpperLimit; }) &&
"Out of bounds shuffle index") ? void (0) : __assert_fail ("llvm::all_of(OrigMask, [&](int M) { return -1 <= M && M < MaskUpperLimit; }) && \"Out of bounds shuffle index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20045, __extension__
__PRETTY_FUNCTION__))

20045

"Out of bounds shuffle index")(static_cast <bool> (llvm::all_of(OrigMask, [&](int
M) { return -1 <= M && M < MaskUpperLimit; }) &&
"Out of bounds shuffle index") ? void (0) : __assert_fail ("llvm::all_of(OrigMask, [&](int M) { return -1 <= M && M < MaskUpperLimit; }) && \"Out of bounds shuffle index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20045, __extension__
__PRETTY_FUNCTION__));

20046

20047

// We actually see shuffles that are entirely re-arrangements of a set of

20048

// zero inputs. This mostly happens while decomposing complex shuffles into

20049

// simple ones. Directly lower these as a buildvector of zeros.

20050

APInt KnownUndef, KnownZero;

20051

computeZeroableShuffleElements(OrigMask, V1, V2, KnownUndef, KnownZero);

20052

20053

APInt Zeroable = KnownUndef | KnownZero;

20054

if (Zeroable.isAllOnes())

20055

return getZeroVector(VT, Subtarget, DAG, DL);

20056

20057

bool V2IsZero = !V2IsUndef && ISD::isBuildVectorAllZeros(V2.getNode());

20058

20059

// Try to collapse shuffles into using a vector type with fewer elements but

20060

// wider element types. We cap this to not form integers or floating point

20061

// elements wider than 64 bits. It does not seem beneficial to form i128

20062

// integers to handle flipping the low and high halves of AVX 256-bit vectors.

20063

SmallVector<int, 16> WidenedMask;

20064

if (VT.getScalarSizeInBits() < 64 && !Is1BitVector &&

20065

!canCombineAsMaskOperation(V1, V2, Subtarget) &&

20066

canWidenShuffleElements(OrigMask, Zeroable, V2IsZero, WidenedMask)) {

20067

// Shuffle mask widening should not interfere with a broadcast opportunity

20068

// by obfuscating the operands with bitcasts.

20069

// TODO: Avoid lowering directly from this top-level function: make this

20070

// a query (canLowerAsBroadcast) and defer lowering to the type-based calls.

20071

if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, OrigMask,

20072

Subtarget, DAG))

20073

return Broadcast;

20074

20075

MVT NewEltVT = VT.isFloatingPoint()

20076

? MVT::getFloatingPointVT(VT.getScalarSizeInBits() * 2)

20077

: MVT::getIntegerVT(VT.getScalarSizeInBits() * 2);

20078

int NewNumElts = NumElements / 2;

20079

MVT NewVT = MVT::getVectorVT(NewEltVT, NewNumElts);

20080

// Make sure that the new vector type is legal. For example, v2f64 isn't

20081

// legal on SSE1.

20082

if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {

20083

if (V2IsZero) {

20084

// Modify the new Mask to take all zeros from the all-zero vector.

20085

// Choose indices that are blend-friendly.

20086

bool UsedZeroVector = false;

20087

assert(is_contained(WidenedMask, SM_SentinelZero) &&(static_cast <bool> (is_contained(WidenedMask, SM_SentinelZero
) && "V2's non-undef elements are used?!") ? void (0)
: __assert_fail ("is_contained(WidenedMask, SM_SentinelZero) && \"V2's non-undef elements are used?!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20088, __extension__
__PRETTY_FUNCTION__))

20088

"V2's non-undef elements are used?!")(static_cast <bool> (is_contained(WidenedMask, SM_SentinelZero
) && "V2's non-undef elements are used?!") ? void (0)
: __assert_fail ("is_contained(WidenedMask, SM_SentinelZero) && \"V2's non-undef elements are used?!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20088, __extension__
__PRETTY_FUNCTION__));

20089

for (int i = 0; i != NewNumElts; ++i)

20090

if (WidenedMask[i] == SM_SentinelZero) {

20091

WidenedMask[i] = i + NewNumElts;

20092

UsedZeroVector = true;

20093

}

20094

// Ensure all elements of V2 are zero - isBuildVectorAllZeros permits

20095

// some elements to be undef.

20096

if (UsedZeroVector)

20097

V2 = getZeroVector(NewVT, Subtarget, DAG, DL);

20098

}

20099

V1 = DAG.getBitcast(NewVT, V1);

20100

V2 = DAG.getBitcast(NewVT, V2);

20101

return DAG.getBitcast(

20102

VT, DAG.getVectorShuffle(NewVT, DL, V1, V2, WidenedMask));

20103

}

20104

}

20105

20106

SmallVector<SDValue> Ops = {V1, V2};

20107

SmallVector<int> Mask(OrigMask);

20108

20109

// Canonicalize the shuffle with any horizontal ops inputs.

20110

// NOTE: This may update Ops and Mask.

20111

if (SDValue HOp = canonicalizeShuffleMaskWithHorizOp(

20112

Ops, Mask, VT.getSizeInBits(), DL, DAG, Subtarget))

20113

return DAG.getBitcast(VT, HOp);

20114

20115

V1 = DAG.getBitcast(VT, Ops[0]);

20116

V2 = DAG.getBitcast(VT, Ops[1]);

20117

assert(NumElements == (int)Mask.size() &&(static_cast <bool> (NumElements == (int)Mask.size() &&
"canonicalizeShuffleMaskWithHorizOp " "shouldn't alter the shuffle mask size"
) ? void (0) : __assert_fail ("NumElements == (int)Mask.size() && \"canonicalizeShuffleMaskWithHorizOp \" \"shouldn't alter the shuffle mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20119, __extension__
__PRETTY_FUNCTION__))

20118

"canonicalizeShuffleMaskWithHorizOp "(static_cast <bool> (NumElements == (int)Mask.size() &&
"canonicalizeShuffleMaskWithHorizOp " "shouldn't alter the shuffle mask size"
) ? void (0) : __assert_fail ("NumElements == (int)Mask.size() && \"canonicalizeShuffleMaskWithHorizOp \" \"shouldn't alter the shuffle mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20119, __extension__
__PRETTY_FUNCTION__))

20119

"shouldn't alter the shuffle mask size")(static_cast <bool> (NumElements == (int)Mask.size() &&
"canonicalizeShuffleMaskWithHorizOp " "shouldn't alter the shuffle mask size"
) ? void (0) : __assert_fail ("NumElements == (int)Mask.size() && \"canonicalizeShuffleMaskWithHorizOp \" \"shouldn't alter the shuffle mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20119, __extension__
__PRETTY_FUNCTION__));

20120

20121

// Commute the shuffle if it will improve canonicalization.

20122

if (canonicalizeShuffleMaskWithCommute(Mask)) {

20123

ShuffleVectorSDNode::commuteMask(Mask);

20124

std::swap(V1, V2);

20125

}

20126

20127

// For each vector width, delegate to a specialized lowering routine.

20128

if (VT.is128BitVector())

20129

return lower128BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);

20130

20131

if (VT.is256BitVector())

20132

return lower256BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);

20133

20134

if (VT.is512BitVector())

20135

return lower512BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);

20136

20137

if (Is1BitVector)

20138

return lower1BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);

20139

20140

llvm_unreachable("Unimplemented!")::llvm::llvm_unreachable_internal("Unimplemented!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20140);

20141

}

20142

20143

/// Try to lower a VSELECT instruction to a vector shuffle.

20144

static SDValue lowerVSELECTtoVectorShuffle(SDValue Op,

20145

const X86Subtarget &Subtarget,

20146

SelectionDAG &DAG) {

20147

SDValue Cond = Op.getOperand(0);

20148

SDValue LHS = Op.getOperand(1);

20149

SDValue RHS = Op.getOperand(2);

20150

MVT VT = Op.getSimpleValueType();

20151

20152

// Only non-legal VSELECTs reach this lowering, convert those into generic

20153

// shuffles and re-use the shuffle lowering path for blends.

20154

if (ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) {

20155

SmallVector<int, 32> Mask;

20156

if (createShuffleMaskFromVSELECT(Mask, Cond))

20157

return DAG.getVectorShuffle(VT, SDLoc(Op), LHS, RHS, Mask);

20158

}

20159

20160

return SDValue();

20161

}

20162

20163

SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {

20164

SDValue Cond = Op.getOperand(0);

20165

SDValue LHS = Op.getOperand(1);

20166

SDValue RHS = Op.getOperand(2);

20167

20168

SDLoc dl(Op);

20169

MVT VT = Op.getSimpleValueType();

20170

if (isSoftFP16(VT)) {

20171

MVT NVT = VT.changeVectorElementTypeToInteger();

20172

return DAG.getBitcast(VT, DAG.getNode(ISD::VSELECT, dl, NVT, Cond,

20173

DAG.getBitcast(NVT, LHS),

20174

DAG.getBitcast(NVT, RHS)));

20175

}

20176

20177

// A vselect where all conditions and data are constants can be optimized into

20178

// a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().

20179

if (ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()) &&

20180

ISD::isBuildVectorOfConstantSDNodes(LHS.getNode()) &&

20181

ISD::isBuildVectorOfConstantSDNodes(RHS.getNode()))

20182

return SDValue();

20183

20184

// Try to lower this to a blend-style vector shuffle. This can handle all

20185

// constant condition cases.

20186

if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG))

20187

return BlendOp;

20188

20189

// If this VSELECT has a vector if i1 as a mask, it will be directly matched

20190

// with patterns on the mask registers on AVX-512.

20191

MVT CondVT = Cond.getSimpleValueType();

20192

unsigned CondEltSize = Cond.getScalarValueSizeInBits();

20193

if (CondEltSize == 1)

20194

return Op;

20195

20196

// Variable blends are only legal from SSE4.1 onward.

20197

if (!Subtarget.hasSSE41())

20198

return SDValue();

20199

20200

unsigned EltSize = VT.getScalarSizeInBits();

20201

unsigned NumElts = VT.getVectorNumElements();

20202

20203

// Expand v32i16/v64i8 without BWI.

20204

if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())

20205

return SDValue();

20206

20207

// If the VSELECT is on a 512-bit type, we have to convert a non-i1 condition

20208

// into an i1 condition so that we can use the mask-based 512-bit blend

20209

// instructions.

20210

if (VT.getSizeInBits() == 512) {

20211

// Build a mask by testing the condition against zero.

20212

MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);

20213

SDValue Mask = DAG.getSetCC(dl, MaskVT, Cond,

20214

DAG.getConstant(0, dl, CondVT),

20215

ISD::SETNE);

20216

// Now return a new VSELECT using the mask.

20217

return DAG.getSelect(dl, VT, Mask, LHS, RHS);

20218

}

20219

20220

// SEXT/TRUNC cases where the mask doesn't match the destination size.

20221

if (CondEltSize != EltSize) {

20222

// If we don't have a sign splat, rely on the expansion.

20223

if (CondEltSize != DAG.ComputeNumSignBits(Cond))

20224

return SDValue();

20225

20226

MVT NewCondSVT = MVT::getIntegerVT(EltSize);

20227

MVT NewCondVT = MVT::getVectorVT(NewCondSVT, NumElts);

20228

Cond = DAG.getSExtOrTrunc(Cond, dl, NewCondVT);

20229

return DAG.getNode(ISD::VSELECT, dl, VT, Cond, LHS, RHS);

20230

}

20231

20232

// Only some types will be legal on some subtargets. If we can emit a legal

20233

// VSELECT-matching blend, return Op, and but if we need to expand, return

20234

// a null value.

20235

switch (VT.SimpleTy) {

20236

default:

20237

// Most of the vector types have blends past SSE4.1.

20238

return Op;

20239

20240

case MVT::v32i8:

20241

// The byte blends for AVX vectors were introduced only in AVX2.

20242

if (Subtarget.hasAVX2())

20243

return Op;

20244

20245

return SDValue();

20246

20247

case MVT::v8i16:

20248

case MVT::v16i16: {

20249

// Bitcast everything to the vXi8 type and use a vXi8 vselect.

20250

MVT CastVT = MVT::getVectorVT(MVT::i8, NumElts * 2);

20251

Cond = DAG.getBitcast(CastVT, Cond);

20252

LHS = DAG.getBitcast(CastVT, LHS);

20253

RHS = DAG.getBitcast(CastVT, RHS);

20254

SDValue Select = DAG.getNode(ISD::VSELECT, dl, CastVT, Cond, LHS, RHS);

20255

return DAG.getBitcast(VT, Select);

20256

}

20257

}

20258

}

20259

20260

static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {

20261

MVT VT = Op.getSimpleValueType();

20262

SDValue Vec = Op.getOperand(0);

20263

SDValue Idx = Op.getOperand(1);

20264

assert(isa<ConstantSDNode>(Idx) && "Constant index expected")(static_cast <bool> (isa<ConstantSDNode>(Idx) &&
"Constant index expected") ? void (0) : __assert_fail ("isa<ConstantSDNode>(Idx) && \"Constant index expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20264, __extension__
__PRETTY_FUNCTION__));

20265

SDLoc dl(Op);

20266

20267

if (!Vec.getSimpleValueType().is128BitVector())

20268

return SDValue();

20269

20270

if (VT.getSizeInBits() == 8) {

20271

// If IdxVal is 0, it's cheaper to do a move instead of a pextrb, unless

20272

// we're going to zero extend the register or fold the store.

20273

if (llvm::isNullConstant(Idx) && !X86::mayFoldIntoZeroExtend(Op) &&

20274

!X86::mayFoldIntoStore(Op))

20275

return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,

20276

DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,

20277

DAG.getBitcast(MVT::v4i32, Vec), Idx));

20278

20279

unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();

20280

SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32, Vec,

20281

DAG.getTargetConstant(IdxVal, dl, MVT::i8));

20282

return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);

20283

}

20284

20285

if (VT == MVT::f32) {

20286

// EXTRACTPS outputs to a GPR32 register which will require a movd to copy

20287

// the result back to FR32 register. It's only worth matching if the

20288

// result has a single use which is a store or a bitcast to i32. And in

20289

// the case of a store, it's not worth it if the index is a constant 0,

20290

// because a MOVSSmr can be used instead, which is smaller and faster.

20291

if (!Op.hasOneUse())

20292

return SDValue();

20293

SDNode *User = *Op.getNode()->use_begin();

20294

if ((User->getOpcode() != ISD::STORE || isNullConstant(Idx)) &&

20295

(User->getOpcode() != ISD::BITCAST ||

20296

User->getValueType(0) != MVT::i32))

20297

return SDValue();

20298

SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,

20299

DAG.getBitcast(MVT::v4i32, Vec), Idx);

20300

return DAG.getBitcast(MVT::f32, Extract);

20301

}

20302

20303

if (VT == MVT::i32 || VT == MVT::i64)

20304

return Op;

20305

20306

return SDValue();

20307

}

20308

20309

/// Extract one bit from mask vector, like v16i1 or v8i1.

20310

/// AVX-512 feature.

20311

static SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG,

20312

const X86Subtarget &Subtarget) {

20313

SDValue Vec = Op.getOperand(0);

20314

SDLoc dl(Vec);

20315

MVT VecVT = Vec.getSimpleValueType();

20316

SDValue Idx = Op.getOperand(1);

20317

auto* IdxC = dyn_cast<ConstantSDNode>(Idx);

20318

MVT EltVT = Op.getSimpleValueType();

20319

20320

assert((VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) &&(static_cast <bool> ((VecVT.getVectorNumElements() <=
16 || Subtarget.hasBWI()) && "Unexpected vector type in ExtractBitFromMaskVector"
) ? void (0) : __assert_fail ("(VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) && \"Unexpected vector type in ExtractBitFromMaskVector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20321, __extension__
__PRETTY_FUNCTION__))

20321

"Unexpected vector type in ExtractBitFromMaskVector")(static_cast <bool> ((VecVT.getVectorNumElements() <=
16 || Subtarget.hasBWI()) && "Unexpected vector type in ExtractBitFromMaskVector"
) ? void (0) : __assert_fail ("(VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) && \"Unexpected vector type in ExtractBitFromMaskVector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20321, __extension__
__PRETTY_FUNCTION__));

20322

20323

// variable index can't be handled in mask registers,

20324

// extend vector to VR512/128

20325

if (!IdxC) {

20326

unsigned NumElts = VecVT.getVectorNumElements();

20327

// Extending v8i1/v16i1 to 512-bit get better performance on KNL

20328

// than extending to 128/256bit.

20329

MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;

20330

MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);

20331

SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec);

20332

SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ExtEltVT, Ext, Idx);

20333

return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);

20334

}

20335

20336

unsigned IdxVal = IdxC->getZExtValue();

20337

if (IdxVal == 0) // the operation is legal

20338

return Op;

20339

20340

// Extend to natively supported kshift.

20341

unsigned NumElems = VecVT.getVectorNumElements();

20342

MVT WideVecVT = VecVT;

20343

if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8) {

20344

WideVecVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;

20345

Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVecVT,

20346

DAG.getUNDEF(WideVecVT), Vec,

20347

DAG.getIntPtrConstant(0, dl));

20348

}

20349

20350

// Use kshiftr instruction to move to the lower element.

20351

Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideVecVT, Vec,

20352

DAG.getTargetConstant(IdxVal, dl, MVT::i8));

20353

20354

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,

20355

DAG.getIntPtrConstant(0, dl));

20356

}

20357

20358

SDValue

20359

X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,

20360

SelectionDAG &DAG) const {

20361

SDLoc dl(Op);

20362

SDValue Vec = Op.getOperand(0);

20363

MVT VecVT = Vec.getSimpleValueType();

20364

SDValue Idx = Op.getOperand(1);

20365

auto* IdxC = dyn_cast<ConstantSDNode>(Idx);

20366

20367

if (VecVT.getVectorElementType() == MVT::i1)

20368

return ExtractBitFromMaskVector(Op, DAG, Subtarget);

20369

20370

if (!IdxC) {

20371

// Its more profitable to go through memory (1 cycles throughput)

20372

// than using VMOVD + VPERMV/PSHUFB sequence ( 2/3 cycles throughput)

20373

// IACA tool was used to get performance estimation

20374

// (https://software.intel.com/en-us/articles/intel-architecture-code-analyzer)

20375

//

20376

// example : extractelement <16 x i8> %a, i32 %i

20377

//

20378

// Block Throughput: 3.00 Cycles

20379

// Throughput Bottleneck: Port5

20380

//

20381

// | Num Of | Ports pressure in cycles | |

20382

// | Uops | 0 - DV | 5 | 6 | 7 | |

20383

// ---------------------------------------------

20384

// | 1 | | 1.0 | | | CP | vmovd xmm1, edi

20385

// | 1 | | 1.0 | | | CP | vpshufb xmm0, xmm0, xmm1

20386

// | 2 | 1.0 | 1.0 | | | CP | vpextrb eax, xmm0, 0x0

20387

// Total Num Of Uops: 4

20388

//

20389

//

20390

// Block Throughput: 1.00 Cycles

20391

// Throughput Bottleneck: PORT2_AGU, PORT3_AGU, Port4

20392

//

20393

// | | Ports pressure in cycles | |

20394

// |Uops| 1 | 2 - D |3 - D | 4 | 5 | |

20395

// ---------------------------------------------------------

20396

// |2^ | | 0.5 | 0.5 |1.0| |CP| vmovaps xmmword ptr [rsp-0x18], xmm0

20397

// |1 |0.5| | | |0.5| | lea rax, ptr [rsp-0x18]

20398

// |1 | |0.5, 0.5|0.5, 0.5| | |CP| mov al, byte ptr [rdi+rax*1]

20399

// Total Num Of Uops: 4

20400

20401

return SDValue();

20402

}

20403

20404

unsigned IdxVal = IdxC->getZExtValue();

20405

20406

// If this is a 256-bit vector result, first extract the 128-bit vector and

20407

// then extract the element from the 128-bit vector.

20408

if (VecVT.is256BitVector() || VecVT.is512BitVector()) {

20409

// Get the 128-bit vector.

20410

Vec = extract128BitVector(Vec, IdxVal, DAG, dl);

20411

MVT EltVT = VecVT.getVectorElementType();

20412

20413

unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();

20414

assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2")(static_cast <bool> (isPowerOf2_32(ElemsPerChunk) &&
"Elements per chunk not power of 2") ? void (0) : __assert_fail
("isPowerOf2_32(ElemsPerChunk) && \"Elements per chunk not power of 2\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20414, __extension__
__PRETTY_FUNCTION__));

20415

20416

// Find IdxVal modulo ElemsPerChunk. Since ElemsPerChunk is a power of 2

20417

// this can be done with a mask.

20418

IdxVal &= ElemsPerChunk - 1;

20419

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,

20420

DAG.getIntPtrConstant(IdxVal, dl));

20421

}

20422

20423

assert(VecVT.is128BitVector() && "Unexpected vector length")(static_cast <bool> (VecVT.is128BitVector() && "Unexpected vector length"
) ? void (0) : __assert_fail ("VecVT.is128BitVector() && \"Unexpected vector length\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20423, __extension__
__PRETTY_FUNCTION__));

20424

20425

MVT VT = Op.getSimpleValueType();

20426

20427

if (VT == MVT::i16) {

20428

// If IdxVal is 0, it's cheaper to do a move instead of a pextrw, unless

20429

// we're going to zero extend the register or fold the store (SSE41 only).

20430

if (IdxVal == 0 && !X86::mayFoldIntoZeroExtend(Op) &&

20431

!(Subtarget.hasSSE41() && X86::mayFoldIntoStore(Op))) {

20432

if (Subtarget.hasFP16())

20433

return Op;

20434

20435

return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,

20436

DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,

20437

DAG.getBitcast(MVT::v4i32, Vec), Idx));

20438

}

20439

20440

SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32, Vec,

20441

DAG.getTargetConstant(IdxVal, dl, MVT::i8));

20442

return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);

20443

}

20444

20445

if (Subtarget.hasSSE41())

20446

if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG))

20447

return Res;

20448

20449

// TODO: We only extract a single element from v16i8, we can probably afford

20450

// to be more aggressive here before using the default approach of spilling to

20451

// stack.

20452

if (VT.getSizeInBits() == 8 && Op->isOnlyUserOf(Vec.getNode())) {

20453

// Extract either the lowest i32 or any i16, and extract the sub-byte.

20454

int DWordIdx = IdxVal / 4;

20455

if (DWordIdx == 0) {

20456

SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,

20457

DAG.getBitcast(MVT::v4i32, Vec),

20458

DAG.getIntPtrConstant(DWordIdx, dl));

20459

int ShiftVal = (IdxVal % 4) * 8;

20460

if (ShiftVal != 0)

20461

Res = DAG.getNode(ISD::SRL, dl, MVT::i32, Res,

20462

DAG.getConstant(ShiftVal, dl, MVT::i8));

20463

return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);

20464

}

20465

20466

int WordIdx = IdxVal / 2;

20467

SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,

20468

DAG.getBitcast(MVT::v8i16, Vec),

20469

DAG.getIntPtrConstant(WordIdx, dl));

20470

int ShiftVal = (IdxVal % 2) * 8;

20471

if (ShiftVal != 0)

20472

Res = DAG.getNode(ISD::SRL, dl, MVT::i16, Res,

20473

DAG.getConstant(ShiftVal, dl, MVT::i8));

20474

return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);

20475

}

20476

20477

if (VT == MVT::f16 || VT.getSizeInBits() == 32) {

20478

if (IdxVal == 0)

20479

return Op;

20480

20481

// Shuffle the element to the lowest element, then movss or movsh.

20482

SmallVector<int, 8> Mask(VecVT.getVectorNumElements(), -1);

20483

Mask[0] = static_cast<int>(IdxVal);

20484

Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);

20485

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,

20486

DAG.getIntPtrConstant(0, dl));

20487

}

20488

20489

if (VT.getSizeInBits() == 64) {

20490

// FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b

20491

// FIXME: seems like this should be unnecessary if mov{h,l}pd were taught

20492

// to match extract_elt for f64.

20493

if (IdxVal == 0)

20494

return Op;

20495

20496

// UNPCKHPD the element to the lowest double word, then movsd.

20497

// Note if the lower 64 bits of the result of the UNPCKHPD is then stored

20498

// to a f64mem, the whole operation is folded into a single MOVHPDmr.

20499

int Mask[2] = { 1, -1 };

20500

Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);

20501

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,

20502

DAG.getIntPtrConstant(0, dl));

20503

}

20504

20505

return SDValue();

20506

}

20507

20508

/// Insert one bit to mask vector, like v16i1 or v8i1.

20509

/// AVX-512 feature.

20510

static SDValue InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG,

20511

const X86Subtarget &Subtarget) {

20512

SDLoc dl(Op);

20513

SDValue Vec = Op.getOperand(0);

20514

SDValue Elt = Op.getOperand(1);

20515

SDValue Idx = Op.getOperand(2);

20516

MVT VecVT = Vec.getSimpleValueType();

20517

20518

if (!isa<ConstantSDNode>(Idx)) {

20519

// Non constant index. Extend source and destination,

20520

// insert element and then truncate the result.

20521

unsigned NumElts = VecVT.getVectorNumElements();

20522

MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;

20523

MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);

20524

SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,

20525

DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec),

20526

DAG.getNode(ISD::SIGN_EXTEND, dl, ExtEltVT, Elt), Idx);

20527

return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);

20528

}

20529

20530

// Copy into a k-register, extract to v1i1 and insert_subvector.

20531

SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i1, Elt);

20532

return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT, Vec, EltInVec, Idx);

20533

}

20534

20535

SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,

20536

SelectionDAG &DAG) const {

20537

MVT VT = Op.getSimpleValueType();

20538

MVT EltVT = VT.getVectorElementType();

20539

unsigned NumElts = VT.getVectorNumElements();

20540

unsigned EltSizeInBits = EltVT.getScalarSizeInBits();

20541

20542

if (EltVT == MVT::i1)

20543

return InsertBitToMaskVector(Op, DAG, Subtarget);

20544

20545

SDLoc dl(Op);

20546

SDValue N0 = Op.getOperand(0);

20547

SDValue N1 = Op.getOperand(1);

20548

SDValue N2 = Op.getOperand(2);

20549

auto *N2C = dyn_cast<ConstantSDNode>(N2);

20550

20551

if (!N2C) {

20552

// Variable insertion indices, usually we're better off spilling to stack,

20553

// but AVX512 can use a variable compare+select by comparing against all

20554

// possible vector indices, and FP insertion has less gpr->simd traffic.

20555

if (!(Subtarget.hasBWI() ||

20556

(Subtarget.hasAVX512() && EltSizeInBits >= 32) ||

20557

(Subtarget.hasSSE41() && (EltVT == MVT::f32 || EltVT == MVT::f64))))

20558

return SDValue();

20559

20560

MVT IdxSVT = MVT::getIntegerVT(EltSizeInBits);

20561

MVT IdxVT = MVT::getVectorVT(IdxSVT, NumElts);

20562

if (!isTypeLegal(IdxSVT) || !isTypeLegal(IdxVT))

20563

return SDValue();

20564

20565

SDValue IdxExt = DAG.getZExtOrTrunc(N2, dl, IdxSVT);

20566

SDValue IdxSplat = DAG.getSplatBuildVector(IdxVT, dl, IdxExt);

20567

SDValue EltSplat = DAG.getSplatBuildVector(VT, dl, N1);

20568

20569

SmallVector<SDValue, 16> RawIndices;

20570

for (unsigned I = 0; I != NumElts; ++I)

20571

RawIndices.push_back(DAG.getConstant(I, dl, IdxSVT));

20572

SDValue Indices = DAG.getBuildVector(IdxVT, dl, RawIndices);

20573

20574

// inselt N0, N1, N2 --> select (SplatN2 == {0,1,2...}) ? SplatN1 : N0.

20575

return DAG.getSelectCC(dl, IdxSplat, Indices, EltSplat, N0,

20576

ISD::CondCode::SETEQ);

20577

}

20578

20579

if (N2C->getAPIntValue().uge(NumElts))

20580

return SDValue();

20581

uint64_t IdxVal = N2C->getZExtValue();

20582

20583

bool IsZeroElt = X86::isZeroNode(N1);

20584

bool IsAllOnesElt = VT.isInteger() && llvm::isAllOnesConstant(N1);

20585

20586

if (IsZeroElt || IsAllOnesElt) {

20587

// Lower insertion of v16i8/v32i8/v64i16 -1 elts as an 'OR' blend.

20588

// We don't deal with i8 0 since it appears to be handled elsewhere.

20589

if (IsAllOnesElt &&

20590

((VT == MVT::v16i8 && !Subtarget.hasSSE41()) ||

20591

((VT == MVT::v32i8 || VT == MVT::v16i16) && !Subtarget.hasInt256()))) {

20592

SDValue ZeroCst = DAG.getConstant(0, dl, VT.getScalarType());

20593

SDValue OnesCst = DAG.getAllOnesConstant(dl, VT.getScalarType());

20594

SmallVector<SDValue, 8> CstVectorElts(NumElts, ZeroCst);

20595

CstVectorElts[IdxVal] = OnesCst;

20596

SDValue CstVector = DAG.getBuildVector(VT, dl, CstVectorElts);

20597

return DAG.getNode(ISD::OR, dl, VT, N0, CstVector);

20598

}

20599

// See if we can do this more efficiently with a blend shuffle with a

20600

// rematerializable vector.

20601

if (Subtarget.hasSSE41() &&

20602

(EltSizeInBits >= 16 || (IsZeroElt && !VT.is128BitVector()))) {

20603

SmallVector<int, 8> BlendMask;

20604

for (unsigned i = 0; i != NumElts; ++i)

20605

BlendMask.push_back(i == IdxVal ? i + NumElts : i);

20606

SDValue CstVector = IsZeroElt ? getZeroVector(VT, Subtarget, DAG, dl)

20607

: getOnesVector(VT, DAG, dl);

20608

return DAG.getVectorShuffle(VT, dl, N0, CstVector, BlendMask);

20609

}

20610

}

20611

20612

// If the vector is wider than 128 bits, extract the 128-bit subvector, insert

20613

// into that, and then insert the subvector back into the result.

20614

if (VT.is256BitVector() || VT.is512BitVector()) {

20615

// With a 256-bit vector, we can insert into the zero element efficiently

20616

// using a blend if we have AVX or AVX2 and the right data type.

20617

if (VT.is256BitVector() && IdxVal == 0) {

20618

// TODO: It is worthwhile to cast integer to floating point and back

20619

// and incur a domain crossing penalty if that's what we'll end up

20620

// doing anyway after extracting to a 128-bit vector.

20621

if ((Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||

20622

(Subtarget.hasAVX2() && (EltVT == MVT::i32 || EltVT == MVT::i64))) {

20623

SDValue N1Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);

20624

return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec,

20625

DAG.getTargetConstant(1, dl, MVT::i8));

20626

}

20627

}

20628

20629

unsigned NumEltsIn128 = 128 / EltSizeInBits;

20630

assert(isPowerOf2_32(NumEltsIn128) &&(static_cast <bool> (isPowerOf2_32(NumEltsIn128) &&
"Vectors will always have power-of-two number of elements.")
? void (0) : __assert_fail ("isPowerOf2_32(NumEltsIn128) && \"Vectors will always have power-of-two number of elements.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20631, __extension__
__PRETTY_FUNCTION__))

20631

"Vectors will always have power-of-two number of elements.")(static_cast <bool> (isPowerOf2_32(NumEltsIn128) &&
"Vectors will always have power-of-two number of elements.")
? void (0) : __assert_fail ("isPowerOf2_32(NumEltsIn128) && \"Vectors will always have power-of-two number of elements.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20631, __extension__
__PRETTY_FUNCTION__));

20632

20633

// If we are not inserting into the low 128-bit vector chunk,

20634

// then prefer the broadcast+blend sequence.

20635

// FIXME: relax the profitability check iff all N1 uses are insertions.

20636

if (IdxVal >= NumEltsIn128 &&

20637

((Subtarget.hasAVX2() && EltSizeInBits != 8) ||

20638

(Subtarget.hasAVX() && (EltSizeInBits >= 32) &&

20639

X86::mayFoldLoad(N1, Subtarget)))) {

20640

SDValue N1SplatVec = DAG.getSplatBuildVector(VT, dl, N1);

20641

SmallVector<int, 8> BlendMask;

20642

for (unsigned i = 0; i != NumElts; ++i)

20643

BlendMask.push_back(i == IdxVal ? i + NumElts : i);

20644

return DAG.getVectorShuffle(VT, dl, N0, N1SplatVec, BlendMask);

20645

}

20646

20647

// Get the desired 128-bit vector chunk.

20648

SDValue V = extract128BitVector(N0, IdxVal, DAG, dl);

20649

20650

// Insert the element into the desired chunk.

20651

// Since NumEltsIn128 is a power of 2 we can use mask instead of modulo.

20652

unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1);

20653

20654

V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,

20655

DAG.getIntPtrConstant(IdxIn128, dl));

20656

20657

// Insert the changed part back into the bigger vector

20658

return insert128BitVector(N0, V, IdxVal, DAG, dl);

20659

}

20660

assert(VT.is128BitVector() && "Only 128-bit vector types should be left!")(static_cast <bool> (VT.is128BitVector() && "Only 128-bit vector types should be left!"
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Only 128-bit vector types should be left!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20660, __extension__
__PRETTY_FUNCTION__));

20661

20662

// This will be just movw/movd/movq/movsh/movss/movsd.

20663

if (IdxVal == 0 && ISD::isBuildVectorAllZeros(N0.getNode())) {

20664

if (EltVT == MVT::i32 || EltVT == MVT::f32 || EltVT == MVT::f64 ||

20665

EltVT == MVT::f16 || EltVT == MVT::i64) {

20666

N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);

20667

return getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);

20668

}

20669

20670

// We can't directly insert an i8 or i16 into a vector, so zero extend

20671

// it to i32 first.

20672

if (EltVT == MVT::i16 || EltVT == MVT::i8) {

20673

N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, N1);

20674

MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);

20675

N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, N1);

20676

N1 = getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);

20677

return DAG.getBitcast(VT, N1);

20678

}

20679

}

20680

20681

// Transform it so it match pinsr{b,w} which expects a GR32 as its second

20682

// argument. SSE41 required for pinsrb.

20683

if (VT == MVT::v8i16 || (VT == MVT::v16i8 && Subtarget.hasSSE41())) {

20684

unsigned Opc;

20685

if (VT == MVT::v8i16) {

20686

assert(Subtarget.hasSSE2() && "SSE2 required for PINSRW")(static_cast <bool> (Subtarget.hasSSE2() && "SSE2 required for PINSRW"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"SSE2 required for PINSRW\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20686, __extension__
__PRETTY_FUNCTION__));

20687

Opc = X86ISD::PINSRW;

20688

} else {

20689

assert(VT == MVT::v16i8 && "PINSRB requires v16i8 vector")(static_cast <bool> (VT == MVT::v16i8 && "PINSRB requires v16i8 vector"
) ? void (0) : __assert_fail ("VT == MVT::v16i8 && \"PINSRB requires v16i8 vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20689, __extension__
__PRETTY_FUNCTION__));

20690

assert(Subtarget.hasSSE41() && "SSE41 required for PINSRB")(static_cast <bool> (Subtarget.hasSSE41() && "SSE41 required for PINSRB"
) ? void (0) : __assert_fail ("Subtarget.hasSSE41() && \"SSE41 required for PINSRB\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20690, __extension__
__PRETTY_FUNCTION__));

20691

Opc = X86ISD::PINSRB;

20692

}

20693

20694

assert(N1.getValueType() != MVT::i32 && "Unexpected VT")(static_cast <bool> (N1.getValueType() != MVT::i32 &&
"Unexpected VT") ? void (0) : __assert_fail ("N1.getValueType() != MVT::i32 && \"Unexpected VT\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20694, __extension__
__PRETTY_FUNCTION__));

20695

N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);

20696

N2 = DAG.getTargetConstant(IdxVal, dl, MVT::i8);

20697

return DAG.getNode(Opc, dl, VT, N0, N1, N2);

20698

}

20699

20700

if (Subtarget.hasSSE41()) {

20701

if (EltVT == MVT::f32) {

20702

// Bits [7:6] of the constant are the source select. This will always be

20703

// zero here. The DAG Combiner may combine an extract_elt index into

20704

// these bits. For example (insert (extract, 3), 2) could be matched by

20705

// putting the '3' into bits [7:6] of X86ISD::INSERTPS.

20706

// Bits [5:4] of the constant are the destination select. This is the

20707

// value of the incoming immediate.

20708

// Bits [3:0] of the constant are the zero mask. The DAG Combiner may

20709

// combine either bitwise AND or insert of float 0.0 to set these bits.

20710

20711

bool MinSize = DAG.getMachineFunction().getFunction().hasMinSize();

20712

if (IdxVal == 0 && (!MinSize || !X86::mayFoldLoad(N1, Subtarget))) {

20713

// If this is an insertion of 32-bits into the low 32-bits of

20714

// a vector, we prefer to generate a blend with immediate rather

20715

// than an insertps. Blends are simpler operations in hardware and so

20716

// will always have equal or better performance than insertps.

20717

// But if optimizing for size and there's a load folding opportunity,

20718

// generate insertps because blendps does not have a 32-bit memory

20719

// operand form.

20720

N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);

20721

return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1,

20722

DAG.getTargetConstant(1, dl, MVT::i8));

20723

}

20724

// Create this as a scalar to vector..

20725

N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);

20726

return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1,

20727

DAG.getTargetConstant(IdxVal << 4, dl, MVT::i8));

20728

}

20729

20730

// PINSR* works with constant index.

20731

if (EltVT == MVT::i32 || EltVT == MVT::i64)

20732

return Op;

20733

}

20734

20735

return SDValue();

20736

}

20737

20738

static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, const X86Subtarget &Subtarget,

20739

SelectionDAG &DAG) {

20740

SDLoc dl(Op);

20741

MVT OpVT = Op.getSimpleValueType();

20742

20743

// It's always cheaper to replace a xor+movd with xorps and simplifies further

20744

// combines.

20745

if (X86::isZeroNode(Op.getOperand(0)))

20746

return getZeroVector(OpVT, Subtarget, DAG, dl);

20747

20748

// If this is a 256-bit vector result, first insert into a 128-bit

20749

// vector and then insert into the 256-bit vector.

20750

if (!OpVT.is128BitVector()) {

20751

// Insert into a 128-bit vector.

20752

unsigned SizeFactor = OpVT.getSizeInBits() / 128;

20753

MVT VT128 = MVT::getVectorVT(OpVT.getVectorElementType(),

20754

OpVT.getVectorNumElements() / SizeFactor);

20755

20756

Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));

20757

20758

// Insert the 128-bit vector.

20759

return insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);

20760

}

20761

assert(OpVT.is128BitVector() && OpVT.isInteger() && OpVT != MVT::v2i64 &&(static_cast <bool> (OpVT.is128BitVector() && OpVT
.isInteger() && OpVT != MVT::v2i64 && "Expected an SSE type!"
) ? void (0) : __assert_fail ("OpVT.is128BitVector() && OpVT.isInteger() && OpVT != MVT::v2i64 && \"Expected an SSE type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20762, __extension__
__PRETTY_FUNCTION__))

20762

"Expected an SSE type!")(static_cast <bool> (OpVT.is128BitVector() && OpVT
.isInteger() && OpVT != MVT::v2i64 && "Expected an SSE type!"
) ? void (0) : __assert_fail ("OpVT.is128BitVector() && OpVT.isInteger() && OpVT != MVT::v2i64 && \"Expected an SSE type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20762, __extension__
__PRETTY_FUNCTION__));

20763

20764

// Pass through a v4i32 or V8i16 SCALAR_TO_VECTOR as that's what we use in

20765

// tblgen.

20766

if (OpVT == MVT::v4i32 || (OpVT == MVT::v8i16 && Subtarget.hasFP16()))

20767

return Op;

20768

20769

SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));

20770

return DAG.getBitcast(

20771

OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, AnyExt));

20772

}

20773

20774

// Lower a node with an INSERT_SUBVECTOR opcode. This may result in a

20775

// simple superregister reference or explicit instructions to insert

20776

// the upper bits of a vector.

20777

static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,

20778

SelectionDAG &DAG) {

20779

assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1)(static_cast <bool> (Op.getSimpleValueType().getVectorElementType
() == MVT::i1) ? void (0) : __assert_fail ("Op.getSimpleValueType().getVectorElementType() == MVT::i1"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20779, __extension__
__PRETTY_FUNCTION__));

20780

20781

return insert1BitVector(Op, DAG, Subtarget);

20782

}

20783

20784

static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,

20785

SelectionDAG &DAG) {

20786

assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&(static_cast <bool> (Op.getSimpleValueType().getVectorElementType
() == MVT::i1 && "Only vXi1 extract_subvectors need custom lowering"
) ? void (0) : __assert_fail ("Op.getSimpleValueType().getVectorElementType() == MVT::i1 && \"Only vXi1 extract_subvectors need custom lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20787, __extension__
__PRETTY_FUNCTION__))

20787

"Only vXi1 extract_subvectors need custom lowering")(static_cast <bool> (Op.getSimpleValueType().getVectorElementType
() == MVT::i1 && "Only vXi1 extract_subvectors need custom lowering"
) ? void (0) : __assert_fail ("Op.getSimpleValueType().getVectorElementType() == MVT::i1 && \"Only vXi1 extract_subvectors need custom lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20787, __extension__
__PRETTY_FUNCTION__));

20788

20789

SDLoc dl(Op);

20790

SDValue Vec = Op.getOperand(0);

20791

uint64_t IdxVal = Op.getConstantOperandVal(1);

20792

20793

if (IdxVal == 0) // the operation is legal

20794

return Op;

20795

20796

MVT VecVT = Vec.getSimpleValueType();

20797

unsigned NumElems = VecVT.getVectorNumElements();

20798

20799

// Extend to natively supported kshift.

20800

MVT WideVecVT = VecVT;

20801

if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8) {

20802

WideVecVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;

20803

Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVecVT,

20804

DAG.getUNDEF(WideVecVT), Vec,

20805

DAG.getIntPtrConstant(0, dl));

20806

}

20807

20808

// Shift to the LSB.

20809

Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideVecVT, Vec,

20810

DAG.getTargetConstant(IdxVal, dl, MVT::i8));

20811

20812

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, Op.getValueType(), Vec,

20813

DAG.getIntPtrConstant(0, dl));

20814

}

20815

20816

// Returns the appropriate wrapper opcode for a global reference.

20817

unsigned X86TargetLowering::getGlobalWrapperKind(

20818

const GlobalValue *GV, const unsigned char OpFlags) const {

20819

// References to absolute symbols are never PC-relative.

20820

if (GV && GV->isAbsoluteSymbolRef())

20821

return X86ISD::Wrapper;

20822

20823

CodeModel::Model M = getTargetMachine().getCodeModel();

20824

if (Subtarget.isPICStyleRIPRel() &&

20825

(M == CodeModel::Small || M == CodeModel::Kernel))

20826

return X86ISD::WrapperRIP;

20827

20828

// In the medium model, functions can always be referenced RIP-relatively,

20829

// since they must be within 2GiB. This is also possible in non-PIC mode, and

20830

// shorter than the 64-bit absolute immediate that would otherwise be emitted.

20831

if (M == CodeModel::Medium && isa_and_nonnull<Function>(GV))

20832

return X86ISD::WrapperRIP;

20833

20834

// GOTPCREL references must always use RIP.

20835

if (OpFlags == X86II::MO_GOTPCREL || OpFlags == X86II::MO_GOTPCREL_NORELAX)

20836

return X86ISD::WrapperRIP;

20837

20838

return X86ISD::Wrapper;

20839

}

20840

20841

// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as

20842

// their target counterpart wrapped in the X86ISD::Wrapper node. Suppose N is

20843

// one of the above mentioned nodes. It has to be wrapped because otherwise

20844

// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only

20845

// be used to form addressing mode. These wrapped nodes will be selected

20846

// into MOV32ri.

20847

SDValue

20848

X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {

20849

ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);

20850

20851

// In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the

20852

// global base reg.

20853

unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);

20854

20855

auto PtrVT = getPointerTy(DAG.getDataLayout());

20856

SDValue Result = DAG.getTargetConstantPool(

20857

CP->getConstVal(), PtrVT, CP->getAlign(), CP->getOffset(), OpFlag);

20858

SDLoc DL(CP);

20859

Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);

20860

// With PIC, the address is actually $g + Offset.

20861

if (OpFlag) {

20862

Result =

20863

DAG.getNode(ISD::ADD, DL, PtrVT,

20864

DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);

20865

}

20866

20867

return Result;

20868

}

20869

20870

SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {

20871

JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);

20872

20873

// In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the

20874

// global base reg.

20875

unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);

20876

20877

auto PtrVT = getPointerTy(DAG.getDataLayout());

20878

SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, OpFlag);

20879

SDLoc DL(JT);

20880

Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);

20881

20882

// With PIC, the address is actually $g + Offset.

20883

if (OpFlag)

20884

Result =

20885

DAG.getNode(ISD::ADD, DL, PtrVT,

20886

DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);

20887

20888

return Result;

20889

}

20890

20891

SDValue X86TargetLowering::LowerExternalSymbol(SDValue Op,

20892

SelectionDAG &DAG) const {

20893

return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false);

20894

}

20895

20896

SDValue

20897

X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {

20898

// Create the TargetBlockAddressAddress node.

20899

unsigned char OpFlags =

20900

Subtarget.classifyBlockAddressReference();

20901

const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();

20902

int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();

20903

SDLoc dl(Op);

20904

auto PtrVT = getPointerTy(DAG.getDataLayout());

20905

SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset, OpFlags);

20906

Result = DAG.getNode(getGlobalWrapperKind(), dl, PtrVT, Result);

20907

20908

// With PIC, the address is actually $g + Offset.

20909

if (isGlobalRelativeToPICBase(OpFlags)) {

20910

Result = DAG.getNode(ISD::ADD, dl, PtrVT,

20911

DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);

20912

}

20913

20914

return Result;

20915

}

20916

20917

/// Creates target global address or external symbol nodes for calls or

20918

/// other uses.

20919

SDValue X86TargetLowering::LowerGlobalOrExternal(SDValue Op, SelectionDAG &DAG,

20920

bool ForCall) const {

20921

// Unpack the global address or external symbol.

20922

const SDLoc &dl = SDLoc(Op);

20923

const GlobalValue *GV = nullptr;

20924

int64_t Offset = 0;

20925

const char *ExternalSym = nullptr;

20926

if (const auto *G = dyn_cast<GlobalAddressSDNode>(Op)) {

20927

GV = G->getGlobal();

20928

Offset = G->getOffset();

20929

} else {

20930

const auto *ES = cast<ExternalSymbolSDNode>(Op);

20931

ExternalSym = ES->getSymbol();

20932

}

20933

20934

// Calculate some flags for address lowering.

20935

const Module &Mod = *DAG.getMachineFunction().getFunction().getParent();

20936

unsigned char OpFlags;

20937

if (ForCall)

20938

OpFlags = Subtarget.classifyGlobalFunctionReference(GV, Mod);

20939

else

20940

OpFlags = Subtarget.classifyGlobalReference(GV, Mod);

20941

bool HasPICReg = isGlobalRelativeToPICBase(OpFlags);

20942

bool NeedsLoad = isGlobalStubReference(OpFlags);

20943

20944

CodeModel::Model M = DAG.getTarget().getCodeModel();

20945

auto PtrVT = getPointerTy(DAG.getDataLayout());

20946

SDValue Result;

20947

20948

if (GV) {

20949

// Create a target global address if this is a global. If possible, fold the

20950

// offset into the global address reference. Otherwise, ADD it on later.

20951

// Suppress the folding if Offset is negative: movl foo-1, %eax is not

20952

// allowed because if the address of foo is 0, the ELF R_X86_64_32

20953

// relocation will compute to a negative value, which is invalid.

20954

int64_t GlobalOffset = 0;

20955

if (OpFlags == X86II::MO_NO_FLAG && Offset >= 0 &&

20956

X86::isOffsetSuitableForCodeModel(Offset, M, true)) {

20957

std::swap(GlobalOffset, Offset);

20958

}

20959

Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, GlobalOffset, OpFlags);

20960

} else {

20961

// If this is not a global address, this must be an external symbol.

20962

Result = DAG.getTargetExternalSymbol(ExternalSym, PtrVT, OpFlags);

20963

}

20964

20965

// If this is a direct call, avoid the wrapper if we don't need to do any

20966

// loads or adds. This allows SDAG ISel to match direct calls.

20967

if (ForCall && !NeedsLoad && !HasPICReg && Offset == 0)

20968

return Result;

20969

20970

Result = DAG.getNode(getGlobalWrapperKind(GV, OpFlags), dl, PtrVT, Result);

20971

20972

// With PIC, the address is actually $g + Offset.

20973

if (HasPICReg) {

20974

Result = DAG.getNode(ISD::ADD, dl, PtrVT,

20975

DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);

20976

}

20977

20978

// For globals that require a load from a stub to get the address, emit the

20979

// load.

20980

if (NeedsLoad)

20981

Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,

20982

MachinePointerInfo::getGOT(DAG.getMachineFunction()));

20983

20984

// If there was a non-zero offset that we didn't fold, create an explicit

20985

// addition for it.

20986

if (Offset != 0)

20987

Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result,

20988

DAG.getConstant(Offset, dl, PtrVT));

20989

20990

return Result;

20991

}

20992

20993

SDValue

20994

X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {

20995

return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false);

20996

}

20997

20998

static SDValue

20999

GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,

21000

SDValue *InGlue, const EVT PtrVT, unsigned ReturnReg,

21001

unsigned char OperandFlags, bool LocalDynamic = false) {

21002

MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();

21003

SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);

21004

SDLoc dl(GA);

21005

SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,

21006

GA->getValueType(0),

21007

GA->getOffset(),

21008

OperandFlags);

21009

21010

X86ISD::NodeType CallType = LocalDynamic ? X86ISD::TLSBASEADDR

21011

: X86ISD::TLSADDR;

21012

21013

if (InGlue) {

21014

SDValue Ops[] = { Chain, TGA, *InGlue };

21015

Chain = DAG.getNode(CallType, dl, NodeTys, Ops);

21016

} else {

21017

SDValue Ops[] = { Chain, TGA };

21018

Chain = DAG.getNode(CallType, dl, NodeTys, Ops);

21019

}

21020

21021

// TLSADDR will be codegen'ed as call. Inform MFI that function has calls.

21022

MFI.setAdjustsStack(true);

21023

MFI.setHasCalls(true);

21024

21025

SDValue Glue = Chain.getValue(1);

21026

return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Glue);

21027

}

21028

21029

// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit

21030

static SDValue

21031

LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,

21032

const EVT PtrVT) {

21033

SDValue InGlue;

21034

SDLoc dl(GA); // ? function entry point might be better

21035

SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,

21036

DAG.getNode(X86ISD::GlobalBaseReg,

21037

SDLoc(), PtrVT), InGlue);

21038

InGlue = Chain.getValue(1);

21039

21040

return GetTLSADDR(DAG, Chain, GA, &InGlue, PtrVT, X86::EAX, X86II::MO_TLSGD);

21041

}

21042

21043

// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit LP64

21044

static SDValue

21045

LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,

21046

const EVT PtrVT) {

21047

return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,

21048

X86::RAX, X86II::MO_TLSGD);

21049

}

21050

21051

// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit ILP32

21052

static SDValue

21053

LowerToTLSGeneralDynamicModelX32(GlobalAddressSDNode *GA, SelectionDAG &DAG,

21054

const EVT PtrVT) {

21055

return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,

21056

X86::EAX, X86II::MO_TLSGD);

21057

}

21058

21059

static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,

21060

SelectionDAG &DAG, const EVT PtrVT,

21061

bool Is64Bit, bool Is64BitLP64) {

21062

SDLoc dl(GA);

21063

21064

// Get the start address of the TLS block for this module.

21065

X86MachineFunctionInfo *MFI = DAG.getMachineFunction()

21066

.getInfo<X86MachineFunctionInfo>();

21067

MFI->incNumLocalDynamicTLSAccesses();

21068

21069

SDValue Base;

21070

if (Is64Bit) {

21071

unsigned ReturnReg = Is64BitLP64 ? X86::RAX : X86::EAX;

21072

Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, ReturnReg,

21073

X86II::MO_TLSLD, /*LocalDynamic=*/true);

21074

} else {

21075

SDValue InGlue;

21076

SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,

21077

DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), InGlue);

21078

InGlue = Chain.getValue(1);

21079

Base = GetTLSADDR(DAG, Chain, GA, &InGlue, PtrVT, X86::EAX,

21080

X86II::MO_TLSLDM, /*LocalDynamic=*/true);

21081

}

21082

21083

// Note: the CleanupLocalDynamicTLSPass will remove redundant computations

21084

// of Base.

21085

21086

// Build x@dtpoff.

21087

unsigned char OperandFlags = X86II::MO_DTPOFF;

21088

unsigned WrapperKind = X86ISD::Wrapper;

21089

SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,

21090

GA->getValueType(0),

21091

GA->getOffset(), OperandFlags);

21092

SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);

21093

21094

// Add x@dtpoff with the base.

21095

return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);

21096

}

21097

21098

// Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.

21099

static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,

21100

const EVT PtrVT, TLSModel::Model model,

21101

bool is64Bit, bool isPIC) {

21102

SDLoc dl(GA);

21103

21104

// Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).

21105

Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(),

21106

is64Bit ? 257 : 256));

21107

21108

SDValue ThreadPointer =

21109

DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),

21110

MachinePointerInfo(Ptr));

21111

21112

unsigned char OperandFlags = 0;

21113

// Most TLS accesses are not RIP relative, even on x86-64. One exception is

21114

// initialexec.

21115

unsigned WrapperKind = X86ISD::Wrapper;

21116

if (model == TLSModel::LocalExec) {

21117

OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;

21118

} else if (model == TLSModel::InitialExec) {

21119

if (is64Bit) {

21120

OperandFlags = X86II::MO_GOTTPOFF;

21121

WrapperKind = X86ISD::WrapperRIP;

21122

} else {

21123

OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;

21124

}

21125

} else {

21126

llvm_unreachable("Unexpected model")::llvm::llvm_unreachable_internal("Unexpected model", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21126);

21127

}

21128

21129

// emit "addl x@ntpoff,%eax" (local exec)

21130

// or "addl x@indntpoff,%eax" (initial exec)

21131

// or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)

21132

SDValue TGA =

21133

DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),

21134

GA->getOffset(), OperandFlags);

21135

SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);

21136

21137

if (model == TLSModel::InitialExec) {

21138

if (isPIC && !is64Bit) {

21139

Offset = DAG.getNode(ISD::ADD, dl, PtrVT,

21140

DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),

21141

Offset);

21142

}

21143

21144

Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,

21145

MachinePointerInfo::getGOT(DAG.getMachineFunction()));

21146

}

21147

21148

// The address of the thread local variable is the add of the thread

21149

// pointer with the offset of the variable.

21150

return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);

21151

}

21152

21153

SDValue

21154

X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {

21155

21156

GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);

21157

21158

if (DAG.getTarget().useEmulatedTLS())

21159

return LowerToTLSEmulatedModel(GA, DAG);

21160

21161

const GlobalValue *GV = GA->getGlobal();

21162

auto PtrVT = getPointerTy(DAG.getDataLayout());

21163

bool PositionIndependent = isPositionIndependent();

21164

21165

if (Subtarget.isTargetELF()) {

21166

TLSModel::Model model = DAG.getTarget().getTLSModel(GV);

21167

switch (model) {

21168

case TLSModel::GeneralDynamic:

21169

if (Subtarget.is64Bit()) {

21170

if (Subtarget.isTarget64BitLP64())

21171

return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT);

21172

return LowerToTLSGeneralDynamicModelX32(GA, DAG, PtrVT);

21173

}

21174

return LowerToTLSGeneralDynamicModel32(GA, DAG, PtrVT);

21175

case TLSModel::LocalDynamic:

21176

return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT, Subtarget.is64Bit(),

21177

Subtarget.isTarget64BitLP64());

21178

case TLSModel::InitialExec:

21179

case TLSModel::LocalExec:

21180

return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget.is64Bit(),

21181

PositionIndependent);

21182

}

21183

llvm_unreachable("Unknown TLS model.")::llvm::llvm_unreachable_internal("Unknown TLS model.", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 21183);

21184

}

21185

21186

if (Subtarget.isTargetDarwin()) {

21187

// Darwin only has one model of TLS. Lower to that.

21188

unsigned char OpFlag = 0;

21189

unsigned WrapperKind = Subtarget.isPICStyleRIPRel() ?

21190

X86ISD::WrapperRIP : X86ISD::Wrapper;

21191

21192

// In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the

21193

// global base reg.

21194

bool PIC32 = PositionIndependent && !Subtarget.is64Bit();

21195

if (PIC32)

21196

OpFlag = X86II::MO_TLVP_PIC_BASE;

21197

else

21198

OpFlag = X86II::MO_TLVP;

21199

SDLoc DL(Op);

21200

SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL,

21201

GA->getValueType(0),

21202

GA->getOffset(), OpFlag);

21203

SDValue Offset = DAG.getNode(WrapperKind, DL, PtrVT, Result);

21204

21205

// With PIC32, the address is actually $g + Offset.

21206

if (PIC32)

21207

Offset = DAG.getNode(ISD::ADD, DL, PtrVT,

21208

DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),

21209

Offset);

21210

21211

// Lowering the machine isd will make sure everything is in the right

21212

// location.

21213

SDValue Chain = DAG.getEntryNode();

21214

SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);

21215

Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);

21216

SDValue Args[] = { Chain, Offset };

21217

Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);

21218

Chain = DAG.getCALLSEQ_END(Chain, 0, 0, Chain.getValue(1), DL);

21219

21220

// TLSCALL will be codegen'ed as call. Inform MFI that function has calls.

21221

MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();

21222

MFI.setAdjustsStack(true);

21223

21224

// And our return value (tls address) is in the standard call return value

21225

// location.

21226

unsigned Reg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;

21227

return DAG.getCopyFromReg(Chain, DL, Reg, PtrVT, Chain.getValue(1));

21228

}

21229

21230

if (Subtarget.isOSWindows()) {

21231

// Just use the implicit TLS architecture

21232

// Need to generate something similar to:

21233

// mov rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage

21234

// ; from TEB

21235

// mov ecx, dword [rel _tls_index]: Load index (from C runtime)

21236

// mov rcx, qword [rdx+rcx*8]

21237

// mov eax, .tls$:tlsvar

21238

// [rax+rcx] contains the address

21239

// Windows 64bit: gs:0x58

21240

// Windows 32bit: fs:__tls_array

21241

21242

SDLoc dl(GA);

21243

SDValue Chain = DAG.getEntryNode();

21244

21245

// Get the Thread Pointer, which is %fs:__tls_array (32-bit) or

21246

// %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly

21247

// use its literal value of 0x2C.

21248

Value *Ptr = Constant::getNullValue(Subtarget.is64Bit()

21249

? Type::getInt8PtrTy(*DAG.getContext(),

21250

256)

21251

: Type::getInt32PtrTy(*DAG.getContext(),

21252

257));

21253

21254

SDValue TlsArray = Subtarget.is64Bit()

21255

? DAG.getIntPtrConstant(0x58, dl)

21256

: (Subtarget.isTargetWindowsGNU()

21257

? DAG.getIntPtrConstant(0x2C, dl)

21258

: DAG.getExternalSymbol("_tls_array", PtrVT));

21259

21260

SDValue ThreadPointer =

21261

DAG.getLoad(PtrVT, dl, Chain, TlsArray, MachinePointerInfo(Ptr));

21262

21263

SDValue res;

21264

if (GV->getThreadLocalMode() == GlobalVariable::LocalExecTLSModel) {

21265

res = ThreadPointer;

21266

} else {

21267

// Load the _tls_index variable

21268

SDValue IDX = DAG.getExternalSymbol("_tls_index", PtrVT);

21269

if (Subtarget.is64Bit())

21270

IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, PtrVT, Chain, IDX,

21271

MachinePointerInfo(), MVT::i32);

21272

else

21273

IDX = DAG.getLoad(PtrVT, dl, Chain, IDX, MachinePointerInfo());

21274

21275

const DataLayout &DL = DAG.getDataLayout();

21276

SDValue Scale =

21277

DAG.getConstant(Log2_64_Ceil(DL.getPointerSize()), dl, MVT::i8);

21278

IDX = DAG.getNode(ISD::SHL, dl, PtrVT, IDX, Scale);

21279

21280

res = DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, IDX);

21281

}

21282

21283

res = DAG.getLoad(PtrVT, dl, Chain, res, MachinePointerInfo());

21284

21285

// Get the offset of start of .tls section

21286

SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,

21287

GA->getValueType(0),

21288

GA->getOffset(), X86II::MO_SECREL);

21289

SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, TGA);

21290

21291

// The address of the thread local variable is the add of the thread

21292

// pointer with the offset of the variable.

21293

return DAG.getNode(ISD::ADD, dl, PtrVT, res, Offset);

21294

}

21295

21296

llvm_unreachable("TLS not implemented for this target.")::llvm::llvm_unreachable_internal("TLS not implemented for this target."
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21296);

21297

}

21298

21299

/// Lower SRA_PARTS and friends, which return two i32 values

21300

/// and take a 2 x i32 value to shift plus a shift amount.

21301

/// TODO: Can this be moved to general expansion code?

21302

static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) {

21303

SDValue Lo, Hi;

21304

DAG.getTargetLoweringInfo().expandShiftParts(Op.getNode(), Lo, Hi, DAG);

21305

return DAG.getMergeValues({Lo, Hi}, SDLoc(Op));

21306

}

21307

21308

// Try to use a packed vector operation to handle i64 on 32-bit targets when

21309

// AVX512DQ is enabled.

21310

static SDValue LowerI64IntToFP_AVX512DQ(SDValue Op, SelectionDAG &DAG,

21311

const X86Subtarget &Subtarget) {

21312

assert((Op.getOpcode() == ISD::SINT_TO_FP ||(static_cast <bool> ((Op.getOpcode() == ISD::SINT_TO_FP
|| Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode(
) == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP
) && "Unexpected opcode!") ? void (0) : __assert_fail
("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21316, __extension__
__PRETTY_FUNCTION__))

21313

Op.getOpcode() == ISD::STRICT_SINT_TO_FP ||(static_cast <bool> ((Op.getOpcode() == ISD::SINT_TO_FP
|| Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode(
) == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP
) && "Unexpected opcode!") ? void (0) : __assert_fail
("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21316, __extension__
__PRETTY_FUNCTION__))

21314

Op.getOpcode() == ISD::STRICT_UINT_TO_FP ||(static_cast <bool> ((Op.getOpcode() == ISD::SINT_TO_FP
|| Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode(
) == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP
) && "Unexpected opcode!") ? void (0) : __assert_fail
("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21316, __extension__
__PRETTY_FUNCTION__))

21315

Op.getOpcode() == ISD::UINT_TO_FP) &&(static_cast <bool> ((Op.getOpcode() == ISD::SINT_TO_FP
|| Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode(
) == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP
) && "Unexpected opcode!") ? void (0) : __assert_fail
("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21316, __extension__
__PRETTY_FUNCTION__))

21316

"Unexpected opcode!")(static_cast <bool> ((Op.getOpcode() == ISD::SINT_TO_FP
|| Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode(
) == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP
) && "Unexpected opcode!") ? void (0) : __assert_fail
("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21316, __extension__
__PRETTY_FUNCTION__));

21317

bool IsStrict = Op->isStrictFPOpcode();

21318

unsigned OpNo = IsStrict ? 1 : 0;

21319

SDValue Src = Op.getOperand(OpNo);

21320

MVT SrcVT = Src.getSimpleValueType();

21321

MVT VT = Op.getSimpleValueType();

21322

21323

if (!Subtarget.hasDQI() || SrcVT != MVT::i64 || Subtarget.is64Bit() ||

21324

(VT != MVT::f32 && VT != MVT::f64))

21325

return SDValue();

21326

21327

// Pack the i64 into a vector, do the operation and extract.

21328

21329

// Using 256-bit to ensure result is 128-bits for f32 case.

21330

unsigned NumElts = Subtarget.hasVLX() ? 4 : 8;

21331

MVT VecInVT = MVT::getVectorVT(MVT::i64, NumElts);

21332

MVT VecVT = MVT::getVectorVT(VT, NumElts);

21333

21334

SDLoc dl(Op);

21335

SDValue InVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecInVT, Src);

21336

if (IsStrict) {

21337

SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, {VecVT, MVT::Other},

21338

{Op.getOperand(0), InVec});

21339

SDValue Chain = CvtVec.getValue(1);

21340

SDValue Value = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,

21341

DAG.getIntPtrConstant(0, dl));

21342

return DAG.getMergeValues({Value, Chain}, dl);

21343

}

21344

21345

SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, VecVT, InVec);

21346

21347

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,

21348

DAG.getIntPtrConstant(0, dl));

21349

}

21350

21351

// Try to use a packed vector operation to handle i64 on 32-bit targets.

21352

static SDValue LowerI64IntToFP16(SDValue Op, SelectionDAG &DAG,

21353

const X86Subtarget &Subtarget) {

21354

assert((Op.getOpcode() == ISD::SINT_TO_FP ||(static_cast <bool> ((Op.getOpcode() == ISD::SINT_TO_FP
|| Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode(
) == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP
) && "Unexpected opcode!") ? void (0) : __assert_fail
("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21358, __extension__
__PRETTY_FUNCTION__))

21355

Op.getOpcode() == ISD::STRICT_SINT_TO_FP ||(static_cast <bool> ((Op.getOpcode() == ISD::SINT_TO_FP
|| Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode(
) == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP
) && "Unexpected opcode!") ? void (0) : __assert_fail
("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21358, __extension__
__PRETTY_FUNCTION__))

21356

Op.getOpcode() == ISD::STRICT_UINT_TO_FP ||(static_cast <bool> ((Op.getOpcode() == ISD::SINT_TO_FP
|| Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode(
) == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP
) && "Unexpected opcode!") ? void (0) : __assert_fail
("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21358, __extension__
__PRETTY_FUNCTION__))

21357

Op.getOpcode() == ISD::UINT_TO_FP) &&(static_cast <bool> ((Op.getOpcode() == ISD::SINT_TO_FP
|| Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode(
) == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP
) && "Unexpected opcode!") ? void (0) : __assert_fail
("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21358, __extension__
__PRETTY_FUNCTION__))

21358

"Unexpected opcode!")(static_cast <bool> ((Op.getOpcode() == ISD::SINT_TO_FP
|| Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode(
) == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP
) && "Unexpected opcode!") ? void (0) : __assert_fail
("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21358, __extension__
__PRETTY_FUNCTION__));

21359

bool IsStrict = Op->isStrictFPOpcode();

21360

SDValue Src = Op.getOperand(IsStrict ? 1 : 0);

21361

MVT SrcVT = Src.getSimpleValueType();

21362

MVT VT = Op.getSimpleValueType();

21363

21364

if (SrcVT != MVT::i64 || Subtarget.is64Bit() || VT != MVT::f16)

21365

return SDValue();

21366

21367

// Pack the i64 into a vector, do the operation and extract.

21368

21369

assert(Subtarget.hasFP16() && "Expected FP16")(static_cast <bool> (Subtarget.hasFP16() && "Expected FP16"
) ? void (0) : __assert_fail ("Subtarget.hasFP16() && \"Expected FP16\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21369, __extension__
__PRETTY_FUNCTION__));

21370

21371

SDLoc dl(Op);

21372

SDValue InVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Src);

21373

if (IsStrict) {

21374

SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, {MVT::v2f16, MVT::Other},

21375

{Op.getOperand(0), InVec});

21376

SDValue Chain = CvtVec.getValue(1);

21377

SDValue Value = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,

21378

DAG.getIntPtrConstant(0, dl));

21379

return DAG.getMergeValues({Value, Chain}, dl);

21380

}

21381

21382

SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, MVT::v2f16, InVec);

21383

21384

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,

21385

DAG.getIntPtrConstant(0, dl));

21386

}

21387

21388

static bool useVectorCast(unsigned Opcode, MVT FromVT, MVT ToVT,

21389

const X86Subtarget &Subtarget) {

21390

switch (Opcode) {

21391

case ISD::SINT_TO_FP:

21392

// TODO: Handle wider types with AVX/AVX512.

21393

if (!Subtarget.hasSSE2() || FromVT != MVT::v4i32)

21394

return false;

21395

// CVTDQ2PS or (V)CVTDQ2PD

21396

return ToVT == MVT::v4f32 || (Subtarget.hasAVX() && ToVT == MVT::v4f64);

21397

21398

case ISD::UINT_TO_FP:

21399

// TODO: Handle wider types and i64 elements.

21400

if (!Subtarget.hasAVX512() || FromVT != MVT::v4i32)

21401

return false;

21402

// VCVTUDQ2PS or VCVTUDQ2PD

21403

return ToVT == MVT::v4f32 || ToVT == MVT::v4f64;

21404

21405

default:

21406

return false;

21407

}

21408

}

21409

21410

/// Given a scalar cast operation that is extracted from a vector, try to

21411

/// vectorize the cast op followed by extraction. This will avoid an expensive

21412

/// round-trip between XMM and GPR.

21413

static SDValue vectorizeExtractedCast(SDValue Cast, SelectionDAG &DAG,

21414

const X86Subtarget &Subtarget) {

21415

// TODO: This could be enhanced to handle smaller integer types by peeking

21416

// through an extend.

21417

SDValue Extract = Cast.getOperand(0);

21418

MVT DestVT = Cast.getSimpleValueType();

21419

if (Extract.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

21420

!isa<ConstantSDNode>(Extract.getOperand(1)))

21421

return SDValue();

21422

21423

// See if we have a 128-bit vector cast op for this type of cast.

21424

SDValue VecOp = Extract.getOperand(0);

21425

MVT FromVT = VecOp.getSimpleValueType();

21426

unsigned NumEltsInXMM = 128 / FromVT.getScalarSizeInBits();

21427

MVT Vec128VT = MVT::getVectorVT(FromVT.getScalarType(), NumEltsInXMM);

21428

MVT ToVT = MVT::getVectorVT(DestVT, NumEltsInXMM);

21429

if (!useVectorCast(Cast.getOpcode(), Vec128VT, ToVT, Subtarget))

21430

return SDValue();

21431

21432

// If we are extracting from a non-zero element, first shuffle the source

21433

// vector to allow extracting from element zero.

21434

SDLoc DL(Cast);

21435

if (!isNullConstant(Extract.getOperand(1))) {

21436

SmallVector<int, 16> Mask(FromVT.getVectorNumElements(), -1);

21437

Mask[0] = Extract.getConstantOperandVal(1);

21438

VecOp = DAG.getVectorShuffle(FromVT, DL, VecOp, DAG.getUNDEF(FromVT), Mask);

21439

}

21440

// If the source vector is wider than 128-bits, extract the low part. Do not

21441

// create an unnecessarily wide vector cast op.

21442

if (FromVT != Vec128VT)

21443

VecOp = extract128BitVector(VecOp, 0, DAG, DL);

21444

21445

// cast (extelt V, 0) --> extelt (cast (extract_subv V)), 0

21446

// cast (extelt V, C) --> extelt (cast (extract_subv (shuffle V, [C...]))), 0

21447

SDValue VCast = DAG.getNode(Cast.getOpcode(), DL, ToVT, VecOp);

21448

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, DestVT, VCast,

21449

DAG.getIntPtrConstant(0, DL));

21450

}

21451

21452

/// Given a scalar cast to FP with a cast to integer operand (almost an ftrunc),

21453

/// try to vectorize the cast ops. This will avoid an expensive round-trip

21454

/// between XMM and GPR.

21455

static SDValue lowerFPToIntToFP(SDValue CastToFP, SelectionDAG &DAG,

21456

const X86Subtarget &Subtarget) {

21457

// TODO: Allow FP_TO_UINT.

21458

SDValue CastToInt = CastToFP.getOperand(0);

21459

MVT VT = CastToFP.getSimpleValueType();

21460

if (CastToInt.getOpcode() != ISD::FP_TO_SINT || VT.isVector())

21461

return SDValue();

21462

21463

MVT IntVT = CastToInt.getSimpleValueType();

21464

SDValue X = CastToInt.getOperand(0);

21465

MVT SrcVT = X.getSimpleValueType();

21466

if (SrcVT != MVT::f32 && SrcVT != MVT::f64)

21467

return SDValue();

21468

21469

// See if we have 128-bit vector cast instructions for this type of cast.

21470

// We need cvttps2dq/cvttpd2dq and cvtdq2ps/cvtdq2pd.

21471

if (!Subtarget.hasSSE2() || (VT != MVT::f32 && VT != MVT::f64) ||

21472

IntVT != MVT::i32)

21473

return SDValue();

21474

21475

unsigned SrcSize = SrcVT.getSizeInBits();

21476

unsigned IntSize = IntVT.getSizeInBits();

21477

unsigned VTSize = VT.getSizeInBits();

21478

MVT VecSrcVT = MVT::getVectorVT(SrcVT, 128 / SrcSize);

21479

MVT VecIntVT = MVT::getVectorVT(IntVT, 128 / IntSize);

21480

MVT VecVT = MVT::getVectorVT(VT, 128 / VTSize);

21481

21482

// We need target-specific opcodes if this is v2f64 -> v4i32 -> v2f64.

21483

unsigned ToIntOpcode =

21484

SrcSize != IntSize ? X86ISD::CVTTP2SI : (unsigned)ISD::FP_TO_SINT;

21485

unsigned ToFPOpcode =

21486

IntSize != VTSize ? X86ISD::CVTSI2P : (unsigned)ISD::SINT_TO_FP;

21487

21488

// sint_to_fp (fp_to_sint X) --> extelt (sint_to_fp (fp_to_sint (s2v X))), 0

21489

//

21490

// We are not defining the high elements (for example, zero them) because

21491

// that could nullify any performance advantage that we hoped to gain from

21492

// this vector op hack. We do not expect any adverse effects (like denorm

21493

// penalties) with cast ops.

21494

SDLoc DL(CastToFP);

21495

SDValue ZeroIdx = DAG.getIntPtrConstant(0, DL);

21496

SDValue VecX = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecSrcVT, X);

21497

SDValue VCastToInt = DAG.getNode(ToIntOpcode, DL, VecIntVT, VecX);

21498

SDValue VCastToFP = DAG.getNode(ToFPOpcode, DL, VecVT, VCastToInt);

21499

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, VCastToFP, ZeroIdx);

21500

}

21501

21502

static SDValue lowerINT_TO_FP_vXi64(SDValue Op, SelectionDAG &DAG,

21503

const X86Subtarget &Subtarget) {

21504

SDLoc DL(Op);

21505

bool IsStrict = Op->isStrictFPOpcode();

21506

MVT VT = Op->getSimpleValueType(0);

21507

SDValue Src = Op->getOperand(IsStrict ? 1 : 0);

21508

21509

if (Subtarget.hasDQI()) {

21510

assert(!Subtarget.hasVLX() && "Unexpected features")(static_cast <bool> (!Subtarget.hasVLX() && "Unexpected features"
) ? void (0) : __assert_fail ("!Subtarget.hasVLX() && \"Unexpected features\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21510, __extension__
__PRETTY_FUNCTION__));

21511

21512

assert((Src.getSimpleValueType() == MVT::v2i64 ||(static_cast <bool> ((Src.getSimpleValueType() == MVT::
v2i64 || Src.getSimpleValueType() == MVT::v4i64) && "Unsupported custom type"
) ? void (0) : __assert_fail ("(Src.getSimpleValueType() == MVT::v2i64 || Src.getSimpleValueType() == MVT::v4i64) && \"Unsupported custom type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21514, __extension__
__PRETTY_FUNCTION__))

21513

Src.getSimpleValueType() == MVT::v4i64) &&(static_cast <bool> ((Src.getSimpleValueType() == MVT::
v2i64 || Src.getSimpleValueType() == MVT::v4i64) && "Unsupported custom type"
) ? void (0) : __assert_fail ("(Src.getSimpleValueType() == MVT::v2i64 || Src.getSimpleValueType() == MVT::v4i64) && \"Unsupported custom type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21514, __extension__
__PRETTY_FUNCTION__))

21514

"Unsupported custom type")(static_cast <bool> ((Src.getSimpleValueType() == MVT::
v2i64 || Src.getSimpleValueType() == MVT::v4i64) && "Unsupported custom type"
) ? void (0) : __assert_fail ("(Src.getSimpleValueType() == MVT::v2i64 || Src.getSimpleValueType() == MVT::v4i64) && \"Unsupported custom type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21514, __extension__
__PRETTY_FUNCTION__));

21515

21516

// With AVX512DQ, but not VLX we need to widen to get a 512-bit result type.

21517

assert((VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v4f64) &&(static_cast <bool> ((VT == MVT::v4f32 || VT == MVT::v2f64
|| VT == MVT::v4f64) && "Unexpected VT!") ? void (0)
: __assert_fail ("(VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v4f64) && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21518, __extension__
__PRETTY_FUNCTION__))

21518

"Unexpected VT!")(static_cast <bool> ((VT == MVT::v4f32 || VT == MVT::v2f64
|| VT == MVT::v4f64) && "Unexpected VT!") ? void (0)
: __assert_fail ("(VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v4f64) && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21518, __extension__
__PRETTY_FUNCTION__));

21519

MVT WideVT = VT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64;

21520

21521

// Need to concat with zero vector for strict fp to avoid spurious

21522

// exceptions.

21523

SDValue Tmp = IsStrict ? DAG.getConstant(0, DL, MVT::v8i64)

21524

: DAG.getUNDEF(MVT::v8i64);

21525

Src = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i64, Tmp, Src,

21526

DAG.getIntPtrConstant(0, DL));

21527

SDValue Res, Chain;

21528

if (IsStrict) {

21529

Res = DAG.getNode(Op.getOpcode(), DL, {WideVT, MVT::Other},

21530

{Op->getOperand(0), Src});

21531

Chain = Res.getValue(1);

21532

} else {

21533

Res = DAG.getNode(Op.getOpcode(), DL, WideVT, Src);

21534

}

21535

21536

Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,

21537

DAG.getIntPtrConstant(0, DL));

21538

21539

if (IsStrict)

21540

return DAG.getMergeValues({Res, Chain}, DL);

21541

return Res;

21542

}

21543

21544

bool IsSigned = Op->getOpcode() == ISD::SINT_TO_FP ||

21545

Op->getOpcode() == ISD::STRICT_SINT_TO_FP;

21546

if (VT != MVT::v4f32 || IsSigned)

21547

return SDValue();

21548

21549

SDValue Zero = DAG.getConstant(0, DL, MVT::v4i64);

21550

SDValue One = DAG.getConstant(1, DL, MVT::v4i64);

21551

SDValue Sign = DAG.getNode(ISD::OR, DL, MVT::v4i64,

21552

DAG.getNode(ISD::SRL, DL, MVT::v4i64, Src, One),

21553

DAG.getNode(ISD::AND, DL, MVT::v4i64, Src, One));

21554

SDValue IsNeg = DAG.getSetCC(DL, MVT::v4i64, Src, Zero, ISD::SETLT);

21555

SDValue SignSrc = DAG.getSelect(DL, MVT::v4i64, IsNeg, Sign, Src);

21556

SmallVector<SDValue, 4> SignCvts(4);

21557

SmallVector<SDValue, 4> Chains(4);

21558

for (int i = 0; i != 4; ++i) {

21559

SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, SignSrc,

21560

DAG.getIntPtrConstant(i, DL));

21561

if (IsStrict) {

21562

SignCvts[i] =

21563

DAG.getNode(ISD::STRICT_SINT_TO_FP, DL, {MVT::f32, MVT::Other},

21564

{Op.getOperand(0), Elt});

21565

Chains[i] = SignCvts[i].getValue(1);

21566

} else {

21567

SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, DL, MVT::f32, Elt);

21568

}

21569

}

21570

SDValue SignCvt = DAG.getBuildVector(VT, DL, SignCvts);

21571

21572

SDValue Slow, Chain;

21573

if (IsStrict) {

21574

Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);

21575

Slow = DAG.getNode(ISD::STRICT_FADD, DL, {MVT::v4f32, MVT::Other},

21576

{Chain, SignCvt, SignCvt});

21577

Chain = Slow.getValue(1);

21578

} else {

21579

Slow = DAG.getNode(ISD::FADD, DL, MVT::v4f32, SignCvt, SignCvt);

21580

}

21581

21582

IsNeg = DAG.getNode(ISD::TRUNCATE, DL, MVT::v4i32, IsNeg);

21583

SDValue Cvt = DAG.getSelect(DL, MVT::v4f32, IsNeg, Slow, SignCvt);

21584

21585

if (IsStrict)

21586

return DAG.getMergeValues({Cvt, Chain}, DL);

21587

21588

return Cvt;

21589

}

21590

21591

static SDValue promoteXINT_TO_FP(SDValue Op, SelectionDAG &DAG) {

21592

bool IsStrict = Op->isStrictFPOpcode();

21593

SDValue Src = Op.getOperand(IsStrict ? 1 : 0);

21594

SDValue Chain = IsStrict ? Op->getOperand(0) : DAG.getEntryNode();

21595

MVT VT = Op.getSimpleValueType();

21596

MVT NVT = VT.isVector() ? VT.changeVectorElementType(MVT::f32) : MVT::f32;

21597

SDLoc dl(Op);

21598

21599

SDValue Rnd = DAG.getIntPtrConstant(0, dl);

21600

if (IsStrict)

21601

return DAG.getNode(

21602

ISD::STRICT_FP_ROUND, dl, {VT, MVT::Other},

21603

{Chain,

21604

DAG.getNode(Op.getOpcode(), dl, {NVT, MVT::Other}, {Chain, Src}),

21605

Rnd});

21606

return DAG.getNode(ISD::FP_ROUND, dl, VT,

21607

DAG.getNode(Op.getOpcode(), dl, NVT, Src), Rnd);

21608

}

21609

21610

static bool isLegalConversion(MVT VT, bool IsSigned,

21611

const X86Subtarget &Subtarget) {

21612

if (VT == MVT::v4i32 && Subtarget.hasSSE2() && IsSigned)

21613

return true;

21614

if (VT == MVT::v8i32 && Subtarget.hasAVX() && IsSigned)

21615

return true;

21616

if (Subtarget.hasVLX() && (VT == MVT::v4i32 || VT == MVT::v8i32))

21617

return true;

21618

if (Subtarget.useAVX512Regs()) {

21619

if (VT == MVT::v16i32)

21620

return true;

21621

if (VT == MVT::v8i64 && Subtarget.hasDQI())

21622

return true;

21623

}

21624

if (Subtarget.hasDQI() && Subtarget.hasVLX() &&

21625

(VT == MVT::v2i64 || VT == MVT::v4i64))

21626

return true;

21627

return false;

21628

}

21629

21630

SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,

21631

SelectionDAG &DAG) const {

21632

bool IsStrict = Op->isStrictFPOpcode();

21633

unsigned OpNo = IsStrict ? 1 : 0;

21634

SDValue Src = Op.getOperand(OpNo);

21635

SDValue Chain = IsStrict ? Op->getOperand(0) : DAG.getEntryNode();

21636

MVT SrcVT = Src.getSimpleValueType();

21637

MVT VT = Op.getSimpleValueType();

21638

SDLoc dl(Op);

21639

21640

if (isSoftFP16(VT))

21641

return promoteXINT_TO_FP(Op, DAG);

21642

else if (isLegalConversion(SrcVT, true, Subtarget))

21643

return Op;

21644

21645

if (Subtarget.isTargetWin64() && SrcVT == MVT::i128)

21646

return LowerWin64_INT128_TO_FP(Op, DAG);

21647

21648

if (SDValue Extract = vectorizeExtractedCast(Op, DAG, Subtarget))

21649

return Extract;

21650

21651

if (SDValue R = lowerFPToIntToFP(Op, DAG, Subtarget))

21652

return R;

21653

21654

if (SrcVT.isVector()) {

21655

if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) {

21656

// Note: Since v2f64 is a legal type. We don't need to zero extend the

21657

// source for strict FP.

21658

if (IsStrict)

21659

return DAG.getNode(

21660

X86ISD::STRICT_CVTSI2P, dl, {VT, MVT::Other},

21661

{Chain, DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,

21662

DAG.getUNDEF(SrcVT))});

21663

return DAG.getNode(X86ISD::CVTSI2P, dl, VT,

21664

DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,

21665

DAG.getUNDEF(SrcVT)));

21666

}

21667

if (SrcVT == MVT::v2i64 || SrcVT == MVT::v4i64)

21668

return lowerINT_TO_FP_vXi64(Op, DAG, Subtarget);

21669

21670

return SDValue();

21671

}

21672

21673

assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&(static_cast <bool> (SrcVT <= MVT::i64 && SrcVT
>= MVT::i16 && "Unknown SINT_TO_FP to lower!") ? void
(0) : __assert_fail ("SrcVT <= MVT::i64 && SrcVT >= MVT::i16 && \"Unknown SINT_TO_FP to lower!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21674, __extension__
__PRETTY_FUNCTION__))

21674

"Unknown SINT_TO_FP to lower!")(static_cast <bool> (SrcVT <= MVT::i64 && SrcVT
>= MVT::i16 && "Unknown SINT_TO_FP to lower!") ? void
(0) : __assert_fail ("SrcVT <= MVT::i64 && SrcVT >= MVT::i16 && \"Unknown SINT_TO_FP to lower!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21674, __extension__
__PRETTY_FUNCTION__));

21675

21676

bool UseSSEReg = isScalarFPTypeInSSEReg(VT);

21677

21678

// These are really Legal; return the operand so the caller accepts it as

21679

// Legal.

21680

if (SrcVT == MVT::i32 && UseSSEReg)

21681

return Op;

21682

if (SrcVT == MVT::i64 && UseSSEReg && Subtarget.is64Bit())

21683

return Op;

21684

21685

if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, DAG, Subtarget))

21686

return V;

21687

if (SDValue V = LowerI64IntToFP16(Op, DAG, Subtarget))

21688

return V;

21689

21690

// SSE doesn't have an i16 conversion so we need to promote.

21691

if (SrcVT == MVT::i16 && (UseSSEReg || VT == MVT::f128)) {

21692

SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i32, Src);

21693

if (IsStrict)

21694

return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},

21695

{Chain, Ext});

21696

21697

return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Ext);

21698

}

21699

21700

if (VT == MVT::f128 || !Subtarget.hasX87())

21701

return SDValue();

21702

21703

SDValue ValueToStore = Src;

21704

if (SrcVT == MVT::i64 && Subtarget.hasSSE2() && !Subtarget.is64Bit())

21705

// Bitcasting to f64 here allows us to do a single 64-bit store from

21706

// an SSE register, avoiding the store forwarding penalty that would come

21707

// with two 32-bit stores.

21708

ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);

21709

21710

unsigned Size = SrcVT.getStoreSize();

21711

Align Alignment(Size);

21712

MachineFunction &MF = DAG.getMachineFunction();

21713

auto PtrVT = getPointerTy(MF.getDataLayout());

21714

int SSFI = MF.getFrameInfo().CreateStackObject(Size, Alignment, false);

21715

MachinePointerInfo MPI =

21716

MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI);

21717

SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);

21718

Chain = DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, Alignment);

21719

std::pair<SDValue, SDValue> Tmp =

21720

BuildFILD(VT, SrcVT, dl, Chain, StackSlot, MPI, Alignment, DAG);

21721

21722

if (IsStrict)

21723

return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);

21724

21725

return Tmp.first;

21726

}

21727

21728

std::pair<SDValue, SDValue> X86TargetLowering::BuildFILD(

21729

EVT DstVT, EVT SrcVT, const SDLoc &DL, SDValue Chain, SDValue Pointer,

21730

MachinePointerInfo PtrInfo, Align Alignment, SelectionDAG &DAG) const {

21731

// Build the FILD

21732

SDVTList Tys;

21733

bool useSSE = isScalarFPTypeInSSEReg(DstVT);

21734

if (useSSE)

21735

Tys = DAG.getVTList(MVT::f80, MVT::Other);

21736

else

21737

Tys = DAG.getVTList(DstVT, MVT::Other);

21738

21739

SDValue FILDOps[] = {Chain, Pointer};

21740

SDValue Result =

21741

DAG.getMemIntrinsicNode(X86ISD::FILD, DL, Tys, FILDOps, SrcVT, PtrInfo,

21742

Alignment, MachineMemOperand::MOLoad);

21743

Chain = Result.getValue(1);

21744

21745

if (useSSE) {

21746

MachineFunction &MF = DAG.getMachineFunction();

21747

unsigned SSFISize = DstVT.getStoreSize();

21748

int SSFI =

21749

MF.getFrameInfo().CreateStackObject(SSFISize, Align(SSFISize), false);

21750

auto PtrVT = getPointerTy(MF.getDataLayout());

21751

SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);

21752

Tys = DAG.getVTList(MVT::Other);

21753

SDValue FSTOps[] = {Chain, Result, StackSlot};

21754

MachineMemOperand *StoreMMO = DAG.getMachineFunction().getMachineMemOperand(

21755

MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),

21756

MachineMemOperand::MOStore, SSFISize, Align(SSFISize));

21757

21758

Chain =

21759

DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys, FSTOps, DstVT, StoreMMO);

21760

Result = DAG.getLoad(

21761

DstVT, DL, Chain, StackSlot,

21762

MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));

21763

Chain = Result.getValue(1);

21764

}

21765

21766

return { Result, Chain };

21767

}

21768

21769

/// Horizontal vector math instructions may be slower than normal math with

21770

/// shuffles. Limit horizontal op codegen based on size/speed trade-offs, uarch

21771

/// implementation, and likely shuffle complexity of the alternate sequence.

21772

static bool shouldUseHorizontalOp(bool IsSingleSource, SelectionDAG &DAG,

21773

const X86Subtarget &Subtarget) {

21774

bool IsOptimizingSize = DAG.shouldOptForSize();

21775

bool HasFastHOps = Subtarget.hasFastHorizontalOps();

21776

return !IsSingleSource || IsOptimizingSize || HasFastHOps;

21777

}

21778

21779

/// 64-bit unsigned integer to double expansion.

21780

static SDValue LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG,

21781

const X86Subtarget &Subtarget) {

21782

// We can't use this algorithm for strict fp. It produces -0.0 instead of +0.0

21783

// when converting 0 when rounding toward negative infinity. Caller will

21784

// fall back to Expand for when i64 or is legal or use FILD in 32-bit mode.

21785

assert(!Op->isStrictFPOpcode() && "Expected non-strict uint_to_fp!")(static_cast <bool> (!Op->isStrictFPOpcode() &&
"Expected non-strict uint_to_fp!") ? void (0) : __assert_fail
("!Op->isStrictFPOpcode() && \"Expected non-strict uint_to_fp!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21785, __extension__
__PRETTY_FUNCTION__));

21786

// This algorithm is not obvious. Here it is what we're trying to output:

21787

/*

21788

movq %rax, %xmm0

21789

punpckldq (c0), %xmm0 // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }

21790

subpd (c1), %xmm0 // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }

21791

#ifdef __SSE3__

21792

haddpd %xmm0, %xmm0

21793

#else

21794

pshufd $0x4e, %xmm0, %xmm1

21795

addpd %xmm1, %xmm0

21796

#endif

21797

*/

21798

21799

SDLoc dl(Op);

21800

LLVMContext *Context = DAG.getContext();

21801

21802

// Build some magic constants.

21803

static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };

21804

Constant *C0 = ConstantDataVector::get(*Context, CV0);

21805

auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());

21806

SDValue CPIdx0 = DAG.getConstantPool(C0, PtrVT, Align(16));

21807

21808

SmallVector<Constant*,2> CV1;

21809

CV1.push_back(

21810

ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),

21811

APInt(64, 0x4330000000000000ULL))));

21812

CV1.push_back(

21813

ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),

21814

APInt(64, 0x4530000000000000ULL))));

21815

Constant *C1 = ConstantVector::get(CV1);

21816

SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, Align(16));

21817

21818

// Load the 64-bit value into an XMM register.

21819

SDValue XR1 =

21820

DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Op.getOperand(0));

21821

SDValue CLod0 = DAG.getLoad(

21822

MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,

21823

MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Align(16));

21824

SDValue Unpck1 =

21825

getUnpackl(DAG, dl, MVT::v4i32, DAG.getBitcast(MVT::v4i32, XR1), CLod0);

21826

21827

SDValue CLod1 = DAG.getLoad(

21828

MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,

21829

MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Align(16));

21830

SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1);

21831

// TODO: Are there any fast-math-flags to propagate here?

21832

SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);

21833

SDValue Result;

21834

21835

if (Subtarget.hasSSE3() &&

21836

shouldUseHorizontalOp(true, DAG, Subtarget)) {

21837

Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);

21838

} else {

21839

SDValue Shuffle = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, Sub, {1,-1});

21840

Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuffle, Sub);

21841

}

21842

Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,

21843

DAG.getIntPtrConstant(0, dl));

21844

return Result;

21845

}

21846

21847

/// 32-bit unsigned integer to float expansion.

21848

static SDValue LowerUINT_TO_FP_i32(SDValue Op, SelectionDAG &DAG,

21849

const X86Subtarget &Subtarget) {

21850

unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0;

21851

SDLoc dl(Op);

21852

// FP constant to bias correct the final result.

21853

SDValue Bias = DAG.getConstantFP(

21854

llvm::bit_cast<double>(0x4330000000000000ULL), dl, MVT::f64);

21855

21856

// Load the 32-bit value into an XMM register.

21857

SDValue Load =

21858

DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Op.getOperand(OpNo));

21859

21860

// Zero out the upper parts of the register.

21861

Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);

21862

21863

// Or the load with the bias.

21864

SDValue Or = DAG.getNode(

21865

ISD::OR, dl, MVT::v2i64,

21866

DAG.getBitcast(MVT::v2i64, Load),

21867

DAG.getBitcast(MVT::v2i64,

21868

DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Bias)));

21869

Or =

21870

DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,

21871

DAG.getBitcast(MVT::v2f64, Or), DAG.getIntPtrConstant(0, dl));

21872

21873

if (Op.getNode()->isStrictFPOpcode()) {

21874

// Subtract the bias.

21875

// TODO: Are there any fast-math-flags to propagate here?

21876

SDValue Chain = Op.getOperand(0);

21877

SDValue Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::f64, MVT::Other},

21878

{Chain, Or, Bias});

21879

21880

if (Op.getValueType() == Sub.getValueType())

21881

return Sub;

21882

21883

// Handle final rounding.

21884

std::pair<SDValue, SDValue> ResultPair = DAG.getStrictFPExtendOrRound(

21885

Sub, Sub.getValue(1), dl, Op.getSimpleValueType());

21886

21887

return DAG.getMergeValues({ResultPair.first, ResultPair.second}, dl);

21888

}

21889

21890

// Subtract the bias.

21891

// TODO: Are there any fast-math-flags to propagate here?

21892

SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);

21893

21894

// Handle final rounding.

21895

return DAG.getFPExtendOrRound(Sub, dl, Op.getSimpleValueType());

21896

}

21897

21898

static SDValue lowerUINT_TO_FP_v2i32(SDValue Op, SelectionDAG &DAG,

21899

const X86Subtarget &Subtarget,

21900

const SDLoc &DL) {

21901

if (Op.getSimpleValueType() != MVT::v2f64)

21902

return SDValue();

21903

21904

bool IsStrict = Op->isStrictFPOpcode();

21905

21906

SDValue N0 = Op.getOperand(IsStrict ? 1 : 0);

21907

assert(N0.getSimpleValueType() == MVT::v2i32 && "Unexpected input type")(static_cast <bool> (N0.getSimpleValueType() == MVT::v2i32
&& "Unexpected input type") ? void (0) : __assert_fail
("N0.getSimpleValueType() == MVT::v2i32 && \"Unexpected input type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21907, __extension__
__PRETTY_FUNCTION__));

21908

21909

if (Subtarget.hasAVX512()) {

21910

if (!Subtarget.hasVLX()) {

21911

// Let generic type legalization widen this.

21912

if (!IsStrict)

21913

return SDValue();

21914

// Otherwise pad the integer input with 0s and widen the operation.

21915

N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,

21916

DAG.getConstant(0, DL, MVT::v2i32));

21917

SDValue Res = DAG.getNode(Op->getOpcode(), DL, {MVT::v4f64, MVT::Other},

21918

{Op.getOperand(0), N0});

21919

SDValue Chain = Res.getValue(1);

21920

Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2f64, Res,

21921

DAG.getIntPtrConstant(0, DL));

21922

return DAG.getMergeValues({Res, Chain}, DL);

21923

}

21924

21925

// Legalize to v4i32 type.

21926

N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,

21927

DAG.getUNDEF(MVT::v2i32));

21928

if (IsStrict)

21929

return DAG.getNode(X86ISD::STRICT_CVTUI2P, DL, {MVT::v2f64, MVT::Other},

21930

{Op.getOperand(0), N0});

21931

return DAG.getNode(X86ISD::CVTUI2P, DL, MVT::v2f64, N0);

21932

}

21933

21934

// Zero extend to 2i64, OR with the floating point representation of 2^52.

21935

// This gives us the floating point equivalent of 2^52 + the i32 integer

21936

// since double has 52-bits of mantissa. Then subtract 2^52 in floating

21937

// point leaving just our i32 integers in double format.

21938

SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v2i64, N0);

21939

SDValue VBias = DAG.getConstantFP(

21940

llvm::bit_cast<double>(0x4330000000000000ULL), DL, MVT::v2f64);

21941

SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v2i64, ZExtIn,

21942

DAG.getBitcast(MVT::v2i64, VBias));

21943

Or = DAG.getBitcast(MVT::v2f64, Or);

21944

21945

if (IsStrict)

21946

return DAG.getNode(ISD::STRICT_FSUB, DL, {MVT::v2f64, MVT::Other},

21947

{Op.getOperand(0), Or, VBias});

21948

return DAG.getNode(ISD::FSUB, DL, MVT::v2f64, Or, VBias);

21949

}

21950

21951

static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,

21952

const X86Subtarget &Subtarget) {

21953

SDLoc DL(Op);

21954

bool IsStrict = Op->isStrictFPOpcode();

21955

SDValue V = Op->getOperand(IsStrict ? 1 : 0);

21956

MVT VecIntVT = V.getSimpleValueType();

21957

assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&(static_cast <bool> ((VecIntVT == MVT::v4i32 || VecIntVT
== MVT::v8i32) && "Unsupported custom type") ? void (
0) : __assert_fail ("(VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) && \"Unsupported custom type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21958, __extension__
__PRETTY_FUNCTION__))

21958

"Unsupported custom type")(static_cast <bool> ((VecIntVT == MVT::v4i32 || VecIntVT
== MVT::v8i32) && "Unsupported custom type") ? void (
0) : __assert_fail ("(VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) && \"Unsupported custom type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21958, __extension__
__PRETTY_FUNCTION__));

21959

21960

if (Subtarget.hasAVX512()) {

21961

// With AVX512, but not VLX we need to widen to get a 512-bit result type.

21962

assert(!Subtarget.hasVLX() && "Unexpected features")(static_cast <bool> (!Subtarget.hasVLX() && "Unexpected features"
) ? void (0) : __assert_fail ("!Subtarget.hasVLX() && \"Unexpected features\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21962, __extension__
__PRETTY_FUNCTION__));

21963

MVT VT = Op->getSimpleValueType(0);

21964

21965

// v8i32->v8f64 is legal with AVX512 so just return it.

21966

if (VT == MVT::v8f64)

21967

return Op;

21968

21969

assert((VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v4f64) &&(static_cast <bool> ((VT == MVT::v4f32 || VT == MVT::v8f32
|| VT == MVT::v4f64) && "Unexpected VT!") ? void (0)
: __assert_fail ("(VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v4f64) && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21970, __extension__
__PRETTY_FUNCTION__))

21970

"Unexpected VT!")(static_cast <bool> ((VT == MVT::v4f32 || VT == MVT::v8f32
|| VT == MVT::v4f64) && "Unexpected VT!") ? void (0)
: __assert_fail ("(VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v4f64) && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21970, __extension__
__PRETTY_FUNCTION__));

21971

MVT WideVT = VT == MVT::v4f64 ? MVT::v8f64 : MVT::v16f32;

21972

MVT WideIntVT = VT == MVT::v4f64 ? MVT::v8i32 : MVT::v16i32;

21973

// Need to concat with zero vector for strict fp to avoid spurious

21974

// exceptions.

21975

SDValue Tmp =

21976

IsStrict ? DAG.getConstant(0, DL, WideIntVT) : DAG.getUNDEF(WideIntVT);

21977

V = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideIntVT, Tmp, V,

21978

DAG.getIntPtrConstant(0, DL));

21979

SDValue Res, Chain;

21980

if (IsStrict) {

21981

Res = DAG.getNode(ISD::STRICT_UINT_TO_FP, DL, {WideVT, MVT::Other},

21982

{Op->getOperand(0), V});

21983

Chain = Res.getValue(1);

21984

} else {

21985

Res = DAG.getNode(ISD::UINT_TO_FP, DL, WideVT, V);

21986

}

21987

21988

Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,

21989

DAG.getIntPtrConstant(0, DL));

21990

21991

if (IsStrict)

21992

return DAG.getMergeValues({Res, Chain}, DL);

21993

return Res;

21994

}

21995

21996

if (Subtarget.hasAVX() && VecIntVT == MVT::v4i32 &&

21997

Op->getSimpleValueType(0) == MVT::v4f64) {

21998

SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v4i64, V);

21999

Constant *Bias = ConstantFP::get(

22000

*DAG.getContext(),

22001

APFloat(APFloat::IEEEdouble(), APInt(64, 0x4330000000000000ULL)));

22002

auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());

22003

SDValue CPIdx = DAG.getConstantPool(Bias, PtrVT, Align(8));

22004

SDVTList Tys = DAG.getVTList(MVT::v4f64, MVT::Other);

22005

SDValue Ops[] = {DAG.getEntryNode(), CPIdx};

22006

SDValue VBias = DAG.getMemIntrinsicNode(

22007

X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::f64,

22008

MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Align(8),

22009

MachineMemOperand::MOLoad);

22010

22011

SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v4i64, ZExtIn,

22012

DAG.getBitcast(MVT::v4i64, VBias));

22013

Or = DAG.getBitcast(MVT::v4f64, Or);

22014

22015

if (IsStrict)

22016

return DAG.getNode(ISD::STRICT_FSUB, DL, {MVT::v4f64, MVT::Other},

22017

{Op.getOperand(0), Or, VBias});

22018

return DAG.getNode(ISD::FSUB, DL, MVT::v4f64, Or, VBias);

22019

}

22020

22021

// The algorithm is the following:

22022

// #ifdef __SSE4_1__

22023

// uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);

22024

// uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),

22025

// (uint4) 0x53000000, 0xaa);

22026

// #else

22027

// uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;

22028

// uint4 hi = (v >> 16) | (uint4) 0x53000000;

22029

// #endif

22030

// float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);

22031

// return (float4) lo + fhi;

22032

22033

bool Is128 = VecIntVT == MVT::v4i32;

22034

MVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;

22035

// If we convert to something else than the supported type, e.g., to v4f64,

22036

// abort early.

22037

if (VecFloatVT != Op->getSimpleValueType(0))

22038

return SDValue();

22039

22040

// In the #idef/#else code, we have in common:

22041

// - The vector of constants:

22042

// -- 0x4b000000

22043

// -- 0x53000000

22044

// - A shift:

22045

// -- v >> 16

22046

22047

// Create the splat vector for 0x4b000000.

22048

SDValue VecCstLow = DAG.getConstant(0x4b000000, DL, VecIntVT);

22049

// Create the splat vector for 0x53000000.

22050

SDValue VecCstHigh = DAG.getConstant(0x53000000, DL, VecIntVT);

22051

22052

// Create the right shift.

22053

SDValue VecCstShift = DAG.getConstant(16, DL, VecIntVT);

22054

SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift);

22055

22056

SDValue Low, High;

22057

if (Subtarget.hasSSE41()) {

22058

MVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16;

22059

// uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);

22060

SDValue VecCstLowBitcast = DAG.getBitcast(VecI16VT, VecCstLow);

22061

SDValue VecBitcast = DAG.getBitcast(VecI16VT, V);

22062

// Low will be bitcasted right away, so do not bother bitcasting back to its

22063

// original type.

22064

Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast,

22065

VecCstLowBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8));

22066

// uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),

22067

// (uint4) 0x53000000, 0xaa);

22068

SDValue VecCstHighBitcast = DAG.getBitcast(VecI16VT, VecCstHigh);

22069

SDValue VecShiftBitcast = DAG.getBitcast(VecI16VT, HighShift);

22070

// High will be bitcasted right away, so do not bother bitcasting back to

22071

// its original type.

22072

High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast,

22073

VecCstHighBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8));

22074

} else {

22075

SDValue VecCstMask = DAG.getConstant(0xffff, DL, VecIntVT);

22076

// uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;

22077

SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask);

22078

Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow);

22079

22080

// uint4 hi = (v >> 16) | (uint4) 0x53000000;

22081

High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh);

22082

}

22083

22084

// Create the vector constant for (0x1.0p39f + 0x1.0p23f).

22085

SDValue VecCstFSub = DAG.getConstantFP(

22086

APFloat(APFloat::IEEEsingle(), APInt(32, 0x53000080)), DL, VecFloatVT);

22087

22088

// float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);

22089

// NOTE: By using fsub of a positive constant instead of fadd of a negative

22090

// constant, we avoid reassociation in MachineCombiner when unsafe-fp-math is

22091

// enabled. See PR24512.

22092

SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High);

22093

// TODO: Are there any fast-math-flags to propagate here?

22094

// (float4) lo;

22095

SDValue LowBitcast = DAG.getBitcast(VecFloatVT, Low);

22096

// return (float4) lo + fhi;

22097

if (IsStrict) {

22098

SDValue FHigh = DAG.getNode(ISD::STRICT_FSUB, DL, {VecFloatVT, MVT::Other},

22099

{Op.getOperand(0), HighBitcast, VecCstFSub});

22100

return DAG.getNode(ISD::STRICT_FADD, DL, {VecFloatVT, MVT::Other},

22101

{FHigh.getValue(1), LowBitcast, FHigh});

22102

}

22103

22104

SDValue FHigh =

22105

DAG.getNode(ISD::FSUB, DL, VecFloatVT, HighBitcast, VecCstFSub);

22106

return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh);

22107

}

22108

22109

static SDValue lowerUINT_TO_FP_vec(SDValue Op, SelectionDAG &DAG,

22110

const X86Subtarget &Subtarget) {

22111

unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0;

22112

SDValue N0 = Op.getOperand(OpNo);

22113

MVT SrcVT = N0.getSimpleValueType();

22114

SDLoc dl(Op);

22115

22116

switch (SrcVT.SimpleTy) {

22117

default:

22118

llvm_unreachable("Custom UINT_TO_FP is not supported!")::llvm::llvm_unreachable_internal("Custom UINT_TO_FP is not supported!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22118);

22119

case MVT::v2i32:

22120

return lowerUINT_TO_FP_v2i32(Op, DAG, Subtarget, dl);

22121

case MVT::v4i32:

22122

case MVT::v8i32:

22123

return lowerUINT_TO_FP_vXi32(Op, DAG, Subtarget);

22124

case MVT::v2i64:

22125

case MVT::v4i64:

22126

return lowerINT_TO_FP_vXi64(Op, DAG, Subtarget);

22127

}

22128

}

22129

22130

SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,

22131

SelectionDAG &DAG) const {

22132

bool IsStrict = Op->isStrictFPOpcode();

22133

unsigned OpNo = IsStrict ? 1 : 0;

22134

SDValue Src = Op.getOperand(OpNo);

22135

SDLoc dl(Op);

22136

auto PtrVT = getPointerTy(DAG.getDataLayout());

22137

MVT SrcVT = Src.getSimpleValueType();

22138

MVT DstVT = Op->getSimpleValueType(0);

22139

SDValue Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();

22140

22141

// Bail out when we don't have native conversion instructions.

22142

if (DstVT == MVT::f128)

22143

return SDValue();

22144

22145

if (isSoftFP16(DstVT))

22146

return promoteXINT_TO_FP(Op, DAG);

22147

else if (isLegalConversion(SrcVT, false, Subtarget))

22148

return Op;

22149

22150

if (DstVT.isVector())

22151

return lowerUINT_TO_FP_vec(Op, DAG, Subtarget);

22152

22153

if (Subtarget.isTargetWin64() && SrcVT == MVT::i128)

22154

return LowerWin64_INT128_TO_FP(Op, DAG);

22155

22156

if (SDValue Extract = vectorizeExtractedCast(Op, DAG, Subtarget))

22157

return Extract;

22158

22159

if (Subtarget.hasAVX512() && isScalarFPTypeInSSEReg(DstVT) &&

22160

(SrcVT == MVT::i32 || (SrcVT == MVT::i64 && Subtarget.is64Bit()))) {

22161

// Conversions from unsigned i32 to f32/f64 are legal,

22162

// using VCVTUSI2SS/SD. Same for i64 in 64-bit mode.

22163

return Op;

22164

}

22165

22166

// Promote i32 to i64 and use a signed conversion on 64-bit targets.

22167

if (SrcVT == MVT::i32 && Subtarget.is64Bit()) {

22168

Src = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Src);

22169

if (IsStrict)

22170

return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {DstVT, MVT::Other},

22171

{Chain, Src});

22172

return DAG.getNode(ISD::SINT_TO_FP, dl, DstVT, Src);

22173

}

22174

22175

if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, DAG, Subtarget))

22176

return V;

22177

if (SDValue V = LowerI64IntToFP16(Op, DAG, Subtarget))

22178

return V;

22179

22180

// The transform for i64->f64 isn't correct for 0 when rounding to negative

22181

// infinity. It produces -0.0, so disable under strictfp.

22182

if (SrcVT == MVT::i64 && DstVT == MVT::f64 && Subtarget.hasSSE2() &&

22183

!IsStrict)

22184

return LowerUINT_TO_FP_i64(Op, DAG, Subtarget);

22185

// The transform for i32->f64/f32 isn't correct for 0 when rounding to

22186

// negative infinity. So disable under strictfp. Using FILD instead.

22187

if (SrcVT == MVT::i32 && Subtarget.hasSSE2() && DstVT != MVT::f80 &&

22188

!IsStrict)

22189

return LowerUINT_TO_FP_i32(Op, DAG, Subtarget);

22190

if (Subtarget.is64Bit() && SrcVT == MVT::i64 &&

22191

(DstVT == MVT::f32 || DstVT == MVT::f64))

22192

return SDValue();

22193

22194

// Make a 64-bit buffer, and use it to build an FILD.

22195

SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64, 8);

22196

int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();

22197

Align SlotAlign(8);

22198

MachinePointerInfo MPI =

22199

MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI);

22200

if (SrcVT == MVT::i32) {

22201

SDValue OffsetSlot =

22202

DAG.getMemBasePlusOffset(StackSlot, TypeSize::Fixed(4), dl);

22203

SDValue Store1 = DAG.getStore(Chain, dl, Src, StackSlot, MPI, SlotAlign);

22204

SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32),

22205

OffsetSlot, MPI.getWithOffset(4), SlotAlign);

22206

std::pair<SDValue, SDValue> Tmp =

22207

BuildFILD(DstVT, MVT::i64, dl, Store2, StackSlot, MPI, SlotAlign, DAG);

22208

if (IsStrict)

22209

return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);

22210

22211

return Tmp.first;

22212

}

22213

22214

assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP")(static_cast <bool> (SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP"
) ? void (0) : __assert_fail ("SrcVT == MVT::i64 && \"Unexpected type in UINT_TO_FP\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22214, __extension__
__PRETTY_FUNCTION__));

22215

SDValue ValueToStore = Src;

22216

if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget.is64Bit()) {

22217

// Bitcasting to f64 here allows us to do a single 64-bit store from

22218

// an SSE register, avoiding the store forwarding penalty that would come

22219

// with two 32-bit stores.

22220

ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);

22221

}

22222

SDValue Store =

22223

DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, SlotAlign);

22224

// For i64 source, we need to add the appropriate power of 2 if the input

22225

// was negative. We must be careful to do the computation in x87 extended

22226

// precision, not in SSE.

22227

SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);

22228

SDValue Ops[] = { Store, StackSlot };

22229

SDValue Fild =

22230

DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops, MVT::i64, MPI,

22231

SlotAlign, MachineMemOperand::MOLoad);

22232

Chain = Fild.getValue(1);

22233

22234

22235

// Check whether the sign bit is set.

22236

SDValue SignSet = DAG.getSetCC(

22237

dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),

22238

Op.getOperand(OpNo), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT);

22239

22240

// Build a 64 bit pair (FF, 0) in the constant pool, with FF in the hi bits.

22241

APInt FF(64, 0x5F80000000000000ULL);

22242

SDValue FudgePtr = DAG.getConstantPool(

22243

ConstantInt::get(*DAG.getContext(), FF), PtrVT);

22244

Align CPAlignment = cast<ConstantPoolSDNode>(FudgePtr)->getAlign();

22245

22246

// Get a pointer to FF if the sign bit was set, or to 0 otherwise.

22247

SDValue Zero = DAG.getIntPtrConstant(0, dl);

22248

SDValue Four = DAG.getIntPtrConstant(4, dl);

22249

SDValue Offset = DAG.getSelect(dl, Zero.getValueType(), SignSet, Four, Zero);

22250

FudgePtr = DAG.getNode(ISD::ADD, dl, PtrVT, FudgePtr, Offset);

22251

22252

// Load the value out, extending it from f32 to f80.

22253

SDValue Fudge = DAG.getExtLoad(

22254

ISD::EXTLOAD, dl, MVT::f80, Chain, FudgePtr,

22255

MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), MVT::f32,

22256

CPAlignment);

22257

Chain = Fudge.getValue(1);

22258

// Extend everything to 80 bits to force it to be done on x87.

22259

// TODO: Are there any fast-math-flags to propagate here?

22260

if (IsStrict) {

22261

unsigned Opc = ISD::STRICT_FADD;

22262

// Windows needs the precision control changed to 80bits around this add.

22263

if (Subtarget.isOSWindows() && DstVT == MVT::f32)

22264

Opc = X86ISD::STRICT_FP80_ADD;

22265

22266

SDValue Add =

22267

DAG.getNode(Opc, dl, {MVT::f80, MVT::Other}, {Chain, Fild, Fudge});

22268

// STRICT_FP_ROUND can't handle equal types.

22269

if (DstVT == MVT::f80)

22270

return Add;

22271

return DAG.getNode(ISD::STRICT_FP_ROUND, dl, {DstVT, MVT::Other},

22272

{Add.getValue(1), Add, DAG.getIntPtrConstant(0, dl)});

22273

}

22274

unsigned Opc = ISD::FADD;

22275

// Windows needs the precision control changed to 80bits around this add.

22276

if (Subtarget.isOSWindows() && DstVT == MVT::f32)

22277

Opc = X86ISD::FP80_ADD;

22278

22279

SDValue Add = DAG.getNode(Opc, dl, MVT::f80, Fild, Fudge);

22280

return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add,

22281

DAG.getIntPtrConstant(0, dl, /*isTarget=*/true));

22282

}

22283

22284

// If the given FP_TO_SINT (IsSigned) or FP_TO_UINT (!IsSigned) operation

22285

// is legal, or has an fp128 or f16 source (which needs to be promoted to f32),

22286

// just return an SDValue().

22287

// Otherwise it is assumed to be a conversion from one of f32, f64 or f80

22288

// to i16, i32 or i64, and we lower it to a legal sequence and return the

22289

// result.

22290

SDValue

22291

X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,

22292

bool IsSigned, SDValue &Chain) const {

22293

bool IsStrict = Op->isStrictFPOpcode();

22294

SDLoc DL(Op);

22295

22296

EVT DstTy = Op.getValueType();

22297

SDValue Value = Op.getOperand(IsStrict ? 1 : 0);

22298

EVT TheVT = Value.getValueType();

22299

auto PtrVT = getPointerTy(DAG.getDataLayout());

22300

22301

if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) {

22302

// f16 must be promoted before using the lowering in this routine.

22303

// fp128 does not use this lowering.

22304

return SDValue();

22305

}

22306

22307

// If using FIST to compute an unsigned i64, we'll need some fixup

22308

// to handle values above the maximum signed i64. A FIST is always

22309

// used for the 32-bit subtarget, but also for f80 on a 64-bit target.

22310

bool UnsignedFixup = !IsSigned && DstTy == MVT::i64;

22311

22312

// FIXME: This does not generate an invalid exception if the input does not

22313

// fit in i32. PR44019

22314

if (!IsSigned && DstTy != MVT::i64) {

22315

// Replace the fp-to-uint32 operation with an fp-to-sint64 FIST.

22316

// The low 32 bits of the fist result will have the correct uint32 result.

22317

assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT")(static_cast <bool> (DstTy == MVT::i32 && "Unexpected FP_TO_UINT"
) ? void (0) : __assert_fail ("DstTy == MVT::i32 && \"Unexpected FP_TO_UINT\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22317, __extension__
__PRETTY_FUNCTION__));

22318

DstTy = MVT::i64;

22319

}

22320

22321

assert(DstTy.getSimpleVT() <= MVT::i64 &&(static_cast <bool> (DstTy.getSimpleVT() <= MVT::i64
&& DstTy.getSimpleVT() >= MVT::i16 && "Unknown FP_TO_INT to lower!"
) ? void (0) : __assert_fail ("DstTy.getSimpleVT() <= MVT::i64 && DstTy.getSimpleVT() >= MVT::i16 && \"Unknown FP_TO_INT to lower!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22323, __extension__
__PRETTY_FUNCTION__))

22322

DstTy.getSimpleVT() >= MVT::i16 &&(static_cast <bool> (DstTy.getSimpleVT() <= MVT::i64
&& DstTy.getSimpleVT() >= MVT::i16 && "Unknown FP_TO_INT to lower!"
) ? void (0) : __assert_fail ("DstTy.getSimpleVT() <= MVT::i64 && DstTy.getSimpleVT() >= MVT::i16 && \"Unknown FP_TO_INT to lower!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22323, __extension__
__PRETTY_FUNCTION__))

22323

"Unknown FP_TO_INT to lower!")(static_cast <bool> (DstTy.getSimpleVT() <= MVT::i64
&& DstTy.getSimpleVT() >= MVT::i16 && "Unknown FP_TO_INT to lower!"
) ? void (0) : __assert_fail ("DstTy.getSimpleVT() <= MVT::i64 && DstTy.getSimpleVT() >= MVT::i16 && \"Unknown FP_TO_INT to lower!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22323, __extension__
__PRETTY_FUNCTION__));

22324

22325

// We lower FP->int64 into FISTP64 followed by a load from a temporary

22326

// stack slot.

22327

MachineFunction &MF = DAG.getMachineFunction();

22328

unsigned MemSize = DstTy.getStoreSize();

22329

int SSFI =

22330

MF.getFrameInfo().CreateStackObject(MemSize, Align(MemSize), false);

22331

SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);

22332

22333

Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();

22334

22335

SDValue Adjust; // 0x0 or 0x80000000, for result sign bit adjustment.

22336

22337

if (UnsignedFixup) {

22338

//

22339

// Conversion to unsigned i64 is implemented with a select,

22340

// depending on whether the source value fits in the range

22341

// of a signed i64. Let Thresh be the FP equivalent of

22342

// 0x8000000000000000ULL.

22343

//

22344

// Adjust = (Value >= Thresh) ? 0x80000000 : 0;

22345

// FltOfs = (Value >= Thresh) ? 0x80000000 : 0;

22346

// FistSrc = (Value - FltOfs);

22347

// Fist-to-mem64 FistSrc

22348

// Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent

22349

// to XOR'ing the high 32 bits with Adjust.

22350

//

22351

// Being a power of 2, Thresh is exactly representable in all FP formats.

22352

// For X87 we'd like to use the smallest FP type for this constant, but

22353

// for DAG type consistency we have to match the FP operand type.

22354

22355

APFloat Thresh(APFloat::IEEEsingle(), APInt(32, 0x5f000000));

22356

LLVM_ATTRIBUTE_UNUSED__attribute__((__unused__)) APFloat::opStatus Status = APFloat::opOK;

22357

bool LosesInfo = false;

22358

if (TheVT == MVT::f64)

22359

// The rounding mode is irrelevant as the conversion should be exact.

22360

Status = Thresh.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven,

22361

&LosesInfo);

22362

else if (TheVT == MVT::f80)

22363

Status = Thresh.convert(APFloat::x87DoubleExtended(),

22364

APFloat::rmNearestTiesToEven, &LosesInfo);

22365

22366

assert(Status == APFloat::opOK && !LosesInfo &&(static_cast <bool> (Status == APFloat::opOK &&
!LosesInfo && "FP conversion should have been exact"
) ? void (0) : __assert_fail ("Status == APFloat::opOK && !LosesInfo && \"FP conversion should have been exact\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22367, __extension__
__PRETTY_FUNCTION__))

22367

"FP conversion should have been exact")(static_cast <bool> (Status == APFloat::opOK &&
!LosesInfo && "FP conversion should have been exact"
) ? void (0) : __assert_fail ("Status == APFloat::opOK && !LosesInfo && \"FP conversion should have been exact\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22367, __extension__
__PRETTY_FUNCTION__));

22368

22369

SDValue ThreshVal = DAG.getConstantFP(Thresh, DL, TheVT);

22370

22371

EVT ResVT = getSetCCResultType(DAG.getDataLayout(),

22372

*DAG.getContext(), TheVT);

22373

SDValue Cmp;

22374

if (IsStrict) {

22375

Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETGE, Chain,

22376

/*IsSignaling*/ true);

22377

Chain = Cmp.getValue(1);

22378

} else {

22379

Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETGE);

22380

}

22381

22382

// Our preferred lowering of

22383

//

22384

// (Value >= Thresh) ? 0x8000000000000000ULL : 0

22385

//

22386

// is

22387

//

22388

// (Value >= Thresh) << 63

22389

//

22390

// but since we can get here after LegalOperations, DAGCombine might do the

22391

// wrong thing if we create a select. So, directly create the preferred

22392

// version.

22393

SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Cmp);

22394

SDValue Const63 = DAG.getConstant(63, DL, MVT::i8);

22395

Adjust = DAG.getNode(ISD::SHL, DL, MVT::i64, Zext, Const63);

22396

22397

SDValue FltOfs = DAG.getSelect(DL, TheVT, Cmp, ThreshVal,

22398

DAG.getConstantFP(0.0, DL, TheVT));

22399

22400

if (IsStrict) {

22401

Value = DAG.getNode(ISD::STRICT_FSUB, DL, { TheVT, MVT::Other},

22402

{ Chain, Value, FltOfs });

22403

Chain = Value.getValue(1);

22404

} else

22405

Value = DAG.getNode(ISD::FSUB, DL, TheVT, Value, FltOfs);

22406

}

22407

22408

MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, SSFI);

22409

22410

// FIXME This causes a redundant load/store if the SSE-class value is already

22411

// in memory, such as if it is on the callstack.

22412

if (isScalarFPTypeInSSEReg(TheVT)) {

22413

assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!")(static_cast <bool> (DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!"
) ? void (0) : __assert_fail ("DstTy == MVT::i64 && \"Invalid FP_TO_SINT to lower!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22413, __extension__
__PRETTY_FUNCTION__));

22414

Chain = DAG.getStore(Chain, DL, Value, StackSlot, MPI);

22415

SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);

22416

SDValue Ops[] = { Chain, StackSlot };

22417

22418

unsigned FLDSize = TheVT.getStoreSize();

22419

assert(FLDSize <= MemSize && "Stack slot not big enough")(static_cast <bool> (FLDSize <= MemSize && "Stack slot not big enough"
) ? void (0) : __assert_fail ("FLDSize <= MemSize && \"Stack slot not big enough\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22419, __extension__
__PRETTY_FUNCTION__));

22420

MachineMemOperand *MMO = MF.getMachineMemOperand(

22421

MPI, MachineMemOperand::MOLoad, FLDSize, Align(FLDSize));

22422

Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, TheVT, MMO);

22423

Chain = Value.getValue(1);

22424

}

22425

22426

// Build the FP_TO_INT*_IN_MEM

22427

MachineMemOperand *MMO = MF.getMachineMemOperand(

22428

MPI, MachineMemOperand::MOStore, MemSize, Align(MemSize));

22429

SDValue Ops[] = { Chain, Value, StackSlot };

22430

SDValue FIST = DAG.getMemIntrinsicNode(X86ISD::FP_TO_INT_IN_MEM, DL,

22431

DAG.getVTList(MVT::Other),

22432

Ops, DstTy, MMO);

22433

22434

SDValue Res = DAG.getLoad(Op.getValueType(), SDLoc(Op), FIST, StackSlot, MPI);

22435

Chain = Res.getValue(1);

22436

22437

// If we need an unsigned fixup, XOR the result with adjust.

22438

if (UnsignedFixup)

22439

Res = DAG.getNode(ISD::XOR, DL, MVT::i64, Res, Adjust);

22440

22441

return Res;

22442

}

22443

22444

static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,

22445

const X86Subtarget &Subtarget) {

22446

MVT VT = Op.getSimpleValueType();

22447

SDValue In = Op.getOperand(0);

22448

MVT InVT = In.getSimpleValueType();

22449

SDLoc dl(Op);

22450

unsigned Opc = Op.getOpcode();

22451

22452

assert(VT.isVector() && InVT.isVector() && "Expected vector type")(static_cast <bool> (VT.isVector() && InVT.isVector
() && "Expected vector type") ? void (0) : __assert_fail
("VT.isVector() && InVT.isVector() && \"Expected vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22452, __extension__
__PRETTY_FUNCTION__));

22453

assert((Opc == ISD::ANY_EXTEND || Opc == ISD::ZERO_EXTEND) &&(static_cast <bool> ((Opc == ISD::ANY_EXTEND || Opc == ISD
::ZERO_EXTEND) && "Unexpected extension opcode") ? void
(0) : __assert_fail ("(Opc == ISD::ANY_EXTEND || Opc == ISD::ZERO_EXTEND) && \"Unexpected extension opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22454, __extension__
__PRETTY_FUNCTION__))

22454

"Unexpected extension opcode")(static_cast <bool> ((Opc == ISD::ANY_EXTEND || Opc == ISD
::ZERO_EXTEND) && "Unexpected extension opcode") ? void
(0) : __assert_fail ("(Opc == ISD::ANY_EXTEND || Opc == ISD::ZERO_EXTEND) && \"Unexpected extension opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22454, __extension__
__PRETTY_FUNCTION__));

22455

assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&(static_cast <bool> (VT.getVectorNumElements() == InVT.
getVectorNumElements() && "Expected same number of elements"
) ? void (0) : __assert_fail ("VT.getVectorNumElements() == InVT.getVectorNumElements() && \"Expected same number of elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22456, __extension__
__PRETTY_FUNCTION__))

22456

"Expected same number of elements")(static_cast <bool> (VT.getVectorNumElements() == InVT.
getVectorNumElements() && "Expected same number of elements"
) ? void (0) : __assert_fail ("VT.getVectorNumElements() == InVT.getVectorNumElements() && \"Expected same number of elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22456, __extension__
__PRETTY_FUNCTION__));

22457

assert((VT.getVectorElementType() == MVT::i16 ||(static_cast <bool> ((VT.getVectorElementType() == MVT::
i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType
() == MVT::i64) && "Unexpected element type") ? void (
0) : __assert_fail ("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22460, __extension__
__PRETTY_FUNCTION__))

22458

VT.getVectorElementType() == MVT::i32 ||(static_cast <bool> ((VT.getVectorElementType() == MVT::
i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType
() == MVT::i64) && "Unexpected element type") ? void (
0) : __assert_fail ("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22460, __extension__
__PRETTY_FUNCTION__))

22459

VT.getVectorElementType() == MVT::i64) &&(static_cast <bool> ((VT.getVectorElementType() == MVT::
i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType
() == MVT::i64) && "Unexpected element type") ? void (
0) : __assert_fail ("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22460, __extension__
__PRETTY_FUNCTION__))

22460

"Unexpected element type")(static_cast <bool> ((VT.getVectorElementType() == MVT::
i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType
() == MVT::i64) && "Unexpected element type") ? void (
0) : __assert_fail ("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22460, __extension__
__PRETTY_FUNCTION__));

22461

assert((InVT.getVectorElementType() == MVT::i8 ||(static_cast <bool> ((InVT.getVectorElementType() == MVT
::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType
() == MVT::i32) && "Unexpected element type") ? void (
0) : __assert_fail ("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22464, __extension__
__PRETTY_FUNCTION__))

22462

InVT.getVectorElementType() == MVT::i16 ||(static_cast <bool> ((InVT.getVectorElementType() == MVT
::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType
() == MVT::i32) && "Unexpected element type") ? void (
0) : __assert_fail ("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22464, __extension__
__PRETTY_FUNCTION__))

22463

InVT.getVectorElementType() == MVT::i32) &&(static_cast <bool> ((InVT.getVectorElementType() == MVT
::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType
() == MVT::i32) && "Unexpected element type") ? void (
0) : __assert_fail ("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22464, __extension__
__PRETTY_FUNCTION__))

22464

"Unexpected element type")(static_cast <bool> ((InVT.getVectorElementType() == MVT
::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType
() == MVT::i32) && "Unexpected element type") ? void (
0) : __assert_fail ("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22464, __extension__
__PRETTY_FUNCTION__));

22465

22466

unsigned ExtendInVecOpc = DAG.getOpcode_EXTEND_VECTOR_INREG(Opc);

22467

22468

if (VT == MVT::v32i16 && !Subtarget.hasBWI()) {

22469

assert(InVT == MVT::v32i8 && "Unexpected VT!")(static_cast <bool> (InVT == MVT::v32i8 && "Unexpected VT!"
) ? void (0) : __assert_fail ("InVT == MVT::v32i8 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22469, __extension__
__PRETTY_FUNCTION__));

22470

return splitVectorIntUnary(Op, DAG);

22471

}

22472

22473

if (Subtarget.hasInt256())

22474

return Op;

22475

22476

// Optimize vectors in AVX mode:

22477

//

22478

// v8i16 -> v8i32

22479

// Use vpmovzwd for 4 lower elements v8i16 -> v4i32.

22480

// Use vpunpckhwd for 4 upper elements v8i16 -> v4i32.

22481

// Concat upper and lower parts.

22482

//

22483

// v4i32 -> v4i64

22484

// Use vpmovzdq for 4 lower elements v4i32 -> v2i64.

22485

// Use vpunpckhdq for 4 upper elements v4i32 -> v2i64.

22486

// Concat upper and lower parts.

22487

//

22488

MVT HalfVT = VT.getHalfNumVectorElementsVT();

22489

SDValue OpLo = DAG.getNode(ExtendInVecOpc, dl, HalfVT, In);

22490

22491

// Short-circuit if we can determine that each 128-bit half is the same value.

22492

// Otherwise, this is difficult to match and optimize.

22493

if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(In))

22494

if (hasIdenticalHalvesShuffleMask(Shuf->getMask()))

22495

return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpLo);

22496

22497

SDValue ZeroVec = DAG.getConstant(0, dl, InVT);

22498

SDValue Undef = DAG.getUNDEF(InVT);

22499

bool NeedZero = Opc == ISD::ZERO_EXTEND;

22500

SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);

22501

OpHi = DAG.getBitcast(HalfVT, OpHi);

22502

22503

return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);

22504

}

22505

22506

// Helper to split and extend a v16i1 mask to v16i8 or v16i16.

22507

static SDValue SplitAndExtendv16i1(unsigned ExtOpc, MVT VT, SDValue In,

22508

const SDLoc &dl, SelectionDAG &DAG) {

22509

assert((VT == MVT::v16i8 || VT == MVT::v16i16) && "Unexpected VT.")(static_cast <bool> ((VT == MVT::v16i8 || VT == MVT::v16i16
) && "Unexpected VT.") ? void (0) : __assert_fail ("(VT == MVT::v16i8 || VT == MVT::v16i16) && \"Unexpected VT.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22509, __extension__
__PRETTY_FUNCTION__));

22510

SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,

22511

DAG.getIntPtrConstant(0, dl));

22512

SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,

22513

DAG.getIntPtrConstant(8, dl));

22514

Lo = DAG.getNode(ExtOpc, dl, MVT::v8i16, Lo);

22515

Hi = DAG.getNode(ExtOpc, dl, MVT::v8i16, Hi);

22516

SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i16, Lo, Hi);

22517

return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);

22518

}

22519

22520

static SDValue LowerZERO_EXTEND_Mask(SDValue Op,

22521

const X86Subtarget &Subtarget,

22522

SelectionDAG &DAG) {

22523

MVT VT = Op->getSimpleValueType(0);

22524

SDValue In = Op->getOperand(0);

22525

MVT InVT = In.getSimpleValueType();

22526

assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!")(static_cast <bool> (InVT.getVectorElementType() == MVT
::i1 && "Unexpected input type!") ? void (0) : __assert_fail
("InVT.getVectorElementType() == MVT::i1 && \"Unexpected input type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22526, __extension__
__PRETTY_FUNCTION__));

22527

SDLoc DL(Op);

22528

unsigned NumElts = VT.getVectorNumElements();

22529

22530

// For all vectors, but vXi8 we can just emit a sign_extend and a shift. This

22531

// avoids a constant pool load.

22532

if (VT.getVectorElementType() != MVT::i8) {

22533

SDValue Extend = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, In);

22534

return DAG.getNode(ISD::SRL, DL, VT, Extend,

22535

DAG.getConstant(VT.getScalarSizeInBits() - 1, DL, VT));

22536

}

22537

22538

// Extend VT if BWI is not supported.

22539

MVT ExtVT = VT;

22540

if (!Subtarget.hasBWI()) {

22541

// If v16i32 is to be avoided, we'll need to split and concatenate.

22542

if (NumElts == 16 && !Subtarget.canExtendTo512DQ())

22543

return SplitAndExtendv16i1(ISD::ZERO_EXTEND, VT, In, DL, DAG);

22544

22545

ExtVT = MVT::getVectorVT(MVT::i32, NumElts);

22546

}

22547

22548

// Widen to 512-bits if VLX is not supported.

22549

MVT WideVT = ExtVT;

22550

if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {

22551

NumElts *= 512 / ExtVT.getSizeInBits();

22552

InVT = MVT::getVectorVT(MVT::i1, NumElts);

22553

In = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, InVT, DAG.getUNDEF(InVT),

22554

In, DAG.getIntPtrConstant(0, DL));

22555

WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(),

22556

NumElts);

22557

}

22558

22559

SDValue One = DAG.getConstant(1, DL, WideVT);

22560

SDValue Zero = DAG.getConstant(0, DL, WideVT);

22561

22562

SDValue SelectedVal = DAG.getSelect(DL, WideVT, In, One, Zero);

22563

22564

// Truncate if we had to extend above.

22565

if (VT != ExtVT) {

22566

WideVT = MVT::getVectorVT(MVT::i8, NumElts);

22567

SelectedVal = DAG.getNode(ISD::TRUNCATE, DL, WideVT, SelectedVal);

22568

}

22569

22570

// Extract back to 128/256-bit if we widened.

22571

if (WideVT != VT)

22572

SelectedVal = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SelectedVal,

22573

DAG.getIntPtrConstant(0, DL));

22574

22575

return SelectedVal;

22576

}

22577

22578

static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget &Subtarget,

22579

SelectionDAG &DAG) {

22580

SDValue In = Op.getOperand(0);

22581

MVT SVT = In.getSimpleValueType();

22582

22583

if (SVT.getVectorElementType() == MVT::i1)

22584

return LowerZERO_EXTEND_Mask(Op, Subtarget, DAG);

22585

22586

assert(Subtarget.hasAVX() && "Expected AVX support")(static_cast <bool> (Subtarget.hasAVX() && "Expected AVX support"
) ? void (0) : __assert_fail ("Subtarget.hasAVX() && \"Expected AVX support\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22586, __extension__
__PRETTY_FUNCTION__));

22587

return LowerAVXExtend(Op, DAG, Subtarget);

22588

}

22589

22590

/// Helper to recursively truncate vector elements in half with PACKSS/PACKUS.

22591

/// It makes use of the fact that vectors with enough leading sign/zero bits

22592

/// prevent the PACKSS/PACKUS from saturating the results.

22593

/// AVX2 (Int256) sub-targets require extra shuffling as the PACK*S operates

22594

/// within each 128-bit lane.

22595

static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In,

22596

const SDLoc &DL, SelectionDAG &DAG,

22597

const X86Subtarget &Subtarget) {

22598

assert((Opcode == X86ISD::PACKSS || Opcode == X86ISD::PACKUS) &&(static_cast <bool> ((Opcode == X86ISD::PACKSS || Opcode
== X86ISD::PACKUS) && "Unexpected PACK opcode") ? void
(0) : __assert_fail ("(Opcode == X86ISD::PACKSS || Opcode == X86ISD::PACKUS) && \"Unexpected PACK opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22599, __extension__
__PRETTY_FUNCTION__))

22599

"Unexpected PACK opcode")(static_cast <bool> ((Opcode == X86ISD::PACKSS || Opcode
== X86ISD::PACKUS) && "Unexpected PACK opcode") ? void
(0) : __assert_fail ("(Opcode == X86ISD::PACKSS || Opcode == X86ISD::PACKUS) && \"Unexpected PACK opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22599, __extension__
__PRETTY_FUNCTION__));

22600

assert(DstVT.isVector() && "VT not a vector?")(static_cast <bool> (DstVT.isVector() && "VT not a vector?"
) ? void (0) : __assert_fail ("DstVT.isVector() && \"VT not a vector?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22600, __extension__
__PRETTY_FUNCTION__));

22601

22602

// Requires SSE2 for PACKSS (SSE41 PACKUSDW is handled below).

22603

if (!Subtarget.hasSSE2())

22604

return SDValue();

22605

22606

EVT SrcVT = In.getValueType();

22607

22608

// No truncation required, we might get here due to recursive calls.

22609

if (SrcVT == DstVT)

22610

return In;

22611

22612

// We only support vector truncation to 64bits or greater from a

22613

// 128bits or greater source.

22614

unsigned DstSizeInBits = DstVT.getSizeInBits();

22615

unsigned SrcSizeInBits = SrcVT.getSizeInBits();

22616

if ((DstSizeInBits % 64) != 0 || (SrcSizeInBits % 128) != 0)

22617

return SDValue();

22618

22619

unsigned NumElems = SrcVT.getVectorNumElements();

22620

if (!isPowerOf2_32(NumElems))

22621

return SDValue();

22622

22623

LLVMContext &Ctx = *DAG.getContext();

22624

assert(DstVT.getVectorNumElements() == NumElems && "Illegal truncation")(static_cast <bool> (DstVT.getVectorNumElements() == NumElems
&& "Illegal truncation") ? void (0) : __assert_fail (
"DstVT.getVectorNumElements() == NumElems && \"Illegal truncation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22624, __extension__
__PRETTY_FUNCTION__));

22625

assert(SrcSizeInBits > DstSizeInBits && "Illegal truncation")(static_cast <bool> (SrcSizeInBits > DstSizeInBits &&
"Illegal truncation") ? void (0) : __assert_fail ("SrcSizeInBits > DstSizeInBits && \"Illegal truncation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22625, __extension__
__PRETTY_FUNCTION__));

22626

22627

EVT PackedSVT = EVT::getIntegerVT(Ctx, SrcVT.getScalarSizeInBits() / 2);

22628

22629

// Pack to the largest type possible:

22630

// vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.

22631

EVT InVT = MVT::i16, OutVT = MVT::i8;

22632

if (SrcVT.getScalarSizeInBits() > 16 &&

22633

(Opcode == X86ISD::PACKSS || Subtarget.hasSSE41())) {

22634

InVT = MVT::i32;

22635

OutVT = MVT::i16;

22636

}

22637

22638

// 128bit -> 64bit truncate - PACK 128-bit src in the lower subvector.

22639

if (SrcVT.is128BitVector()) {

22640

InVT = EVT::getVectorVT(Ctx, InVT, 128 / InVT.getSizeInBits());

22641

OutVT = EVT::getVectorVT(Ctx, OutVT, 128 / OutVT.getSizeInBits());

22642

In = DAG.getBitcast(InVT, In);

22643

SDValue Res = DAG.getNode(Opcode, DL, OutVT, In, DAG.getUNDEF(InVT));

22644

Res = extractSubVector(Res, 0, DAG, DL, 64);

22645

return DAG.getBitcast(DstVT, Res);

22646

}

22647

22648

// Split lower/upper subvectors.

22649

SDValue Lo, Hi;

22650

std::tie(Lo, Hi) = splitVector(In, DAG, DL);

22651

22652

unsigned SubSizeInBits = SrcSizeInBits / 2;

22653

InVT = EVT::getVectorVT(Ctx, InVT, SubSizeInBits / InVT.getSizeInBits());

22654

OutVT = EVT::getVectorVT(Ctx, OutVT, SubSizeInBits / OutVT.getSizeInBits());

22655

22656

// 256bit -> 128bit truncate - PACK lower/upper 128-bit subvectors.

22657

if (SrcVT.is256BitVector() && DstVT.is128BitVector()) {

22658

Lo = DAG.getBitcast(InVT, Lo);

22659

Hi = DAG.getBitcast(InVT, Hi);

22660

SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);

22661

return DAG.getBitcast(DstVT, Res);

22662

}

22663

22664

// AVX2: 512bit -> 256bit truncate - PACK lower/upper 256-bit subvectors.

22665

// AVX2: 512bit -> 128bit truncate - PACK(PACK, PACK).

22666

if (SrcVT.is512BitVector() && Subtarget.hasInt256()) {

22667

Lo = DAG.getBitcast(InVT, Lo);

22668

Hi = DAG.getBitcast(InVT, Hi);

22669

SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);

22670

22671

// 256-bit PACK(ARG0, ARG1) leaves us with ((LO0,LO1),(HI0,HI1)),

22672

// so we need to shuffle to get ((LO0,HI0),(LO1,HI1)).

22673

// Scale shuffle mask to avoid bitcasts and help ComputeNumSignBits.

22674

SmallVector<int, 64> Mask;

22675

int Scale = 64 / OutVT.getScalarSizeInBits();

22676

narrowShuffleMaskElts(Scale, { 0, 2, 1, 3 }, Mask);

22677

Res = DAG.getVectorShuffle(OutVT, DL, Res, Res, Mask);

22678

22679

if (DstVT.is256BitVector())

22680

return DAG.getBitcast(DstVT, Res);

22681

22682

// If 512bit -> 128bit truncate another stage.

22683

EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems);

22684

Res = DAG.getBitcast(PackedVT, Res);

22685

return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);

22686

}

22687

22688

// Recursively pack lower/upper subvectors, concat result and pack again.

22689

assert(SrcSizeInBits >= 256 && "Expected 256-bit vector or greater")(static_cast <bool> (SrcSizeInBits >= 256 &&
"Expected 256-bit vector or greater") ? void (0) : __assert_fail
("SrcSizeInBits >= 256 && \"Expected 256-bit vector or greater\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22689, __extension__
__PRETTY_FUNCTION__));

22690

EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems / 2);

22691

Lo = truncateVectorWithPACK(Opcode, PackedVT, Lo, DL, DAG, Subtarget);

22692

Hi = truncateVectorWithPACK(Opcode, PackedVT, Hi, DL, DAG, Subtarget);

22693

22694

PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems);

22695

SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, PackedVT, Lo, Hi);

22696

return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);

22697

}

22698

22699

static SDValue LowerTruncateVecI1(SDValue Op, SelectionDAG &DAG,

22700

const X86Subtarget &Subtarget) {

22701

22702

SDLoc DL(Op);

22703

MVT VT = Op.getSimpleValueType();

22704

SDValue In = Op.getOperand(0);

22705

MVT InVT = In.getSimpleValueType();

22706

22707

assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type.")(static_cast <bool> (VT.getVectorElementType() == MVT::
i1 && "Unexpected vector type.") ? void (0) : __assert_fail
("VT.getVectorElementType() == MVT::i1 && \"Unexpected vector type.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22707, __extension__
__PRETTY_FUNCTION__));

22708

22709

// Shift LSB to MSB and use VPMOVB/W2M or TESTD/Q.

22710

unsigned ShiftInx = InVT.getScalarSizeInBits() - 1;

22711

if (InVT.getScalarSizeInBits() <= 16) {

22712

if (Subtarget.hasBWI()) {

22713

// legal, will go to VPMOVB2M, VPMOVW2M

22714

if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {

22715

// We need to shift to get the lsb into sign position.

22716

// Shift packed bytes not supported natively, bitcast to word

22717

MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits()/16);

22718

In = DAG.getNode(ISD::SHL, DL, ExtVT,

22719

DAG.getBitcast(ExtVT, In),

22720

DAG.getConstant(ShiftInx, DL, ExtVT));

22721

In = DAG.getBitcast(InVT, In);

22722

}

22723

return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT),

22724

In, ISD::SETGT);

22725

}

22726

// Use TESTD/Q, extended vector to packed dword/qword.

22727

assert((InVT.is256BitVector() || InVT.is128BitVector()) &&(static_cast <bool> ((InVT.is256BitVector() || InVT.is128BitVector
()) && "Unexpected vector type.") ? void (0) : __assert_fail
("(InVT.is256BitVector() || InVT.is128BitVector()) && \"Unexpected vector type.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22728, __extension__
__PRETTY_FUNCTION__))

22728

"Unexpected vector type.")(static_cast <bool> ((InVT.is256BitVector() || InVT.is128BitVector
()) && "Unexpected vector type.") ? void (0) : __assert_fail
("(InVT.is256BitVector() || InVT.is128BitVector()) && \"Unexpected vector type.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22728, __extension__
__PRETTY_FUNCTION__));

22729

unsigned NumElts = InVT.getVectorNumElements();

22730

assert((NumElts == 8 || NumElts == 16) && "Unexpected number of elements")(static_cast <bool> ((NumElts == 8 || NumElts == 16) &&
"Unexpected number of elements") ? void (0) : __assert_fail (
"(NumElts == 8 || NumElts == 16) && \"Unexpected number of elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22730, __extension__
__PRETTY_FUNCTION__));

22731

// We need to change to a wider element type that we have support for.

22732

// For 8 element vectors this is easy, we either extend to v8i32 or v8i64.

22733

// For 16 element vectors we extend to v16i32 unless we are explicitly

22734

// trying to avoid 512-bit vectors. If we are avoiding 512-bit vectors

22735

// we need to split into two 8 element vectors which we can extend to v8i32,

22736

// truncate and concat the results. There's an additional complication if

22737

// the original type is v16i8. In that case we can't split the v16i8

22738

// directly, so we need to shuffle high elements to low and use

22739

// sign_extend_vector_inreg.

22740

if (NumElts == 16 && !Subtarget.canExtendTo512DQ()) {

22741

SDValue Lo, Hi;

22742

if (InVT == MVT::v16i8) {

22743

Lo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, MVT::v8i32, In);

22744

Hi = DAG.getVectorShuffle(

22745

InVT, DL, In, In,

22746

{8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1});

22747

Hi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, MVT::v8i32, Hi);

22748

} else {

22749

assert(InVT == MVT::v16i16 && "Unexpected VT!")(static_cast <bool> (InVT == MVT::v16i16 && "Unexpected VT!"
) ? void (0) : __assert_fail ("InVT == MVT::v16i16 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22749, __extension__
__PRETTY_FUNCTION__));

22750

Lo = extract128BitVector(In, 0, DAG, DL);

22751

Hi = extract128BitVector(In, 8, DAG, DL);

22752

}

22753

// We're split now, just emit two truncates and a concat. The two

22754

// truncates will trigger legalization to come back to this function.

22755

Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Lo);

22756

Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Hi);

22757

return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);

22758

}

22759

// We either have 8 elements or we're allowed to use 512-bit vectors.

22760

// If we have VLX, we want to use the narrowest vector that can get the

22761

// job done so we use vXi32.

22762

MVT EltVT = Subtarget.hasVLX() ? MVT::i32 : MVT::getIntegerVT(512/NumElts);

22763

MVT ExtVT = MVT::getVectorVT(EltVT, NumElts);

22764

In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);

22765

InVT = ExtVT;

22766

ShiftInx = InVT.getScalarSizeInBits() - 1;

22767

}

22768

22769

if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {

22770

// We need to shift to get the lsb into sign position.

22771

In = DAG.getNode(ISD::SHL, DL, InVT, In,

22772

DAG.getConstant(ShiftInx, DL, InVT));

22773

}

22774

// If we have DQI, emit a pattern that will be iseled as vpmovq2m/vpmovd2m.

22775

if (Subtarget.hasDQI())

22776

return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT), In, ISD::SETGT);

22777

return DAG.getSetCC(DL, VT, In, DAG.getConstant(0, DL, InVT), ISD::SETNE);

22778

}

22779

22780

SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {

22781

SDLoc DL(Op);

22782

MVT VT = Op.getSimpleValueType();

22783

SDValue In = Op.getOperand(0);

22784

MVT InVT = In.getSimpleValueType();

22785

unsigned InNumEltBits = InVT.getScalarSizeInBits();

22786

22787

assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&(static_cast <bool> (VT.getVectorNumElements() == InVT.
getVectorNumElements() && "Invalid TRUNCATE operation"
) ? void (0) : __assert_fail ("VT.getVectorNumElements() == InVT.getVectorNumElements() && \"Invalid TRUNCATE operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22788, __extension__
__PRETTY_FUNCTION__))

22788

"Invalid TRUNCATE operation")(static_cast <bool> (VT.getVectorNumElements() == InVT.
getVectorNumElements() && "Invalid TRUNCATE operation"
) ? void (0) : __assert_fail ("VT.getVectorNumElements() == InVT.getVectorNumElements() && \"Invalid TRUNCATE operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22788, __extension__
__PRETTY_FUNCTION__));

22789

22790

// If we're called by the type legalizer, handle a few cases.

22791

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

22792

if (!TLI.isTypeLegal(InVT)) {

22793

if ((InVT == MVT::v8i64 || InVT == MVT::v16i32 || InVT == MVT::v16i64) &&

22794

VT.is128BitVector()) {

22795

assert((InVT == MVT::v16i64 || Subtarget.hasVLX()) &&(static_cast <bool> ((InVT == MVT::v16i64 || Subtarget.
hasVLX()) && "Unexpected subtarget!") ? void (0) : __assert_fail
("(InVT == MVT::v16i64 || Subtarget.hasVLX()) && \"Unexpected subtarget!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22796, __extension__
__PRETTY_FUNCTION__))

22796

"Unexpected subtarget!")(static_cast <bool> ((InVT == MVT::v16i64 || Subtarget.
hasVLX()) && "Unexpected subtarget!") ? void (0) : __assert_fail
("(InVT == MVT::v16i64 || Subtarget.hasVLX()) && \"Unexpected subtarget!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22796, __extension__
__PRETTY_FUNCTION__));

22797

// The default behavior is to truncate one step, concatenate, and then

22798

// truncate the remainder. We'd rather produce two 64-bit results and

22799

// concatenate those.

22800

SDValue Lo, Hi;

22801

std::tie(Lo, Hi) = DAG.SplitVector(In, DL);

22802

22803

EVT LoVT, HiVT;

22804

std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);

22805

22806

Lo = DAG.getNode(ISD::TRUNCATE, DL, LoVT, Lo);

22807

Hi = DAG.getNode(ISD::TRUNCATE, DL, HiVT, Hi);

22808

return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);

22809

}

22810

22811

// Otherwise let default legalization handle it.

22812

return SDValue();

22813

}

22814

22815

if (VT.getVectorElementType() == MVT::i1)

22816

return LowerTruncateVecI1(Op, DAG, Subtarget);

22817

22818

// vpmovqb/w/d, vpmovdb/w, vpmovwb

22819

if (Subtarget.hasAVX512()) {

22820

if (InVT == MVT::v32i16 && !Subtarget.hasBWI()) {

22821

assert(VT == MVT::v32i8 && "Unexpected VT!")(static_cast <bool> (VT == MVT::v32i8 && "Unexpected VT!"
) ? void (0) : __assert_fail ("VT == MVT::v32i8 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22821, __extension__
__PRETTY_FUNCTION__));

22822

return splitVectorIntUnary(Op, DAG);

22823

}

22824

22825

// word to byte only under BWI. Otherwise we have to promoted to v16i32

22826

// and then truncate that. But we should only do that if we haven't been

22827

// asked to avoid 512-bit vectors. The actual promotion to v16i32 will be

22828

// handled by isel patterns.

22829

if (InVT != MVT::v16i16 || Subtarget.hasBWI() ||

22830

Subtarget.canExtendTo512DQ())

22831

return Op;

22832

}

22833

22834

unsigned NumPackedSignBits = std::min<unsigned>(VT.getScalarSizeInBits(), 16);

22835

unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8;

22836

22837

// Truncate with PACKUS if we are truncating a vector with leading zero bits

22838

// that extend all the way to the packed/truncated value.

22839

// Pre-SSE41 we can only use PACKUSWB.

22840

KnownBits Known = DAG.computeKnownBits(In);

22841

if ((InNumEltBits - NumPackedZeroBits) <= Known.countMinLeadingZeros())

22842

if (SDValue V =

22843

truncateVectorWithPACK(X86ISD::PACKUS, VT, In, DL, DAG, Subtarget))

22844

return V;

22845

22846

// Truncate with PACKSS if we are truncating a vector with sign-bits that

22847

// extend all the way to the packed/truncated value.

22848

if ((InNumEltBits - NumPackedSignBits) < DAG.ComputeNumSignBits(In))

22849

if (SDValue V =

22850

truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget))

22851

return V;

22852

22853

// Handle truncation of V256 to V128 using shuffles.

22854

assert(VT.is128BitVector() && InVT.is256BitVector() && "Unexpected types!")(static_cast <bool> (VT.is128BitVector() && InVT
.is256BitVector() && "Unexpected types!") ? void (0) :
__assert_fail ("VT.is128BitVector() && InVT.is256BitVector() && \"Unexpected types!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22854, __extension__
__PRETTY_FUNCTION__));

22855

22856

if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {

22857

// On AVX2, v4i64 -> v4i32 becomes VPERMD.

22858

if (Subtarget.hasInt256()) {

22859

static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};

22860

In = DAG.getBitcast(MVT::v8i32, In);

22861

In = DAG.getVectorShuffle(MVT::v8i32, DL, In, In, ShufMask);

22862

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,

22863

DAG.getIntPtrConstant(0, DL));

22864

}

22865

22866

SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,

22867

DAG.getIntPtrConstant(0, DL));

22868

SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,

22869

DAG.getIntPtrConstant(2, DL));

22870

static const int ShufMask[] = {0, 2, 4, 6};

22871

return DAG.getVectorShuffle(VT, DL, DAG.getBitcast(MVT::v4i32, OpLo),

22872

DAG.getBitcast(MVT::v4i32, OpHi), ShufMask);

22873

}

22874

22875

if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {

22876

// On AVX2, v8i32 -> v8i16 becomes PSHUFB.

22877

if (Subtarget.hasInt256()) {

22878

// The PSHUFB mask:

22879

static const int ShufMask1[] = { 0, 1, 4, 5, 8, 9, 12, 13,

22880

-1, -1, -1, -1, -1, -1, -1, -1,

22881

16, 17, 20, 21, 24, 25, 28, 29,

22882

-1, -1, -1, -1, -1, -1, -1, -1 };

22883

In = DAG.getBitcast(MVT::v32i8, In);

22884

In = DAG.getVectorShuffle(MVT::v32i8, DL, In, In, ShufMask1);

22885

In = DAG.getBitcast(MVT::v4i64, In);

22886

22887

static const int ShufMask2[] = {0, 2, -1, -1};

22888

In = DAG.getVectorShuffle(MVT::v4i64, DL, In, In, ShufMask2);

22889

In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,

22890

DAG.getIntPtrConstant(0, DL));

22891

return DAG.getBitcast(MVT::v8i16, In);

22892

}

22893

22894

SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,

22895

DAG.getIntPtrConstant(0, DL));

22896

SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,

22897

DAG.getIntPtrConstant(4, DL));

22898

22899

// The PSHUFB mask:

22900

static const int ShufMask1[] = {0, 2, 4, 6, -1, -1, -1, -1};

22901

22902

OpLo = DAG.getBitcast(MVT::v8i16, OpLo);

22903

OpHi = DAG.getBitcast(MVT::v8i16, OpHi);

22904

22905

OpLo = DAG.getVectorShuffle(MVT::v8i16, DL, OpLo, OpLo, ShufMask1);

22906

OpHi = DAG.getVectorShuffle(MVT::v8i16, DL, OpHi, OpHi, ShufMask1);

22907

22908

OpLo = DAG.getBitcast(MVT::v4i32, OpLo);

22909

OpHi = DAG.getBitcast(MVT::v4i32, OpHi);

22910

22911

// The MOVLHPS Mask:

22912

static const int ShufMask2[] = {0, 1, 4, 5};

22913

SDValue res = DAG.getVectorShuffle(MVT::v4i32, DL, OpLo, OpHi, ShufMask2);

22914

return DAG.getBitcast(MVT::v8i16, res);

22915

}

22916

22917

if (VT == MVT::v16i8 && InVT == MVT::v16i16) {

22918

// Use an AND to zero uppper bits for PACKUS.

22919

In = DAG.getNode(ISD::AND, DL, InVT, In, DAG.getConstant(255, DL, InVT));

22920

22921

SDValue InLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i16, In,

22922

DAG.getIntPtrConstant(0, DL));

22923

SDValue InHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i16, In,

22924

DAG.getIntPtrConstant(8, DL));

22925

return DAG.getNode(X86ISD::PACKUS, DL, VT, InLo, InHi);

22926

}

22927

22928

llvm_unreachable("All 256->128 cases should have been handled above!")::llvm::llvm_unreachable_internal("All 256->128 cases should have been handled above!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22928);

22929

}

22930

22931

// We can leverage the specific way the "cvttps2dq/cvttpd2dq" instruction

22932

// behaves on out of range inputs to generate optimized conversions.

22933

static SDValue expandFP_TO_UINT_SSE(MVT VT, SDValue Src, const SDLoc &dl,

22934

SelectionDAG &DAG,

22935

const X86Subtarget &Subtarget) {

22936

MVT SrcVT = Src.getSimpleValueType();

22937

unsigned DstBits = VT.getScalarSizeInBits();

22938

assert(DstBits == 32 && "expandFP_TO_UINT_SSE - only vXi32 supported")(static_cast <bool> (DstBits == 32 && "expandFP_TO_UINT_SSE - only vXi32 supported"
) ? void (0) : __assert_fail ("DstBits == 32 && \"expandFP_TO_UINT_SSE - only vXi32 supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22938, __extension__
__PRETTY_FUNCTION__));

22939

22940

// Calculate the converted result for values in the range 0 to

22941

// 2^31-1 ("Small") and from 2^31 to 2^32-1 ("Big").

22942

SDValue Small = DAG.getNode(X86ISD::CVTTP2SI, dl, VT, Src);

22943

SDValue Big =

22944

DAG.getNode(X86ISD::CVTTP2SI, dl, VT,

22945

DAG.getNode(ISD::FSUB, dl, SrcVT, Src,

22946

DAG.getConstantFP(2147483648.0f, dl, SrcVT)));

22947

22948

// The "CVTTP2SI" instruction conveniently sets the sign bit if

22949

// and only if the value was out of range. So we can use that

22950

// as our indicator that we rather use "Big" instead of "Small".

22951

//

22952

// Use "Small" if "IsOverflown" has all bits cleared

22953

// and "0x80000000 | Big" if all bits in "IsOverflown" are set.

22954

22955

// AVX1 can't use the signsplat masking for 256-bit vectors - we have to

22956

// use the slightly slower blendv select instead.

22957

if (VT == MVT::v8i32 && !Subtarget.hasAVX2()) {

22958

SDValue Overflow = DAG.getNode(ISD::OR, dl, VT, Small, Big);

22959

return DAG.getNode(X86ISD::BLENDV, dl, VT, Small, Overflow, Small);

22960

}

22961

22962

SDValue IsOverflown =

22963

DAG.getNode(X86ISD::VSRAI, dl, VT, Small,

22964

DAG.getTargetConstant(DstBits - 1, dl, MVT::i8));

22965

return DAG.getNode(ISD::OR, dl, VT, Small,

22966

DAG.getNode(ISD::AND, dl, VT, Big, IsOverflown));

22967

}

22968

22969

SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {

22970

bool IsStrict = Op->isStrictFPOpcode();

22971

bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT ||

22972

Op.getOpcode() == ISD::STRICT_FP_TO_SINT;

22973

MVT VT = Op->getSimpleValueType(0);

22974

SDValue Src = Op.getOperand(IsStrict ? 1 : 0);

22975

SDValue Chain = IsStrict ? Op->getOperand(0) : SDValue();

22976

MVT SrcVT = Src.getSimpleValueType();

22977

SDLoc dl(Op);

22978

22979

SDValue Res;

22980

if (isSoftFP16(SrcVT)) {

22981

MVT NVT = VT.isVector() ? VT.changeVectorElementType(MVT::f32) : MVT::f32;

22982

if (IsStrict)

22983

return DAG.getNode(Op.getOpcode(), dl, {VT, MVT::Other},

22984

{Chain, DAG.getNode(ISD::STRICT_FP_EXTEND, dl,

22985

{NVT, MVT::Other}, {Chain, Src})});

22986

return DAG.getNode(Op.getOpcode(), dl, VT,

22987

DAG.getNode(ISD::FP_EXTEND, dl, NVT, Src));

22988

} else if (isTypeLegal(SrcVT) && isLegalConversion(VT, IsSigned, Subtarget)) {

22989

return Op;

22990

}

22991

22992

if (VT.isVector()) {

22993

if (VT == MVT::v2i1 && SrcVT == MVT::v2f64) {

22994

MVT ResVT = MVT::v4i32;

22995

MVT TruncVT = MVT::v4i1;

22996

unsigned Opc;

22997

if (IsStrict)

22998

Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI;

22999

else

23000

Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;

23001

23002

if (!IsSigned && !Subtarget.hasVLX()) {

23003

assert(Subtarget.useAVX512Regs() && "Unexpected features!")(static_cast <bool> (Subtarget.useAVX512Regs() &&
"Unexpected features!") ? void (0) : __assert_fail ("Subtarget.useAVX512Regs() && \"Unexpected features!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23003, __extension__
__PRETTY_FUNCTION__));

23004

// Widen to 512-bits.

23005

ResVT = MVT::v8i32;

23006

TruncVT = MVT::v8i1;

23007

Opc = Op.getOpcode();

23008

// Need to concat with zero vector for strict fp to avoid spurious

23009

// exceptions.

23010

// TODO: Should we just do this for non-strict as well?

23011

SDValue Tmp = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v8f64)

23012

: DAG.getUNDEF(MVT::v8f64);

23013

Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8f64, Tmp, Src,

23014

DAG.getIntPtrConstant(0, dl));

23015

}

23016

if (IsStrict) {

23017

Res = DAG.getNode(Opc, dl, {ResVT, MVT::Other}, {Chain, Src});

23018

Chain = Res.getValue(1);

23019

} else {

23020

Res = DAG.getNode(Opc, dl, ResVT, Src);

23021

}

23022

23023

Res = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Res);

23024

Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i1, Res,

23025

DAG.getIntPtrConstant(0, dl));

23026

if (IsStrict)

23027

return DAG.getMergeValues({Res, Chain}, dl);

23028

return Res;

23029

}

23030

23031

if (Subtarget.hasFP16() && SrcVT.getVectorElementType() == MVT::f16) {

23032

if (VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16)

23033

return Op;

23034

23035

MVT ResVT = VT;

23036

MVT EleVT = VT.getVectorElementType();

23037

if (EleVT != MVT::i64)

23038

ResVT = EleVT == MVT::i32 ? MVT::v4i32 : MVT::v8i16;

23039

23040

if (SrcVT != MVT::v8f16) {

23041

SDValue Tmp =

23042

IsStrict ? DAG.getConstantFP(0.0, dl, SrcVT) : DAG.getUNDEF(SrcVT);

23043

SmallVector<SDValue, 4> Ops(SrcVT == MVT::v2f16 ? 4 : 2, Tmp);

23044

Ops[0] = Src;

23045

Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f16, Ops);

23046

}

23047

23048

if (IsStrict) {

23049

Res = DAG.getNode(IsSigned ? X86ISD::STRICT_CVTTP2SI

23050

: X86ISD::STRICT_CVTTP2UI,

23051

dl, {ResVT, MVT::Other}, {Chain, Src});

23052

Chain = Res.getValue(1);

23053

} else {

23054

Res = DAG.getNode(IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI, dl,

23055

ResVT, Src);

23056

}

23057

23058

// TODO: Need to add exception check code for strict FP.

23059

if (EleVT.getSizeInBits() < 16) {

23060

ResVT = MVT::getVectorVT(EleVT, 8);

23061

Res = DAG.getNode(ISD::TRUNCATE, dl, ResVT, Res);

23062

}

23063

23064

if (ResVT != VT)

23065

Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,

23066

DAG.getIntPtrConstant(0, dl));

23067

23068

if (IsStrict)

23069

return DAG.getMergeValues({Res, Chain}, dl);

23070

return Res;

23071

}

23072

23073

// v8f32/v16f32/v8f64->v8i16/v16i16 need to widen first.

23074

if (VT.getVectorElementType() == MVT::i16) {

23075

assert((SrcVT.getVectorElementType() == MVT::f32 ||(static_cast <bool> ((SrcVT.getVectorElementType() == MVT
::f32 || SrcVT.getVectorElementType() == MVT::f64) &&
"Expected f32/f64 vector!") ? void (0) : __assert_fail ("(SrcVT.getVectorElementType() == MVT::f32 || SrcVT.getVectorElementType() == MVT::f64) && \"Expected f32/f64 vector!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23077, __extension__
__PRETTY_FUNCTION__))

23076

SrcVT.getVectorElementType() == MVT::f64) &&(static_cast <bool> ((SrcVT.getVectorElementType() == MVT
::f32 || SrcVT.getVectorElementType() == MVT::f64) &&
"Expected f32/f64 vector!") ? void (0) : __assert_fail ("(SrcVT.getVectorElementType() == MVT::f32 || SrcVT.getVectorElementType() == MVT::f64) && \"Expected f32/f64 vector!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23077, __extension__
__PRETTY_FUNCTION__))

23077

"Expected f32/f64 vector!")(static_cast <bool> ((SrcVT.getVectorElementType() == MVT
::f32 || SrcVT.getVectorElementType() == MVT::f64) &&
"Expected f32/f64 vector!") ? void (0) : __assert_fail ("(SrcVT.getVectorElementType() == MVT::f32 || SrcVT.getVectorElementType() == MVT::f64) && \"Expected f32/f64 vector!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23077, __extension__
__PRETTY_FUNCTION__));

23078

MVT NVT = VT.changeVectorElementType(MVT::i32);

23079

if (IsStrict) {

23080

Res = DAG.getNode(IsSigned ? ISD::STRICT_FP_TO_SINT

23081

: ISD::STRICT_FP_TO_UINT,

23082

dl, {NVT, MVT::Other}, {Chain, Src});

23083

Chain = Res.getValue(1);

23084

} else {

23085

Res = DAG.getNode(IsSigned ? ISD::FP_TO_SINT : ISD::FP_TO_UINT, dl,

23086

NVT, Src);

23087

}

23088

23089

// TODO: Need to add exception check code for strict FP.

23090

Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);

23091

23092

if (IsStrict)

23093

return DAG.getMergeValues({Res, Chain}, dl);

23094

return Res;

23095

}

23096

23097

// v8f64->v8i32 is legal, but we need v8i32 to be custom for v8f32.

23098

if (VT == MVT::v8i32 && SrcVT == MVT::v8f64) {

23099

assert(!IsSigned && "Expected unsigned conversion!")(static_cast <bool> (!IsSigned && "Expected unsigned conversion!"
) ? void (0) : __assert_fail ("!IsSigned && \"Expected unsigned conversion!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23099, __extension__
__PRETTY_FUNCTION__));

23100

assert(Subtarget.useAVX512Regs() && "Requires avx512f")(static_cast <bool> (Subtarget.useAVX512Regs() &&
"Requires avx512f") ? void (0) : __assert_fail ("Subtarget.useAVX512Regs() && \"Requires avx512f\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23100, __extension__
__PRETTY_FUNCTION__));

23101

return Op;

23102

}

23103

23104

// Widen vXi32 fp_to_uint with avx512f to 512-bit source.

23105

if ((VT == MVT::v4i32 || VT == MVT::v8i32) &&

23106

(SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32 || SrcVT == MVT::v8f32) &&

23107

Subtarget.useAVX512Regs()) {

23108

assert(!IsSigned && "Expected unsigned conversion!")(static_cast <bool> (!IsSigned && "Expected unsigned conversion!"
) ? void (0) : __assert_fail ("!IsSigned && \"Expected unsigned conversion!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23108, __extension__
__PRETTY_FUNCTION__));

23109

assert(!Subtarget.hasVLX() && "Unexpected features!")(static_cast <bool> (!Subtarget.hasVLX() && "Unexpected features!"
) ? void (0) : __assert_fail ("!Subtarget.hasVLX() && \"Unexpected features!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23109, __extension__
__PRETTY_FUNCTION__));

23110

MVT WideVT = SrcVT == MVT::v4f64 ? MVT::v8f64 : MVT::v16f32;

23111

MVT ResVT = SrcVT == MVT::v4f64 ? MVT::v8i32 : MVT::v16i32;

23112

// Need to concat with zero vector for strict fp to avoid spurious

23113

// exceptions.

23114

// TODO: Should we just do this for non-strict as well?

23115

SDValue Tmp =

23116

IsStrict ? DAG.getConstantFP(0.0, dl, WideVT) : DAG.getUNDEF(WideVT);

23117

Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Tmp, Src,

23118

DAG.getIntPtrConstant(0, dl));

23119

23120

if (IsStrict) {

23121

Res = DAG.getNode(ISD::STRICT_FP_TO_UINT, dl, {ResVT, MVT::Other},

23122

{Chain, Src});

23123

Chain = Res.getValue(1);

23124

} else {

23125

Res = DAG.getNode(ISD::FP_TO_UINT, dl, ResVT, Src);

23126

}

23127

23128

Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,

23129

DAG.getIntPtrConstant(0, dl));

23130

23131

if (IsStrict)

23132

return DAG.getMergeValues({Res, Chain}, dl);

23133

return Res;

23134

}

23135

23136

// Widen vXi64 fp_to_uint/fp_to_sint with avx512dq to 512-bit source.

23137

if ((VT == MVT::v2i64 || VT == MVT::v4i64) &&

23138

(SrcVT == MVT::v2f64 || SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32) &&

23139

Subtarget.useAVX512Regs() && Subtarget.hasDQI()) {

23140

assert(!Subtarget.hasVLX() && "Unexpected features!")(static_cast <bool> (!Subtarget.hasVLX() && "Unexpected features!"
) ? void (0) : __assert_fail ("!Subtarget.hasVLX() && \"Unexpected features!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23140, __extension__
__PRETTY_FUNCTION__));

23141

MVT WideVT = SrcVT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64;

23142

// Need to concat with zero vector for strict fp to avoid spurious

23143

// exceptions.

23144

// TODO: Should we just do this for non-strict as well?

23145

SDValue Tmp =

23146

IsStrict ? DAG.getConstantFP(0.0, dl, WideVT) : DAG.getUNDEF(WideVT);

23147

Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Tmp, Src,

23148

DAG.getIntPtrConstant(0, dl));

23149

23150

if (IsStrict) {

23151

Res = DAG.getNode(Op.getOpcode(), dl, {MVT::v8i64, MVT::Other},

23152

{Chain, Src});

23153

Chain = Res.getValue(1);

23154

} else {

23155

Res = DAG.getNode(Op.getOpcode(), dl, MVT::v8i64, Src);

23156

}

23157

23158

Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,

23159

DAG.getIntPtrConstant(0, dl));

23160

23161

if (IsStrict)

23162

return DAG.getMergeValues({Res, Chain}, dl);

23163

return Res;

23164

}

23165

23166

if (VT == MVT::v2i64 && SrcVT == MVT::v2f32) {

23167

if (!Subtarget.hasVLX()) {

23168

// Non-strict nodes without VLX can we widened to v4f32->v4i64 by type

23169

// legalizer and then widened again by vector op legalization.

23170

if (!IsStrict)

23171

return SDValue();

23172

23173

SDValue Zero = DAG.getConstantFP(0.0, dl, MVT::v2f32);

23174

SDValue Tmp = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f32,

23175

{Src, Zero, Zero, Zero});

23176

Tmp = DAG.getNode(Op.getOpcode(), dl, {MVT::v8i64, MVT::Other},

23177

{Chain, Tmp});

23178

SDValue Chain = Tmp.getValue(1);

23179

Tmp = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i64, Tmp,

23180

DAG.getIntPtrConstant(0, dl));

23181

return DAG.getMergeValues({Tmp, Chain}, dl);

23182

}

23183

23184

assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL")(static_cast <bool> (Subtarget.hasDQI() && Subtarget
.hasVLX() && "Requires AVX512DQVL") ? void (0) : __assert_fail
("Subtarget.hasDQI() && Subtarget.hasVLX() && \"Requires AVX512DQVL\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23184, __extension__
__PRETTY_FUNCTION__));

23185

SDValue Tmp = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,

23186

DAG.getUNDEF(MVT::v2f32));

23187

if (IsStrict) {

23188

unsigned Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI

23189

: X86ISD::STRICT_CVTTP2UI;

23190

return DAG.getNode(Opc, dl, {VT, MVT::Other}, {Op->getOperand(0), Tmp});

23191

}

23192

unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;

23193

return DAG.getNode(Opc, dl, VT, Tmp);

23194

}

23195

23196

// Generate optimized instructions for pre AVX512 unsigned conversions from

23197

// vXf32 to vXi32.

23198

if ((VT == MVT::v4i32 && SrcVT == MVT::v4f32) ||

23199

(VT == MVT::v4i32 && SrcVT == MVT::v4f64) ||

23200

(VT == MVT::v8i32 && SrcVT == MVT::v8f32)) {

23201

assert(!IsSigned && "Expected unsigned conversion!")(static_cast <bool> (!IsSigned && "Expected unsigned conversion!"
) ? void (0) : __assert_fail ("!IsSigned && \"Expected unsigned conversion!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23201, __extension__
__PRETTY_FUNCTION__));

23202

return expandFP_TO_UINT_SSE(VT, Src, dl, DAG, Subtarget);

23203

}

23204

23205

return SDValue();

23206

}

23207

23208

assert(!VT.isVector())(static_cast <bool> (!VT.isVector()) ? void (0) : __assert_fail
("!VT.isVector()", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 23208, __extension__ __PRETTY_FUNCTION__));

23209

23210

bool UseSSEReg = isScalarFPTypeInSSEReg(SrcVT);

23211

23212

if (!IsSigned && UseSSEReg) {

23213

// Conversions from f32/f64 with AVX512 should be legal.

23214

if (Subtarget.hasAVX512())

23215

return Op;

23216

23217

// We can leverage the specific way the "cvttss2si/cvttsd2si" instruction

23218

// behaves on out of range inputs to generate optimized conversions.

23219

if (!IsStrict && ((VT == MVT::i32 && !Subtarget.is64Bit()) ||

23220

(VT == MVT::i64 && Subtarget.is64Bit()))) {

23221

unsigned DstBits = VT.getScalarSizeInBits();

23222

APInt UIntLimit = APInt::getSignMask(DstBits);

23223

SDValue FloatOffset = DAG.getNode(ISD::UINT_TO_FP, dl, SrcVT,

23224

DAG.getConstant(UIntLimit, dl, VT));

23225

MVT SrcVecVT = MVT::getVectorVT(SrcVT, 128 / SrcVT.getScalarSizeInBits());

23226

23227

// Calculate the converted result for values in the range:

23228

// (i32) 0 to 2^31-1 ("Small") and from 2^31 to 2^32-1 ("Big").

23229

// (i64) 0 to 2^63-1 ("Small") and from 2^63 to 2^64-1 ("Big").

23230

SDValue Small =

23231

DAG.getNode(X86ISD::CVTTS2SI, dl, VT,

23232

DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, SrcVecVT, Src));

23233

SDValue Big = DAG.getNode(

23234

X86ISD::CVTTS2SI, dl, VT,

23235

DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, SrcVecVT,

23236

DAG.getNode(ISD::FSUB, dl, SrcVT, Src, FloatOffset)));

23237

23238

// The "CVTTS2SI" instruction conveniently sets the sign bit if

23239

// and only if the value was out of range. So we can use that

23240

// as our indicator that we rather use "Big" instead of "Small".

23241

//

23242

// Use "Small" if "IsOverflown" has all bits cleared

23243

// and "0x80000000 | Big" if all bits in "IsOverflown" are set.

23244

SDValue IsOverflown = DAG.getNode(

23245

ISD::SRA, dl, VT, Small, DAG.getConstant(DstBits - 1, dl, MVT::i8));

23246

return DAG.getNode(ISD::OR, dl, VT, Small,

23247

DAG.getNode(ISD::AND, dl, VT, Big, IsOverflown));

23248

}

23249

23250

// Use default expansion for i64.

23251

if (VT == MVT::i64)

23252

return SDValue();

23253

23254

assert(VT == MVT::i32 && "Unexpected VT!")(static_cast <bool> (VT == MVT::i32 && "Unexpected VT!"
) ? void (0) : __assert_fail ("VT == MVT::i32 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23254, __extension__
__PRETTY_FUNCTION__));

23255

23256

// Promote i32 to i64 and use a signed operation on 64-bit targets.

23257

// FIXME: This does not generate an invalid exception if the input does not

23258

// fit in i32. PR44019

23259

if (Subtarget.is64Bit()) {

23260

if (IsStrict) {

23261

Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {MVT::i64, MVT::Other},

23262

{Chain, Src});

23263

Chain = Res.getValue(1);

23264

} else

23265

Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i64, Src);

23266

23267

Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);

23268

if (IsStrict)

23269

return DAG.getMergeValues({Res, Chain}, dl);

23270

return Res;

23271

}

23272

23273

// Use default expansion for SSE1/2 targets without SSE3. With SSE3 we can

23274

// use fisttp which will be handled later.

23275

if (!Subtarget.hasSSE3())

23276

return SDValue();

23277

}

23278

23279

// Promote i16 to i32 if we can use a SSE operation or the type is f128.

23280

// FIXME: This does not generate an invalid exception if the input does not

23281

// fit in i16. PR44019

23282

if (VT == MVT::i16 && (UseSSEReg || SrcVT == MVT::f128)) {

23283

assert(IsSigned && "Expected i16 FP_TO_UINT to have been promoted!")(static_cast <bool> (IsSigned && "Expected i16 FP_TO_UINT to have been promoted!"
) ? void (0) : __assert_fail ("IsSigned && \"Expected i16 FP_TO_UINT to have been promoted!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23283, __extension__
__PRETTY_FUNCTION__));

23284

if (IsStrict) {

23285

Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {MVT::i32, MVT::Other},

23286

{Chain, Src});

23287

Chain = Res.getValue(1);

23288

} else

23289

Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, Src);

23290

23291

Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);

23292

if (IsStrict)

23293

return DAG.getMergeValues({Res, Chain}, dl);

23294

return Res;

23295

}

23296

23297

// If this is a FP_TO_SINT using SSEReg we're done.

23298

if (UseSSEReg && IsSigned)

23299

return Op;

23300

23301

// fp128 needs to use a libcall.

23302

if (SrcVT == MVT::f128) {

23303

RTLIB::Libcall LC;

23304

if (IsSigned)

23305

LC = RTLIB::getFPTOSINT(SrcVT, VT);

23306

else

23307

LC = RTLIB::getFPTOUINT(SrcVT, VT);

23308

23309

MakeLibCallOptions CallOptions;

23310

std::pair<SDValue, SDValue> Tmp = makeLibCall(DAG, LC, VT, Src, CallOptions,

23311

SDLoc(Op), Chain);

23312

23313

if (IsStrict)

23314

return DAG.getMergeValues({ Tmp.first, Tmp.second }, dl);

23315

23316

return Tmp.first;

23317

}

23318

23319

// Fall back to X87.

23320

if (SDValue V = FP_TO_INTHelper(Op, DAG, IsSigned, Chain)) {

23321

if (IsStrict)

23322

return DAG.getMergeValues({V, Chain}, dl);

23323

return V;

23324

}

23325

23326

llvm_unreachable("Expected FP_TO_INTHelper to handle all remaining cases.")::llvm::llvm_unreachable_internal("Expected FP_TO_INTHelper to handle all remaining cases."
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23326);

23327

}

23328

23329

SDValue X86TargetLowering::LowerLRINT_LLRINT(SDValue Op,

23330

SelectionDAG &DAG) const {

23331

SDValue Src = Op.getOperand(0);

23332

MVT SrcVT = Src.getSimpleValueType();

23333

23334

if (SrcVT == MVT::f16)

23335

return SDValue();

23336

23337

// If the source is in an SSE register, the node is Legal.

23338

if (isScalarFPTypeInSSEReg(SrcVT))

23339

return Op;

23340

23341

return LRINT_LLRINTHelper(Op.getNode(), DAG);

23342

}

23343

23344

SDValue X86TargetLowering::LRINT_LLRINTHelper(SDNode *N,

23345

SelectionDAG &DAG) const {

23346

EVT DstVT = N->getValueType(0);

23347

SDValue Src = N->getOperand(0);

23348

EVT SrcVT = Src.getValueType();

23349

23350

if (SrcVT != MVT::f32 && SrcVT != MVT::f64 && SrcVT != MVT::f80) {

23351

// f16 must be promoted before using the lowering in this routine.

23352

// fp128 does not use this lowering.

23353

return SDValue();

23354

}

23355

23356

SDLoc DL(N);

23357

SDValue Chain = DAG.getEntryNode();

23358

23359

bool UseSSE = isScalarFPTypeInSSEReg(SrcVT);

23360

23361

// If we're converting from SSE, the stack slot needs to hold both types.

23362

// Otherwise it only needs to hold the DstVT.

23363

EVT OtherVT = UseSSE ? SrcVT : DstVT;

23364

SDValue StackPtr = DAG.CreateStackTemporary(DstVT, OtherVT);

23365

int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();

23366

MachinePointerInfo MPI =

23367

MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);

23368

23369

if (UseSSE) {

23370

assert(DstVT == MVT::i64 && "Invalid LRINT/LLRINT to lower!")(static_cast <bool> (DstVT == MVT::i64 && "Invalid LRINT/LLRINT to lower!"
) ? void (0) : __assert_fail ("DstVT == MVT::i64 && \"Invalid LRINT/LLRINT to lower!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23370, __extension__
__PRETTY_FUNCTION__));

23371

Chain = DAG.getStore(Chain, DL, Src, StackPtr, MPI);

23372

SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);

23373

SDValue Ops[] = { Chain, StackPtr };

23374

23375

Src = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, SrcVT, MPI,

23376

/*Align*/ std::nullopt,

23377

MachineMemOperand::MOLoad);

23378

Chain = Src.getValue(1);

23379

}

23380

23381

SDValue StoreOps[] = { Chain, Src, StackPtr };

23382

Chain = DAG.getMemIntrinsicNode(X86ISD::FIST, DL, DAG.getVTList(MVT::Other),

23383

StoreOps, DstVT, MPI, /*Align*/ std::nullopt,

23384

MachineMemOperand::MOStore);

23385

23386

return DAG.getLoad(DstVT, DL, Chain, StackPtr, MPI);

23387

}

23388

23389

SDValue

23390

X86TargetLowering::LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) const {

23391

// This is based on the TargetLowering::expandFP_TO_INT_SAT implementation,

23392

// but making use of X86 specifics to produce better instruction sequences.

23393

SDNode *Node = Op.getNode();

23394

bool IsSigned = Node->getOpcode() == ISD::FP_TO_SINT_SAT;

23395

unsigned FpToIntOpcode = IsSigned ? ISD::FP_TO_SINT : ISD::FP_TO_UINT;

23396

SDLoc dl(SDValue(Node, 0));

23397

SDValue Src = Node->getOperand(0);

23398

23399

// There are three types involved here: SrcVT is the source floating point

23400

// type, DstVT is the type of the result, and TmpVT is the result of the

23401

// intermediate FP_TO_*INT operation we'll use (which may be a promotion of

23402

// DstVT).

23403

EVT SrcVT = Src.getValueType();

23404

EVT DstVT = Node->getValueType(0);

23405

EVT TmpVT = DstVT;

23406

23407

// This code is only for floats and doubles. Fall back to generic code for

23408

// anything else.

23409

if (!isScalarFPTypeInSSEReg(SrcVT) || isSoftFP16(SrcVT))

23410

return SDValue();

23411

23412

EVT SatVT = cast<VTSDNode>(Node->getOperand(1))->getVT();

23413

unsigned SatWidth = SatVT.getScalarSizeInBits();

23414

unsigned DstWidth = DstVT.getScalarSizeInBits();

23415

unsigned TmpWidth = TmpVT.getScalarSizeInBits();

23416

assert(SatWidth <= DstWidth && SatWidth <= TmpWidth &&(static_cast <bool> (SatWidth <= DstWidth &&
SatWidth <= TmpWidth && "Expected saturation width smaller than result width"
) ? void (0) : __assert_fail ("SatWidth <= DstWidth && SatWidth <= TmpWidth && \"Expected saturation width smaller than result width\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23417, __extension__
__PRETTY_FUNCTION__))

23417

"Expected saturation width smaller than result width")(static_cast <bool> (SatWidth <= DstWidth &&
SatWidth <= TmpWidth && "Expected saturation width smaller than result width"
) ? void (0) : __assert_fail ("SatWidth <= DstWidth && SatWidth <= TmpWidth && \"Expected saturation width smaller than result width\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23417, __extension__
__PRETTY_FUNCTION__));

23418

23419

// Promote result of FP_TO_*INT to at least 32 bits.

23420

if (TmpWidth < 32) {

23421

TmpVT = MVT::i32;

23422

TmpWidth = 32;

23423

}

23424

23425

// Promote conversions to unsigned 32-bit to 64-bit, because it will allow

23426

// us to use a native signed conversion instead.

23427

if (SatWidth == 32 && !IsSigned && Subtarget.is64Bit()) {

23428

TmpVT = MVT::i64;

23429

TmpWidth = 64;

23430

}

23431

23432

// If the saturation width is smaller than the size of the temporary result,

23433

// we can always use signed conversion, which is native.

23434

if (SatWidth < TmpWidth)

23435

FpToIntOpcode = ISD::FP_TO_SINT;

23436

23437

// Determine minimum and maximum integer values and their corresponding

23438

// floating-point values.

23439

APInt MinInt, MaxInt;

23440

if (IsSigned) {

23441

MinInt = APInt::getSignedMinValue(SatWidth).sext(DstWidth);

23442

MaxInt = APInt::getSignedMaxValue(SatWidth).sext(DstWidth);

23443

} else {

23444

MinInt = APInt::getMinValue(SatWidth).zext(DstWidth);

23445

MaxInt = APInt::getMaxValue(SatWidth).zext(DstWidth);

23446

}

23447

23448

APFloat MinFloat(DAG.EVTToAPFloatSemantics(SrcVT));

23449

APFloat MaxFloat(DAG.EVTToAPFloatSemantics(SrcVT));

23450

23451

APFloat::opStatus MinStatus = MinFloat.convertFromAPInt(

23452

MinInt, IsSigned, APFloat::rmTowardZero);

23453

APFloat::opStatus MaxStatus = MaxFloat.convertFromAPInt(

23454

MaxInt, IsSigned, APFloat::rmTowardZero);

23455

bool AreExactFloatBounds = !(MinStatus & APFloat::opStatus::opInexact)

23456

&& !(MaxStatus & APFloat::opStatus::opInexact);

23457

23458

SDValue MinFloatNode = DAG.getConstantFP(MinFloat, dl, SrcVT);

23459

SDValue MaxFloatNode = DAG.getConstantFP(MaxFloat, dl, SrcVT);

23460

23461

// If the integer bounds are exactly representable as floats, emit a

23462

// min+max+fptoi sequence. Otherwise use comparisons and selects.

23463

if (AreExactFloatBounds) {

23464

if (DstVT != TmpVT) {

23465

// Clamp by MinFloat from below. If Src is NaN, propagate NaN.

23466

SDValue MinClamped = DAG.getNode(

23467

X86ISD::FMAX, dl, SrcVT, MinFloatNode, Src);

23468

// Clamp by MaxFloat from above. If Src is NaN, propagate NaN.

23469

SDValue BothClamped = DAG.getNode(

23470

X86ISD::FMIN, dl, SrcVT, MaxFloatNode, MinClamped);

23471

// Convert clamped value to integer.

23472

SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, TmpVT, BothClamped);

23473

23474

// NaN will become INDVAL, with the top bit set and the rest zero.

23475

// Truncation will discard the top bit, resulting in zero.

23476

return DAG.getNode(ISD::TRUNCATE, dl, DstVT, FpToInt);

23477

}

23478

23479

// Clamp by MinFloat from below. If Src is NaN, the result is MinFloat.

23480

SDValue MinClamped = DAG.getNode(

23481

X86ISD::FMAX, dl, SrcVT, Src, MinFloatNode);

23482

// Clamp by MaxFloat from above. NaN cannot occur.

23483

SDValue BothClamped = DAG.getNode(

23484

X86ISD::FMINC, dl, SrcVT, MinClamped, MaxFloatNode);

23485

// Convert clamped value to integer.

23486

SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, DstVT, BothClamped);

23487

23488

if (!IsSigned) {

23489

// In the unsigned case we're done, because we mapped NaN to MinFloat,

23490

// which is zero.

23491

return FpToInt;

23492

}

23493

23494

// Otherwise, select zero if Src is NaN.

23495

SDValue ZeroInt = DAG.getConstant(0, dl, DstVT);

23496

return DAG.getSelectCC(

23497

dl, Src, Src, ZeroInt, FpToInt, ISD::CondCode::SETUO);

23498

}

23499

23500

SDValue MinIntNode = DAG.getConstant(MinInt, dl, DstVT);

23501

SDValue MaxIntNode = DAG.getConstant(MaxInt, dl, DstVT);

23502

23503

// Result of direct conversion, which may be selected away.

23504

SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, TmpVT, Src);

23505

23506

if (DstVT != TmpVT) {

23507

// NaN will become INDVAL, with the top bit set and the rest zero.

23508

// Truncation will discard the top bit, resulting in zero.

23509

FpToInt = DAG.getNode(ISD::TRUNCATE, dl, DstVT, FpToInt);

23510

}

23511

23512

SDValue Select = FpToInt;

23513

// For signed conversions where we saturate to the same size as the

23514

// result type of the fptoi instructions, INDVAL coincides with integer

23515

// minimum, so we don't need to explicitly check it.

23516

if (!IsSigned || SatWidth != TmpVT.getScalarSizeInBits()) {

23517

// If Src ULT MinFloat, select MinInt. In particular, this also selects

23518

// MinInt if Src is NaN.

23519

Select = DAG.getSelectCC(

23520

dl, Src, MinFloatNode, MinIntNode, Select, ISD::CondCode::SETULT);

23521

}

23522

23523

// If Src OGT MaxFloat, select MaxInt.

23524

Select = DAG.getSelectCC(

23525

dl, Src, MaxFloatNode, MaxIntNode, Select, ISD::CondCode::SETOGT);

23526

23527

// In the unsigned case we are done, because we mapped NaN to MinInt, which

23528

// is already zero. The promoted case was already handled above.

23529

if (!IsSigned || DstVT != TmpVT) {

23530

return Select;

23531

}

23532

23533

// Otherwise, select 0 if Src is NaN.

23534

SDValue ZeroInt = DAG.getConstant(0, dl, DstVT);

23535

return DAG.getSelectCC(

23536

dl, Src, Src, ZeroInt, Select, ISD::CondCode::SETUO);

23537

}

23538

23539

SDValue X86TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {

23540

bool IsStrict = Op->isStrictFPOpcode();

23541

23542

SDLoc DL(Op);

23543

MVT VT = Op.getSimpleValueType();

23544

SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();

23545

SDValue In = Op.getOperand(IsStrict ? 1 : 0);

23546

MVT SVT = In.getSimpleValueType();

23547

23548

// Let f16->f80 get lowered to a libcall, except for darwin, where we should

23549

// lower it to an fp_extend via f32 (as only f16<>f32 libcalls are available)

23550

if (VT == MVT::f128 || (SVT == MVT::f16 && VT == MVT::f80 &&

23551

!Subtarget.getTargetTriple().isOSDarwin()))

23552

return SDValue();

23553

23554

if (SVT == MVT::f16) {

23555

if (Subtarget.hasFP16())

23556

return Op;

23557

23558

if (VT != MVT::f32) {

23559

if (IsStrict)

23560

return DAG.getNode(

23561

ISD::STRICT_FP_EXTEND, DL, {VT, MVT::Other},

23562

{Chain, DAG.getNode(ISD::STRICT_FP_EXTEND, DL,

23563

{MVT::f32, MVT::Other}, {Chain, In})});

23564

23565

return DAG.getNode(ISD::FP_EXTEND, DL, VT,

23566

DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, In));

23567

}

23568

23569

if (!Subtarget.hasF16C()) {

23570

if (!Subtarget.getTargetTriple().isOSDarwin())

23571

return SDValue();

23572

23573

assert(VT == MVT::f32 && SVT == MVT::f16 && "unexpected extend libcall")(static_cast <bool> (VT == MVT::f32 && SVT == MVT
::f16 && "unexpected extend libcall") ? void (0) : __assert_fail
("VT == MVT::f32 && SVT == MVT::f16 && \"unexpected extend libcall\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23573, __extension__
__PRETTY_FUNCTION__));

23574

23575

// Need a libcall, but ABI for f16 is soft-float on MacOS.

23576

TargetLowering::CallLoweringInfo CLI(DAG);

23577

Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();

23578

23579

In = DAG.getBitcast(MVT::i16, In);

23580

TargetLowering::ArgListTy Args;

23581

TargetLowering::ArgListEntry Entry;

23582

Entry.Node = In;

23583

Entry.Ty = EVT(MVT::i16).getTypeForEVT(*DAG.getContext());

23584

Entry.IsSExt = false;

23585

Entry.IsZExt = true;

23586

Args.push_back(Entry);

23587

23588

SDValue Callee = DAG.getExternalSymbol(

23589

getLibcallName(RTLIB::FPEXT_F16_F32),

23590

getPointerTy(DAG.getDataLayout()));

23591

CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(

23592

CallingConv::C, EVT(VT).getTypeForEVT(*DAG.getContext()), Callee,

23593

std::move(Args));

23594

23595

SDValue Res;

23596

std::tie(Res,Chain) = LowerCallTo(CLI);

23597

if (IsStrict)

23598

Res = DAG.getMergeValues({Res, Chain}, DL);

23599

23600

return Res;

23601

}

23602

23603

In = DAG.getBitcast(MVT::i16, In);

23604

In = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v8i16,

23605

getZeroVector(MVT::v8i16, Subtarget, DAG, DL), In,

23606

DAG.getIntPtrConstant(0, DL));

23607

SDValue Res;

23608

if (IsStrict) {

23609

Res = DAG.getNode(X86ISD::STRICT_CVTPH2PS, DL, {MVT::v4f32, MVT::Other},

23610

{Chain, In});

23611

Chain = Res.getValue(1);

23612

} else {

23613

Res = DAG.getNode(X86ISD::CVTPH2PS, DL, MVT::v4f32, In,

23614

DAG.getTargetConstant(4, DL, MVT::i32));

23615

}

23616

Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Res,

23617

DAG.getIntPtrConstant(0, DL));

23618

if (IsStrict)

23619

return DAG.getMergeValues({Res, Chain}, DL);

23620

return Res;

23621

}

23622

23623

if (!SVT.isVector())

23624

return Op;

23625

23626

if (SVT.getVectorElementType() == MVT::f16) {

23627

assert(Subtarget.hasF16C() && "Unexpected features!")(static_cast <bool> (Subtarget.hasF16C() && "Unexpected features!"
) ? void (0) : __assert_fail ("Subtarget.hasF16C() && \"Unexpected features!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23627, __extension__
__PRETTY_FUNCTION__));

23628

if (SVT == MVT::v2f16)

23629

In = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f16, In,

23630

DAG.getUNDEF(MVT::v2f16));

23631

SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8f16, In,

23632

DAG.getUNDEF(MVT::v4f16));

23633

if (IsStrict)

23634

return DAG.getNode(X86ISD::STRICT_VFPEXT, DL, {VT, MVT::Other},

23635

{Op->getOperand(0), Res});

23636

return DAG.getNode(X86ISD::VFPEXT, DL, VT, Res);

23637

} else if (VT == MVT::v4f64 || VT == MVT::v8f64) {

23638

return Op;

23639

}

23640

23641

assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!")(static_cast <bool> (SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!"
) ? void (0) : __assert_fail ("SVT == MVT::v2f32 && \"Only customize MVT::v2f32 type legalization!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23641, __extension__
__PRETTY_FUNCTION__));

23642

23643

SDValue Res =

23644

DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32, In, DAG.getUNDEF(SVT));

23645

if (IsStrict)

23646

return DAG.getNode(X86ISD::STRICT_VFPEXT, DL, {VT, MVT::Other},

23647

{Op->getOperand(0), Res});

23648

return DAG.getNode(X86ISD::VFPEXT, DL, VT, Res);

23649

}

23650

23651

SDValue X86TargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {

23652

bool IsStrict = Op->isStrictFPOpcode();

23653

23654

SDLoc DL(Op);

23655

SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();

23656

SDValue In = Op.getOperand(IsStrict ? 1 : 0);

23657

MVT VT = Op.getSimpleValueType();

23658

MVT SVT = In.getSimpleValueType();

23659

23660

if (SVT == MVT::f128 || (VT == MVT::f16 && SVT == MVT::f80))

23661

return SDValue();

23662

23663

if (VT == MVT::f16 && (SVT == MVT::f64 || SVT == MVT::f32) &&

23664

!Subtarget.hasFP16() && (SVT == MVT::f64 || !Subtarget.hasF16C())) {

23665

if (!Subtarget.getTargetTriple().isOSDarwin())

23666

return SDValue();

23667

23668

// We need a libcall but the ABI for f16 libcalls on MacOS is soft.

23669

TargetLowering::CallLoweringInfo CLI(DAG);

23670

Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();

23671

23672

TargetLowering::ArgListTy Args;

23673

TargetLowering::ArgListEntry Entry;

23674

Entry.Node = In;

23675

Entry.Ty = EVT(SVT).getTypeForEVT(*DAG.getContext());

23676

Entry.IsSExt = false;

23677

Entry.IsZExt = true;

23678

Args.push_back(Entry);

23679

23680

SDValue Callee = DAG.getExternalSymbol(

23681

getLibcallName(SVT == MVT::f64 ? RTLIB::FPROUND_F64_F16

23682

: RTLIB::FPROUND_F32_F16),

23683

getPointerTy(DAG.getDataLayout()));

23684

CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(

23685

CallingConv::C, EVT(MVT::i16).getTypeForEVT(*DAG.getContext()), Callee,

23686

std::move(Args));

23687

23688

SDValue Res;

23689

std::tie(Res, Chain) = LowerCallTo(CLI);

23690

23691

Res = DAG.getBitcast(MVT::f16, Res);

23692

23693

if (IsStrict)

23694

Res = DAG.getMergeValues({Res, Chain}, DL);

23695

23696

return Res;

23697

}

23698

23699

if (VT.getScalarType() == MVT::f16 && !Subtarget.hasFP16()) {

23700

if (!Subtarget.hasF16C() || SVT.getScalarType() != MVT::f32)

23701

return SDValue();

23702

23703

if (VT.isVector())

23704

return Op;

23705

23706

SDValue Res;

23707

SDValue Rnd = DAG.getTargetConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, DL,

23708

MVT::i32);

23709

if (IsStrict) {

23710

Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v4f32,

23711

DAG.getConstantFP(0, DL, MVT::v4f32), In,

23712

DAG.getIntPtrConstant(0, DL));

23713

Res = DAG.getNode(X86ISD::STRICT_CVTPS2PH, DL, {MVT::v8i16, MVT::Other},

23714

{Chain, Res, Rnd});

23715

Chain = Res.getValue(1);

23716

} else {

23717

// FIXME: Should we use zeros for upper elements for non-strict?

23718

Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, In);

23719

Res = DAG.getNode(X86ISD::CVTPS2PH, DL, MVT::v8i16, Res, Rnd);

23720

}

23721

23722

Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i16, Res,

23723

DAG.getIntPtrConstant(0, DL));

23724

Res = DAG.getBitcast(MVT::f16, Res);

23725

23726

if (IsStrict)

23727

return DAG.getMergeValues({Res, Chain}, DL);

23728

23729

return Res;

23730

}

23731

23732

return Op;

23733

}

23734

23735

static SDValue LowerFP16_TO_FP(SDValue Op, SelectionDAG &DAG) {

23736

bool IsStrict = Op->isStrictFPOpcode();

23737

SDValue Src = Op.getOperand(IsStrict ? 1 : 0);

23738

assert(Src.getValueType() == MVT::i16 && Op.getValueType() == MVT::f32 &&(static_cast <bool> (Src.getValueType() == MVT::i16 &&
Op.getValueType() == MVT::f32 && "Unexpected VT!") ?
void (0) : __assert_fail ("Src.getValueType() == MVT::i16 && Op.getValueType() == MVT::f32 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23739, __extension__
__PRETTY_FUNCTION__))

23739

"Unexpected VT!")(static_cast <bool> (Src.getValueType() == MVT::i16 &&
Op.getValueType() == MVT::f32 && "Unexpected VT!") ?
void (0) : __assert_fail ("Src.getValueType() == MVT::i16 && Op.getValueType() == MVT::f32 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23739, __extension__
__PRETTY_FUNCTION__));

23740

23741

SDLoc dl(Op);

23742

SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16,

23743

DAG.getConstant(0, dl, MVT::v8i16), Src,

23744

DAG.getIntPtrConstant(0, dl));

23745

23746

SDValue Chain;

23747

if (IsStrict) {

23748

Res = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {MVT::v4f32, MVT::Other},

23749

{Op.getOperand(0), Res});

23750

Chain = Res.getValue(1);

23751

} else {

23752

Res = DAG.getNode(X86ISD::CVTPH2PS, dl, MVT::v4f32, Res);

23753

}

23754

23755

Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,

23756

DAG.getIntPtrConstant(0, dl));

23757

23758

if (IsStrict)

23759

return DAG.getMergeValues({Res, Chain}, dl);

23760

23761

return Res;

23762

}

23763

23764

static SDValue LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) {

23765

bool IsStrict = Op->isStrictFPOpcode();

23766

SDValue Src = Op.getOperand(IsStrict ? 1 : 0);

23767

assert(Src.getValueType() == MVT::f32 && Op.getValueType() == MVT::i16 &&(static_cast <bool> (Src.getValueType() == MVT::f32 &&
Op.getValueType() == MVT::i16 && "Unexpected VT!") ?
void (0) : __assert_fail ("Src.getValueType() == MVT::f32 && Op.getValueType() == MVT::i16 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23768, __extension__
__PRETTY_FUNCTION__))

23768

"Unexpected VT!")(static_cast <bool> (Src.getValueType() == MVT::f32 &&
Op.getValueType() == MVT::i16 && "Unexpected VT!") ?
void (0) : __assert_fail ("Src.getValueType() == MVT::f32 && Op.getValueType() == MVT::i16 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23768, __extension__
__PRETTY_FUNCTION__));

23769

23770

SDLoc dl(Op);

23771

SDValue Res, Chain;

23772

if (IsStrict) {

23773

Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v4f32,

23774

DAG.getConstantFP(0, dl, MVT::v4f32), Src,

23775

DAG.getIntPtrConstant(0, dl));

23776

Res = DAG.getNode(

23777

X86ISD::STRICT_CVTPS2PH, dl, {MVT::v8i16, MVT::Other},

23778

{Op.getOperand(0), Res, DAG.getTargetConstant(4, dl, MVT::i32)});

23779

Chain = Res.getValue(1);

23780

} else {

23781

// FIXME: Should we use zeros for upper elements for non-strict?

23782

Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, Src);

23783

Res = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Res,

23784

DAG.getTargetConstant(4, dl, MVT::i32));

23785

}

23786

23787

Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Res,

23788

DAG.getIntPtrConstant(0, dl));

23789

23790

if (IsStrict)

23791

return DAG.getMergeValues({Res, Chain}, dl);

23792

23793

return Res;

23794

}

23795

23796

SDValue X86TargetLowering::LowerFP_TO_BF16(SDValue Op,

23797

SelectionDAG &DAG) const {

23798

SDLoc DL(Op);

23799

MakeLibCallOptions CallOptions;

23800

RTLIB::Libcall LC =

23801

RTLIB::getFPROUND(Op.getOperand(0).getValueType(), MVT::bf16);

23802

SDValue Res =

23803

makeLibCall(DAG, LC, MVT::f32, Op.getOperand(0), CallOptions, DL).first;

23804

return DAG.getNode(ISD::TRUNCATE, DL, MVT::i16,

23805

DAG.getBitcast(MVT::i32, Res));

23806

}

23807

23808

/// Depending on uarch and/or optimizing for size, we might prefer to use a

23809

/// vector operation in place of the typical scalar operation.

23810

static SDValue lowerAddSubToHorizontalOp(SDValue Op, SelectionDAG &DAG,

23811

const X86Subtarget &Subtarget) {

23812

// If both operands have other uses, this is probably not profitable.

23813

SDValue LHS = Op.getOperand(0);

23814

SDValue RHS = Op.getOperand(1);

23815

if (!LHS.hasOneUse() && !RHS.hasOneUse())

23816

return Op;

23817

23818

// FP horizontal add/sub were added with SSE3. Integer with SSSE3.

23819

bool IsFP = Op.getSimpleValueType().isFloatingPoint();

23820

if (IsFP && !Subtarget.hasSSE3())

23821

return Op;

23822

if (!IsFP && !Subtarget.hasSSSE3())

23823

return Op;

23824

23825

// Extract from a common vector.

23826

if (LHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

23827

RHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

23828

LHS.getOperand(0) != RHS.getOperand(0) ||

23829

!isa<ConstantSDNode>(LHS.getOperand(1)) ||

23830

!isa<ConstantSDNode>(RHS.getOperand(1)) ||

23831

!shouldUseHorizontalOp(true, DAG, Subtarget))

23832

return Op;

23833

23834

// Allow commuted 'hadd' ops.

23835

// TODO: Allow commuted (f)sub by negating the result of (F)HSUB?

23836

unsigned HOpcode;

23837

switch (Op.getOpcode()) {

23838

case ISD::ADD: HOpcode = X86ISD::HADD; break;

23839

case ISD::SUB: HOpcode = X86ISD::HSUB; break;

23840

case ISD::FADD: HOpcode = X86ISD::FHADD; break;

23841

case ISD::FSUB: HOpcode = X86ISD::FHSUB; break;

23842

default:

23843

llvm_unreachable("Trying to lower unsupported opcode to horizontal op")::llvm::llvm_unreachable_internal("Trying to lower unsupported opcode to horizontal op"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23843);

23844

}

23845

unsigned LExtIndex = LHS.getConstantOperandVal(1);

23846

unsigned RExtIndex = RHS.getConstantOperandVal(1);

23847

if ((LExtIndex & 1) == 1 && (RExtIndex & 1) == 0 &&

23848

(HOpcode == X86ISD::HADD || HOpcode == X86ISD::FHADD))

23849

std::swap(LExtIndex, RExtIndex);

23850

23851

if ((LExtIndex & 1) != 0 || RExtIndex != (LExtIndex + 1))

23852

return Op;

23853

23854

SDValue X = LHS.getOperand(0);

23855

EVT VecVT = X.getValueType();

23856

unsigned BitWidth = VecVT.getSizeInBits();

23857

unsigned NumLanes = BitWidth / 128;

23858

unsigned NumEltsPerLane = VecVT.getVectorNumElements() / NumLanes;

23859

assert((BitWidth == 128 || BitWidth == 256 || BitWidth == 512) &&(static_cast <bool> ((BitWidth == 128 || BitWidth == 256
|| BitWidth == 512) && "Not expecting illegal vector widths here"
) ? void (0) : __assert_fail ("(BitWidth == 128 || BitWidth == 256 || BitWidth == 512) && \"Not expecting illegal vector widths here\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23860, __extension__
__PRETTY_FUNCTION__))

23860

"Not expecting illegal vector widths here")(static_cast <bool> ((BitWidth == 128 || BitWidth == 256
|| BitWidth == 512) && "Not expecting illegal vector widths here"
) ? void (0) : __assert_fail ("(BitWidth == 128 || BitWidth == 256 || BitWidth == 512) && \"Not expecting illegal vector widths here\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23860, __extension__
__PRETTY_FUNCTION__));

23861

23862

// Creating a 256-bit horizontal op would be wasteful, and there is no 512-bit

23863

// equivalent, so extract the 256/512-bit source op to 128-bit if we can.

23864

SDLoc DL(Op);

23865

if (BitWidth == 256 || BitWidth == 512) {

23866

unsigned LaneIdx = LExtIndex / NumEltsPerLane;

23867

X = extract128BitVector(X, LaneIdx * NumEltsPerLane, DAG, DL);

23868

LExtIndex %= NumEltsPerLane;

23869

}

23870

23871

// add (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hadd X, X), 0

23872

// add (extractelt (X, 1), extractelt (X, 0)) --> extractelt (hadd X, X), 0

23873

// add (extractelt (X, 2), extractelt (X, 3)) --> extractelt (hadd X, X), 1

23874

// sub (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hsub X, X), 0

23875

SDValue HOp = DAG.getNode(HOpcode, DL, X.getValueType(), X, X);

23876

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getSimpleValueType(), HOp,

23877

DAG.getIntPtrConstant(LExtIndex / 2, DL));

23878

}

23879

23880

/// Depending on uarch and/or optimizing for size, we might prefer to use a

23881

/// vector operation in place of the typical scalar operation.

23882

SDValue X86TargetLowering::lowerFaddFsub(SDValue Op, SelectionDAG &DAG) const {

23883

assert((Op.getValueType() == MVT::f32 || Op.getValueType() == MVT::f64) &&(static_cast <bool> ((Op.getValueType() == MVT::f32 || Op
.getValueType() == MVT::f64) && "Only expecting float/double"
) ? void (0) : __assert_fail ("(Op.getValueType() == MVT::f32 || Op.getValueType() == MVT::f64) && \"Only expecting float/double\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23884, __extension__
__PRETTY_FUNCTION__))

23884

"Only expecting float/double")(static_cast <bool> ((Op.getValueType() == MVT::f32 || Op
.getValueType() == MVT::f64) && "Only expecting float/double"
) ? void (0) : __assert_fail ("(Op.getValueType() == MVT::f32 || Op.getValueType() == MVT::f64) && \"Only expecting float/double\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23884, __extension__
__PRETTY_FUNCTION__));

23885

return lowerAddSubToHorizontalOp(Op, DAG, Subtarget);

23886

}

23887

23888

/// ISD::FROUND is defined to round to nearest with ties rounding away from 0.

23889

/// This mode isn't supported in hardware on X86. But as long as we aren't

23890

/// compiling with trapping math, we can emulate this with

23891

/// trunc(X + copysign(nextafter(0.5, 0.0), X)).

23892

static SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG) {

23893

SDValue N0 = Op.getOperand(0);

23894

SDLoc dl(Op);

23895

MVT VT = Op.getSimpleValueType();

23896

23897

// N0 += copysign(nextafter(0.5, 0.0), N0)

23898

const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(VT);

23899

bool Ignored;

23900

APFloat Point5Pred = APFloat(0.5f);

23901

Point5Pred.convert(Sem, APFloat::rmNearestTiesToEven, &Ignored);

23902

Point5Pred.next(/*nextDown*/true);

23903

23904

SDValue Adder = DAG.getNode(ISD::FCOPYSIGN, dl, VT,

23905

DAG.getConstantFP(Point5Pred, dl, VT), N0);

23906

N0 = DAG.getNode(ISD::FADD, dl, VT, N0, Adder);

23907

23908

// Truncate the result to remove fraction.

23909

return DAG.getNode(ISD::FTRUNC, dl, VT, N0);

23910

}

23911

23912

/// The only differences between FABS and FNEG are the mask and the logic op.

23913

/// FNEG also has a folding opportunity for FNEG(FABS(x)).

23914

static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {

23915

assert((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) &&(static_cast <bool> ((Op.getOpcode() == ISD::FABS || Op
.getOpcode() == ISD::FNEG) && "Wrong opcode for lowering FABS or FNEG."
) ? void (0) : __assert_fail ("(Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) && \"Wrong opcode for lowering FABS or FNEG.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23916, __extension__
__PRETTY_FUNCTION__))

23916

"Wrong opcode for lowering FABS or FNEG.")(static_cast <bool> ((Op.getOpcode() == ISD::FABS || Op
.getOpcode() == ISD::FNEG) && "Wrong opcode for lowering FABS or FNEG."
) ? void (0) : __assert_fail ("(Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) && \"Wrong opcode for lowering FABS or FNEG.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23916, __extension__
__PRETTY_FUNCTION__));

23917

23918

bool IsFABS = (Op.getOpcode() == ISD::FABS);

23919

23920

// If this is a FABS and it has an FNEG user, bail out to fold the combination

23921

// into an FNABS. We'll lower the FABS after that if it is still in use.

23922

if (IsFABS)

23923

for (SDNode *User : Op->uses())

23924

if (User->getOpcode() == ISD::FNEG)

23925

return Op;

23926

23927

SDLoc dl(Op);

23928

MVT VT = Op.getSimpleValueType();

23929

23930

bool IsF128 = (VT == MVT::f128);

23931

assert(VT.isFloatingPoint() && VT != MVT::f80 &&(static_cast <bool> (VT.isFloatingPoint() && VT
!= MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal
(VT) && "Unexpected type in LowerFABSorFNEG") ? void (
0) : __assert_fail ("VT.isFloatingPoint() && VT != MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal(VT) && \"Unexpected type in LowerFABSorFNEG\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23933, __extension__
__PRETTY_FUNCTION__))

23932

DAG.getTargetLoweringInfo().isTypeLegal(VT) &&(static_cast <bool> (VT.isFloatingPoint() && VT
!= MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal
(VT) && "Unexpected type in LowerFABSorFNEG") ? void (
0) : __assert_fail ("VT.isFloatingPoint() && VT != MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal(VT) && \"Unexpected type in LowerFABSorFNEG\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23933, __extension__
__PRETTY_FUNCTION__))

23933

"Unexpected type in LowerFABSorFNEG")(static_cast <bool> (VT.isFloatingPoint() && VT
!= MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal
(VT) && "Unexpected type in LowerFABSorFNEG") ? void (
0) : __assert_fail ("VT.isFloatingPoint() && VT != MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal(VT) && \"Unexpected type in LowerFABSorFNEG\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23933, __extension__
__PRETTY_FUNCTION__));

23934

23935

// FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOpt::Level to

23936

// decide if we should generate a 16-byte constant mask when we only need 4 or

23937

// 8 bytes for the scalar case.

23938

23939

// There are no scalar bitwise logical SSE/AVX instructions, so we

23940

// generate a 16-byte vector constant and logic op even for the scalar case.

23941

// Using a 16-byte mask allows folding the load of the mask with

23942

// the logic op, so it can save (~4 bytes) on code size.

23943

bool IsFakeVector = !VT.isVector() && !IsF128;

23944

MVT LogicVT = VT;

23945

if (IsFakeVector)

23946

LogicVT = (VT == MVT::f64) ? MVT::v2f64

23947

: (VT == MVT::f32) ? MVT::v4f32

23948

: MVT::v8f16;

23949

23950

unsigned EltBits = VT.getScalarSizeInBits();

23951

// For FABS, mask is 0x7f...; for FNEG, mask is 0x80...

23952

APInt MaskElt = IsFABS ? APInt::getSignedMaxValue(EltBits) :

23953

APInt::getSignMask(EltBits);

23954

const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(VT);

23955

SDValue Mask = DAG.getConstantFP(APFloat(Sem, MaskElt), dl, LogicVT);

23956

23957

SDValue Op0 = Op.getOperand(0);

23958

bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);

23959

unsigned LogicOp = IsFABS ? X86ISD::FAND :

23960

IsFNABS ? X86ISD::FOR :

23961

X86ISD::FXOR;

23962

SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;

23963

23964

if (VT.isVector() || IsF128)

23965

return DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);

23966

23967

// For the scalar case extend to a 128-bit vector, perform the logic op,

23968

// and extract the scalar result back out.

23969

Operand = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Operand);

23970

SDValue LogicNode = DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);

23971

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, LogicNode,

23972

DAG.getIntPtrConstant(0, dl));

23973

}

23974

23975

static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {

23976

SDValue Mag = Op.getOperand(0);

23977

SDValue Sign = Op.getOperand(1);

23978

SDLoc dl(Op);

23979

23980

// If the sign operand is smaller, extend it first.

23981

MVT VT = Op.getSimpleValueType();

23982

if (Sign.getSimpleValueType().bitsLT(VT))

23983

Sign = DAG.getNode(ISD::FP_EXTEND, dl, VT, Sign);

23984

23985

// And if it is bigger, shrink it first.

23986

if (Sign.getSimpleValueType().bitsGT(VT))

23987

Sign = DAG.getNode(ISD::FP_ROUND, dl, VT, Sign,

23988

DAG.getIntPtrConstant(0, dl, /*isTarget=*/true));

23989

23990

// At this point the operands and the result should have the same

23991

// type, and that won't be f80 since that is not custom lowered.

23992

bool IsF128 = (VT == MVT::f128);

23993

assert(VT.isFloatingPoint() && VT != MVT::f80 &&(static_cast <bool> (VT.isFloatingPoint() && VT
!= MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal
(VT) && "Unexpected type in LowerFCOPYSIGN") ? void (
0) : __assert_fail ("VT.isFloatingPoint() && VT != MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal(VT) && \"Unexpected type in LowerFCOPYSIGN\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23995, __extension__
__PRETTY_FUNCTION__))

23994

DAG.getTargetLoweringInfo().isTypeLegal(VT) &&(static_cast <bool> (VT.isFloatingPoint() && VT
!= MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal
(VT) && "Unexpected type in LowerFCOPYSIGN") ? void (
0) : __assert_fail ("VT.isFloatingPoint() && VT != MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal(VT) && \"Unexpected type in LowerFCOPYSIGN\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23995, __extension__
__PRETTY_FUNCTION__))

23995

"Unexpected type in LowerFCOPYSIGN")(static_cast <bool> (VT.isFloatingPoint() && VT
!= MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal
(VT) && "Unexpected type in LowerFCOPYSIGN") ? void (
0) : __assert_fail ("VT.isFloatingPoint() && VT != MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal(VT) && \"Unexpected type in LowerFCOPYSIGN\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23995, __extension__
__PRETTY_FUNCTION__));

23996

23997

const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(VT);

23998

23999

// Perform all scalar logic operations as 16-byte vectors because there are no

24000

// scalar FP logic instructions in SSE.

24001

// TODO: This isn't necessary. If we used scalar types, we might avoid some

24002

// unnecessary splats, but we might miss load folding opportunities. Should

24003

// this decision be based on OptimizeForSize?

24004

bool IsFakeVector = !VT.isVector() && !IsF128;

24005

MVT LogicVT = VT;

24006

if (IsFakeVector)

24007

LogicVT = (VT == MVT::f64) ? MVT::v2f64

24008

: (VT == MVT::f32) ? MVT::v4f32

24009

: MVT::v8f16;

24010

24011

// The mask constants are automatically splatted for vector types.

24012

unsigned EltSizeInBits = VT.getScalarSizeInBits();

24013

SDValue SignMask = DAG.getConstantFP(

24014

APFloat(Sem, APInt::getSignMask(EltSizeInBits)), dl, LogicVT);

24015

SDValue MagMask = DAG.getConstantFP(

24016

APFloat(Sem, APInt::getSignedMaxValue(EltSizeInBits)), dl, LogicVT);

24017

24018

// First, clear all bits but the sign bit from the second operand (sign).

24019

if (IsFakeVector)

24020

Sign = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Sign);

24021

SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Sign, SignMask);

24022

24023

// Next, clear the sign bit from the first operand (magnitude).

24024

// TODO: If we had general constant folding for FP logic ops, this check

24025

// wouldn't be necessary.

24026

SDValue MagBits;

24027

if (ConstantFPSDNode *Op0CN = isConstOrConstSplatFP(Mag)) {

24028

APFloat APF = Op0CN->getValueAPF();

24029

APF.clearSign();

24030

MagBits = DAG.getConstantFP(APF, dl, LogicVT);

24031

} else {

24032

// If the magnitude operand wasn't a constant, we need to AND out the sign.

24033

if (IsFakeVector)

24034

Mag = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Mag);

24035

MagBits = DAG.getNode(X86ISD::FAND, dl, LogicVT, Mag, MagMask);

24036

}

24037

24038

// OR the magnitude value with the sign bit.

24039

SDValue Or = DAG.getNode(X86ISD::FOR, dl, LogicVT, MagBits, SignBit);

24040

return !IsFakeVector ? Or : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Or,

24041

DAG.getIntPtrConstant(0, dl));

24042

}

24043

24044

static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {

24045

SDValue N0 = Op.getOperand(0);

24046

SDLoc dl(Op);

24047

MVT VT = Op.getSimpleValueType();

24048

24049

MVT OpVT = N0.getSimpleValueType();

24050

assert((OpVT == MVT::f32 || OpVT == MVT::f64) &&(static_cast <bool> ((OpVT == MVT::f32 || OpVT == MVT::
f64) && "Unexpected type for FGETSIGN") ? void (0) : __assert_fail
("(OpVT == MVT::f32 || OpVT == MVT::f64) && \"Unexpected type for FGETSIGN\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24051, __extension__
__PRETTY_FUNCTION__))

24051

"Unexpected type for FGETSIGN")(static_cast <bool> ((OpVT == MVT::f32 || OpVT == MVT::
f64) && "Unexpected type for FGETSIGN") ? void (0) : __assert_fail
("(OpVT == MVT::f32 || OpVT == MVT::f64) && \"Unexpected type for FGETSIGN\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24051, __extension__
__PRETTY_FUNCTION__));

24052

24053

// Lower ISD::FGETSIGN to (AND (X86ISD::MOVMSK ...) 1).

24054

MVT VecVT = (OpVT == MVT::f32 ? MVT::v4f32 : MVT::v2f64);

24055

SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, N0);

24056

Res = DAG.getNode(X86ISD::MOVMSK, dl, MVT::i32, Res);

24057

Res = DAG.getZExtOrTrunc(Res, dl, VT);

24058

Res = DAG.getNode(ISD::AND, dl, VT, Res, DAG.getConstant(1, dl, VT));

24059

return Res;

24060

}

24061

24062

/// Helper for attempting to create a X86ISD::BT node.

24063

static SDValue getBT(SDValue Src, SDValue BitNo, const SDLoc &DL, SelectionDAG &DAG) {

24064

// If Src is i8, promote it to i32 with any_extend. There is no i8 BT

24065

// instruction. Since the shift amount is in-range-or-undefined, we know

24066

// that doing a bittest on the i32 value is ok. We extend to i32 because

24067

// the encoding for the i16 version is larger than the i32 version.

24068

// Also promote i16 to i32 for performance / code size reason.

24069

if (Src.getValueType().getScalarSizeInBits() < 32)

24070

Src = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Src);

24071

24072

// No legal type found, give up.

24073

if (!DAG.getTargetLoweringInfo().isTypeLegal(Src.getValueType()))

24074

return SDValue();

24075

24076

// See if we can use the 32-bit instruction instead of the 64-bit one for a

24077

// shorter encoding. Since the former takes the modulo 32 of BitNo and the

24078

// latter takes the modulo 64, this is only valid if the 5th bit of BitNo is

24079

// known to be zero.

24080

if (Src.getValueType() == MVT::i64 &&

24081

DAG.MaskedValueIsZero(BitNo, APInt(BitNo.getValueSizeInBits(), 32)))

24082

Src = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Src);

24083

24084

// If the operand types disagree, extend the shift amount to match. Since

24085

// BT ignores high bits (like shifts) we can use anyextend.

24086

if (Src.getValueType() != BitNo.getValueType()) {

24087

// Peek through a mask/modulo operation.

24088

// TODO: DAGCombine fails to do this as it just checks isTruncateFree, but

24089

// we probably need a better IsDesirableToPromoteOp to handle this as well.

24090

if (BitNo.getOpcode() == ISD::AND && BitNo->hasOneUse())

24091

BitNo = DAG.getNode(ISD::AND, DL, Src.getValueType(),

24092

DAG.getNode(ISD::ANY_EXTEND, DL, Src.getValueType(),

24093

BitNo.getOperand(0)),

24094

DAG.getNode(ISD::ANY_EXTEND, DL, Src.getValueType(),

24095

BitNo.getOperand(1)));

24096

else

24097

BitNo = DAG.getNode(ISD::ANY_EXTEND, DL, Src.getValueType(), BitNo);

24098

}

24099

24100

return DAG.getNode(X86ISD::BT, DL, MVT::i32, Src, BitNo);

24101

}

24102

24103

/// Helper for creating a X86ISD::SETCC node.

24104

static SDValue getSETCC(X86::CondCode Cond, SDValue EFLAGS, const SDLoc &dl,

24105

SelectionDAG &DAG) {

24106

return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,

24107

DAG.getTargetConstant(Cond, dl, MVT::i8), EFLAGS);

24108

}

24109

24110

/// Recursive helper for combineVectorSizedSetCCEquality() to see if we have a

24111

/// recognizable memcmp expansion.

24112

static bool isOrXorXorTree(SDValue X, bool Root = true) {

24113

if (X.getOpcode() == ISD::OR)

24114

return isOrXorXorTree(X.getOperand(0), false) &&

24115

isOrXorXorTree(X.getOperand(1), false);

24116

if (Root)

24117

return false;

24118

return X.getOpcode() == ISD::XOR;

24119

}

24120

24121

/// Recursive helper for combineVectorSizedSetCCEquality() to emit the memcmp

24122

/// expansion.

24123

template <typename F>

24124

static SDValue emitOrXorXorTree(SDValue X, const SDLoc &DL, SelectionDAG &DAG,

24125

EVT VecVT, EVT CmpVT, bool HasPT, F SToV) {

24126

SDValue Op0 = X.getOperand(0);

24127

SDValue Op1 = X.getOperand(1);

24128

if (X.getOpcode() == ISD::OR) {

24129

SDValue A = emitOrXorXorTree(Op0, DL, DAG, VecVT, CmpVT, HasPT, SToV);

24130

SDValue B = emitOrXorXorTree(Op1, DL, DAG, VecVT, CmpVT, HasPT, SToV);

24131

if (VecVT != CmpVT)

24132

return DAG.getNode(ISD::OR, DL, CmpVT, A, B);

24133

if (HasPT)

24134

return DAG.getNode(ISD::OR, DL, VecVT, A, B);

24135

return DAG.getNode(ISD::AND, DL, CmpVT, A, B);

24136

}

24137

if (X.getOpcode() == ISD::XOR) {

24138

SDValue A = SToV(Op0);

24139

SDValue B = SToV(Op1);

24140

if (VecVT != CmpVT)

24141

return DAG.getSetCC(DL, CmpVT, A, B, ISD::SETNE);

24142

if (HasPT)

24143

return DAG.getNode(ISD::XOR, DL, VecVT, A, B);

24144

return DAG.getSetCC(DL, CmpVT, A, B, ISD::SETEQ);

24145

}

24146

llvm_unreachable("Impossible")::llvm::llvm_unreachable_internal("Impossible", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24146);

24147

}

24148

24149

/// Try to map a 128-bit or larger integer comparison to vector instructions

24150

/// before type legalization splits it up into chunks.

24151

static SDValue combineVectorSizedSetCCEquality(EVT VT, SDValue X, SDValue Y,

24152

ISD::CondCode CC,

24153

const SDLoc &DL,

24154

SelectionDAG &DAG,

24155

const X86Subtarget &Subtarget) {

24156

assert((CC == ISD::SETNE || CC == ISD::SETEQ) && "Bad comparison predicate")(static_cast <bool> ((CC == ISD::SETNE || CC == ISD::SETEQ
) && "Bad comparison predicate") ? void (0) : __assert_fail
("(CC == ISD::SETNE || CC == ISD::SETEQ) && \"Bad comparison predicate\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24156, __extension__
__PRETTY_FUNCTION__));

24157

24158

// We're looking for an oversized integer equality comparison.

24159

EVT OpVT = X.getValueType();

24160

unsigned OpSize = OpVT.getSizeInBits();

24161

if (!OpVT.isScalarInteger() || OpSize < 128)

24162

return SDValue();

24163

24164

// Ignore a comparison with zero because that gets special treatment in

24165

// EmitTest(). But make an exception for the special case of a pair of

24166

// logically-combined vector-sized operands compared to zero. This pattern may

24167

// be generated by the memcmp expansion pass with oversized integer compares

24168

// (see PR33325).

24169

bool IsOrXorXorTreeCCZero = isNullConstant(Y) && isOrXorXorTree(X);

24170

if (isNullConstant(Y) && !IsOrXorXorTreeCCZero)

24171

return SDValue();

24172

24173

// Don't perform this combine if constructing the vector will be expensive.

24174

auto IsVectorBitCastCheap = [](SDValue X) {

24175

X = peekThroughBitcasts(X);

24176

return isa<ConstantSDNode>(X) || X.getValueType().isVector() ||

24177

X.getOpcode() == ISD::LOAD;

24178

};

24179

if ((!IsVectorBitCastCheap(X) || !IsVectorBitCastCheap(Y)) &&

24180

!IsOrXorXorTreeCCZero)

24181

return SDValue();

24182

24183

// Use XOR (plus OR) and PTEST after SSE4.1 for 128/256-bit operands.

24184

// Use PCMPNEQ (plus OR) and KORTEST for 512-bit operands.

24185

// Otherwise use PCMPEQ (plus AND) and mask testing.

24186

bool NoImplicitFloatOps =

24187

DAG.getMachineFunction().getFunction().hasFnAttribute(

24188

Attribute::NoImplicitFloat);

24189

if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps &&

24190

((OpSize == 128 && Subtarget.hasSSE2()) ||

24191

(OpSize == 256 && Subtarget.hasAVX()) ||

24192

(OpSize == 512 && Subtarget.useAVX512Regs()))) {

24193

bool HasPT = Subtarget.hasSSE41();

24194

24195

// PTEST and MOVMSK are slow on Knights Landing and Knights Mill and widened

24196

// vector registers are essentially free. (Technically, widening registers

24197

// prevents load folding, but the tradeoff is worth it.)

24198

bool PreferKOT = Subtarget.preferMaskRegisters();

24199

bool NeedZExt = PreferKOT && !Subtarget.hasVLX() && OpSize != 512;

24200

24201

EVT VecVT = MVT::v16i8;

24202

EVT CmpVT = PreferKOT ? MVT::v16i1 : VecVT;

24203

if (OpSize == 256) {

24204

VecVT = MVT::v32i8;

24205

CmpVT = PreferKOT ? MVT::v32i1 : VecVT;

24206

}

24207

EVT CastVT = VecVT;

24208

bool NeedsAVX512FCast = false;

24209

if (OpSize == 512 || NeedZExt) {

24210

if (Subtarget.hasBWI()) {

24211

VecVT = MVT::v64i8;

24212

CmpVT = MVT::v64i1;

24213

if (OpSize == 512)

24214

CastVT = VecVT;

24215

} else {

24216

VecVT = MVT::v16i32;

24217

CmpVT = MVT::v16i1;

24218

CastVT = OpSize == 512 ? VecVT

24219

: OpSize == 256 ? MVT::v8i32

24220

: MVT::v4i32;

24221

NeedsAVX512FCast = true;

24222

}

24223

}

24224

24225

auto ScalarToVector = [&](SDValue X) -> SDValue {

24226

bool TmpZext = false;

24227

EVT TmpCastVT = CastVT;

24228

if (X.getOpcode() == ISD::ZERO_EXTEND) {

24229

SDValue OrigX = X.getOperand(0);

24230

unsigned OrigSize = OrigX.getScalarValueSizeInBits();

24231

if (OrigSize < OpSize) {

24232

if (OrigSize == 128) {

24233

TmpCastVT = NeedsAVX512FCast ? MVT::v4i32 : MVT::v16i8;

24234

X = OrigX;

24235

TmpZext = true;

24236

} else if (OrigSize == 256) {

24237

TmpCastVT = NeedsAVX512FCast ? MVT::v8i32 : MVT::v32i8;

24238

X = OrigX;

24239

TmpZext = true;

24240

}

24241

}

24242

}

24243

X = DAG.getBitcast(TmpCastVT, X);

24244

if (!NeedZExt && !TmpZext)

24245

return X;

24246

return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VecVT,

24247

DAG.getConstant(0, DL, VecVT), X,

24248

DAG.getVectorIdxConstant(0, DL));

24249

};

24250

24251

SDValue Cmp;

24252

if (IsOrXorXorTreeCCZero) {

24253

// This is a bitwise-combined equality comparison of 2 pairs of vectors:

24254

// setcc i128 (or (xor A, B), (xor C, D)), 0, eq|ne

24255

// Use 2 vector equality compares and 'and' the results before doing a

24256

// MOVMSK.

24257

Cmp = emitOrXorXorTree(X, DL, DAG, VecVT, CmpVT, HasPT, ScalarToVector);

24258

} else {

24259

SDValue VecX = ScalarToVector(X);

24260

SDValue VecY = ScalarToVector(Y);

24261

if (VecVT != CmpVT) {

24262

Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETNE);

24263

} else if (HasPT) {

24264

Cmp = DAG.getNode(ISD::XOR, DL, VecVT, VecX, VecY);

24265

} else {

24266

Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETEQ);

24267

}

24268

}

24269

// AVX512 should emit a setcc that will lower to kortest.

24270

if (VecVT != CmpVT) {

24271

EVT KRegVT = CmpVT == MVT::v64i1 ? MVT::i64

24272

: CmpVT == MVT::v32i1 ? MVT::i32

24273

: MVT::i16;

24274

return DAG.getSetCC(DL, VT, DAG.getBitcast(KRegVT, Cmp),

24275

DAG.getConstant(0, DL, KRegVT), CC);

24276

}

24277

if (HasPT) {

24278

SDValue BCCmp =

24279

DAG.getBitcast(OpSize == 256 ? MVT::v4i64 : MVT::v2i64, Cmp);

24280

SDValue PT = DAG.getNode(X86ISD::PTEST, DL, MVT::i32, BCCmp, BCCmp);

24281

X86::CondCode X86CC = CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE;

24282

SDValue X86SetCC = getSETCC(X86CC, PT, DL, DAG);

24283

return DAG.getNode(ISD::TRUNCATE, DL, VT, X86SetCC.getValue(0));

24284

}

24285

// If all bytes match (bitmask is 0x(FFFF)FFFF), that's equality.

24286

// setcc i128 X, Y, eq --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, eq

24287

// setcc i128 X, Y, ne --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, ne

24288

assert(Cmp.getValueType() == MVT::v16i8 &&(static_cast <bool> (Cmp.getValueType() == MVT::v16i8 &&
"Non 128-bit vector on pre-SSE41 target") ? void (0) : __assert_fail
("Cmp.getValueType() == MVT::v16i8 && \"Non 128-bit vector on pre-SSE41 target\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24289, __extension__
__PRETTY_FUNCTION__))

24289

"Non 128-bit vector on pre-SSE41 target")(static_cast <bool> (Cmp.getValueType() == MVT::v16i8 &&
"Non 128-bit vector on pre-SSE41 target") ? void (0) : __assert_fail
("Cmp.getValueType() == MVT::v16i8 && \"Non 128-bit vector on pre-SSE41 target\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24289, __extension__
__PRETTY_FUNCTION__));

24290

SDValue MovMsk = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Cmp);

24291

SDValue FFFFs = DAG.getConstant(0xFFFF, DL, MVT::i32);

24292

return DAG.getSetCC(DL, VT, MovMsk, FFFFs, CC);

24293

}

24294

24295

return SDValue();

24296

}

24297

24298

/// Helper for matching BINOP(EXTRACTELT(X,0),BINOP(EXTRACTELT(X,1),...))

24299

/// style scalarized (associative) reduction patterns. Partial reductions

24300

/// are supported when the pointer SrcMask is non-null.

24301

/// TODO - move this to SelectionDAG?

24302

static bool matchScalarReduction(SDValue Op, ISD::NodeType BinOp,

24303

SmallVectorImpl<SDValue> &SrcOps,

24304

SmallVectorImpl<APInt> *SrcMask = nullptr) {

24305

SmallVector<SDValue, 8> Opnds;

24306

DenseMap<SDValue, APInt> SrcOpMap;

24307

EVT VT = MVT::Other;

24308

24309

// Recognize a special case where a vector is casted into wide integer to

24310

// test all 0s.

24311

assert(Op.getOpcode() == unsigned(BinOp) &&(static_cast <bool> (Op.getOpcode() == unsigned(BinOp) &&
"Unexpected bit reduction opcode") ? void (0) : __assert_fail
("Op.getOpcode() == unsigned(BinOp) && \"Unexpected bit reduction opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24312, __extension__
__PRETTY_FUNCTION__))

24312

"Unexpected bit reduction opcode")(static_cast <bool> (Op.getOpcode() == unsigned(BinOp) &&
"Unexpected bit reduction opcode") ? void (0) : __assert_fail
("Op.getOpcode() == unsigned(BinOp) && \"Unexpected bit reduction opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24312, __extension__
__PRETTY_FUNCTION__));

24313

Opnds.push_back(Op.getOperand(0));

24314

Opnds.push_back(Op.getOperand(1));

24315

24316

for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {

24317

SmallVectorImpl<SDValue>::const_iterator I = Opnds.begin() + Slot;

24318

// BFS traverse all BinOp operands.

24319

if (I->getOpcode() == unsigned(BinOp)) {

24320

Opnds.push_back(I->getOperand(0));

24321

Opnds.push_back(I->getOperand(1));

24322

// Re-evaluate the number of nodes to be traversed.

24323

e += 2; // 2 more nodes (LHS and RHS) are pushed.

24324

continue;

24325

}

24326

24327

// Quit if a non-EXTRACT_VECTOR_ELT

24328

if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)

24329

return false;

24330

24331

// Quit if without a constant index.

24332

auto *Idx = dyn_cast<ConstantSDNode>(I->getOperand(1));

24333

if (!Idx)

24334

return false;

24335

24336

SDValue Src = I->getOperand(0);

24337

DenseMap<SDValue, APInt>::iterator M = SrcOpMap.find(Src);

24338

if (M == SrcOpMap.end()) {

24339

VT = Src.getValueType();

24340

// Quit if not the same type.

24341

if (!SrcOpMap.empty() && VT != SrcOpMap.begin()->first.getValueType())

24342

return false;

24343

unsigned NumElts = VT.getVectorNumElements();

24344

APInt EltCount = APInt::getZero(NumElts);

24345

M = SrcOpMap.insert(std::make_pair(Src, EltCount)).first;

24346

SrcOps.push_back(Src);

24347

}

24348

24349

// Quit if element already used.

24350

unsigned CIdx = Idx->getZExtValue();

24351

if (M->second[CIdx])

24352

return false;

24353

M->second.setBit(CIdx);

24354

}

24355

24356

if (SrcMask) {

24357

// Collect the source partial masks.

24358

for (SDValue &SrcOp : SrcOps)

24359

SrcMask->push_back(SrcOpMap[SrcOp]);

24360

} else {

24361

// Quit if not all elements are used.

24362

for (const auto &I : SrcOpMap)

24363

if (!I.second.isAllOnes())

24364

return false;

24365

}

24366

24367

return true;

24368

}

24369

24370

// Helper function for comparing all bits of two vectors.

24371

static SDValue LowerVectorAllEqual(const SDLoc &DL, SDValue LHS, SDValue RHS,

24372

ISD::CondCode CC, const APInt &OriginalMask,

24373

const X86Subtarget &Subtarget,

24374

SelectionDAG &DAG, X86::CondCode &X86CC) {

24375

EVT VT = LHS.getValueType();

24376

unsigned ScalarSize = VT.getScalarSizeInBits();

24377

if (OriginalMask.getBitWidth() != ScalarSize) {

24378

assert(ScalarSize == 1 && "Element Mask vs Vector bitwidth mismatch")(static_cast <bool> (ScalarSize == 1 && "Element Mask vs Vector bitwidth mismatch"
) ? void (0) : __assert_fail ("ScalarSize == 1 && \"Element Mask vs Vector bitwidth mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24378, __extension__
__PRETTY_FUNCTION__));

24379

return SDValue();

24380

}

24381

24382

// Quit if not convertable to legal scalar or 128/256-bit vector.

24383

if (!llvm::has_single_bit<uint32_t>(VT.getSizeInBits()))

24384

return SDValue();

24385

24386

// FCMP may use ISD::SETNE when nnan - early out if we manage to get here.

24387

if (VT.isFloatingPoint())

24388

return SDValue();

24389

24390

assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode")(static_cast <bool> ((CC == ISD::SETEQ || CC == ISD::SETNE
) && "Unsupported ISD::CondCode") ? void (0) : __assert_fail
("(CC == ISD::SETEQ || CC == ISD::SETNE) && \"Unsupported ISD::CondCode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24390, __extension__
__PRETTY_FUNCTION__));

24391

X86CC = (CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE);

24392

24393

APInt Mask = OriginalMask;

24394

24395

auto MaskBits = [&](SDValue Src) {

24396

if (Mask.isAllOnes())

24397

return Src;

24398

EVT SrcVT = Src.getValueType();

24399

SDValue MaskValue = DAG.getConstant(Mask, DL, SrcVT);

24400

return DAG.getNode(ISD::AND, DL, SrcVT, Src, MaskValue);

24401

};

24402

24403

// For sub-128-bit vector, cast to (legal) integer and compare with zero.

24404

if (VT.getSizeInBits() < 128) {

24405

EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());

24406

if (!DAG.getTargetLoweringInfo().isTypeLegal(IntVT)) {

24407

if (IntVT != MVT::i64)

24408

return SDValue();

24409

auto SplitLHS = DAG.SplitScalar(DAG.getBitcast(IntVT, MaskBits(LHS)), DL,

24410

MVT::i32, MVT::i32);

24411

auto SplitRHS = DAG.SplitScalar(DAG.getBitcast(IntVT, MaskBits(RHS)), DL,

24412

MVT::i32, MVT::i32);

24413

SDValue Lo =

24414

DAG.getNode(ISD::XOR, DL, MVT::i32, SplitLHS.first, SplitRHS.first);

24415

SDValue Hi =

24416

DAG.getNode(ISD::XOR, DL, MVT::i32, SplitLHS.second, SplitRHS.second);

24417

return DAG.getNode(X86ISD::CMP, DL, MVT::i32,

24418

DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi),

24419

DAG.getConstant(0, DL, MVT::i32));

24420

}

24421

return DAG.getNode(X86ISD::CMP, DL, MVT::i32,

24422

DAG.getBitcast(IntVT, MaskBits(LHS)),

24423

DAG.getBitcast(IntVT, MaskBits(RHS)));

24424

}

24425

24426

// Without PTEST, a masked v2i64 or-reduction is not faster than

24427

// scalarization.

24428

bool UseKORTEST = Subtarget.useAVX512Regs();

24429

bool UsePTEST = Subtarget.hasSSE41();

24430

if (!UsePTEST && !Mask.isAllOnes() && ScalarSize > 32)

24431

return SDValue();

24432

24433

// Split down to 128/256/512-bit vector.

24434

unsigned TestSize = UseKORTEST ? 512 : (Subtarget.hasAVX() ? 256 : 128);

24435

24436

// If the input vector has vector elements wider than the target test size,

24437

// then cast to <X x i64> so it will safely split.

24438

if (ScalarSize > TestSize) {

24439

if (!Mask.isAllOnes())

24440

return SDValue();

24441

VT = EVT::getVectorVT(*DAG.getContext(), MVT::i64, VT.getSizeInBits() / 64);

24442

LHS = DAG.getBitcast(VT, LHS);

24443

RHS = DAG.getBitcast(VT, RHS);

24444

Mask = APInt::getAllOnes(64);

24445

}

24446

24447

if (VT.getSizeInBits() > TestSize) {

24448

KnownBits KnownRHS = DAG.computeKnownBits(RHS);

24449

if (KnownRHS.isConstant() && KnownRHS.getConstant() == Mask) {

24450

// If ICMP(AND(LHS,MASK),MASK) - reduce using AND splits.

24451

while (VT.getSizeInBits() > TestSize) {

24452

auto Split = DAG.SplitVector(LHS, DL);

24453

VT = Split.first.getValueType();

24454

LHS = DAG.getNode(ISD::AND, DL, VT, Split.first, Split.second);

24455

}

24456

RHS = DAG.getAllOnesConstant(DL, VT);

24457

} else if (!UsePTEST && !KnownRHS.isZero()) {

24458

// MOVMSK Special Case:

24459

// ALLOF(CMPEQ(X,Y)) -> AND(CMPEQ(X[0],Y[0]),CMPEQ(X[1],Y[1]),....)

24460

MVT SVT = ScalarSize >= 32 ? MVT::i32 : MVT::i8;

24461

VT = MVT::getVectorVT(SVT, VT.getSizeInBits() / SVT.getSizeInBits());

24462

LHS = DAG.getBitcast(VT, MaskBits(LHS));

24463

RHS = DAG.getBitcast(VT, MaskBits(RHS));

24464

EVT BoolVT = VT.changeVectorElementType(MVT::i1);

24465

SDValue V = DAG.getSetCC(DL, BoolVT, LHS, RHS, ISD::SETEQ);

24466

V = DAG.getSExtOrTrunc(V, DL, VT);

24467

while (VT.getSizeInBits() > TestSize) {

24468

auto Split = DAG.SplitVector(V, DL);

24469

VT = Split.first.getValueType();

24470

V = DAG.getNode(ISD::AND, DL, VT, Split.first, Split.second);

24471

}

24472

V = DAG.getNOT(DL, V, VT);

24473

V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);

24474

return DAG.getNode(X86ISD::CMP, DL, MVT::i32, V,

24475

DAG.getConstant(0, DL, MVT::i32));

24476

} else {

24477

// Convert to a ICMP_EQ(XOR(LHS,RHS),0) pattern.

24478

SDValue V = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS);

24479

while (VT.getSizeInBits() > TestSize) {

24480

auto Split = DAG.SplitVector(V, DL);

24481

VT = Split.first.getValueType();

24482

V = DAG.getNode(ISD::OR, DL, VT, Split.first, Split.second);

24483

}

24484

LHS = V;

24485

RHS = DAG.getConstant(0, DL, VT);

24486

}

24487

}

24488

24489

if (UseKORTEST && VT.is512BitVector()) {

24490

MVT TestVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);

24491

MVT BoolVT = TestVT.changeVectorElementType(MVT::i1);

24492

LHS = DAG.getBitcast(TestVT, MaskBits(LHS));

24493

RHS = DAG.getBitcast(TestVT, MaskBits(RHS));

24494

SDValue V = DAG.getSetCC(DL, BoolVT, LHS, RHS, ISD::SETNE);

24495

return DAG.getNode(X86ISD::KORTEST, DL, MVT::i32, V, V);

24496

}

24497

24498

if (UsePTEST) {

24499

MVT TestVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);

24500

LHS = DAG.getBitcast(TestVT, MaskBits(LHS));

24501

RHS = DAG.getBitcast(TestVT, MaskBits(RHS));

24502

SDValue V = DAG.getNode(ISD::XOR, DL, TestVT, LHS, RHS);

24503

return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, V, V);

24504

}

24505

24506

assert(VT.getSizeInBits() == 128 && "Failure to split to 128-bits")(static_cast <bool> (VT.getSizeInBits() == 128 &&
"Failure to split to 128-bits") ? void (0) : __assert_fail (
"VT.getSizeInBits() == 128 && \"Failure to split to 128-bits\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24506, __extension__
__PRETTY_FUNCTION__));

24507

MVT MaskVT = ScalarSize >= 32 ? MVT::v4i32 : MVT::v16i8;

24508

LHS = DAG.getBitcast(MaskVT, MaskBits(LHS));

24509

RHS = DAG.getBitcast(MaskVT, MaskBits(RHS));

24510

SDValue V = DAG.getNode(X86ISD::PCMPEQ, DL, MaskVT, LHS, RHS);

24511

V = DAG.getNOT(DL, V, MaskVT);

24512

V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);

24513

return DAG.getNode(X86ISD::CMP, DL, MVT::i32, V,

24514

DAG.getConstant(0, DL, MVT::i32));

24515

}

24516

24517

// Check whether an AND/OR'd reduction tree is PTEST-able, or if we can fallback

24518

// to CMP(MOVMSK(PCMPEQB(X,Y))).

24519

static SDValue MatchVectorAllEqualTest(SDValue LHS, SDValue RHS,

24520

ISD::CondCode CC, const SDLoc &DL,

24521

const X86Subtarget &Subtarget,

24522

SelectionDAG &DAG,

24523

X86::CondCode &X86CC) {

24524

assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode")(static_cast <bool> ((CC == ISD::SETEQ || CC == ISD::SETNE
) && "Unsupported ISD::CondCode") ? void (0) : __assert_fail
("(CC == ISD::SETEQ || CC == ISD::SETNE) && \"Unsupported ISD::CondCode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24524, __extension__
__PRETTY_FUNCTION__));

24525

24526

bool CmpNull = isNullConstant(RHS);

24527

bool CmpAllOnes = isAllOnesConstant(RHS);

24528

if (!CmpNull && !CmpAllOnes)

24529

return SDValue();

24530

24531

SDValue Op = LHS;

24532

if (!Subtarget.hasSSE2() || !Op->hasOneUse())

24533

return SDValue();

24534

24535

// Check whether we're masking/truncating an OR-reduction result, in which

24536

// case track the masked bits.

24537

// TODO: Add CmpAllOnes support.

24538

APInt Mask = APInt::getAllOnes(Op.getScalarValueSizeInBits());

24539

if (CmpNull) {

24540

switch (Op.getOpcode()) {

24541

case ISD::TRUNCATE: {

24542

SDValue Src = Op.getOperand(0);

24543

Mask = APInt::getLowBitsSet(Src.getScalarValueSizeInBits(),

24544

Op.getScalarValueSizeInBits());

24545

Op = Src;

24546

break;

24547

}

24548

case ISD::AND: {

24549

if (auto *Cst = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {

24550

Mask = Cst->getAPIntValue();

24551

Op = Op.getOperand(0);

24552

}

24553

break;

24554

}

24555

}

24556

}

24557

24558

ISD::NodeType LogicOp = CmpNull ? ISD::OR : ISD::AND;

24559

24560

// Match icmp(or(extract(X,0),extract(X,1)),0) anyof reduction patterns.

24561

// Match icmp(and(extract(X,0),extract(X,1)),-1) allof reduction patterns.

24562

SmallVector<SDValue, 8> VecIns;

24563

if (Op.getOpcode() == LogicOp && matchScalarReduction(Op, LogicOp, VecIns)) {

24564

EVT VT = VecIns[0].getValueType();

24565

assert(llvm::all_of(VecIns,(static_cast <bool> (llvm::all_of(VecIns, [VT](SDValue V
) { return VT == V.getValueType(); }) && "Reduction source vector mismatch"
) ? void (0) : __assert_fail ("llvm::all_of(VecIns, [VT](SDValue V) { return VT == V.getValueType(); }) && \"Reduction source vector mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24567, __extension__
__PRETTY_FUNCTION__))

24566

[VT](SDValue V) { return VT == V.getValueType(); }) &&(static_cast <bool> (llvm::all_of(VecIns, [VT](SDValue V
) { return VT == V.getValueType(); }) && "Reduction source vector mismatch"
) ? void (0) : __assert_fail ("llvm::all_of(VecIns, [VT](SDValue V) { return VT == V.getValueType(); }) && \"Reduction source vector mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24567, __extension__
__PRETTY_FUNCTION__))

24567

"Reduction source vector mismatch")(static_cast <bool> (llvm::all_of(VecIns, [VT](SDValue V
) { return VT == V.getValueType(); }) && "Reduction source vector mismatch"
) ? void (0) : __assert_fail ("llvm::all_of(VecIns, [VT](SDValue V) { return VT == V.getValueType(); }) && \"Reduction source vector mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24567, __extension__
__PRETTY_FUNCTION__));

24568

24569

// Quit if not splittable to scalar/128/256/512-bit vector.

24570

if (!llvm::has_single_bit<uint32_t>(VT.getSizeInBits()))

24571

return SDValue();

24572

24573

// If more than one full vector is evaluated, AND/OR them first before

24574

// PTEST.

24575

for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1;

24576

Slot += 2, e += 1) {

24577

// Each iteration will AND/OR 2 nodes and append the result until there is

24578

// only 1 node left, i.e. the final value of all vectors.

24579

SDValue LHS = VecIns[Slot];

24580

SDValue RHS = VecIns[Slot + 1];

24581

VecIns.push_back(DAG.getNode(LogicOp, DL, VT, LHS, RHS));

24582

}

24583

24584

return LowerVectorAllEqual(DL, VecIns.back(),

24585

CmpNull ? DAG.getConstant(0, DL, VT)

24586

: DAG.getAllOnesConstant(DL, VT),

24587

CC, Mask, Subtarget, DAG, X86CC);

24588

}

24589

24590

// Match icmp(reduce_or(X),0) anyof reduction patterns.

24591

// Match icmp(reduce_and(X),-1) allof reduction patterns.

24592

if (Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {

24593

ISD::NodeType BinOp;

24594

if (SDValue Match =

24595

DAG.matchBinOpReduction(Op.getNode(), BinOp, {LogicOp})) {

24596

EVT MatchVT = Match.getValueType();

24597

return LowerVectorAllEqual(DL, Match,

24598

CmpNull ? DAG.getConstant(0, DL, MatchVT)

24599

: DAG.getAllOnesConstant(DL, MatchVT),

24600

CC, Mask, Subtarget, DAG, X86CC);

24601

}

24602

}

24603

24604

if (Mask.isAllOnes()) {

24605

assert(!Op.getValueType().isVector() &&(static_cast <bool> (!Op.getValueType().isVector() &&
"Illegal vector type for reduction pattern") ? void (0) : __assert_fail
("!Op.getValueType().isVector() && \"Illegal vector type for reduction pattern\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24606, __extension__
__PRETTY_FUNCTION__))

24606

"Illegal vector type for reduction pattern")(static_cast <bool> (!Op.getValueType().isVector() &&
"Illegal vector type for reduction pattern") ? void (0) : __assert_fail
("!Op.getValueType().isVector() && \"Illegal vector type for reduction pattern\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24606, __extension__
__PRETTY_FUNCTION__));

24607

SDValue Src = peekThroughBitcasts(Op);

24608

if (Src.getValueType().isFixedLengthVector() &&

24609

Src.getValueType().getScalarType() == MVT::i1) {

24610

// Match icmp(bitcast(icmp_ne(X,Y)),0) reduction patterns.

24611

// Match icmp(bitcast(icmp_eq(X,Y)),-1) reduction patterns.

24612

if (Src.getOpcode() == ISD::SETCC) {

24613

SDValue LHS = Src.getOperand(0);

24614

SDValue RHS = Src.getOperand(1);

24615

EVT LHSVT = LHS.getValueType();

24616

ISD::CondCode SrcCC = cast<CondCodeSDNode>(Src.getOperand(2))->get();

24617

if (SrcCC == (CmpNull ? ISD::SETNE : ISD::SETEQ) &&

24618

llvm::has_single_bit<uint32_t>(LHSVT.getSizeInBits())) {

24619

APInt SrcMask = APInt::getAllOnes(LHSVT.getScalarSizeInBits());

24620

return LowerVectorAllEqual(DL, LHS, RHS, CC, SrcMask, Subtarget, DAG,

24621

X86CC);

24622

}

24623

}

24624

// Match icmp(bitcast(vXi1 trunc(Y)),0) reduction patterns.

24625

// Match icmp(bitcast(vXi1 trunc(Y)),-1) reduction patterns.

24626

// Peek through truncation, mask the LSB and compare against zero/LSB.

24627

if (Src.getOpcode() == ISD::TRUNCATE) {

24628

SDValue Inner = Src.getOperand(0);

24629

EVT InnerVT = Inner.getValueType();

24630

if (llvm::has_single_bit<uint32_t>(InnerVT.getSizeInBits())) {

24631

unsigned BW = InnerVT.getScalarSizeInBits();

24632

APInt SrcMask = APInt(BW, 1);

24633

APInt Cmp = CmpNull ? APInt::getZero(BW) : SrcMask;

24634

return LowerVectorAllEqual(DL, Inner,

24635

DAG.getConstant(Cmp, DL, InnerVT), CC,

24636

SrcMask, Subtarget, DAG, X86CC);

24637

}

24638

}

24639

}

24640

}

24641

24642

return SDValue();

24643

}

24644

24645

/// return true if \c Op has a use that doesn't just read flags.

24646

static bool hasNonFlagsUse(SDValue Op) {

24647

for (SDNode::use_iterator UI = Op->use_begin(), UE = Op->use_end(); UI != UE;

24648

++UI) {

24649

SDNode *User = *UI;

24650

unsigned UOpNo = UI.getOperandNo();

24651

if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {

24652

// Look pass truncate.

24653

UOpNo = User->use_begin().getOperandNo();

24654

User = *User->use_begin();

24655

}

24656

24657

if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC &&

24658

!(User->getOpcode() == ISD::SELECT && UOpNo == 0))

24659

return true;

24660

}

24661

return false;

24662

}

24663

24664

// Transform to an x86-specific ALU node with flags if there is a chance of

24665

// using an RMW op or only the flags are used. Otherwise, leave

24666

// the node alone and emit a 'cmp' or 'test' instruction.

24667

static bool isProfitableToUseFlagOp(SDValue Op) {

24668

for (SDNode *U : Op->uses())

24669

if (U->getOpcode() != ISD::CopyToReg &&

24670

U->getOpcode() != ISD::SETCC &&

24671

U->getOpcode() != ISD::STORE)

24672

return false;

24673

24674

return true;

24675

}

24676

24677

/// Emit nodes that will be selected as "test Op0,Op0", or something

24678

/// equivalent.

24679

static SDValue EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,

24680

SelectionDAG &DAG, const X86Subtarget &Subtarget) {

24681

// CF and OF aren't always set the way we want. Determine which

24682

// of these we need.

24683

bool NeedCF = false;

24684

bool NeedOF = false;

24685

switch (X86CC) {

24686

default: break;

24687

case X86::COND_A: case X86::COND_AE:

24688

case X86::COND_B: case X86::COND_BE:

24689

NeedCF = true;

24690

break;

24691

case X86::COND_G: case X86::COND_GE:

24692

case X86::COND_L: case X86::COND_LE:

24693

case X86::COND_O: case X86::COND_NO: {

24694

// Check if we really need to set the

24695

// Overflow flag. If NoSignedWrap is present

24696

// that is not actually needed.

24697

switch (Op->getOpcode()) {

24698

case ISD::ADD:

24699

case ISD::SUB:

24700

case ISD::MUL:

24701

case ISD::SHL:

24702

if (Op.getNode()->getFlags().hasNoSignedWrap())

24703

break;

24704

[[fallthrough]];

24705

default:

24706

NeedOF = true;

24707

break;

24708

}

24709

break;

24710

}

24711

}

24712

// See if we can use the EFLAGS value from the operand instead of

24713

// doing a separate TEST. TEST always sets OF and CF to 0, so unless

24714

// we prove that the arithmetic won't overflow, we can't use OF or CF.

24715

if (Op.getResNo() != 0 || NeedOF || NeedCF) {

24716

// Emit a CMP with 0, which is the TEST pattern.

24717

return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,

24718

DAG.getConstant(0, dl, Op.getValueType()));

24719

}

24720

unsigned Opcode = 0;

24721

unsigned NumOperands = 0;

24722

24723

SDValue ArithOp = Op;

24724

24725

// NOTICE: In the code below we use ArithOp to hold the arithmetic operation

24726

// which may be the result of a CAST. We use the variable 'Op', which is the

24727

// non-casted variable when we check for possible users.

24728

switch (ArithOp.getOpcode()) {

24729

case ISD::AND:

24730

// If the primary 'and' result isn't used, don't bother using X86ISD::AND,

24731

// because a TEST instruction will be better.

24732

if (!hasNonFlagsUse(Op))

24733

break;

24734

24735

[[fallthrough]];

24736

case ISD::ADD:

24737

case ISD::SUB:

24738

case ISD::OR:

24739

case ISD::XOR:

24740

if (!isProfitableToUseFlagOp(Op))

24741

break;

24742

24743

// Otherwise use a regular EFLAGS-setting instruction.

24744

switch (ArithOp.getOpcode()) {

24745

default: llvm_unreachable("unexpected operator!")::llvm::llvm_unreachable_internal("unexpected operator!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24745);

24746

case ISD::ADD: Opcode = X86ISD::ADD; break;

24747

case ISD::SUB: Opcode = X86ISD::SUB; break;

24748

case ISD::XOR: Opcode = X86ISD::XOR; break;

24749

case ISD::AND: Opcode = X86ISD::AND; break;

24750

case ISD::OR: Opcode = X86ISD::OR; break;

24751

}

24752

24753

NumOperands = 2;

24754

break;

24755

case X86ISD::ADD:

24756

case X86ISD::SUB:

24757

case X86ISD::OR:

24758

case X86ISD::XOR:

24759

case X86ISD::AND:

24760

return SDValue(Op.getNode(), 1);

24761

case ISD::SSUBO:

24762

case ISD::USUBO: {

24763

// /USUBO/SSUBO will become a X86ISD::SUB and we can use its Z flag.

24764

SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);

24765

return DAG.getNode(X86ISD::SUB, dl, VTs, Op->getOperand(0),

24766

Op->getOperand(1)).getValue(1);

24767

}

24768

default:

24769

break;

24770

}

24771

24772

if (Opcode == 0) {

24773

// Emit a CMP with 0, which is the TEST pattern.

24774

return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,

24775

DAG.getConstant(0, dl, Op.getValueType()));

24776

}

24777

SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);

24778

SmallVector<SDValue, 4> Ops(Op->op_begin(), Op->op_begin() + NumOperands);

24779

24780

SDValue New = DAG.getNode(Opcode, dl, VTs, Ops);

24781

DAG.ReplaceAllUsesOfValueWith(SDValue(Op.getNode(), 0), New);

24782

return SDValue(New.getNode(), 1);

24783

}

24784

24785

/// Emit nodes that will be selected as "cmp Op0,Op1", or something

24786

/// equivalent.

24787

static SDValue EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,

24788

const SDLoc &dl, SelectionDAG &DAG,

24789

const X86Subtarget &Subtarget) {

24790

if (isNullConstant(Op1))

24791

return EmitTest(Op0, X86CC, dl, DAG, Subtarget);

24792

24793

EVT CmpVT = Op0.getValueType();

24794

24795

assert((CmpVT == MVT::i8 || CmpVT == MVT::i16 ||(static_cast <bool> ((CmpVT == MVT::i8 || CmpVT == MVT::
i16 || CmpVT == MVT::i32 || CmpVT == MVT::i64) && "Unexpected VT!"
) ? void (0) : __assert_fail ("(CmpVT == MVT::i8 || CmpVT == MVT::i16 || CmpVT == MVT::i32 || CmpVT == MVT::i64) && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24796, __extension__
__PRETTY_FUNCTION__))

24796

CmpVT == MVT::i32 || CmpVT == MVT::i64) && "Unexpected VT!")(static_cast <bool> ((CmpVT == MVT::i8 || CmpVT == MVT::
i16 || CmpVT == MVT::i32 || CmpVT == MVT::i64) && "Unexpected VT!"
) ? void (0) : __assert_fail ("(CmpVT == MVT::i8 || CmpVT == MVT::i16 || CmpVT == MVT::i32 || CmpVT == MVT::i64) && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24796, __extension__
__PRETTY_FUNCTION__));

24797

24798

// Only promote the compare up to I32 if it is a 16 bit operation

24799

// with an immediate. 16 bit immediates are to be avoided.

24800

if (CmpVT == MVT::i16 && !Subtarget.isAtom() &&

24801

!DAG.getMachineFunction().getFunction().hasMinSize()) {

24802

ConstantSDNode *COp0 = dyn_cast<ConstantSDNode>(Op0);

24803

ConstantSDNode *COp1 = dyn_cast<ConstantSDNode>(Op1);

24804

// Don't do this if the immediate can fit in 8-bits.

24805

if ((COp0 && !COp0->getAPIntValue().isSignedIntN(8)) ||

24806

(COp1 && !COp1->getAPIntValue().isSignedIntN(8))) {

24807

unsigned ExtendOp =

24808

isX86CCSigned(X86CC) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;

24809

if (X86CC == X86::COND_E || X86CC == X86::COND_NE) {

24810

// For equality comparisons try to use SIGN_EXTEND if the input was

24811

// truncate from something with enough sign bits.

24812

if (Op0.getOpcode() == ISD::TRUNCATE) {

24813

if (DAG.ComputeMaxSignificantBits(Op0.getOperand(0)) <= 16)

24814

ExtendOp = ISD::SIGN_EXTEND;

24815

} else if (Op1.getOpcode() == ISD::TRUNCATE) {

24816

if (DAG.ComputeMaxSignificantBits(Op1.getOperand(0)) <= 16)

24817

ExtendOp = ISD::SIGN_EXTEND;

24818

}

24819

}

24820

24821

CmpVT = MVT::i32;

24822

Op0 = DAG.getNode(ExtendOp, dl, CmpVT, Op0);

24823

Op1 = DAG.getNode(ExtendOp, dl, CmpVT, Op1);

24824

}

24825

}

24826

24827

// Try to shrink i64 compares if the input has enough zero bits.

24828

// FIXME: Do this for non-constant compares for constant on LHS?

24829

if (CmpVT == MVT::i64 && isa<ConstantSDNode>(Op1) && !isX86CCSigned(X86CC) &&

24830

Op0.hasOneUse() && // Hacky way to not break CSE opportunities with sub.

24831

cast<ConstantSDNode>(Op1)->getAPIntValue().getActiveBits() <= 32 &&

24832

DAG.MaskedValueIsZero(Op0, APInt::getHighBitsSet(64, 32))) {

24833

CmpVT = MVT::i32;

24834

Op0 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op0);

24835

Op1 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op1);

24836

}

24837

24838

// 0-x == y --> x+y == 0

24839

// 0-x != y --> x+y != 0

24840

if (Op0.getOpcode() == ISD::SUB && isNullConstant(Op0.getOperand(0)) &&

24841

Op0.hasOneUse() && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {

24842

SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);

24843

SDValue Add = DAG.getNode(X86ISD::ADD, dl, VTs, Op0.getOperand(1), Op1);

24844

return Add.getValue(1);

24845

}

24846

24847

// x == 0-y --> x+y == 0

24848

// x != 0-y --> x+y != 0

24849

if (Op1.getOpcode() == ISD::SUB && isNullConstant(Op1.getOperand(0)) &&

24850

Op1.hasOneUse() && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {

24851

SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);

24852

SDValue Add = DAG.getNode(X86ISD::ADD, dl, VTs, Op0, Op1.getOperand(1));

24853

return Add.getValue(1);

24854

}

24855

24856

// Use SUB instead of CMP to enable CSE between SUB and CMP.

24857

SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);

24858

SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs, Op0, Op1);

24859

return Sub.getValue(1);

24860

}

24861

24862

/// Check if replacement of SQRT with RSQRT should be disabled.

24863

bool X86TargetLowering::isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const {

24864

EVT VT = Op.getValueType();

24865

24866

// We don't need to replace SQRT with RSQRT for half type.

24867

if (VT.getScalarType() == MVT::f16)

24868

return true;

24869

24870

// We never want to use both SQRT and RSQRT instructions for the same input.

24871

if (DAG.doesNodeExist(X86ISD::FRSQRT, DAG.getVTList(VT), Op))

24872

return false;

24873

24874

if (VT.isVector())

24875

return Subtarget.hasFastVectorFSQRT();

24876

return Subtarget.hasFastScalarFSQRT();

24877

}

24878

24879

/// The minimum architected relative accuracy is 2^-12. We need one

24880

/// Newton-Raphson step to have a good float result (24 bits of precision).

24881

SDValue X86TargetLowering::getSqrtEstimate(SDValue Op,

24882

SelectionDAG &DAG, int Enabled,

24883

int &RefinementSteps,

24884

bool &UseOneConstNR,

24885

bool Reciprocal) const {

24886

SDLoc DL(Op);

24887

EVT VT = Op.getValueType();

24888

24889

// SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps.

24890

// It is likely not profitable to do this for f64 because a double-precision

24891

// rsqrt estimate with refinement on x86 prior to FMA requires at least 16

24892

// instructions: convert to single, rsqrtss, convert back to double, refine

24893

// (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA

24894

// along with FMA, this could be a throughput win.

24895

// TODO: SQRT requires SSE2 to prevent the introduction of an illegal v4i32

24896

// after legalize types.

24897

if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||

24898

(VT == MVT::v4f32 && Subtarget.hasSSE1() && Reciprocal) ||

24899

(VT == MVT::v4f32 && Subtarget.hasSSE2() && !Reciprocal) ||

24900

(VT == MVT::v8f32 && Subtarget.hasAVX()) ||

24901

(VT == MVT::v16f32 && Subtarget.useAVX512Regs())) {

24902

if (RefinementSteps == ReciprocalEstimate::Unspecified)

24903

RefinementSteps = 1;

24904

24905

UseOneConstNR = false;

24906

// There is no FSQRT for 512-bits, but there is RSQRT14.

24907

unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RSQRT14 : X86ISD::FRSQRT;

24908

SDValue Estimate = DAG.getNode(Opcode, DL, VT, Op);

24909

if (RefinementSteps == 0 && !Reciprocal)

24910

Estimate = DAG.getNode(ISD::FMUL, DL, VT, Op, Estimate);

24911

return Estimate;

24912

}

24913

24914

if (VT.getScalarType() == MVT::f16 && isTypeLegal(VT) &&

24915

Subtarget.hasFP16()) {

24916

assert(Reciprocal && "Don't replace SQRT with RSQRT for half type")(static_cast <bool> (Reciprocal && "Don't replace SQRT with RSQRT for half type"
) ? void (0) : __assert_fail ("Reciprocal && \"Don't replace SQRT with RSQRT for half type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24916, __extension__
__PRETTY_FUNCTION__));

24917

if (RefinementSteps == ReciprocalEstimate::Unspecified)

24918

RefinementSteps = 0;

24919

24920

if (VT == MVT::f16) {

24921

SDValue Zero = DAG.getIntPtrConstant(0, DL);

24922

SDValue Undef = DAG.getUNDEF(MVT::v8f16);

24923

Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v8f16, Op);

24924

Op = DAG.getNode(X86ISD::RSQRT14S, DL, MVT::v8f16, Undef, Op);

24925

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16, Op, Zero);

24926

}

24927

24928

return DAG.getNode(X86ISD::RSQRT14, DL, VT, Op);

24929

}

24930

return SDValue();

24931

}

24932

24933

/// The minimum architected relative accuracy is 2^-12. We need one

24934

/// Newton-Raphson step to have a good float result (24 bits of precision).

24935

SDValue X86TargetLowering::getRecipEstimate(SDValue Op, SelectionDAG &DAG,

24936

int Enabled,

24937

int &RefinementSteps) const {

24938

SDLoc DL(Op);

24939

EVT VT = Op.getValueType();

24940

24941

// SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.

24942

// It is likely not profitable to do this for f64 because a double-precision

24943

// reciprocal estimate with refinement on x86 prior to FMA requires

24944

// 15 instructions: convert to single, rcpss, convert back to double, refine

24945

// (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA

24946

// along with FMA, this could be a throughput win.

24947

24948

if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||

24949

(VT == MVT::v4f32 && Subtarget.hasSSE1()) ||

24950

(VT == MVT::v8f32 && Subtarget.hasAVX()) ||

24951

(VT == MVT::v16f32 && Subtarget.useAVX512Regs())) {

24952

// Enable estimate codegen with 1 refinement step for vector division.

24953

// Scalar division estimates are disabled because they break too much

24954

// real-world code. These defaults are intended to match GCC behavior.

24955

if (VT == MVT::f32 && Enabled == ReciprocalEstimate::Unspecified)

24956

return SDValue();

24957

24958

if (RefinementSteps == ReciprocalEstimate::Unspecified)

24959

RefinementSteps = 1;

24960

24961

// There is no FSQRT for 512-bits, but there is RCP14.

24962

unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RCP14 : X86ISD::FRCP;

24963

return DAG.getNode(Opcode, DL, VT, Op);

24964

}

24965

24966

if (VT.getScalarType() == MVT::f16 && isTypeLegal(VT) &&

24967

Subtarget.hasFP16()) {

24968

if (RefinementSteps == ReciprocalEstimate::Unspecified)

24969

RefinementSteps = 0;

24970

24971

if (VT == MVT::f16) {

24972

SDValue Zero = DAG.getIntPtrConstant(0, DL);

24973

SDValue Undef = DAG.getUNDEF(MVT::v8f16);

24974

Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v8f16, Op);

24975

Op = DAG.getNode(X86ISD::RCP14S, DL, MVT::v8f16, Undef, Op);

24976

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16, Op, Zero);

24977

}

24978

24979

return DAG.getNode(X86ISD::RCP14, DL, VT, Op);

24980

}

24981

return SDValue();

24982

}

24983

24984

/// If we have at least two divisions that use the same divisor, convert to

24985

/// multiplication by a reciprocal. This may need to be adjusted for a given

24986

/// CPU if a division's cost is not at least twice the cost of a multiplication.

24987

/// This is because we still need one division to calculate the reciprocal and

24988

/// then we need two multiplies by that reciprocal as replacements for the

24989

/// original divisions.

24990

unsigned X86TargetLowering::combineRepeatedFPDivisors() const {

24991

return 2;

24992

}

24993

24994

SDValue

24995

X86TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,

24996

SelectionDAG &DAG,

24997

SmallVectorImpl<SDNode *> &Created) const {

24998

AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();

24999

if (isIntDivCheap(N->getValueType(0), Attr))

25000

return SDValue(N,0); // Lower SDIV as SDIV

25001

25002

assert((Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()) &&(static_cast <bool> ((Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2
()) && "Unexpected divisor!") ? void (0) : __assert_fail
("(Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()) && \"Unexpected divisor!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25003, __extension__
__PRETTY_FUNCTION__))

25003

"Unexpected divisor!")(static_cast <bool> ((Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2
()) && "Unexpected divisor!") ? void (0) : __assert_fail
("(Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()) && \"Unexpected divisor!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25003, __extension__
__PRETTY_FUNCTION__));

25004

25005

// Only perform this transform if CMOV is supported otherwise the select

25006

// below will become a branch.

25007

if (!Subtarget.canUseCMOV())

25008

return SDValue();

25009

25010

// fold (sdiv X, pow2)

25011

EVT VT = N->getValueType(0);

25012

// FIXME: Support i8.

25013

if (VT != MVT::i16 && VT != MVT::i32 &&

25014

!(Subtarget.is64Bit() && VT == MVT::i64))

25015

return SDValue();

25016

25017

unsigned Lg2 = Divisor.countr_zero();

25018

25019

// If the divisor is 2 or -2, the default expansion is better.

25020

if (Lg2 == 1)

25021

return SDValue();

25022

25023

SDLoc DL(N);

25024

SDValue N0 = N->getOperand(0);

25025

SDValue Zero = DAG.getConstant(0, DL, VT);

25026

APInt Lg2Mask = APInt::getLowBitsSet(VT.getSizeInBits(), Lg2);

25027

SDValue Pow2MinusOne = DAG.getConstant(Lg2Mask, DL, VT);

25028

25029

// If N0 is negative, we need to add (Pow2 - 1) to it before shifting right.

25030

SDValue Cmp = DAG.getSetCC(DL, MVT::i8, N0, Zero, ISD::SETLT);

25031

SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N0, Pow2MinusOne);

25032

SDValue CMov = DAG.getNode(ISD::SELECT, DL, VT, Cmp, Add, N0);

25033

25034

Created.push_back(Cmp.getNode());

25035

Created.push_back(Add.getNode());

25036

Created.push_back(CMov.getNode());

25037

25038

// Divide by pow2.

25039

SDValue SRA =

25040

DAG.getNode(ISD::SRA, DL, VT, CMov, DAG.getConstant(Lg2, DL, MVT::i8));

25041

25042

// If we're dividing by a positive value, we're done. Otherwise, we must

25043

// negate the result.

25044

if (Divisor.isNonNegative())

25045

return SRA;

25046

25047

Created.push_back(SRA.getNode());

25048

return DAG.getNode(ISD::SUB, DL, VT, Zero, SRA);

25049

}

25050

25051

/// Result of 'and' is compared against zero. Change to a BT node if possible.

25052

/// Returns the BT node and the condition code needed to use it.

25053

static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC, const SDLoc &dl,

25054

SelectionDAG &DAG, X86::CondCode &X86CC) {

25055

assert(And.getOpcode() == ISD::AND && "Expected AND node!")(static_cast <bool> (And.getOpcode() == ISD::AND &&
"Expected AND node!") ? void (0) : __assert_fail ("And.getOpcode() == ISD::AND && \"Expected AND node!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25055, __extension__
__PRETTY_FUNCTION__));

25056

SDValue Op0 = And.getOperand(0);

25057

SDValue Op1 = And.getOperand(1);

25058

if (Op0.getOpcode() == ISD::TRUNCATE)

25059

Op0 = Op0.getOperand(0);

25060

if (Op1.getOpcode() == ISD::TRUNCATE)

25061

Op1 = Op1.getOperand(0);

25062

25063

SDValue Src, BitNo;

25064

if (Op1.getOpcode() == ISD::SHL)

25065

std::swap(Op0, Op1);

25066

if (Op0.getOpcode() == ISD::SHL) {

25067

if (isOneConstant(Op0.getOperand(0))) {

25068

// If we looked past a truncate, check that it's only truncating away

25069

// known zeros.

25070

unsigned BitWidth = Op0.getValueSizeInBits();

25071

unsigned AndBitWidth = And.getValueSizeInBits();

25072

if (BitWidth > AndBitWidth) {

25073

KnownBits Known = DAG.computeKnownBits(Op0);

25074

if (Known.countMinLeadingZeros() < BitWidth - AndBitWidth)

25075

return SDValue();

25076

}

25077

Src = Op1;

25078

BitNo = Op0.getOperand(1);

25079

}

25080

} else if (Op1.getOpcode() == ISD::Constant) {

25081

ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);

25082

uint64_t AndRHSVal = AndRHS->getZExtValue();

25083

SDValue AndLHS = Op0;

25084

25085

if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) {

25086

Src = AndLHS.getOperand(0);

25087

BitNo = AndLHS.getOperand(1);

25088

} else {

25089

// Use BT if the immediate can't be encoded in a TEST instruction or we

25090

// are optimizing for size and the immedaite won't fit in a byte.

25091

bool OptForSize = DAG.shouldOptForSize();

25092

if ((!isUInt<32>(AndRHSVal) || (OptForSize && !isUInt<8>(AndRHSVal))) &&

25093

isPowerOf2_64(AndRHSVal)) {

25094

Src = AndLHS;

25095

BitNo = DAG.getConstant(Log2_64_Ceil(AndRHSVal), dl,

25096

Src.getValueType());

25097

}

25098

}

25099

}

25100

25101

// No patterns found, give up.

25102

if (!Src.getNode())

25103

return SDValue();

25104

25105

// Remove any bit flip.

25106

if (isBitwiseNot(Src)) {

25107

Src = Src.getOperand(0);

25108

CC = CC == ISD::SETEQ ? ISD::SETNE : ISD::SETEQ;

25109

}

25110

25111

// Attempt to create the X86ISD::BT node.

25112

if (SDValue BT = getBT(Src, BitNo, dl, DAG)) {

25113

X86CC = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;

25114

return BT;

25115

}

25116

25117

return SDValue();

25118

}

25119

25120

// Check if pre-AVX condcode can be performed by a single FCMP op.

25121

static bool cheapX86FSETCC_SSE(ISD::CondCode SetCCOpcode) {

25122

return (SetCCOpcode != ISD::SETONE) && (SetCCOpcode != ISD::SETUEQ);

25123

}

25124

25125

/// Turns an ISD::CondCode into a value suitable for SSE floating-point mask

25126

/// CMPs.

25127

static unsigned translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,

25128

SDValue &Op1, bool &IsAlwaysSignaling) {

25129

unsigned SSECC;

25130

bool Swap = false;

25131

25132

// SSE Condition code mapping:

25133

// 0 - EQ

25134

// 1 - LT

25135

// 2 - LE

25136

// 3 - UNORD

25137

// 4 - NEQ

25138

// 5 - NLT

25139

// 6 - NLE

25140

// 7 - ORD

25141

switch (SetCCOpcode) {

25142

default: llvm_unreachable("Unexpected SETCC condition")::llvm::llvm_unreachable_internal("Unexpected SETCC condition"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25142);

25143

case ISD::SETOEQ:

25144

case ISD::SETEQ: SSECC = 0; break;

25145

case ISD::SETOGT:

25146

case ISD::SETGT: Swap = true; [[fallthrough]];

25147

case ISD::SETLT:

25148

case ISD::SETOLT: SSECC = 1; break;

25149

case ISD::SETOGE:

25150

case ISD::SETGE: Swap = true; [[fallthrough]];

25151

case ISD::SETLE:

25152

case ISD::SETOLE: SSECC = 2; break;

25153

case ISD::SETUO: SSECC = 3; break;

25154

case ISD::SETUNE:

25155

case ISD::SETNE: SSECC = 4; break;

25156

case ISD::SETULE: Swap = true; [[fallthrough]];

25157

case ISD::SETUGE: SSECC = 5; break;

25158

case ISD::SETULT: Swap = true; [[fallthrough]];

25159

case ISD::SETUGT: SSECC = 6; break;

25160

case ISD::SETO: SSECC = 7; break;

25161

case ISD::SETUEQ: SSECC = 8; break;

25162

case ISD::SETONE: SSECC = 12; break;

25163

}

25164

if (Swap)

25165

std::swap(Op0, Op1);

25166

25167

switch (SetCCOpcode) {

25168

default:

25169

IsAlwaysSignaling = true;

25170

break;

25171

case ISD::SETEQ:

25172

case ISD::SETOEQ:

25173

case ISD::SETUEQ:

25174

case ISD::SETNE:

25175

case ISD::SETONE:

25176

case ISD::SETUNE:

25177

case ISD::SETO:

25178

case ISD::SETUO:

25179

IsAlwaysSignaling = false;

25180

break;

25181

}

25182

25183

return SSECC;

25184

}

25185

25186

/// Break a VSETCC 256-bit integer VSETCC into two new 128 ones and then

25187

/// concatenate the result back.

25188

static SDValue splitIntVSETCC(EVT VT, SDValue LHS, SDValue RHS,

25189

ISD::CondCode Cond, SelectionDAG &DAG,

25190

const SDLoc &dl) {

25191

assert(VT.isInteger() && VT == LHS.getValueType() &&(static_cast <bool> (VT.isInteger() && VT == LHS
.getValueType() && VT == RHS.getValueType() &&
"Unsupported VTs!") ? void (0) : __assert_fail ("VT.isInteger() && VT == LHS.getValueType() && VT == RHS.getValueType() && \"Unsupported VTs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25192, __extension__
__PRETTY_FUNCTION__))

25192

VT == RHS.getValueType() && "Unsupported VTs!")(static_cast <bool> (VT.isInteger() && VT == LHS
.getValueType() && VT == RHS.getValueType() &&
"Unsupported VTs!") ? void (0) : __assert_fail ("VT.isInteger() && VT == LHS.getValueType() && VT == RHS.getValueType() && \"Unsupported VTs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25192, __extension__
__PRETTY_FUNCTION__));

25193

25194

SDValue CC = DAG.getCondCode(Cond);

25195

25196

// Extract the LHS Lo/Hi vectors

25197

SDValue LHS1, LHS2;

25198

std::tie(LHS1, LHS2) = splitVector(LHS, DAG, dl);

25199

25200

// Extract the RHS Lo/Hi vectors

25201

SDValue RHS1, RHS2;

25202

std::tie(RHS1, RHS2) = splitVector(RHS, DAG, dl);

25203

25204

// Issue the operation on the smaller types and concatenate the result back

25205

EVT LoVT, HiVT;

25206

std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);

25207

return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,

25208

DAG.getNode(ISD::SETCC, dl, LoVT, LHS1, RHS1, CC),

25209

DAG.getNode(ISD::SETCC, dl, HiVT, LHS2, RHS2, CC));

25210

}

25211

25212

static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {

25213

25214

SDValue Op0 = Op.getOperand(0);

25215

SDValue Op1 = Op.getOperand(1);

25216

SDValue CC = Op.getOperand(2);

25217

MVT VT = Op.getSimpleValueType();

25218

SDLoc dl(Op);

25219

25220

assert(VT.getVectorElementType() == MVT::i1 &&(static_cast <bool> (VT.getVectorElementType() == MVT::
i1 && "Cannot set masked compare for this operation")
? void (0) : __assert_fail ("VT.getVectorElementType() == MVT::i1 && \"Cannot set masked compare for this operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25221, __extension__
__PRETTY_FUNCTION__))

25221

"Cannot set masked compare for this operation")(static_cast <bool> (VT.getVectorElementType() == MVT::
i1 && "Cannot set masked compare for this operation")
? void (0) : __assert_fail ("VT.getVectorElementType() == MVT::i1 && \"Cannot set masked compare for this operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25221, __extension__
__PRETTY_FUNCTION__));

25222

25223

ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();

25224

25225

// Prefer SETGT over SETLT.

25226

if (SetCCOpcode == ISD::SETLT) {

25227

SetCCOpcode = ISD::getSetCCSwappedOperands(SetCCOpcode);

25228

std::swap(Op0, Op1);

25229

}

25230

25231

return DAG.getSetCC(dl, VT, Op0, Op1, SetCCOpcode);

25232

}

25233

25234

/// Given a buildvector constant, return a new vector constant with each element

25235

/// incremented or decremented. If incrementing or decrementing would result in

25236

/// unsigned overflow or underflow or this is not a simple vector constant,

25237

/// return an empty value.

25238

static SDValue incDecVectorConstant(SDValue V, SelectionDAG &DAG, bool IsInc,

25239

bool NSW) {

25240

auto *BV = dyn_cast<BuildVectorSDNode>(V.getNode());

25241

if (!BV || !V.getValueType().isSimple())

25242

return SDValue();

25243

25244

MVT VT = V.getSimpleValueType();

25245

MVT EltVT = VT.getVectorElementType();

25246

unsigned NumElts = VT.getVectorNumElements();

25247

SmallVector<SDValue, 8> NewVecC;

25248

SDLoc DL(V);

25249

for (unsigned i = 0; i < NumElts; ++i) {

25250

auto *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i));

25251

if (!Elt || Elt->isOpaque() || Elt->getSimpleValueType(0) != EltVT)

25252

return SDValue();

25253

25254

// Avoid overflow/underflow.

25255

const APInt &EltC = Elt->getAPIntValue();

25256

if ((IsInc && EltC.isMaxValue()) || (!IsInc && EltC.isZero()))

25257

return SDValue();

25258

if (NSW && ((IsInc && EltC.isMaxSignedValue()) ||

25259

(!IsInc && EltC.isMinSignedValue())))

25260

return SDValue();

25261

25262

NewVecC.push_back(DAG.getConstant(EltC + (IsInc ? 1 : -1), DL, EltVT));

25263

}

25264

25265

return DAG.getBuildVector(VT, DL, NewVecC);

25266

}

25267

25268

/// As another special case, use PSUBUS[BW] when it's profitable. E.g. for

25269

/// Op0 u<= Op1:

25270

/// t = psubus Op0, Op1

25271

/// pcmpeq t, <0..0>

25272

static SDValue LowerVSETCCWithSUBUS(SDValue Op0, SDValue Op1, MVT VT,

25273

ISD::CondCode Cond, const SDLoc &dl,

25274

const X86Subtarget &Subtarget,

25275

SelectionDAG &DAG) {

25276

if (!Subtarget.hasSSE2())

25277

return SDValue();

25278

25279

MVT VET = VT.getVectorElementType();

25280

if (VET != MVT::i8 && VET != MVT::i16)

25281

return SDValue();

25282

25283

switch (Cond) {

25284

default:

25285

return SDValue();

25286

case ISD::SETULT: {

25287

// If the comparison is against a constant we can turn this into a

25288

// setule. With psubus, setule does not require a swap. This is

25289

// beneficial because the constant in the register is no longer

25290

// destructed as the destination so it can be hoisted out of a loop.

25291

// Only do this pre-AVX since vpcmp* is no longer destructive.

25292

if (Subtarget.hasAVX())

25293

return SDValue();

25294

SDValue ULEOp1 =

25295

incDecVectorConstant(Op1, DAG, /*IsInc*/ false, /*NSW*/ false);

25296

if (!ULEOp1)

25297

return SDValue();

25298

Op1 = ULEOp1;

25299

break;

25300

}

25301

case ISD::SETUGT: {

25302

// If the comparison is against a constant, we can turn this into a setuge.

25303

// This is beneficial because materializing a constant 0 for the PCMPEQ is

25304

// probably cheaper than XOR+PCMPGT using 2 different vector constants:

25305

// cmpgt (xor X, SignMaskC) CmpC --> cmpeq (usubsat (CmpC+1), X), 0

25306

SDValue UGEOp1 =

25307

incDecVectorConstant(Op1, DAG, /*IsInc*/ true, /*NSW*/ false);

25308

if (!UGEOp1)

25309

return SDValue();

25310

Op1 = Op0;

25311

Op0 = UGEOp1;

25312

break;

25313

}

25314

// Psubus is better than flip-sign because it requires no inversion.

25315

case ISD::SETUGE:

25316

std::swap(Op0, Op1);

25317

break;

25318

case ISD::SETULE:

25319

break;

25320

}

25321

25322

SDValue Result = DAG.getNode(ISD::USUBSAT, dl, VT, Op0, Op1);

25323

return DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result,

25324

DAG.getConstant(0, dl, VT));

25325

}

25326

25327

static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,

25328

SelectionDAG &DAG) {

25329

bool IsStrict = Op.getOpcode() == ISD::STRICT_FSETCC ||

25330

Op.getOpcode() == ISD::STRICT_FSETCCS;

25331

SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0);

25332

SDValue Op1 = Op.getOperand(IsStrict ? 2 : 1);

25333

SDValue CC = Op.getOperand(IsStrict ? 3 : 2);

25334

MVT VT = Op->getSimpleValueType(0);

25335

ISD::CondCode Cond = cast<CondCodeSDNode>(CC)->get();

25336

bool isFP = Op1.getSimpleValueType().isFloatingPoint();

25337

SDLoc dl(Op);

25338

25339

if (isFP) {

25340

MVT EltVT = Op0.getSimpleValueType().getVectorElementType();

25341

assert(EltVT == MVT::f16 || EltVT == MVT::f32 || EltVT == MVT::f64)(static_cast <bool> (EltVT == MVT::f16 || EltVT == MVT::
f32 || EltVT == MVT::f64) ? void (0) : __assert_fail ("EltVT == MVT::f16 || EltVT == MVT::f32 || EltVT == MVT::f64"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25341, __extension__
__PRETTY_FUNCTION__));

25342

if (isSoftFP16(EltVT, Subtarget))

25343

return SDValue();

25344

25345

bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;

25346

SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();

25347

25348

// If we have a strict compare with a vXi1 result and the input is 128/256

25349

// bits we can't use a masked compare unless we have VLX. If we use a wider

25350

// compare like we do for non-strict, we might trigger spurious exceptions

25351

// from the upper elements. Instead emit a AVX compare and convert to mask.

25352

unsigned Opc;

25353

if (Subtarget.hasAVX512() && VT.getVectorElementType() == MVT::i1 &&

25354

(!IsStrict || Subtarget.hasVLX() ||

25355

Op0.getSimpleValueType().is512BitVector())) {

25356

#ifndef NDEBUG

25357

unsigned Num = VT.getVectorNumElements();

25358

assert(Num <= 16 || (Num == 32 && EltVT == MVT::f16))(static_cast <bool> (Num <= 16 || (Num == 32 &&
EltVT == MVT::f16)) ? void (0) : __assert_fail ("Num <= 16 || (Num == 32 && EltVT == MVT::f16)"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25358, __extension__
__PRETTY_FUNCTION__));

25359

#endif

25360

Opc = IsStrict ? X86ISD::STRICT_CMPM : X86ISD::CMPM;

25361

} else {

25362

Opc = IsStrict ? X86ISD::STRICT_CMPP : X86ISD::CMPP;

25363

// The SSE/AVX packed FP comparison nodes are defined with a

25364

// floating-point vector result that matches the operand type. This allows

25365

// them to work with an SSE1 target (integer vector types are not legal).

25366

VT = Op0.getSimpleValueType();

25367

}

25368

25369

SDValue Cmp;

25370

bool IsAlwaysSignaling;

25371

unsigned SSECC = translateX86FSETCC(Cond, Op0, Op1, IsAlwaysSignaling);

25372

if (!Subtarget.hasAVX()) {

25373

// TODO: We could use following steps to handle a quiet compare with

25374

// signaling encodings.

25375

// 1. Get ordered masks from a quiet ISD::SETO

25376

// 2. Use the masks to mask potential unordered elements in operand A, B

25377

// 3. Get the compare results of masked A, B

25378

// 4. Calculating final result using the mask and result from 3

25379

// But currently, we just fall back to scalar operations.

25380

if (IsStrict && IsAlwaysSignaling && !IsSignaling)

25381

return SDValue();

25382

25383

// Insert an extra signaling instruction to raise exception.

25384

if (IsStrict && !IsAlwaysSignaling && IsSignaling) {

25385

SDValue SignalCmp = DAG.getNode(

25386

Opc, dl, {VT, MVT::Other},

25387

{Chain, Op0, Op1, DAG.getTargetConstant(1, dl, MVT::i8)}); // LT_OS

25388

// FIXME: It seems we need to update the flags of all new strict nodes.

25389

// Otherwise, mayRaiseFPException in MI will return false due to

25390

// NoFPExcept = false by default. However, I didn't find it in other

25391

// patches.

25392

SignalCmp->setFlags(Op->getFlags());

25393

Chain = SignalCmp.getValue(1);

25394

}

25395

25396

// In the two cases not handled by SSE compare predicates (SETUEQ/SETONE),

25397

// emit two comparisons and a logic op to tie them together.

25398

if (!cheapX86FSETCC_SSE(Cond)) {

25399

// LLVM predicate is SETUEQ or SETONE.

25400

unsigned CC0, CC1;

25401

unsigned CombineOpc;

25402

if (Cond == ISD::SETUEQ) {

25403

CC0 = 3; // UNORD

25404

CC1 = 0; // EQ

25405

CombineOpc = X86ISD::FOR;

25406

} else {

25407

assert(Cond == ISD::SETONE)(static_cast <bool> (Cond == ISD::SETONE) ? void (0) : __assert_fail
("Cond == ISD::SETONE", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 25407, __extension__ __PRETTY_FUNCTION__));

25408

CC0 = 7; // ORD

25409

CC1 = 4; // NEQ

25410

CombineOpc = X86ISD::FAND;

25411

}

25412

25413

SDValue Cmp0, Cmp1;

25414

if (IsStrict) {

25415

Cmp0 = DAG.getNode(

25416

Opc, dl, {VT, MVT::Other},

25417

{Chain, Op0, Op1, DAG.getTargetConstant(CC0, dl, MVT::i8)});

25418

Cmp1 = DAG.getNode(

25419

Opc, dl, {VT, MVT::Other},

25420

{Chain, Op0, Op1, DAG.getTargetConstant(CC1, dl, MVT::i8)});

25421

Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Cmp0.getValue(1),

25422

Cmp1.getValue(1));

25423

} else {

25424

Cmp0 = DAG.getNode(

25425

Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(CC0, dl, MVT::i8));

25426

Cmp1 = DAG.getNode(

25427

Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(CC1, dl, MVT::i8));

25428

}

25429

Cmp = DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);

25430

} else {

25431

if (IsStrict) {

25432

Cmp = DAG.getNode(

25433

Opc, dl, {VT, MVT::Other},

25434

{Chain, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)});

25435

Chain = Cmp.getValue(1);

25436

} else

25437

Cmp = DAG.getNode(

25438

Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8));

25439

}

25440

} else {

25441

// Handle all other FP comparisons here.

25442

if (IsStrict) {

25443

// Make a flip on already signaling CCs before setting bit 4 of AVX CC.

25444

SSECC |= (IsAlwaysSignaling ^ IsSignaling) << 4;

25445

Cmp = DAG.getNode(

25446

Opc, dl, {VT, MVT::Other},

25447

{Chain, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)});

25448

Chain = Cmp.getValue(1);

25449

} else

25450

Cmp = DAG.getNode(

25451

Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8));

25452

}

25453

25454

if (VT.getFixedSizeInBits() >

25455

Op.getSimpleValueType().getFixedSizeInBits()) {

25456

// We emitted a compare with an XMM/YMM result. Finish converting to a

25457

// mask register using a vptestm.

25458

EVT CastVT = EVT(VT).changeVectorElementTypeToInteger();

25459

Cmp = DAG.getBitcast(CastVT, Cmp);

25460

Cmp = DAG.getSetCC(dl, Op.getSimpleValueType(), Cmp,

25461

DAG.getConstant(0, dl, CastVT), ISD::SETNE);

25462

} else {

25463

// If this is SSE/AVX CMPP, bitcast the result back to integer to match

25464

// the result type of SETCC. The bitcast is expected to be optimized

25465

// away during combining/isel.

25466

Cmp = DAG.getBitcast(Op.getSimpleValueType(), Cmp);

25467

}

25468

25469

if (IsStrict)

25470

return DAG.getMergeValues({Cmp, Chain}, dl);

25471

25472

return Cmp;

25473

}

25474

25475

assert(!IsStrict && "Strict SETCC only handles FP operands.")(static_cast <bool> (!IsStrict && "Strict SETCC only handles FP operands."
) ? void (0) : __assert_fail ("!IsStrict && \"Strict SETCC only handles FP operands.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25475, __extension__
__PRETTY_FUNCTION__));

25476

25477

MVT VTOp0 = Op0.getSimpleValueType();

25478

(void)VTOp0;

25479

assert(VTOp0 == Op1.getSimpleValueType() &&(static_cast <bool> (VTOp0 == Op1.getSimpleValueType() &&
"Expected operands with same type!") ? void (0) : __assert_fail
("VTOp0 == Op1.getSimpleValueType() && \"Expected operands with same type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25480, __extension__
__PRETTY_FUNCTION__))

25480

"Expected operands with same type!")(static_cast <bool> (VTOp0 == Op1.getSimpleValueType() &&
"Expected operands with same type!") ? void (0) : __assert_fail
("VTOp0 == Op1.getSimpleValueType() && \"Expected operands with same type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25480, __extension__
__PRETTY_FUNCTION__));

25481

assert(VT.getVectorNumElements() == VTOp0.getVectorNumElements() &&(static_cast <bool> (VT.getVectorNumElements() == VTOp0
.getVectorNumElements() && "Invalid number of packed elements for source and destination!"
) ? void (0) : __assert_fail ("VT.getVectorNumElements() == VTOp0.getVectorNumElements() && \"Invalid number of packed elements for source and destination!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25482, __extension__
__PRETTY_FUNCTION__))

25482

"Invalid number of packed elements for source and destination!")(static_cast <bool> (VT.getVectorNumElements() == VTOp0
.getVectorNumElements() && "Invalid number of packed elements for source and destination!"
) ? void (0) : __assert_fail ("VT.getVectorNumElements() == VTOp0.getVectorNumElements() && \"Invalid number of packed elements for source and destination!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25482, __extension__
__PRETTY_FUNCTION__));

25483

25484

// The non-AVX512 code below works under the assumption that source and

25485

// destination types are the same.

25486

assert((Subtarget.hasAVX512() || (VT == VTOp0)) &&(static_cast <bool> ((Subtarget.hasAVX512() || (VT == VTOp0
)) && "Value types for source and destination must be the same!"
) ? void (0) : __assert_fail ("(Subtarget.hasAVX512() || (VT == VTOp0)) && \"Value types for source and destination must be the same!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25487, __extension__
__PRETTY_FUNCTION__))

25487

"Value types for source and destination must be the same!")(static_cast <bool> ((Subtarget.hasAVX512() || (VT == VTOp0
)) && "Value types for source and destination must be the same!"
) ? void (0) : __assert_fail ("(Subtarget.hasAVX512() || (VT == VTOp0)) && \"Value types for source and destination must be the same!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25487, __extension__
__PRETTY_FUNCTION__));

25488

25489

// The result is boolean, but operands are int/float

25490

if (VT.getVectorElementType() == MVT::i1) {

25491

// In AVX-512 architecture setcc returns mask with i1 elements,

25492

// But there is no compare instruction for i8 and i16 elements in KNL.

25493

assert((VTOp0.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) &&(static_cast <bool> ((VTOp0.getScalarSizeInBits() >=
32 || Subtarget.hasBWI()) && "Unexpected operand type"
) ? void (0) : __assert_fail ("(VTOp0.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) && \"Unexpected operand type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25494, __extension__
__PRETTY_FUNCTION__))

25494

"Unexpected operand type")(static_cast <bool> ((VTOp0.getScalarSizeInBits() >=
32 || Subtarget.hasBWI()) && "Unexpected operand type"
) ? void (0) : __assert_fail ("(VTOp0.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) && \"Unexpected operand type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25494, __extension__
__PRETTY_FUNCTION__));

25495

return LowerIntVSETCC_AVX512(Op, DAG);

25496

}

25497

25498

// Lower using XOP integer comparisons.

25499

if (VT.is128BitVector() && Subtarget.hasXOP()) {

25500

// Translate compare code to XOP PCOM compare mode.

25501

unsigned CmpMode = 0;

25502

switch (Cond) {

25503

default: llvm_unreachable("Unexpected SETCC condition")::llvm::llvm_unreachable_internal("Unexpected SETCC condition"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25503);

25504

case ISD::SETULT:

25505

case ISD::SETLT: CmpMode = 0x00; break;

25506

case ISD::SETULE:

25507

case ISD::SETLE: CmpMode = 0x01; break;

25508

case ISD::SETUGT:

25509

case ISD::SETGT: CmpMode = 0x02; break;

25510

case ISD::SETUGE:

25511

case ISD::SETGE: CmpMode = 0x03; break;

25512

case ISD::SETEQ: CmpMode = 0x04; break;

25513

case ISD::SETNE: CmpMode = 0x05; break;

25514

}

25515

25516

// Are we comparing unsigned or signed integers?

25517

unsigned Opc =

25518

ISD::isUnsignedIntSetCC(Cond) ? X86ISD::VPCOMU : X86ISD::VPCOM;

25519

25520

return DAG.getNode(Opc, dl, VT, Op0, Op1,

25521

DAG.getTargetConstant(CmpMode, dl, MVT::i8));

25522

}

25523

25524

// (X & Y) != 0 --> (X & Y) == Y iff Y is power-of-2.

25525

// Revert part of the simplifySetCCWithAnd combine, to avoid an invert.

25526

if (Cond == ISD::SETNE && ISD::isBuildVectorAllZeros(Op1.getNode())) {

25527

SDValue BC0 = peekThroughBitcasts(Op0);

25528

if (BC0.getOpcode() == ISD::AND) {

25529

APInt UndefElts;

25530

SmallVector<APInt, 64> EltBits;

25531

if (getTargetConstantBitsFromNode(BC0.getOperand(1),

25532

VT.getScalarSizeInBits(), UndefElts,

25533

EltBits, false, false)) {

25534

if (llvm::all_of(EltBits, [](APInt &V) { return V.isPowerOf2(); })) {

25535

Cond = ISD::SETEQ;

25536

Op1 = DAG.getBitcast(VT, BC0.getOperand(1));

25537

}

25538

}

25539

}

25540

}

25541

25542

// ICMP_EQ(AND(X,C),C) -> SRA(SHL(X,LOG2(C)),BW-1) iff C is power-of-2.

25543

if (Cond == ISD::SETEQ && Op0.getOpcode() == ISD::AND &&

25544

Op0.getOperand(1) == Op1 && Op0.hasOneUse()) {

25545

ConstantSDNode *C1 = isConstOrConstSplat(Op1);

25546

if (C1 && C1->getAPIntValue().isPowerOf2()) {

25547

unsigned BitWidth = VT.getScalarSizeInBits();

25548

unsigned ShiftAmt = BitWidth - C1->getAPIntValue().logBase2() - 1;

25549

25550

SDValue Result = Op0.getOperand(0);

25551

Result = DAG.getNode(ISD::SHL, dl, VT, Result,

25552

DAG.getConstant(ShiftAmt, dl, VT));

25553

Result = DAG.getNode(ISD::SRA, dl, VT, Result,

25554

DAG.getConstant(BitWidth - 1, dl, VT));

25555

return Result;

25556

}

25557

}

25558

25559

// Break 256-bit integer vector compare into smaller ones.

25560

if (VT.is256BitVector() && !Subtarget.hasInt256())

25561

return splitIntVSETCC(VT, Op0, Op1, Cond, DAG, dl);

25562

25563

// Break 512-bit integer vector compare into smaller ones.

25564

// TODO: Try harder to use VPCMPx + VPMOV2x?

25565

if (VT.is512BitVector())

25566

return splitIntVSETCC(VT, Op0, Op1, Cond, DAG, dl);

25567

25568

// If we have a limit constant, try to form PCMPGT (signed cmp) to avoid

25569

// not-of-PCMPEQ:

25570

// X != INT_MIN --> X >s INT_MIN

25571

// X != INT_MAX --> X <s INT_MAX --> INT_MAX >s X

25572

// +X != 0 --> +X >s 0

25573

APInt ConstValue;

25574

if (Cond == ISD::SETNE &&

25575

ISD::isConstantSplatVector(Op1.getNode(), ConstValue)) {

25576

if (ConstValue.isMinSignedValue())

25577

Cond = ISD::SETGT;

25578

else if (ConstValue.isMaxSignedValue())

25579

Cond = ISD::SETLT;

25580

else if (ConstValue.isZero() && DAG.SignBitIsZero(Op0))

25581

Cond = ISD::SETGT;

25582

}

25583

25584

// If both operands are known non-negative, then an unsigned compare is the

25585

// same as a signed compare and there's no need to flip signbits.

25586

// TODO: We could check for more general simplifications here since we're

25587

// computing known bits.

25588

bool FlipSigns = ISD::isUnsignedIntSetCC(Cond) &&

25589

!(DAG.SignBitIsZero(Op0) && DAG.SignBitIsZero(Op1));

25590

25591

// Special case: Use min/max operations for unsigned compares.

25592

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

25593

if (ISD::isUnsignedIntSetCC(Cond) &&

25594

(FlipSigns || ISD::isTrueWhenEqual(Cond)) &&

25595

TLI.isOperationLegal(ISD::UMIN, VT)) {

25596

// If we have a constant operand, increment/decrement it and change the

25597

// condition to avoid an invert.

25598

if (Cond == ISD::SETUGT) {

25599

// X > C --> X >= (C+1) --> X == umax(X, C+1)

25600

if (SDValue UGTOp1 =

25601

incDecVectorConstant(Op1, DAG, /*IsInc*/ true, /*NSW*/ false)) {

25602

Op1 = UGTOp1;

25603

Cond = ISD::SETUGE;

25604

}

25605

}

25606

if (Cond == ISD::SETULT) {

25607

// X < C --> X <= (C-1) --> X == umin(X, C-1)

25608

if (SDValue ULTOp1 =

25609

incDecVectorConstant(Op1, DAG, /*IsInc*/ false, /*NSW*/ false)) {

25610

Op1 = ULTOp1;

25611

Cond = ISD::SETULE;

25612

}

25613

}

25614

bool Invert = false;

25615

unsigned Opc;

25616

switch (Cond) {

25617

default: llvm_unreachable("Unexpected condition code")::llvm::llvm_unreachable_internal("Unexpected condition code"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25617);

25618

case ISD::SETUGT: Invert = true; [[fallthrough]];

25619

case ISD::SETULE: Opc = ISD::UMIN; break;

25620

case ISD::SETULT: Invert = true; [[fallthrough]];

25621

case ISD::SETUGE: Opc = ISD::UMAX; break;

25622

}

25623

25624

SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);

25625

Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result);

25626

25627

// If the logical-not of the result is required, perform that now.

25628

if (Invert)

25629

Result = DAG.getNOT(dl, Result, VT);

25630

25631

return Result;

25632

}

25633

25634

// Try to use SUBUS and PCMPEQ.

25635

if (FlipSigns)

25636

if (SDValue V =

25637

LowerVSETCCWithSUBUS(Op0, Op1, VT, Cond, dl, Subtarget, DAG))

25638

return V;

25639

25640

// We are handling one of the integer comparisons here. Since SSE only has

25641

// GT and EQ comparisons for integer, swapping operands and multiple

25642

// operations may be required for some comparisons.

25643

unsigned Opc = (Cond == ISD::SETEQ || Cond == ISD::SETNE) ? X86ISD::PCMPEQ

25644

: X86ISD::PCMPGT;

25645

bool Swap = Cond == ISD::SETLT || Cond == ISD::SETULT ||

25646

Cond == ISD::SETGE || Cond == ISD::SETUGE;

25647

bool Invert = Cond == ISD::SETNE ||

25648

(Cond != ISD::SETEQ && ISD::isTrueWhenEqual(Cond));

25649

25650

if (Swap)

25651

std::swap(Op0, Op1);

25652

25653

// Check that the operation in question is available (most are plain SSE2,

25654

// but PCMPGTQ and PCMPEQQ have different requirements).

25655

if (VT == MVT::v2i64) {

25656

if (Opc == X86ISD::PCMPGT && !Subtarget.hasSSE42()) {

25657

assert(Subtarget.hasSSE2() && "Don't know how to lower!")(static_cast <bool> (Subtarget.hasSSE2() && "Don't know how to lower!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"Don't know how to lower!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25657, __extension__
__PRETTY_FUNCTION__));

25658

25659

// Special case for sign bit test. We can use a v4i32 PCMPGT and shuffle

25660

// the odd elements over the even elements.

25661

if (!FlipSigns && !Invert && ISD::isBuildVectorAllZeros(Op0.getNode())) {

25662

Op0 = DAG.getConstant(0, dl, MVT::v4i32);

25663

Op1 = DAG.getBitcast(MVT::v4i32, Op1);

25664

25665

SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);

25666

static const int MaskHi[] = { 1, 1, 3, 3 };

25667

SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);

25668

25669

return DAG.getBitcast(VT, Result);

25670

}

25671

25672

if (!FlipSigns && !Invert && ISD::isBuildVectorAllOnes(Op1.getNode())) {

25673

Op0 = DAG.getBitcast(MVT::v4i32, Op0);

25674

Op1 = DAG.getConstant(-1, dl, MVT::v4i32);

25675

25676

SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);

25677

static const int MaskHi[] = { 1, 1, 3, 3 };

25678

SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);

25679

25680

return DAG.getBitcast(VT, Result);

25681

}

25682

25683

// Since SSE has no unsigned integer comparisons, we need to flip the sign

25684

// bits of the inputs before performing those operations. The lower

25685

// compare is always unsigned.

25686

SDValue SB = DAG.getConstant(FlipSigns ? 0x8000000080000000ULL

25687

: 0x0000000080000000ULL,

25688

dl, MVT::v2i64);

25689

25690

Op0 = DAG.getNode(ISD::XOR, dl, MVT::v2i64, Op0, SB);

25691

Op1 = DAG.getNode(ISD::XOR, dl, MVT::v2i64, Op1, SB);

25692

25693

// Cast everything to the right type.

25694

Op0 = DAG.getBitcast(MVT::v4i32, Op0);

25695

Op1 = DAG.getBitcast(MVT::v4i32, Op1);

25696

25697

// Emulate PCMPGTQ with (hi1 > hi2) | ((hi1 == hi2) & (lo1 > lo2))

25698

SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);

25699

SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1);

25700

25701

// Create masks for only the low parts/high parts of the 64 bit integers.

25702

static const int MaskHi[] = { 1, 1, 3, 3 };

25703

static const int MaskLo[] = { 0, 0, 2, 2 };

25704

SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi);

25705

SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);

25706

SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);

25707

25708

SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo);

25709

Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi);

25710

25711

if (Invert)

25712

Result = DAG.getNOT(dl, Result, MVT::v4i32);

25713

25714

return DAG.getBitcast(VT, Result);

25715

}

25716

25717

if (Opc == X86ISD::PCMPEQ && !Subtarget.hasSSE41()) {

25718

// If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with

25719

// pcmpeqd + pshufd + pand.

25720

assert(Subtarget.hasSSE2() && !FlipSigns && "Don't know how to lower!")(static_cast <bool> (Subtarget.hasSSE2() && !FlipSigns
&& "Don't know how to lower!") ? void (0) : __assert_fail
("Subtarget.hasSSE2() && !FlipSigns && \"Don't know how to lower!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25720, __extension__
__PRETTY_FUNCTION__));

25721

25722

// First cast everything to the right type.

25723

Op0 = DAG.getBitcast(MVT::v4i32, Op0);

25724

Op1 = DAG.getBitcast(MVT::v4i32, Op1);

25725

25726

// Do the compare.

25727

SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);

25728

25729

// Make sure the lower and upper halves are both all-ones.

25730

static const int Mask[] = { 1, 0, 3, 2 };

25731

SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask);

25732

Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf);

25733

25734

if (Invert)

25735

Result = DAG.getNOT(dl, Result, MVT::v4i32);

25736

25737

return DAG.getBitcast(VT, Result);

25738

}

25739

}

25740

25741

// Since SSE has no unsigned integer comparisons, we need to flip the sign

25742

// bits of the inputs before performing those operations.

25743

if (FlipSigns) {

25744

MVT EltVT = VT.getVectorElementType();

25745

SDValue SM = DAG.getConstant(APInt::getSignMask(EltVT.getSizeInBits()), dl,

25746

VT);

25747

Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SM);

25748

Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SM);

25749

}

25750

25751

SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);

25752

25753

// If the logical-not of the result is required, perform that now.

25754

if (Invert)

25755

Result = DAG.getNOT(dl, Result, VT);

25756

25757

return Result;

25758

}

25759

25760

// Try to select this as a KORTEST+SETCC or KTEST+SETCC if possible.

25761

static SDValue EmitAVX512Test(SDValue Op0, SDValue Op1, ISD::CondCode CC,

25762

const SDLoc &dl, SelectionDAG &DAG,

25763

const X86Subtarget &Subtarget,

25764

SDValue &X86CC) {

25765

assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode")(static_cast <bool> ((CC == ISD::SETEQ || CC == ISD::SETNE
) && "Unsupported ISD::CondCode") ? void (0) : __assert_fail
("(CC == ISD::SETEQ || CC == ISD::SETNE) && \"Unsupported ISD::CondCode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25765, __extension__
__PRETTY_FUNCTION__));

25766

25767

// Must be a bitcast from vXi1.

25768

if (Op0.getOpcode() != ISD::BITCAST)

25769

return SDValue();

25770

25771

Op0 = Op0.getOperand(0);

25772

MVT VT = Op0.getSimpleValueType();

25773

if (!(Subtarget.hasAVX512() && VT == MVT::v16i1) &&

25774

!(Subtarget.hasDQI() && VT == MVT::v8i1) &&

25775

!(Subtarget.hasBWI() && (VT == MVT::v32i1 || VT == MVT::v64i1)))

25776

return SDValue();

25777

25778

X86::CondCode X86Cond;

25779

if (isNullConstant(Op1)) {

25780

X86Cond = CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE;

25781

} else if (isAllOnesConstant(Op1)) {

25782

// C flag is set for all ones.

25783

X86Cond = CC == ISD::SETEQ ? X86::COND_B : X86::COND_AE;

25784

} else

25785

return SDValue();

25786

25787

// If the input is an AND, we can combine it's operands into the KTEST.

25788

bool KTestable = false;

25789

if (Subtarget.hasDQI() && (VT == MVT::v8i1 || VT == MVT::v16i1))

25790

KTestable = true;

25791

if (Subtarget.hasBWI() && (VT == MVT::v32i1 || VT == MVT::v64i1))

25792

KTestable = true;

25793

if (!isNullConstant(Op1))

25794

KTestable = false;

25795

if (KTestable && Op0.getOpcode() == ISD::AND && Op0.hasOneUse()) {

25796

SDValue LHS = Op0.getOperand(0);

25797

SDValue RHS = Op0.getOperand(1);

25798

X86CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8);

25799

return DAG.getNode(X86ISD::KTEST, dl, MVT::i32, LHS, RHS);

25800

}

25801

25802

// If the input is an OR, we can combine it's operands into the KORTEST.

25803

SDValue LHS = Op0;

25804

SDValue RHS = Op0;

25805

if (Op0.getOpcode() == ISD::OR && Op0.hasOneUse()) {

25806

LHS = Op0.getOperand(0);

25807

RHS = Op0.getOperand(1);

25808

}

25809

25810

X86CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8);

25811

return DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);

25812

}

25813

25814

/// Emit flags for the given setcc condition and operands. Also returns the

25815

/// corresponding X86 condition code constant in X86CC.

25816

SDValue X86TargetLowering::emitFlagsForSetcc(SDValue Op0, SDValue Op1,

25817

ISD::CondCode CC, const SDLoc &dl,

25818

SelectionDAG &DAG,

25819

SDValue &X86CC) const {

25820

// Equality Combines.

25821

if (CC == ISD::SETEQ || CC == ISD::SETNE) {

25822

X86::CondCode X86CondCode;

25823

25824

// Optimize to BT if possible.

25825

// Lower (X & (1 << N)) == 0 to BT(X, N).

25826

// Lower ((X >>u N) & 1) != 0 to BT(X, N).

25827

// Lower ((X >>s N) & 1) != 0 to BT(X, N).

25828

if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() && isNullConstant(Op1)) {

25829

if (SDValue BT = LowerAndToBT(Op0, CC, dl, DAG, X86CondCode)) {

25830

X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8);

25831

return BT;

25832

}

25833

}

25834

25835

// Try to use PTEST/PMOVMSKB for a tree AND/ORs equality compared with -1/0.

25836

if (SDValue CmpZ = MatchVectorAllEqualTest(Op0, Op1, CC, dl, Subtarget, DAG,

25837

X86CondCode)) {

25838

X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8);

25839

return CmpZ;

25840

}

25841

25842

// Try to lower using KORTEST or KTEST.

25843

if (SDValue Test = EmitAVX512Test(Op0, Op1, CC, dl, DAG, Subtarget, X86CC))

25844

return Test;

25845

25846

// Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms

25847

// of these.

25848

if (isOneConstant(Op1) || isNullConstant(Op1)) {

25849

// If the input is a setcc, then reuse the input setcc or use a new one

25850

// with the inverted condition.

25851

if (Op0.getOpcode() == X86ISD::SETCC) {

25852

bool Invert = (CC == ISD::SETNE) ^ isNullConstant(Op1);

25853

25854

X86CC = Op0.getOperand(0);

25855

if (Invert) {

25856

X86CondCode = (X86::CondCode)Op0.getConstantOperandVal(0);

25857

X86CondCode = X86::GetOppositeBranchCondition(X86CondCode);

25858

X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8);

25859

}

25860

25861

return Op0.getOperand(1);

25862

}

25863

}

25864

25865

// Try to use the carry flag from the add in place of an separate CMP for:

25866

// (seteq (add X, -1), -1). Similar for setne.

25867

if (isAllOnesConstant(Op1) && Op0.getOpcode() == ISD::ADD &&

25868

Op0.getOperand(1) == Op1) {

25869

if (isProfitableToUseFlagOp(Op0)) {

25870

SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);

25871

25872

SDValue New = DAG.getNode(X86ISD::ADD, dl, VTs, Op0.getOperand(0),

25873

Op0.getOperand(1));

25874

DAG.ReplaceAllUsesOfValueWith(SDValue(Op0.getNode(), 0), New);

25875

X86CondCode = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;

25876

X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8);

25877

return SDValue(New.getNode(), 1);

25878

}

25879

}

25880

}

25881

25882

X86::CondCode CondCode =

25883

TranslateX86CC(CC, dl, /*IsFP*/ false, Op0, Op1, DAG);

25884

assert(CondCode != X86::COND_INVALID && "Unexpected condition code!")(static_cast <bool> (CondCode != X86::COND_INVALID &&
"Unexpected condition code!") ? void (0) : __assert_fail ("CondCode != X86::COND_INVALID && \"Unexpected condition code!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25884, __extension__
__PRETTY_FUNCTION__));

25885

25886

SDValue EFLAGS = EmitCmp(Op0, Op1, CondCode, dl, DAG, Subtarget);

25887

X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);

25888

return EFLAGS;

25889

}

25890

25891

SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {

25892

25893

bool IsStrict = Op.getOpcode() == ISD::STRICT_FSETCC ||

25894

Op.getOpcode() == ISD::STRICT_FSETCCS;

25895

MVT VT = Op->getSimpleValueType(0);

25896

25897

if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);

25898

25899

assert(VT == MVT::i8 && "SetCC type must be 8-bit integer")(static_cast <bool> (VT == MVT::i8 && "SetCC type must be 8-bit integer"
) ? void (0) : __assert_fail ("VT == MVT::i8 && \"SetCC type must be 8-bit integer\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25899, __extension__
__PRETTY_FUNCTION__));

25900

SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();

25901

SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0);

25902

SDValue Op1 = Op.getOperand(IsStrict ? 2 : 1);

25903

SDLoc dl(Op);

25904

ISD::CondCode CC =

25905

cast<CondCodeSDNode>(Op.getOperand(IsStrict ? 3 : 2))->get();

25906

25907

if (isSoftFP16(Op0.getValueType()))

25908

return SDValue();

25909

25910

// Handle f128 first, since one possible outcome is a normal integer

25911

// comparison which gets handled by emitFlagsForSetcc.

25912

if (Op0.getValueType() == MVT::f128) {

25913

softenSetCCOperands(DAG, MVT::f128, Op0, Op1, CC, dl, Op0, Op1, Chain,

25914

Op.getOpcode() == ISD::STRICT_FSETCCS);

25915

25916

// If softenSetCCOperands returned a scalar, use it.

25917

if (!Op1.getNode()) {

25918

assert(Op0.getValueType() == Op.getValueType() &&(static_cast <bool> (Op0.getValueType() == Op.getValueType
() && "Unexpected setcc expansion!") ? void (0) : __assert_fail
("Op0.getValueType() == Op.getValueType() && \"Unexpected setcc expansion!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25919, __extension__
__PRETTY_FUNCTION__))

25919

"Unexpected setcc expansion!")(static_cast <bool> (Op0.getValueType() == Op.getValueType
() && "Unexpected setcc expansion!") ? void (0) : __assert_fail
("Op0.getValueType() == Op.getValueType() && \"Unexpected setcc expansion!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25919, __extension__
__PRETTY_FUNCTION__));

25920

if (IsStrict)

25921

return DAG.getMergeValues({Op0, Chain}, dl);

25922

return Op0;

25923

}

25924

}

25925

25926

if (Op0.getSimpleValueType().isInteger()) {

25927

// Attempt to canonicalize SGT/UGT -> SGE/UGE compares with constant which

25928

// reduces the number of EFLAGs bit reads (the GE conditions don't read ZF),

25929

// this may translate to less uops depending on uarch implementation. The

25930

// equivalent for SLE/ULE -> SLT/ULT isn't likely to happen as we already

25931

// canonicalize to that CondCode.

25932

// NOTE: Only do this if incrementing the constant doesn't increase the bit

25933

// encoding size - so it must either already be a i8 or i32 immediate, or it

25934

// shrinks down to that. We don't do this for any i64's to avoid additional

25935

// constant materializations.

25936

// TODO: Can we move this to TranslateX86CC to handle jumps/branches too?

25937

if (auto *Op1C = dyn_cast<ConstantSDNode>(Op1)) {

25938

const APInt &Op1Val = Op1C->getAPIntValue();

25939

if (!Op1Val.isZero()) {

25940

// Ensure the constant+1 doesn't overflow.

25941

if ((CC == ISD::CondCode::SETGT && !Op1Val.isMaxSignedValue()) ||

25942

(CC == ISD::CondCode::SETUGT && !Op1Val.isMaxValue())) {

25943

APInt Op1ValPlusOne = Op1Val + 1;

25944

if (Op1ValPlusOne.isSignedIntN(32) &&

25945

(!Op1Val.isSignedIntN(8) || Op1ValPlusOne.isSignedIntN(8))) {

25946

Op1 = DAG.getConstant(Op1ValPlusOne, dl, Op0.getValueType());

25947

CC = CC == ISD::CondCode::SETGT ? ISD::CondCode::SETGE

25948

: ISD::CondCode::SETUGE;

25949

}

25950

}

25951

}

25952

}

25953

25954

SDValue X86CC;

25955

SDValue EFLAGS = emitFlagsForSetcc(Op0, Op1, CC, dl, DAG, X86CC);

25956

SDValue Res = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS);

25957

return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;

25958

}

25959

25960

// Handle floating point.

25961

X86::CondCode CondCode = TranslateX86CC(CC, dl, /*IsFP*/ true, Op0, Op1, DAG);

25962

if (CondCode == X86::COND_INVALID)

25963

return SDValue();

25964

25965

SDValue EFLAGS;

25966

if (IsStrict) {

25967

bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;

25968

EFLAGS =

25969

DAG.getNode(IsSignaling ? X86ISD::STRICT_FCMPS : X86ISD::STRICT_FCMP,

25970

dl, {MVT::i32, MVT::Other}, {Chain, Op0, Op1});

25971

Chain = EFLAGS.getValue(1);

25972

} else {

25973

EFLAGS = DAG.getNode(X86ISD::FCMP, dl, MVT::i32, Op0, Op1);

25974

}

25975

25976

SDValue X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);

25977

SDValue Res = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS);

25978

return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;

25979

}

25980

25981

SDValue X86TargetLowering::LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const {

25982

SDValue LHS = Op.getOperand(0);

25983

SDValue RHS = Op.getOperand(1);

25984

SDValue Carry = Op.getOperand(2);

25985

SDValue Cond = Op.getOperand(3);

25986

SDLoc DL(Op);

25987

25988

assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only.")(static_cast <bool> (LHS.getSimpleValueType().isInteger
() && "SETCCCARRY is integer only.") ? void (0) : __assert_fail
("LHS.getSimpleValueType().isInteger() && \"SETCCCARRY is integer only.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25988, __extension__
__PRETTY_FUNCTION__));

25989

X86::CondCode CC = TranslateIntegerX86CC(cast<CondCodeSDNode>(Cond)->get());

25990

25991

// Recreate the carry if needed.

25992

EVT CarryVT = Carry.getValueType();

25993

Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),

25994

Carry, DAG.getAllOnesConstant(DL, CarryVT));

25995

25996

SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);

25997

SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry.getValue(1));

25998

return getSETCC(CC, Cmp.getValue(1), DL, DAG);

25999

}

26000

26001

// This function returns three things: the arithmetic computation itself

26002

// (Value), an EFLAGS result (Overflow), and a condition code (Cond). The

26003

// flag and the condition code define the case in which the arithmetic

26004

// computation overflows.

26005

static std::pair<SDValue, SDValue>

26006

getX86XALUOOp(X86::CondCode &Cond, SDValue Op, SelectionDAG &DAG) {

26007

assert(Op.getResNo() == 0 && "Unexpected result number!")(static_cast <bool> (Op.getResNo() == 0 && "Unexpected result number!"
) ? void (0) : __assert_fail ("Op.getResNo() == 0 && \"Unexpected result number!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26007, __extension__
__PRETTY_FUNCTION__));

26008

SDValue Value, Overflow;

26009

SDValue LHS = Op.getOperand(0);

26010

SDValue RHS = Op.getOperand(1);

26011

unsigned BaseOp = 0;

26012

SDLoc DL(Op);

26013

switch (Op.getOpcode()) {

26014

default: llvm_unreachable("Unknown ovf instruction!")::llvm::llvm_unreachable_internal("Unknown ovf instruction!",
"llvm/lib/Target/X86/X86ISelLowering.cpp", 26014);

26015

case ISD::SADDO:

26016

BaseOp = X86ISD::ADD;

26017

Cond = X86::COND_O;

26018

break;

26019

case ISD::UADDO:

26020

BaseOp = X86ISD::ADD;

26021

Cond = isOneConstant(RHS) ? X86::COND_E : X86::COND_B;

26022

break;

26023

case ISD::SSUBO:

26024

BaseOp = X86ISD::SUB;

26025

Cond = X86::COND_O;

26026

break;

26027

case ISD::USUBO:

26028

BaseOp = X86ISD::SUB;

26029

Cond = X86::COND_B;

26030

break;

26031

case ISD::SMULO:

26032

BaseOp = X86ISD::SMUL;

26033

Cond = X86::COND_O;

26034

break;

26035

case ISD::UMULO:

26036

BaseOp = X86ISD::UMUL;

26037

Cond = X86::COND_O;

26038

break;

26039

}

26040

26041

if (BaseOp) {

26042

// Also sets EFLAGS.

26043

SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);

26044

Value = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);

26045

Overflow = Value.getValue(1);

26046

}

26047

26048

return std::make_pair(Value, Overflow);

26049

}

26050

26051

static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {

26052

// Lower the "add/sub/mul with overflow" instruction into a regular ins plus

26053

// a "setcc" instruction that checks the overflow flag. The "brcond" lowering

26054

// looks for this combo and may remove the "setcc" instruction if the "setcc"

26055

// has only one use.

26056

SDLoc DL(Op);

26057

X86::CondCode Cond;

26058

SDValue Value, Overflow;

26059

std::tie(Value, Overflow) = getX86XALUOOp(Cond, Op, DAG);

26060

26061

SDValue SetCC = getSETCC(Cond, Overflow, DL, DAG);

26062

assert(Op->getValueType(1) == MVT::i8 && "Unexpected VT!")(static_cast <bool> (Op->getValueType(1) == MVT::i8 &&
"Unexpected VT!") ? void (0) : __assert_fail ("Op->getValueType(1) == MVT::i8 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26062, __extension__
__PRETTY_FUNCTION__));

26063

return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Value, SetCC);

26064

}

26065

26066

/// Return true if opcode is a X86 logical comparison.

26067

static bool isX86LogicalCmp(SDValue Op) {

26068

unsigned Opc = Op.getOpcode();

26069

if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI ||

26070

Opc == X86ISD::FCMP)

26071

return true;

26072

if (Op.getResNo() == 1 &&

26073

(Opc == X86ISD::ADD || Opc == X86ISD::SUB || Opc == X86ISD::ADC ||

26074

Opc == X86ISD::SBB || Opc == X86ISD::SMUL || Opc == X86ISD::UMUL ||

26075

Opc == X86ISD::OR || Opc == X86ISD::XOR || Opc == X86ISD::AND))

26076

return true;

26077

26078

return false;

26079

}

26080

26081

static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) {

26082

if (V.getOpcode() != ISD::TRUNCATE)

26083

return false;

26084

26085

SDValue VOp0 = V.getOperand(0);

26086

unsigned InBits = VOp0.getValueSizeInBits();

26087

unsigned Bits = V.getValueSizeInBits();

26088

return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits));

26089

}

26090

26091

SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {

26092

bool AddTest = true;

26093

SDValue Cond = Op.getOperand(0);

26094

SDValue Op1 = Op.getOperand(1);

26095

SDValue Op2 = Op.getOperand(2);

26096

SDLoc DL(Op);

26097

MVT VT = Op1.getSimpleValueType();

26098

SDValue CC;

26099

26100

if (isSoftFP16(VT)) {

26101

MVT NVT = VT.changeTypeToInteger();

26102

return DAG.getBitcast(VT, DAG.getNode(ISD::SELECT, DL, NVT, Cond,

26103

DAG.getBitcast(NVT, Op1),

26104

DAG.getBitcast(NVT, Op2)));

26105

}

26106

26107

// Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops

26108

// are available or VBLENDV if AVX is available.

26109

// Otherwise FP cmovs get lowered into a less efficient branch sequence later.

26110

if (Cond.getOpcode() == ISD::SETCC && isScalarFPTypeInSSEReg(VT) &&

26111

VT == Cond.getOperand(0).getSimpleValueType() && Cond->hasOneUse()) {

26112

SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);

26113

bool IsAlwaysSignaling;

26114

unsigned SSECC =

26115

translateX86FSETCC(cast<CondCodeSDNode>(Cond.getOperand(2))->get(),

26116

CondOp0, CondOp1, IsAlwaysSignaling);

26117

26118

if (Subtarget.hasAVX512()) {

26119

SDValue Cmp =

26120

DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CondOp0, CondOp1,

26121

DAG.getTargetConstant(SSECC, DL, MVT::i8));

26122

assert(!VT.isVector() && "Not a scalar type?")(static_cast <bool> (!VT.isVector() && "Not a scalar type?"
) ? void (0) : __assert_fail ("!VT.isVector() && \"Not a scalar type?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26122, __extension__
__PRETTY_FUNCTION__));

26123

return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);

26124

}

26125

26126

if (SSECC < 8 || Subtarget.hasAVX()) {

26127

SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1,

26128

DAG.getTargetConstant(SSECC, DL, MVT::i8));

26129

26130

// If we have AVX, we can use a variable vector select (VBLENDV) instead

26131

// of 3 logic instructions for size savings and potentially speed.

26132

// Unfortunately, there is no scalar form of VBLENDV.

26133

26134

// If either operand is a +0.0 constant, don't try this. We can expect to

26135

// optimize away at least one of the logic instructions later in that

26136

// case, so that sequence would be faster than a variable blend.

26137

26138

// BLENDV was introduced with SSE 4.1, but the 2 register form implicitly

26139

// uses XMM0 as the selection register. That may need just as many

26140

// instructions as the AND/ANDN/OR sequence due to register moves, so

26141

// don't bother.

26142

if (Subtarget.hasAVX() && !isNullFPConstant(Op1) &&

26143

!isNullFPConstant(Op2)) {

26144

// Convert to vectors, do a VSELECT, and convert back to scalar.

26145

// All of the conversions should be optimized away.

26146

MVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64;

26147

SDValue VOp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op1);

26148

SDValue VOp2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op2);

26149

SDValue VCmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Cmp);

26150

26151

MVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64;

26152

VCmp = DAG.getBitcast(VCmpVT, VCmp);

26153

26154

SDValue VSel = DAG.getSelect(DL, VecVT, VCmp, VOp1, VOp2);

26155

26156

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,

26157

VSel, DAG.getIntPtrConstant(0, DL));

26158

}

26159

SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2);

26160

SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1);

26161

return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And);

26162

}

26163

}

26164

26165

// AVX512 fallback is to lower selects of scalar floats to masked moves.

26166

if (isScalarFPTypeInSSEReg(VT) && Subtarget.hasAVX512()) {

26167

SDValue Cmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Cond);

26168

return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);

26169

}

26170

26171

if (Cond.getOpcode() == ISD::SETCC &&

26172

!isSoftFP16(Cond.getOperand(0).getSimpleValueType())) {

26173

if (SDValue NewCond = LowerSETCC(Cond, DAG)) {

26174

Cond = NewCond;

26175

// If the condition was updated, it's possible that the operands of the

26176

// select were also updated (for example, EmitTest has a RAUW). Refresh

26177

// the local references to the select operands in case they got stale.

26178

Op1 = Op.getOperand(1);

26179

Op2 = Op.getOperand(2);

26180

}

26181

}

26182

26183

// (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y

26184

// (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y

26185

// (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y

26186

// (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y

26187

// (select (and (x , 0x1) == 0), y, (z ^ y) ) -> (-(and (x , 0x1)) & z ) ^ y

26188

// (select (and (x , 0x1) == 0), y, (z | y) ) -> (-(and (x , 0x1)) & z ) | y

26189

// (select (x > 0), x, 0) -> (~(x >> (size_in_bits(x)-1))) & x

26190

// (select (x < 0), x, 0) -> ((x >> (size_in_bits(x)-1))) & x

26191

if (Cond.getOpcode() == X86ISD::SETCC &&

26192

Cond.getOperand(1).getOpcode() == X86ISD::CMP &&

26193

isNullConstant(Cond.getOperand(1).getOperand(1))) {

26194

SDValue Cmp = Cond.getOperand(1);

26195

SDValue CmpOp0 = Cmp.getOperand(0);

26196

unsigned CondCode = Cond.getConstantOperandVal(0);

26197

26198

// Special handling for __builtin_ffs(X) - 1 pattern which looks like

26199

// (select (seteq X, 0), -1, (cttz_zero_undef X)). Disable the special

26200

// handle to keep the CMP with 0. This should be removed by

26201

// optimizeCompareInst by using the flags from the BSR/TZCNT used for the

26202

// cttz_zero_undef.

26203

auto MatchFFSMinus1 = [&](SDValue Op1, SDValue Op2) {

26204

return (Op1.getOpcode() == ISD::CTTZ_ZERO_UNDEF && Op1.hasOneUse() &&

26205

Op1.getOperand(0) == CmpOp0 && isAllOnesConstant(Op2));

26206

};

26207

if (Subtarget.canUseCMOV() && (VT == MVT::i32 || VT == MVT::i64) &&

26208

((CondCode == X86::COND_NE && MatchFFSMinus1(Op1, Op2)) ||

26209

(CondCode == X86::COND_E && MatchFFSMinus1(Op2, Op1)))) {

26210

// Keep Cmp.

26211

} else if ((isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&

26212

(CondCode == X86::COND_E || CondCode == X86::COND_NE)) {

26213

SDValue Y = isAllOnesConstant(Op2) ? Op1 : Op2;

26214

SDVTList CmpVTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32);

26215

26216

// 'X - 1' sets the carry flag if X == 0.

26217

// '0 - X' sets the carry flag if X != 0.

26218

// Convert the carry flag to a -1/0 mask with sbb:

26219

// select (X != 0), -1, Y --> 0 - X; or (sbb), Y

26220

// select (X == 0), Y, -1 --> 0 - X; or (sbb), Y

26221

// select (X != 0), Y, -1 --> X - 1; or (sbb), Y

26222

// select (X == 0), -1, Y --> X - 1; or (sbb), Y

26223

SDValue Sub;

26224

if (isAllOnesConstant(Op1) == (CondCode == X86::COND_NE)) {

26225

SDValue Zero = DAG.getConstant(0, DL, CmpOp0.getValueType());

26226

Sub = DAG.getNode(X86ISD::SUB, DL, CmpVTs, Zero, CmpOp0);

26227

} else {

26228

SDValue One = DAG.getConstant(1, DL, CmpOp0.getValueType());

26229

Sub = DAG.getNode(X86ISD::SUB, DL, CmpVTs, CmpOp0, One);

26230

}

26231

SDValue SBB = DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,

26232

DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),

26233

Sub.getValue(1));

26234

return DAG.getNode(ISD::OR, DL, VT, SBB, Y);

26235

} else if (!Subtarget.canUseCMOV() && CondCode == X86::COND_E &&

26236

CmpOp0.getOpcode() == ISD::AND &&

26237

isOneConstant(CmpOp0.getOperand(1))) {

26238

SDValue Src1, Src2;

26239

// true if Op2 is XOR or OR operator and one of its operands

26240

// is equal to Op1

26241

// ( a , a op b) || ( b , a op b)

26242

auto isOrXorPattern = [&]() {

26243

if ((Op2.getOpcode() == ISD::XOR || Op2.getOpcode() == ISD::OR) &&

26244

(Op2.getOperand(0) == Op1 || Op2.getOperand(1) == Op1)) {

26245

Src1 =

26246

Op2.getOperand(0) == Op1 ? Op2.getOperand(1) : Op2.getOperand(0);

26247

Src2 = Op1;

26248

return true;

26249

}

26250

return false;

26251

};

26252

26253

if (isOrXorPattern()) {

26254

SDValue Neg;

26255

unsigned int CmpSz = CmpOp0.getSimpleValueType().getSizeInBits();

26256

// we need mask of all zeros or ones with same size of the other

26257

// operands.

26258

if (CmpSz > VT.getSizeInBits())

26259

Neg = DAG.getNode(ISD::TRUNCATE, DL, VT, CmpOp0);

26260

else if (CmpSz < VT.getSizeInBits())

26261

Neg = DAG.getNode(ISD::AND, DL, VT,

26262

DAG.getNode(ISD::ANY_EXTEND, DL, VT, CmpOp0.getOperand(0)),

26263

DAG.getConstant(1, DL, VT));

26264

else

26265

Neg = CmpOp0;

26266

SDValue Mask = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),

26267

Neg); // -(and (x, 0x1))

26268

SDValue And = DAG.getNode(ISD::AND, DL, VT, Mask, Src1); // Mask & z

26269

return DAG.getNode(Op2.getOpcode(), DL, VT, And, Src2); // And Op y

26270

}

26271

} else if ((VT == MVT::i32 || VT == MVT::i64) && isNullConstant(Op2) &&

26272

Cmp.getNode()->hasOneUse() && (CmpOp0 == Op1) &&

26273

((CondCode == X86::COND_S) || // smin(x, 0)

26274

(CondCode == X86::COND_G && hasAndNot(Op1)))) { // smax(x, 0)

26275

// (select (x < 0), x, 0) -> ((x >> (size_in_bits(x)-1))) & x

26276

//

26277

// If the comparison is testing for a positive value, we have to invert

26278

// the sign bit mask, so only do that transform if the target has a

26279

// bitwise 'and not' instruction (the invert is free).

26280

// (select (x > 0), x, 0) -> (~(x >> (size_in_bits(x)-1))) & x

26281

unsigned ShCt = VT.getSizeInBits() - 1;

26282

SDValue ShiftAmt = DAG.getConstant(ShCt, DL, VT);

26283

SDValue Shift = DAG.getNode(ISD::SRA, DL, VT, Op1, ShiftAmt);

26284

if (CondCode == X86::COND_G)

26285

Shift = DAG.getNOT(DL, Shift, VT);

26286

return DAG.getNode(ISD::AND, DL, VT, Shift, Op1);

26287

}

26288

}

26289

26290

// Look past (and (setcc_carry (cmp ...)), 1).

26291

if (Cond.getOpcode() == ISD::AND &&

26292

Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&

26293

isOneConstant(Cond.getOperand(1)))

26294

Cond = Cond.getOperand(0);

26295

26296

// If condition flag is set by a X86ISD::CMP, then use it as the condition

26297

// setting operand in place of the X86ISD::SETCC.

26298

unsigned CondOpcode = Cond.getOpcode();

26299

if (CondOpcode == X86ISD::SETCC ||

26300

CondOpcode == X86ISD::SETCC_CARRY) {

26301

CC = Cond.getOperand(0);

26302

26303

SDValue Cmp = Cond.getOperand(1);

26304

bool IllegalFPCMov = false;

26305

if (VT.isFloatingPoint() && !VT.isVector() &&

26306

!isScalarFPTypeInSSEReg(VT) && Subtarget.canUseCMOV()) // FPStack?

26307

IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());

26308

26309

if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||

26310

Cmp.getOpcode() == X86ISD::BT) { // FIXME

26311

Cond = Cmp;

26312

AddTest = false;

26313

}

26314

} else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||

26315

CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||

26316

CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) {

26317

SDValue Value;

26318

X86::CondCode X86Cond;

26319

std::tie(Value, Cond) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG);

26320

26321

CC = DAG.getTargetConstant(X86Cond, DL, MVT::i8);

26322

AddTest = false;

26323

}

26324

26325

if (AddTest) {

26326

// Look past the truncate if the high bits are known zero.

26327

if (isTruncWithZeroHighBitsInput(Cond, DAG))

26328

Cond = Cond.getOperand(0);

26329

26330

// We know the result of AND is compared against zero. Try to match

26331

// it to BT.

26332

if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {

26333

X86::CondCode X86CondCode;

26334

if (SDValue BT = LowerAndToBT(Cond, ISD::SETNE, DL, DAG, X86CondCode)) {

26335

CC = DAG.getTargetConstant(X86CondCode, DL, MVT::i8);

26336

Cond = BT;

26337

AddTest = false;

26338

}

26339

}

26340

}

26341

26342

if (AddTest) {

26343

CC = DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8);

26344

Cond = EmitTest(Cond, X86::COND_NE, DL, DAG, Subtarget);

26345

}

26346

26347

// a < b ? -1 : 0 -> RES = ~setcc_carry

26348

// a < b ? 0 : -1 -> RES = setcc_carry

26349

// a >= b ? -1 : 0 -> RES = setcc_carry

26350

// a >= b ? 0 : -1 -> RES = ~setcc_carry

26351

if (Cond.getOpcode() == X86ISD::SUB) {

26352

unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue();

26353

26354

if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) &&

26355

(isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&

26356

(isNullConstant(Op1) || isNullConstant(Op2))) {

26357

SDValue Res =

26358

DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),

26359

DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), Cond);

26360

if (isAllOnesConstant(Op1) != (CondCode == X86::COND_B))

26361

return DAG.getNOT(DL, Res, Res.getValueType());

26362

return Res;

26363

}

26364

}

26365

26366

// X86 doesn't have an i8 cmov. If both operands are the result of a truncate

26367

// widen the cmov and push the truncate through. This avoids introducing a new

26368

// branch during isel and doesn't add any extensions.

26369

if (Op.getValueType() == MVT::i8 &&

26370

Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) {

26371

SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);

26372

if (T1.getValueType() == T2.getValueType() &&

26373

// Exclude CopyFromReg to avoid partial register stalls.

26374

T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){

26375

SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, T1.getValueType(), T2, T1,

26376

CC, Cond);

26377

return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);

26378

}

26379

}

26380

26381

// Or finally, promote i8 cmovs if we have CMOV,

26382

// or i16 cmovs if it won't prevent folding a load.

26383

// FIXME: we should not limit promotion of i8 case to only when the CMOV is

26384

// legal, but EmitLoweredSelect() can not deal with these extensions

26385

// being inserted between two CMOV's. (in i16 case too TBN)

26386

// https://bugs.llvm.org/show_bug.cgi?id=40974

26387

if ((Op.getValueType() == MVT::i8 && Subtarget.canUseCMOV()) ||

26388

(Op.getValueType() == MVT::i16 && !X86::mayFoldLoad(Op1, Subtarget) &&

26389

!X86::mayFoldLoad(Op2, Subtarget))) {

26390

Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1);

26391

Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2);

26392

SDValue Ops[] = { Op2, Op1, CC, Cond };

26393

SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, MVT::i32, Ops);

26394

return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);

26395

}

26396

26397

// X86ISD::CMOV means set the result (which is operand 1) to the RHS if

26398

// condition is true.

26399

SDValue Ops[] = { Op2, Op1, CC, Cond };

26400

return DAG.getNode(X86ISD::CMOV, DL, Op.getValueType(), Ops);

26401

}

26402

26403

static SDValue LowerSIGN_EXTEND_Mask(SDValue Op,

26404

const X86Subtarget &Subtarget,

26405

SelectionDAG &DAG) {

26406

MVT VT = Op->getSimpleValueType(0);

26407

SDValue In = Op->getOperand(0);

26408

MVT InVT = In.getSimpleValueType();

26409

assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!")(static_cast <bool> (InVT.getVectorElementType() == MVT
::i1 && "Unexpected input type!") ? void (0) : __assert_fail
("InVT.getVectorElementType() == MVT::i1 && \"Unexpected input type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26409, __extension__
__PRETTY_FUNCTION__));

26410

MVT VTElt = VT.getVectorElementType();

26411

SDLoc dl(Op);

26412

26413

unsigned NumElts = VT.getVectorNumElements();

26414

26415

// Extend VT if the scalar type is i8/i16 and BWI is not supported.

26416

MVT ExtVT = VT;

26417

if (!Subtarget.hasBWI() && VTElt.getSizeInBits() <= 16) {

26418

// If v16i32 is to be avoided, we'll need to split and concatenate.

26419

if (NumElts == 16 && !Subtarget.canExtendTo512DQ())

26420

return SplitAndExtendv16i1(Op.getOpcode(), VT, In, dl, DAG);

26421

26422

ExtVT = MVT::getVectorVT(MVT::i32, NumElts);

26423

}

26424

26425

// Widen to 512-bits if VLX is not supported.

26426

MVT WideVT = ExtVT;

26427

if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {

26428

NumElts *= 512 / ExtVT.getSizeInBits();

26429

InVT = MVT::getVectorVT(MVT::i1, NumElts);

26430

In = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, InVT, DAG.getUNDEF(InVT),

26431

In, DAG.getIntPtrConstant(0, dl));

26432

WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(), NumElts);

26433

}

26434

26435

SDValue V;

26436

MVT WideEltVT = WideVT.getVectorElementType();

26437

if ((Subtarget.hasDQI() && WideEltVT.getSizeInBits() >= 32) ||

26438

(Subtarget.hasBWI() && WideEltVT.getSizeInBits() <= 16)) {

26439

V = DAG.getNode(Op.getOpcode(), dl, WideVT, In);

26440

} else {

26441

SDValue NegOne = DAG.getConstant(-1, dl, WideVT);

26442

SDValue Zero = DAG.getConstant(0, dl, WideVT);

26443

V = DAG.getSelect(dl, WideVT, In, NegOne, Zero);

26444

}

26445

26446

// Truncate if we had to extend i16/i8 above.

26447

if (VT != ExtVT) {

26448

WideVT = MVT::getVectorVT(VTElt, NumElts);

26449

V = DAG.getNode(ISD::TRUNCATE, dl, WideVT, V);

26450

}

26451

26452

// Extract back to 128/256-bit if we widened.

26453

if (WideVT != VT)

26454

V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, V,

26455

DAG.getIntPtrConstant(0, dl));

26456

26457

return V;

26458

}

26459

26460

static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget &Subtarget,

26461

SelectionDAG &DAG) {

26462

SDValue In = Op->getOperand(0);

26463

MVT InVT = In.getSimpleValueType();

26464

26465

if (InVT.getVectorElementType() == MVT::i1)

26466

return LowerSIGN_EXTEND_Mask(Op, Subtarget, DAG);

26467

26468

assert(Subtarget.hasAVX() && "Expected AVX support")(static_cast <bool> (Subtarget.hasAVX() && "Expected AVX support"
) ? void (0) : __assert_fail ("Subtarget.hasAVX() && \"Expected AVX support\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26468, __extension__
__PRETTY_FUNCTION__));

26469

return LowerAVXExtend(Op, DAG, Subtarget);

26470

}

26471

26472

// Lowering for SIGN_EXTEND_VECTOR_INREG and ZERO_EXTEND_VECTOR_INREG.

26473

// For sign extend this needs to handle all vector sizes and SSE4.1 and

26474

// non-SSE4.1 targets. For zero extend this should only handle inputs of

26475

// MVT::v64i8 when BWI is not supported, but AVX512 is.

26476

static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op,

26477

const X86Subtarget &Subtarget,

26478

SelectionDAG &DAG) {

26479

SDValue In = Op->getOperand(0);

26480

MVT VT = Op->getSimpleValueType(0);

26481

MVT InVT = In.getSimpleValueType();

26482

26483

MVT SVT = VT.getVectorElementType();

26484

MVT InSVT = InVT.getVectorElementType();

26485

assert(SVT.getFixedSizeInBits() > InSVT.getFixedSizeInBits())(static_cast <bool> (SVT.getFixedSizeInBits() > InSVT
.getFixedSizeInBits()) ? void (0) : __assert_fail ("SVT.getFixedSizeInBits() > InSVT.getFixedSizeInBits()"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26485, __extension__
__PRETTY_FUNCTION__));

26486

26487

if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)

26488

return SDValue();

26489

if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)

26490

return SDValue();

26491

if (!(VT.is128BitVector() && Subtarget.hasSSE2()) &&

26492

!(VT.is256BitVector() && Subtarget.hasAVX()) &&

26493

!(VT.is512BitVector() && Subtarget.hasAVX512()))

26494

return SDValue();

26495

26496

SDLoc dl(Op);

26497

unsigned Opc = Op.getOpcode();

26498

unsigned NumElts = VT.getVectorNumElements();

26499

26500

// For 256-bit vectors, we only need the lower (128-bit) half of the input.

26501

// For 512-bit vectors, we need 128-bits or 256-bits.

26502

if (InVT.getSizeInBits() > 128) {

26503

// Input needs to be at least the same number of elements as output, and

26504

// at least 128-bits.

26505

int InSize = InSVT.getSizeInBits() * NumElts;

26506

In = extractSubVector(In, 0, DAG, dl, std::max(InSize, 128));

26507

InVT = In.getSimpleValueType();

26508

}

26509

26510

// SSE41 targets can use the pmov[sz]x* instructions directly for 128-bit results,

26511

// so are legal and shouldn't occur here. AVX2/AVX512 pmovsx* instructions still

26512

// need to be handled here for 256/512-bit results.

26513

if (Subtarget.hasInt256()) {

26514

assert(VT.getSizeInBits() > 128 && "Unexpected 128-bit vector extension")(static_cast <bool> (VT.getSizeInBits() > 128 &&
"Unexpected 128-bit vector extension") ? void (0) : __assert_fail
("VT.getSizeInBits() > 128 && \"Unexpected 128-bit vector extension\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26514, __extension__
__PRETTY_FUNCTION__));

26515

26516

if (InVT.getVectorNumElements() != NumElts)

26517

return DAG.getNode(Op.getOpcode(), dl, VT, In);

26518

26519

// FIXME: Apparently we create inreg operations that could be regular

26520

// extends.

26521

unsigned ExtOpc =

26522

Opc == ISD::SIGN_EXTEND_VECTOR_INREG ? ISD::SIGN_EXTEND

26523

: ISD::ZERO_EXTEND;

26524

return DAG.getNode(ExtOpc, dl, VT, In);

26525

}

26526

26527

// pre-AVX2 256-bit extensions need to be split into 128-bit instructions.

26528

if (Subtarget.hasAVX()) {

26529

assert(VT.is256BitVector() && "256-bit vector expected")(static_cast <bool> (VT.is256BitVector() && "256-bit vector expected"
) ? void (0) : __assert_fail ("VT.is256BitVector() && \"256-bit vector expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26529, __extension__
__PRETTY_FUNCTION__));

26530

MVT HalfVT = VT.getHalfNumVectorElementsVT();

26531

int HalfNumElts = HalfVT.getVectorNumElements();

26532

26533

unsigned NumSrcElts = InVT.getVectorNumElements();

26534

SmallVector<int, 16> HiMask(NumSrcElts, SM_SentinelUndef);

26535

for (int i = 0; i != HalfNumElts; ++i)

26536

HiMask[i] = HalfNumElts + i;

26537

26538

SDValue Lo = DAG.getNode(Opc, dl, HalfVT, In);

26539

SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, DAG.getUNDEF(InVT), HiMask);

26540

Hi = DAG.getNode(Opc, dl, HalfVT, Hi);

26541

return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);

26542

}

26543

26544

// We should only get here for sign extend.

26545

assert(Opc == ISD::SIGN_EXTEND_VECTOR_INREG && "Unexpected opcode!")(static_cast <bool> (Opc == ISD::SIGN_EXTEND_VECTOR_INREG
&& "Unexpected opcode!") ? void (0) : __assert_fail (
"Opc == ISD::SIGN_EXTEND_VECTOR_INREG && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26545, __extension__
__PRETTY_FUNCTION__));

26546

assert(VT.is128BitVector() && InVT.is128BitVector() && "Unexpected VTs")(static_cast <bool> (VT.is128BitVector() && InVT
.is128BitVector() && "Unexpected VTs") ? void (0) : __assert_fail
("VT.is128BitVector() && InVT.is128BitVector() && \"Unexpected VTs\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26546, __extension__
__PRETTY_FUNCTION__));

26547

26548

// pre-SSE41 targets unpack lower lanes and then sign-extend using SRAI.

26549

SDValue Curr = In;

26550

SDValue SignExt = Curr;

26551

26552

// As SRAI is only available on i16/i32 types, we expand only up to i32

26553

// and handle i64 separately.

26554

if (InVT != MVT::v4i32) {

26555

MVT DestVT = VT == MVT::v2i64 ? MVT::v4i32 : VT;

26556

26557

unsigned DestWidth = DestVT.getScalarSizeInBits();

26558

unsigned Scale = DestWidth / InSVT.getSizeInBits();

26559

26560

unsigned InNumElts = InVT.getVectorNumElements();

26561

unsigned DestElts = DestVT.getVectorNumElements();

26562

26563

// Build a shuffle mask that takes each input element and places it in the

26564

// MSBs of the new element size.

26565

SmallVector<int, 16> Mask(InNumElts, SM_SentinelUndef);

26566

for (unsigned i = 0; i != DestElts; ++i)

26567

Mask[i * Scale + (Scale - 1)] = i;

26568

26569

Curr = DAG.getVectorShuffle(InVT, dl, In, In, Mask);

26570

Curr = DAG.getBitcast(DestVT, Curr);

26571

26572

unsigned SignExtShift = DestWidth - InSVT.getSizeInBits();

26573

SignExt = DAG.getNode(X86ISD::VSRAI, dl, DestVT, Curr,

26574

DAG.getTargetConstant(SignExtShift, dl, MVT::i8));

26575

}

26576

26577

if (VT == MVT::v2i64) {

26578

assert(Curr.getValueType() == MVT::v4i32 && "Unexpected input VT")(static_cast <bool> (Curr.getValueType() == MVT::v4i32 &&
"Unexpected input VT") ? void (0) : __assert_fail ("Curr.getValueType() == MVT::v4i32 && \"Unexpected input VT\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26578, __extension__
__PRETTY_FUNCTION__));

26579

SDValue Zero = DAG.getConstant(0, dl, MVT::v4i32);

26580

SDValue Sign = DAG.getSetCC(dl, MVT::v4i32, Zero, Curr, ISD::SETGT);

26581

SignExt = DAG.getVectorShuffle(MVT::v4i32, dl, SignExt, Sign, {0, 4, 1, 5});

26582

SignExt = DAG.getBitcast(VT, SignExt);

26583

}

26584

26585

return SignExt;

26586

}

26587

26588

static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget,

26589

SelectionDAG &DAG) {

26590

MVT VT = Op->getSimpleValueType(0);

26591

SDValue In = Op->getOperand(0);

26592

MVT InVT = In.getSimpleValueType();

26593

SDLoc dl(Op);

26594

26595

if (InVT.getVectorElementType() == MVT::i1)

26596

return LowerSIGN_EXTEND_Mask(Op, Subtarget, DAG);

26597

26598

assert(VT.isVector() && InVT.isVector() && "Expected vector type")(static_cast <bool> (VT.isVector() && InVT.isVector
() && "Expected vector type") ? void (0) : __assert_fail
("VT.isVector() && InVT.isVector() && \"Expected vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26598, __extension__
__PRETTY_FUNCTION__));

26599

assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&(static_cast <bool> (VT.getVectorNumElements() == InVT.
getVectorNumElements() && "Expected same number of elements"
) ? void (0) : __assert_fail ("VT.getVectorNumElements() == InVT.getVectorNumElements() && \"Expected same number of elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26600, __extension__
__PRETTY_FUNCTION__))

26600

"Expected same number of elements")(static_cast <bool> (VT.getVectorNumElements() == InVT.
getVectorNumElements() && "Expected same number of elements"
) ? void (0) : __assert_fail ("VT.getVectorNumElements() == InVT.getVectorNumElements() && \"Expected same number of elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26600, __extension__
__PRETTY_FUNCTION__));

26601

assert((VT.getVectorElementType() == MVT::i16 ||(static_cast <bool> ((VT.getVectorElementType() == MVT::
i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType
() == MVT::i64) && "Unexpected element type") ? void (
0) : __assert_fail ("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26604, __extension__
__PRETTY_FUNCTION__))

26602

VT.getVectorElementType() == MVT::i32 ||(static_cast <bool> ((VT.getVectorElementType() == MVT::
i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType
() == MVT::i64) && "Unexpected element type") ? void (
0) : __assert_fail ("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26604, __extension__
__PRETTY_FUNCTION__))

26603

VT.getVectorElementType() == MVT::i64) &&(static_cast <bool> ((VT.getVectorElementType() == MVT::
i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType
() == MVT::i64) && "Unexpected element type") ? void (
0) : __assert_fail ("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26604, __extension__
__PRETTY_FUNCTION__))

26604

"Unexpected element type")(static_cast <bool> ((VT.getVectorElementType() == MVT::
i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType
() == MVT::i64) && "Unexpected element type") ? void (
0) : __assert_fail ("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26604, __extension__
__PRETTY_FUNCTION__));

26605

assert((InVT.getVectorElementType() == MVT::i8 ||(static_cast <bool> ((InVT.getVectorElementType() == MVT
::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType
() == MVT::i32) && "Unexpected element type") ? void (
0) : __assert_fail ("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26608, __extension__
__PRETTY_FUNCTION__))

26606

InVT.getVectorElementType() == MVT::i16 ||(static_cast <bool> ((InVT.getVectorElementType() == MVT
::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType
() == MVT::i32) && "Unexpected element type") ? void (
0) : __assert_fail ("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26608, __extension__
__PRETTY_FUNCTION__))

26607

InVT.getVectorElementType() == MVT::i32) &&(static_cast <bool> ((InVT.getVectorElementType() == MVT
::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType
() == MVT::i32) && "Unexpected element type") ? void (
0) : __assert_fail ("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26608, __extension__
__PRETTY_FUNCTION__))

26608

"Unexpected element type")(static_cast <bool> ((InVT.getVectorElementType() == MVT
::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType
() == MVT::i32) && "Unexpected element type") ? void (
0) : __assert_fail ("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26608, __extension__
__PRETTY_FUNCTION__));

26609

26610

if (VT == MVT::v32i16 && !Subtarget.hasBWI()) {

26611

assert(InVT == MVT::v32i8 && "Unexpected VT!")(static_cast <bool> (InVT == MVT::v32i8 && "Unexpected VT!"
) ? void (0) : __assert_fail ("InVT == MVT::v32i8 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26611, __extension__
__PRETTY_FUNCTION__));

26612

return splitVectorIntUnary(Op, DAG);

26613

}

26614

26615

if (Subtarget.hasInt256())

26616

return Op;

26617

26618

// Optimize vectors in AVX mode

26619

// Sign extend v8i16 to v8i32 and

26620

// v4i32 to v4i64

26621

//

26622

// Divide input vector into two parts

26623

// for v4i32 the high shuffle mask will be {2, 3, -1, -1}

26624

// use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32

26625

// concat the vectors to original VT

26626

MVT HalfVT = VT.getHalfNumVectorElementsVT();

26627

SDValue OpLo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, In);

26628

26629

unsigned NumElems = InVT.getVectorNumElements();

26630

SmallVector<int,8> ShufMask(NumElems, -1);

26631

for (unsigned i = 0; i != NumElems/2; ++i)

26632

ShufMask[i] = i + NumElems/2;

26633

26634

SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask);

26635

OpHi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, OpHi);

26636

26637

return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);

26638

}

26639

26640

/// Change a vector store into a pair of half-size vector stores.

26641

static SDValue splitVectorStore(StoreSDNode *Store, SelectionDAG &DAG) {

26642

SDValue StoredVal = Store->getValue();

26643

assert((StoredVal.getValueType().is256BitVector() ||(static_cast <bool> ((StoredVal.getValueType().is256BitVector
() || StoredVal.getValueType().is512BitVector()) && "Expecting 256/512-bit op"
) ? void (0) : __assert_fail ("(StoredVal.getValueType().is256BitVector() || StoredVal.getValueType().is512BitVector()) && \"Expecting 256/512-bit op\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26645, __extension__
__PRETTY_FUNCTION__))

26644

StoredVal.getValueType().is512BitVector()) &&(static_cast <bool> ((StoredVal.getValueType().is256BitVector
() || StoredVal.getValueType().is512BitVector()) && "Expecting 256/512-bit op"
) ? void (0) : __assert_fail ("(StoredVal.getValueType().is256BitVector() || StoredVal.getValueType().is512BitVector()) && \"Expecting 256/512-bit op\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26645, __extension__
__PRETTY_FUNCTION__))

26645

"Expecting 256/512-bit op")(static_cast <bool> ((StoredVal.getValueType().is256BitVector
() || StoredVal.getValueType().is512BitVector()) && "Expecting 256/512-bit op"
) ? void (0) : __assert_fail ("(StoredVal.getValueType().is256BitVector() || StoredVal.getValueType().is512BitVector()) && \"Expecting 256/512-bit op\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26645, __extension__
__PRETTY_FUNCTION__));

26646

26647

// Splitting volatile memory ops is not allowed unless the operation was not

26648

// legal to begin with. Assume the input store is legal (this transform is

26649

// only used for targets with AVX). Note: It is possible that we have an

26650

// illegal type like v2i128, and so we could allow splitting a volatile store

26651

// in that case if that is important.

26652

if (!Store->isSimple())

26653

return SDValue();

26654

26655

SDLoc DL(Store);

26656

SDValue Value0, Value1;

26657

std::tie(Value0, Value1) = splitVector(StoredVal, DAG, DL);

26658

unsigned HalfOffset = Value0.getValueType().getStoreSize();

26659

SDValue Ptr0 = Store->getBasePtr();

26660

SDValue Ptr1 =

26661

DAG.getMemBasePlusOffset(Ptr0, TypeSize::Fixed(HalfOffset), DL);

26662

SDValue Ch0 =

26663

DAG.getStore(Store->getChain(), DL, Value0, Ptr0, Store->getPointerInfo(),

26664

Store->getOriginalAlign(),

26665

Store->getMemOperand()->getFlags());

26666

SDValue Ch1 = DAG.getStore(Store->getChain(), DL, Value1, Ptr1,

26667

Store->getPointerInfo().getWithOffset(HalfOffset),

26668

Store->getOriginalAlign(),

26669

Store->getMemOperand()->getFlags());

26670

return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Ch0, Ch1);

26671

}

26672

26673

/// Scalarize a vector store, bitcasting to TargetVT to determine the scalar

26674

/// type.

26675

static SDValue scalarizeVectorStore(StoreSDNode *Store, MVT StoreVT,

26676

SelectionDAG &DAG) {

26677

SDValue StoredVal = Store->getValue();

26678

assert(StoreVT.is128BitVector() &&(static_cast <bool> (StoreVT.is128BitVector() &&
StoredVal.getValueType().is128BitVector() && "Expecting 128-bit op"
) ? void (0) : __assert_fail ("StoreVT.is128BitVector() && StoredVal.getValueType().is128BitVector() && \"Expecting 128-bit op\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26679, __extension__
__PRETTY_FUNCTION__))

26679

StoredVal.getValueType().is128BitVector() && "Expecting 128-bit op")(static_cast <bool> (StoreVT.is128BitVector() &&
StoredVal.getValueType().is128BitVector() && "Expecting 128-bit op"
) ? void (0) : __assert_fail ("StoreVT.is128BitVector() && StoredVal.getValueType().is128BitVector() && \"Expecting 128-bit op\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26679, __extension__
__PRETTY_FUNCTION__));

26680

StoredVal = DAG.getBitcast(StoreVT, StoredVal);

26681

26682

// Splitting volatile memory ops is not allowed unless the operation was not

26683

// legal to begin with. We are assuming the input op is legal (this transform

26684

// is only used for targets with AVX).

26685

if (!Store->isSimple())

26686

return SDValue();

26687

26688

MVT StoreSVT = StoreVT.getScalarType();

26689

unsigned NumElems = StoreVT.getVectorNumElements();

26690

unsigned ScalarSize = StoreSVT.getStoreSize();

26691

26692

SDLoc DL(Store);

26693

SmallVector<SDValue, 4> Stores;

26694

for (unsigned i = 0; i != NumElems; ++i) {

26695

unsigned Offset = i * ScalarSize;

26696

SDValue Ptr = DAG.getMemBasePlusOffset(Store->getBasePtr(),

26697

TypeSize::Fixed(Offset), DL);

26698

SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, StoreSVT, StoredVal,

26699

DAG.getIntPtrConstant(i, DL));

26700

SDValue Ch = DAG.getStore(Store->getChain(), DL, Scl, Ptr,

26701

Store->getPointerInfo().getWithOffset(Offset),

26702

Store->getOriginalAlign(),

26703

Store->getMemOperand()->getFlags());

26704

Stores.push_back(Ch);

26705

}

26706

return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);

26707

}

26708

26709

static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget,

26710

SelectionDAG &DAG) {

26711

StoreSDNode *St = cast<StoreSDNode>(Op.getNode());

26712

SDLoc dl(St);

26713

SDValue StoredVal = St->getValue();

26714

26715

// Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 stores.

26716

if (StoredVal.getValueType().isVector() &&

26717

StoredVal.getValueType().getVectorElementType() == MVT::i1) {

26718

unsigned NumElts = StoredVal.getValueType().getVectorNumElements();

26719

assert(NumElts <= 8 && "Unexpected VT")(static_cast <bool> (NumElts <= 8 && "Unexpected VT"
) ? void (0) : __assert_fail ("NumElts <= 8 && \"Unexpected VT\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26719, __extension__
__PRETTY_FUNCTION__));

26720

assert(!St->isTruncatingStore() && "Expected non-truncating store")(static_cast <bool> (!St->isTruncatingStore() &&
"Expected non-truncating store") ? void (0) : __assert_fail (
"!St->isTruncatingStore() && \"Expected non-truncating store\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26720, __extension__
__PRETTY_FUNCTION__));

26721

assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&(static_cast <bool> (Subtarget.hasAVX512() && !
Subtarget.hasDQI() && "Expected AVX512F without AVX512DQI"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && !Subtarget.hasDQI() && \"Expected AVX512F without AVX512DQI\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26722, __extension__
__PRETTY_FUNCTION__))

26722

"Expected AVX512F without AVX512DQI")(static_cast <bool> (Subtarget.hasAVX512() && !
Subtarget.hasDQI() && "Expected AVX512F without AVX512DQI"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && !Subtarget.hasDQI() && \"Expected AVX512F without AVX512DQI\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26722, __extension__
__PRETTY_FUNCTION__));

26723

26724

// We must pad with zeros to ensure we store zeroes to any unused bits.

26725

StoredVal = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,

26726

DAG.getUNDEF(MVT::v16i1), StoredVal,

26727

DAG.getIntPtrConstant(0, dl));

26728

StoredVal = DAG.getBitcast(MVT::i16, StoredVal);

26729

StoredVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, StoredVal);

26730

// Make sure we store zeros in the extra bits.

26731

if (NumElts < 8)

26732

StoredVal = DAG.getZeroExtendInReg(

26733

StoredVal, dl, EVT::getIntegerVT(*DAG.getContext(), NumElts));

26734

26735

return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),

26736

St->getPointerInfo(), St->getOriginalAlign(),

26737

St->getMemOperand()->getFlags());

26738

}

26739

26740

if (St->isTruncatingStore())

26741

return SDValue();

26742

26743

// If this is a 256-bit store of concatenated ops, we are better off splitting

26744

// that store into two 128-bit stores. This avoids spurious use of 256-bit ops

26745

// and each half can execute independently. Some cores would split the op into

26746

// halves anyway, so the concat (vinsertf128) is purely an extra op.

26747

MVT StoreVT = StoredVal.getSimpleValueType();

26748

if (StoreVT.is256BitVector() ||

26749

((StoreVT == MVT::v32i16 || StoreVT == MVT::v64i8) &&

26750

!Subtarget.hasBWI())) {

26751

SmallVector<SDValue, 4> CatOps;

26752

if (StoredVal.hasOneUse() &&

26753

collectConcatOps(StoredVal.getNode(), CatOps, DAG))

26754

return splitVectorStore(St, DAG);

26755

return SDValue();

26756

}

26757

26758

if (StoreVT.is32BitVector())

26759

return SDValue();

26760

26761

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

26762

assert(StoreVT.is64BitVector() && "Unexpected VT")(static_cast <bool> (StoreVT.is64BitVector() &&
"Unexpected VT") ? void (0) : __assert_fail ("StoreVT.is64BitVector() && \"Unexpected VT\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26762, __extension__
__PRETTY_FUNCTION__));

26763

assert(TLI.getTypeAction(*DAG.getContext(), StoreVT) ==(static_cast <bool> (TLI.getTypeAction(*DAG.getContext(
), StoreVT) == TargetLowering::TypeWidenVector && "Unexpected type action!"
) ? void (0) : __assert_fail ("TLI.getTypeAction(*DAG.getContext(), StoreVT) == TargetLowering::TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26765, __extension__
__PRETTY_FUNCTION__))

26764

TargetLowering::TypeWidenVector &&(static_cast <bool> (TLI.getTypeAction(*DAG.getContext(
), StoreVT) == TargetLowering::TypeWidenVector && "Unexpected type action!"
) ? void (0) : __assert_fail ("TLI.getTypeAction(*DAG.getContext(), StoreVT) == TargetLowering::TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26765, __extension__
__PRETTY_FUNCTION__))

26765

"Unexpected type action!")(static_cast <bool> (TLI.getTypeAction(*DAG.getContext(
), StoreVT) == TargetLowering::TypeWidenVector && "Unexpected type action!"
) ? void (0) : __assert_fail ("TLI.getTypeAction(*DAG.getContext(), StoreVT) == TargetLowering::TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26765, __extension__
__PRETTY_FUNCTION__));

26766

26767

EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), StoreVT);

26768

StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, StoredVal,

26769

DAG.getUNDEF(StoreVT));

26770

26771

if (Subtarget.hasSSE2()) {

26772

// Widen the vector, cast to a v2x64 type, extract the single 64-bit element

26773

// and store it.

26774

MVT StVT = Subtarget.is64Bit() && StoreVT.isInteger() ? MVT::i64 : MVT::f64;

26775

MVT CastVT = MVT::getVectorVT(StVT, 2);

26776

StoredVal = DAG.getBitcast(CastVT, StoredVal);

26777

StoredVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, StVT, StoredVal,

26778

DAG.getIntPtrConstant(0, dl));

26779

26780

return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),

26781

St->getPointerInfo(), St->getOriginalAlign(),

26782

St->getMemOperand()->getFlags());

26783

}

26784

assert(Subtarget.hasSSE1() && "Expected SSE")(static_cast <bool> (Subtarget.hasSSE1() && "Expected SSE"
) ? void (0) : __assert_fail ("Subtarget.hasSSE1() && \"Expected SSE\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26784, __extension__
__PRETTY_FUNCTION__));

26785

SDVTList Tys = DAG.getVTList(MVT::Other);

26786

SDValue Ops[] = {St->getChain(), StoredVal, St->getBasePtr()};

26787

return DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, Ops, MVT::i64,

26788

St->getMemOperand());

26789

}

26790

26791

// Lower vector extended loads using a shuffle. If SSSE3 is not available we

26792

// may emit an illegal shuffle but the expansion is still better than scalar

26793

// code. We generate sext/sext_invec for SEXTLOADs if it's available, otherwise

26794

// we'll emit a shuffle and a arithmetic shift.

26795

// FIXME: Is the expansion actually better than scalar code? It doesn't seem so.

26796

// TODO: It is possible to support ZExt by zeroing the undef values during

26797

// the shuffle phase or after the shuffle.

26798

static SDValue LowerLoad(SDValue Op, const X86Subtarget &Subtarget,

26799

SelectionDAG &DAG) {

26800

MVT RegVT = Op.getSimpleValueType();

26801

assert(RegVT.isVector() && "We only custom lower vector loads.")(static_cast <bool> (RegVT.isVector() && "We only custom lower vector loads."
) ? void (0) : __assert_fail ("RegVT.isVector() && \"We only custom lower vector loads.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26801, __extension__
__PRETTY_FUNCTION__));

26802

assert(RegVT.isInteger() &&(static_cast <bool> (RegVT.isInteger() && "We only custom lower integer vector loads."
) ? void (0) : __assert_fail ("RegVT.isInteger() && \"We only custom lower integer vector loads.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26803, __extension__
__PRETTY_FUNCTION__))

26803

"We only custom lower integer vector loads.")(static_cast <bool> (RegVT.isInteger() && "We only custom lower integer vector loads."
) ? void (0) : __assert_fail ("RegVT.isInteger() && \"We only custom lower integer vector loads.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26803, __extension__
__PRETTY_FUNCTION__));

26804

26805

LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());

26806

SDLoc dl(Ld);

26807

26808

// Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 loads.

26809

if (RegVT.getVectorElementType() == MVT::i1) {

26810

assert(EVT(RegVT) == Ld->getMemoryVT() && "Expected non-extending load")(static_cast <bool> (EVT(RegVT) == Ld->getMemoryVT()
&& "Expected non-extending load") ? void (0) : __assert_fail
("EVT(RegVT) == Ld->getMemoryVT() && \"Expected non-extending load\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26810, __extension__
__PRETTY_FUNCTION__));

26811

assert(RegVT.getVectorNumElements() <= 8 && "Unexpected VT")(static_cast <bool> (RegVT.getVectorNumElements() <=
8 && "Unexpected VT") ? void (0) : __assert_fail ("RegVT.getVectorNumElements() <= 8 && \"Unexpected VT\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26811, __extension__
__PRETTY_FUNCTION__));

26812

assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&(static_cast <bool> (Subtarget.hasAVX512() && !
Subtarget.hasDQI() && "Expected AVX512F without AVX512DQI"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && !Subtarget.hasDQI() && \"Expected AVX512F without AVX512DQI\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26813, __extension__
__PRETTY_FUNCTION__))

26813

"Expected AVX512F without AVX512DQI")(static_cast <bool> (Subtarget.hasAVX512() && !
Subtarget.hasDQI() && "Expected AVX512F without AVX512DQI"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && !Subtarget.hasDQI() && \"Expected AVX512F without AVX512DQI\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26813, __extension__
__PRETTY_FUNCTION__));

26814

26815

SDValue NewLd = DAG.getLoad(MVT::i8, dl, Ld->getChain(), Ld->getBasePtr(),

26816

Ld->getPointerInfo(), Ld->getOriginalAlign(),

26817

Ld->getMemOperand()->getFlags());

26818

26819

// Replace chain users with the new chain.

26820

assert(NewLd->getNumValues() == 2 && "Loads must carry a chain!")(static_cast <bool> (NewLd->getNumValues() == 2 &&
"Loads must carry a chain!") ? void (0) : __assert_fail ("NewLd->getNumValues() == 2 && \"Loads must carry a chain!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26820, __extension__
__PRETTY_FUNCTION__));

26821

26822

SDValue Val = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, NewLd);

26823

Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, RegVT,

26824

DAG.getBitcast(MVT::v16i1, Val),

26825

DAG.getIntPtrConstant(0, dl));

26826

return DAG.getMergeValues({Val, NewLd.getValue(1)}, dl);

26827

}

26828

26829

return SDValue();

26830

}

26831

26832

/// Return true if node is an ISD::AND or ISD::OR of two X86ISD::SETCC nodes

26833

/// each of which has no other use apart from the AND / OR.

26834

static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {

26835

Opc = Op.getOpcode();

26836

if (Opc != ISD::OR && Opc != ISD::AND)

26837

return false;

26838

return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&

26839

Op.getOperand(0).hasOneUse() &&

26840

Op.getOperand(1).getOpcode() == X86ISD::SETCC &&

26841

Op.getOperand(1).hasOneUse());

26842

}

26843

26844

SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {

26845

SDValue Chain = Op.getOperand(0);

26846

SDValue Cond = Op.getOperand(1);

26847

SDValue Dest = Op.getOperand(2);

26848

SDLoc dl(Op);

26849

26850

// Bail out when we don't have native compare instructions.

26851

if (Cond.getOpcode() == ISD::SETCC &&

26852

Cond.getOperand(0).getValueType() != MVT::f128 &&

26853

!isSoftFP16(Cond.getOperand(0).getValueType())) {

26854

SDValue LHS = Cond.getOperand(0);

26855

SDValue RHS = Cond.getOperand(1);

26856

ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();

26857

26858

// Special case for

26859

// setcc([su]{add,sub,mul}o == 0)

26860

// setcc([su]{add,sub,mul}o != 1)

26861

if (ISD::isOverflowIntrOpRes(LHS) &&

26862

(CC == ISD::SETEQ || CC == ISD::SETNE) &&

26863

(isNullConstant(RHS) || isOneConstant(RHS))) {

26864

SDValue Value, Overflow;

26865

X86::CondCode X86Cond;

26866

std::tie(Value, Overflow) = getX86XALUOOp(X86Cond, LHS.getValue(0), DAG);

26867

26868

if ((CC == ISD::SETEQ) == isNullConstant(RHS))

26869

X86Cond = X86::GetOppositeBranchCondition(X86Cond);

26870

26871

SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);

26872

return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,

26873

Overflow);

26874

}

26875

26876

if (LHS.getSimpleValueType().isInteger()) {

26877

SDValue CCVal;

26878

SDValue EFLAGS = emitFlagsForSetcc(LHS, RHS, CC, SDLoc(Cond), DAG, CCVal);

26879

return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,

26880

EFLAGS);

26881

}

26882

26883

if (CC == ISD::SETOEQ) {

26884

// For FCMP_OEQ, we can emit

26885

// two branches instead of an explicit AND instruction with a

26886

// separate test. However, we only do this if this block doesn't

26887

// have a fall-through edge, because this requires an explicit

26888

// jmp when the condition is false.

26889

if (Op.getNode()->hasOneUse()) {

26890

SDNode *User = *Op.getNode()->use_begin();

26891

// Look for an unconditional branch following this conditional branch.

26892

// We need this because we need to reverse the successors in order

26893

// to implement FCMP_OEQ.

26894

if (User->getOpcode() == ISD::BR) {

26895

SDValue FalseBB = User->getOperand(1);

26896

SDNode *NewBR =

26897

DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);

26898

assert(NewBR == User)(static_cast <bool> (NewBR == User) ? void (0) : __assert_fail
("NewBR == User", "llvm/lib/Target/X86/X86ISelLowering.cpp",
26898, __extension__ __PRETTY_FUNCTION__));

26899

(void)NewBR;

26900

Dest = FalseBB;

26901

26902

SDValue Cmp =

26903

DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);

26904

SDValue CCVal = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8);

26905

Chain = DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest,

26906

CCVal, Cmp);

26907

CCVal = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8);

26908

return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,

26909

Cmp);

26910

}

26911

}

26912

} else if (CC == ISD::SETUNE) {

26913

// For FCMP_UNE, we can emit

26914

// two branches instead of an explicit OR instruction with a

26915

// separate test.

26916

SDValue Cmp = DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);

26917

SDValue CCVal = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8);

26918

Chain =

26919

DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal, Cmp);

26920

CCVal = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8);

26921

return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,

26922

Cmp);

26923

} else {

26924

X86::CondCode X86Cond =

26925

TranslateX86CC(CC, dl, /*IsFP*/ true, LHS, RHS, DAG);

26926

SDValue Cmp = DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);

26927

SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);

26928

return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,

26929

Cmp);

26930

}

26931

}

26932

26933

if (ISD::isOverflowIntrOpRes(Cond)) {

26934

SDValue Value, Overflow;

26935

X86::CondCode X86Cond;

26936

std::tie(Value, Overflow) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG);

26937

26938

SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);

26939

return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,

26940

Overflow);

26941

}

26942

26943

// Look past the truncate if the high bits are known zero.

26944

if (isTruncWithZeroHighBitsInput(Cond, DAG))

26945

Cond = Cond.getOperand(0);

26946

26947

EVT CondVT = Cond.getValueType();

26948

26949

// Add an AND with 1 if we don't already have one.

26950

if (!(Cond.getOpcode() == ISD::AND && isOneConstant(Cond.getOperand(1))))

26951

Cond =

26952

DAG.getNode(ISD::AND, dl, CondVT, Cond, DAG.getConstant(1, dl, CondVT));

26953

26954

SDValue LHS = Cond;

26955

SDValue RHS = DAG.getConstant(0, dl, CondVT);

26956

26957

SDValue CCVal;

26958

SDValue EFLAGS = emitFlagsForSetcc(LHS, RHS, ISD::SETNE, dl, DAG, CCVal);

26959

return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,

26960

EFLAGS);

26961

}

26962

26963

// Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.

26964

// Calls to _alloca are needed to probe the stack when allocating more than 4k

26965

// bytes in one go. Touching the stack at 4K increments is necessary to ensure

26966

// that the guard pages used by the OS virtual memory manager are allocated in

26967

// correct sequence.

26968

SDValue

26969

X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,

26970

SelectionDAG &DAG) const {

26971

MachineFunction &MF = DAG.getMachineFunction();

26972

bool SplitStack = MF.shouldSplitStack();

26973

bool EmitStackProbeCall = hasStackProbeSymbol(MF);

26974

bool Lower = (Subtarget.isOSWindows() && !Subtarget.isTargetMachO()) ||

26975

SplitStack || EmitStackProbeCall;

26976

SDLoc dl(Op);

26977

26978

// Get the inputs.

26979

SDNode *Node = Op.getNode();

26980

SDValue Chain = Op.getOperand(0);

26981

SDValue Size = Op.getOperand(1);

26982

MaybeAlign Alignment(Op.getConstantOperandVal(2));

26983

EVT VT = Node->getValueType(0);

26984

26985

// Chain the dynamic stack allocation so that it doesn't modify the stack

26986

// pointer when other instructions are using the stack.

26987

Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);

26988

26989

bool Is64Bit = Subtarget.is64Bit();

26990

MVT SPTy = getPointerTy(DAG.getDataLayout());

26991

26992

SDValue Result;

26993

if (!Lower) {

26994

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

26995

Register SPReg = TLI.getStackPointerRegisterToSaveRestore();

26996

assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"(static_cast <bool> (SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
" not tell us which reg is the stack pointer!") ? void (0) :
__assert_fail ("SPReg && \"Target cannot require DYNAMIC_STACKALLOC expansion and\" \" not tell us which reg is the stack pointer!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26997, __extension__
__PRETTY_FUNCTION__))

26997

" not tell us which reg is the stack pointer!")(static_cast <bool> (SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
" not tell us which reg is the stack pointer!") ? void (0) :
__assert_fail ("SPReg && \"Target cannot require DYNAMIC_STACKALLOC expansion and\" \" not tell us which reg is the stack pointer!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26997, __extension__
__PRETTY_FUNCTION__));

26998

26999

const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();

27000

const Align StackAlign = TFI.getStackAlign();

27001

if (hasInlineStackProbe(MF)) {

27002

MachineRegisterInfo &MRI = MF.getRegInfo();

27003

27004

const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);

27005

Register Vreg = MRI.createVirtualRegister(AddrRegClass);

27006

Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);

27007

Result = DAG.getNode(X86ISD::PROBED_ALLOCA, dl, SPTy, Chain,

27008

DAG.getRegister(Vreg, SPTy));

27009

} else {

27010

SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);

27011

Chain = SP.getValue(1);

27012

Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value

27013

}

27014

if (Alignment && *Alignment > StackAlign)

27015

Result =

27016

DAG.getNode(ISD::AND, dl, VT, Result,

27017

DAG.getConstant(~(Alignment->value() - 1ULL), dl, VT));

27018

Chain = DAG.getCopyToReg(Chain, dl, SPReg, Result); // Output chain

27019

} else if (SplitStack) {

27020

MachineRegisterInfo &MRI = MF.getRegInfo();

27021

27022

if (Is64Bit) {

27023

// The 64 bit implementation of segmented stacks needs to clobber both r10

27024

// r11. This makes it impossible to use it along with nested parameters.

27025

const Function &F = MF.getFunction();

27026

for (const auto &A : F.args()) {

27027

if (A.hasNestAttr())

27028

report_fatal_error("Cannot use segmented stacks with functions that "

27029

"have nested arguments.");

27030

}

27031

}

27032

27033

const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);

27034

Register Vreg = MRI.createVirtualRegister(AddrRegClass);

27035

Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);

27036

Result = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,

27037

DAG.getRegister(Vreg, SPTy));

27038

} else {

27039

SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);

27040

Chain = DAG.getNode(X86ISD::DYN_ALLOCA, dl, NodeTys, Chain, Size);

27041

MF.getInfo<X86MachineFunctionInfo>()->setHasDynAlloca(true);

27042

27043

const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();

27044

Register SPReg = RegInfo->getStackRegister();

27045

SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);

27046

Chain = SP.getValue(1);

27047

27048

if (Alignment) {

27049

SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),

27050

DAG.getConstant(~(Alignment->value() - 1ULL), dl, VT));

27051

Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP);

27052

}

27053

27054

Result = SP;

27055

}

27056

27057

Chain = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);

27058

27059

SDValue Ops[2] = {Result, Chain};

27060

return DAG.getMergeValues(Ops, dl);

27061

}

27062

27063

SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {

27064

MachineFunction &MF = DAG.getMachineFunction();

27065

auto PtrVT = getPointerTy(MF.getDataLayout());

27066

X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();

27067

27068

const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();

27069

SDLoc DL(Op);

27070

27071

if (!Subtarget.is64Bit() ||

27072

Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv())) {

27073

// vastart just stores the address of the VarArgsFrameIndex slot into the

27074

// memory location argument.

27075

SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);

27076

return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),

27077

MachinePointerInfo(SV));

27078

}

27079

27080

// __va_list_tag:

27081

// gp_offset (0 - 6 * 8)

27082

// fp_offset (48 - 48 + 8 * 16)

27083

// overflow_arg_area (point to parameters coming in memory).

27084

// reg_save_area

27085

SmallVector<SDValue, 8> MemOps;

27086

SDValue FIN = Op.getOperand(1);

27087

// Store gp_offset

27088

SDValue Store = DAG.getStore(

27089

Op.getOperand(0), DL,

27090

DAG.getConstant(FuncInfo->getVarArgsGPOffset(), DL, MVT::i32), FIN,

27091

MachinePointerInfo(SV));

27092

MemOps.push_back(Store);

27093

27094

// Store fp_offset

27095

FIN = DAG.getMemBasePlusOffset(FIN, TypeSize::Fixed(4), DL);

27096

Store = DAG.getStore(

27097

Op.getOperand(0), DL,

27098

DAG.getConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32), FIN,

27099

MachinePointerInfo(SV, 4));

27100

MemOps.push_back(Store);

27101

27102

// Store ptr to overflow_arg_area

27103

FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(4, DL));

27104

SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);

27105

Store =

27106

DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN, MachinePointerInfo(SV, 8));

27107

MemOps.push_back(Store);

27108

27109

// Store ptr to reg_save_area.

27110

FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(

27111

Subtarget.isTarget64BitLP64() ? 8 : 4, DL));

27112

SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT);

27113

Store = DAG.getStore(

27114

Op.getOperand(0), DL, RSFIN, FIN,

27115

MachinePointerInfo(SV, Subtarget.isTarget64BitLP64() ? 16 : 12));

27116

MemOps.push_back(Store);

27117

return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);

27118

}

27119

27120

SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {

27121

assert(Subtarget.is64Bit() &&(static_cast <bool> (Subtarget.is64Bit() && "LowerVAARG only handles 64-bit va_arg!"
) ? void (0) : __assert_fail ("Subtarget.is64Bit() && \"LowerVAARG only handles 64-bit va_arg!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27122, __extension__
__PRETTY_FUNCTION__))

27122

"LowerVAARG only handles 64-bit va_arg!")(static_cast <bool> (Subtarget.is64Bit() && "LowerVAARG only handles 64-bit va_arg!"
) ? void (0) : __assert_fail ("Subtarget.is64Bit() && \"LowerVAARG only handles 64-bit va_arg!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27122, __extension__
__PRETTY_FUNCTION__));

27123

assert(Op.getNumOperands() == 4)(static_cast <bool> (Op.getNumOperands() == 4) ? void (
0) : __assert_fail ("Op.getNumOperands() == 4", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27123, __extension__ __PRETTY_FUNCTION__));

27124

27125

MachineFunction &MF = DAG.getMachineFunction();

27126

if (Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv()))

27127

// The Win64 ABI uses char* instead of a structure.

27128

return DAG.expandVAArg(Op.getNode());

27129

27130

SDValue Chain = Op.getOperand(0);

27131

SDValue SrcPtr = Op.getOperand(1);

27132

const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();

27133

unsigned Align = Op.getConstantOperandVal(3);

27134

SDLoc dl(Op);

27135

27136

EVT ArgVT = Op.getNode()->getValueType(0);

27137

Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());

27138

uint32_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);

27139

uint8_t ArgMode;

27140

27141

// Decide which area this value should be read from.

27142

// TODO: Implement the AMD64 ABI in its entirety. This simple

27143

// selection mechanism works only for the basic types.

27144

assert(ArgVT != MVT::f80 && "va_arg for f80 not yet implemented")(static_cast <bool> (ArgVT != MVT::f80 && "va_arg for f80 not yet implemented"
) ? void (0) : __assert_fail ("ArgVT != MVT::f80 && \"va_arg for f80 not yet implemented\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27144, __extension__
__PRETTY_FUNCTION__));

27145

if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) {

27146

ArgMode = 2; // Argument passed in XMM register. Use fp_offset.

27147

} else {

27148

assert(ArgVT.isInteger() && ArgSize <= 32 /*bytes*/ &&(static_cast <bool> (ArgVT.isInteger() && ArgSize
<= 32 && "Unhandled argument type in LowerVAARG")
? void (0) : __assert_fail ("ArgVT.isInteger() && ArgSize <= 32 && \"Unhandled argument type in LowerVAARG\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27149, __extension__
__PRETTY_FUNCTION__))

27149

"Unhandled argument type in LowerVAARG")(static_cast <bool> (ArgVT.isInteger() && ArgSize
<= 32 && "Unhandled argument type in LowerVAARG")
? void (0) : __assert_fail ("ArgVT.isInteger() && ArgSize <= 32 && \"Unhandled argument type in LowerVAARG\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27149, __extension__
__PRETTY_FUNCTION__));

27150

ArgMode = 1; // Argument passed in GPR64 register(s). Use gp_offset.

27151

}

27152

27153

if (ArgMode == 2) {

27154

// Make sure using fp_offset makes sense.

27155

assert(!Subtarget.useSoftFloat() &&(static_cast <bool> (!Subtarget.useSoftFloat() &&
!(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat
)) && Subtarget.hasSSE1()) ? void (0) : __assert_fail
("!Subtarget.useSoftFloat() && !(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) && Subtarget.hasSSE1()"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27157, __extension__
__PRETTY_FUNCTION__))

27156

!(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) &&(static_cast <bool> (!Subtarget.useSoftFloat() &&
!(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat
)) && Subtarget.hasSSE1()) ? void (0) : __assert_fail
("!Subtarget.useSoftFloat() && !(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) && Subtarget.hasSSE1()"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27157, __extension__
__PRETTY_FUNCTION__))

27157

Subtarget.hasSSE1())(static_cast <bool> (!Subtarget.useSoftFloat() &&
!(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat
)) && Subtarget.hasSSE1()) ? void (0) : __assert_fail
("!Subtarget.useSoftFloat() && !(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) && Subtarget.hasSSE1()"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27157, __extension__
__PRETTY_FUNCTION__));

27158

}

27159

27160

// Insert VAARG node into the DAG

27161

// VAARG returns two values: Variable Argument Address, Chain

27162

SDValue InstOps[] = {Chain, SrcPtr,

27163

DAG.getTargetConstant(ArgSize, dl, MVT::i32),

27164

DAG.getTargetConstant(ArgMode, dl, MVT::i8),

27165

DAG.getTargetConstant(Align, dl, MVT::i32)};

27166

SDVTList VTs = DAG.getVTList(getPointerTy(DAG.getDataLayout()), MVT::Other);

27167

SDValue VAARG = DAG.getMemIntrinsicNode(

27168

Subtarget.isTarget64BitLP64() ? X86ISD::VAARG_64 : X86ISD::VAARG_X32, dl,

27169

VTs, InstOps, MVT::i64, MachinePointerInfo(SV),

27170

/*Alignment=*/std::nullopt,

27171

MachineMemOperand::MOLoad | MachineMemOperand::MOStore);

27172

Chain = VAARG.getValue(1);

27173

27174

// Load the next argument and return it

27175

return DAG.getLoad(ArgVT, dl, Chain, VAARG, MachinePointerInfo());

27176

}

27177

27178

static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget,

27179

SelectionDAG &DAG) {

27180

// X86-64 va_list is a struct { i32, i32, i8*, i8* }, except on Windows,

27181

// where a va_list is still an i8*.

27182

assert(Subtarget.is64Bit() && "This code only handles 64-bit va_copy!")(static_cast <bool> (Subtarget.is64Bit() && "This code only handles 64-bit va_copy!"
) ? void (0) : __assert_fail ("Subtarget.is64Bit() && \"This code only handles 64-bit va_copy!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27182, __extension__
__PRETTY_FUNCTION__));

27183

if (Subtarget.isCallingConvWin64(

27184

DAG.getMachineFunction().getFunction().getCallingConv()))

27185

// Probably a Win64 va_copy.

27186

return DAG.expandVACopy(Op.getNode());

27187

27188

SDValue Chain = Op.getOperand(0);

27189

SDValue DstPtr = Op.getOperand(1);

27190

SDValue SrcPtr = Op.getOperand(2);

27191

const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();

27192

const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();

27193

SDLoc DL(Op);

27194

27195

return DAG.getMemcpy(

27196

Chain, DL, DstPtr, SrcPtr,

27197

DAG.getIntPtrConstant(Subtarget.isTarget64BitLP64() ? 24 : 16, DL),

27198

Align(Subtarget.isTarget64BitLP64() ? 8 : 4), /*isVolatile*/ false, false,

27199

false, MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));

27200

}

27201

27202

// Helper to get immediate/variable SSE shift opcode from other shift opcodes.

27203

static unsigned getTargetVShiftUniformOpcode(unsigned Opc, bool IsVariable) {

27204

switch (Opc) {

27205

case ISD::SHL:

27206

case X86ISD::VSHL:

27207

case X86ISD::VSHLI:

27208

return IsVariable ? X86ISD::VSHL : X86ISD::VSHLI;

27209

case ISD::SRL:

27210

case X86ISD::VSRL:

27211

case X86ISD::VSRLI:

27212

return IsVariable ? X86ISD::VSRL : X86ISD::VSRLI;

27213

case ISD::SRA:

27214

case X86ISD::VSRA:

27215

case X86ISD::VSRAI:

27216

return IsVariable ? X86ISD::VSRA : X86ISD::VSRAI;

27217

}

27218

llvm_unreachable("Unknown target vector shift node")::llvm::llvm_unreachable_internal("Unknown target vector shift node"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27218);

27219

}

27220

27221

/// Handle vector element shifts where the shift amount is a constant.

27222

/// Takes immediate version of shift as input.

27223

static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,

27224

SDValue SrcOp, uint64_t ShiftAmt,

27225

SelectionDAG &DAG) {

27226

MVT ElementType = VT.getVectorElementType();

27227

27228

// Bitcast the source vector to the output type, this is mainly necessary for

27229

// vXi8/vXi64 shifts.

27230

if (VT != SrcOp.getSimpleValueType())

27231

SrcOp = DAG.getBitcast(VT, SrcOp);

27232

27233

// Fold this packed shift into its first operand if ShiftAmt is 0.

27234

if (ShiftAmt == 0)

27235

return SrcOp;

27236

27237

// Check for ShiftAmt >= element width

27238

if (ShiftAmt >= ElementType.getSizeInBits()) {

27239

if (Opc == X86ISD::VSRAI)

27240

ShiftAmt = ElementType.getSizeInBits() - 1;

27241

else

27242

return DAG.getConstant(0, dl, VT);

27243

}

27244

27245

assert((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI)(static_cast <bool> ((Opc == X86ISD::VSHLI || Opc == X86ISD
::VSRLI || Opc == X86ISD::VSRAI) && "Unknown target vector shift-by-constant node"
) ? void (0) : __assert_fail ("(Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI) && \"Unknown target vector shift-by-constant node\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27246, __extension__
__PRETTY_FUNCTION__))

27246

&& "Unknown target vector shift-by-constant node")(static_cast <bool> ((Opc == X86ISD::VSHLI || Opc == X86ISD
::VSRLI || Opc == X86ISD::VSRAI) && "Unknown target vector shift-by-constant node"
) ? void (0) : __assert_fail ("(Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI) && \"Unknown target vector shift-by-constant node\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27246, __extension__
__PRETTY_FUNCTION__));

27247

27248

// Fold this packed vector shift into a build vector if SrcOp is a

27249

// vector of Constants or UNDEFs.

27250

if (ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) {

27251

unsigned ShiftOpc;

27252

switch (Opc) {

27253

default: llvm_unreachable("Unknown opcode!")::llvm::llvm_unreachable_internal("Unknown opcode!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27253);

27254

case X86ISD::VSHLI:

27255

ShiftOpc = ISD::SHL;

27256

break;

27257

case X86ISD::VSRLI:

27258

ShiftOpc = ISD::SRL;

27259

break;

27260

case X86ISD::VSRAI:

27261

ShiftOpc = ISD::SRA;

27262

break;

27263

}

27264

27265

SDValue Amt = DAG.getConstant(ShiftAmt, dl, VT);

27266

if (SDValue C = DAG.FoldConstantArithmetic(ShiftOpc, dl, VT, {SrcOp, Amt}))

27267

return C;

27268

}

27269

27270

return DAG.getNode(Opc, dl, VT, SrcOp,

27271

DAG.getTargetConstant(ShiftAmt, dl, MVT::i8));

27272

}

27273

27274

/// Handle vector element shifts by a splat shift amount

27275

static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT,

27276

SDValue SrcOp, SDValue ShAmt, int ShAmtIdx,

27277

const X86Subtarget &Subtarget,

27278

SelectionDAG &DAG) {

27279

MVT AmtVT = ShAmt.getSimpleValueType();

27280

assert(AmtVT.isVector() && "Vector shift type mismatch")(static_cast <bool> (AmtVT.isVector() && "Vector shift type mismatch"
) ? void (0) : __assert_fail ("AmtVT.isVector() && \"Vector shift type mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27280, __extension__
__PRETTY_FUNCTION__));

27281

assert(0 <= ShAmtIdx && ShAmtIdx < (int)AmtVT.getVectorNumElements() &&(static_cast <bool> (0 <= ShAmtIdx && ShAmtIdx
< (int)AmtVT.getVectorNumElements() && "Illegal vector splat index"
) ? void (0) : __assert_fail ("0 <= ShAmtIdx && ShAmtIdx < (int)AmtVT.getVectorNumElements() && \"Illegal vector splat index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27282, __extension__
__PRETTY_FUNCTION__))

27282

"Illegal vector splat index")(static_cast <bool> (0 <= ShAmtIdx && ShAmtIdx
< (int)AmtVT.getVectorNumElements() && "Illegal vector splat index"
) ? void (0) : __assert_fail ("0 <= ShAmtIdx && ShAmtIdx < (int)AmtVT.getVectorNumElements() && \"Illegal vector splat index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27282, __extension__
__PRETTY_FUNCTION__));

27283

27284

// Move the splat element to the bottom element.

27285

if (ShAmtIdx != 0) {

27286

SmallVector<int> Mask(AmtVT.getVectorNumElements(), -1);

27287

Mask[0] = ShAmtIdx;

27288

ShAmt = DAG.getVectorShuffle(AmtVT, dl, ShAmt, DAG.getUNDEF(AmtVT), Mask);

27289

}

27290

27291

// Peek through any zext node if we can get back to a 128-bit source.

27292

if (AmtVT.getScalarSizeInBits() == 64 &&

27293

(ShAmt.getOpcode() == ISD::ZERO_EXTEND ||

27294

ShAmt.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG) &&

27295

ShAmt.getOperand(0).getValueType().isSimple() &&

27296

ShAmt.getOperand(0).getValueType().is128BitVector()) {

27297

ShAmt = ShAmt.getOperand(0);

27298

AmtVT = ShAmt.getSimpleValueType();

27299

}

27300

27301

// See if we can mask off the upper elements using the existing source node.

27302

// The shift uses the entire lower 64-bits of the amount vector, so no need to

27303

// do this for vXi64 types.

27304

bool IsMasked = false;

27305

if (AmtVT.getScalarSizeInBits() < 64) {

27306

if (ShAmt.getOpcode() == ISD::BUILD_VECTOR ||

27307

ShAmt.getOpcode() == ISD::SCALAR_TO_VECTOR) {

27308

// If the shift amount has come from a scalar, then zero-extend the scalar

27309

// before moving to the vector.

27310

ShAmt = DAG.getZExtOrTrunc(ShAmt.getOperand(0), dl, MVT::i32);

27311

ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, ShAmt);

27312

ShAmt = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, ShAmt);

27313

AmtVT = MVT::v4i32;

27314

IsMasked = true;

27315

} else if (ShAmt.getOpcode() == ISD::AND) {

27316

// See if the shift amount is already masked (e.g. for rotation modulo),

27317

// then we can zero-extend it by setting all the other mask elements to

27318

// zero.

27319

SmallVector<SDValue> MaskElts(

27320

AmtVT.getVectorNumElements(),

27321

DAG.getConstant(0, dl, AmtVT.getScalarType()));

27322

MaskElts[0] = DAG.getAllOnesConstant(dl, AmtVT.getScalarType());

27323

SDValue Mask = DAG.getBuildVector(AmtVT, dl, MaskElts);

27324

if ((Mask = DAG.FoldConstantArithmetic(ISD::AND, dl, AmtVT,

27325

{ShAmt.getOperand(1), Mask}))) {

27326

ShAmt = DAG.getNode(ISD::AND, dl, AmtVT, ShAmt.getOperand(0), Mask);

27327

IsMasked = true;

27328

}

27329

}

27330

}

27331

27332

// Extract if the shift amount vector is larger than 128-bits.

27333

if (AmtVT.getSizeInBits() > 128) {

27334

ShAmt = extract128BitVector(ShAmt, 0, DAG, dl);

27335

AmtVT = ShAmt.getSimpleValueType();

27336

}

27337

27338

// Zero-extend bottom element to v2i64 vector type, either by extension or

27339

// shuffle masking.

27340

if (!IsMasked && AmtVT.getScalarSizeInBits() < 64) {

27341

if (AmtVT == MVT::v4i32 && (ShAmt.getOpcode() == X86ISD::VBROADCAST ||

27342

ShAmt.getOpcode() == X86ISD::VBROADCAST_LOAD)) {

27343

ShAmt = DAG.getNode(X86ISD::VZEXT_MOVL, SDLoc(ShAmt), MVT::v4i32, ShAmt);

27344

} else if (Subtarget.hasSSE41()) {

27345

ShAmt = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(ShAmt),

27346

MVT::v2i64, ShAmt);

27347

} else {

27348

SDValue ByteShift = DAG.getTargetConstant(

27349

(128 - AmtVT.getScalarSizeInBits()) / 8, SDLoc(ShAmt), MVT::i8);

27350

ShAmt = DAG.getBitcast(MVT::v16i8, ShAmt);

27351

ShAmt = DAG.getNode(X86ISD::VSHLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt,

27352

ByteShift);

27353

ShAmt = DAG.getNode(X86ISD::VSRLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt,

27354

ByteShift);

27355

}

27356

}

27357

27358

// Change opcode to non-immediate version.

27359

Opc = getTargetVShiftUniformOpcode(Opc, true);

27360

27361

// The return type has to be a 128-bit type with the same element

27362

// type as the input type.

27363

MVT EltVT = VT.getVectorElementType();

27364

MVT ShVT = MVT::getVectorVT(EltVT, 128 / EltVT.getSizeInBits());

27365

27366

ShAmt = DAG.getBitcast(ShVT, ShAmt);

27367

return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);

27368

}

27369

27370

/// Return Mask with the necessary casting or extending

27371

/// for \p Mask according to \p MaskVT when lowering masking intrinsics

27372

static SDValue getMaskNode(SDValue Mask, MVT MaskVT,

27373

const X86Subtarget &Subtarget, SelectionDAG &DAG,

27374

const SDLoc &dl) {

27375

27376

if (isAllOnesConstant(Mask))

27377

return DAG.getConstant(1, dl, MaskVT);

27378

if (X86::isZeroNode(Mask))

27379

return DAG.getConstant(0, dl, MaskVT);

27380

27381

assert(MaskVT.bitsLE(Mask.getSimpleValueType()) && "Unexpected mask size!")(static_cast <bool> (MaskVT.bitsLE(Mask.getSimpleValueType
()) && "Unexpected mask size!") ? void (0) : __assert_fail
("MaskVT.bitsLE(Mask.getSimpleValueType()) && \"Unexpected mask size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27381, __extension__
__PRETTY_FUNCTION__));

27382

27383

if (Mask.getSimpleValueType() == MVT::i64 && Subtarget.is32Bit()) {

27384

assert(MaskVT == MVT::v64i1 && "Expected v64i1 mask!")(static_cast <bool> (MaskVT == MVT::v64i1 && "Expected v64i1 mask!"
) ? void (0) : __assert_fail ("MaskVT == MVT::v64i1 && \"Expected v64i1 mask!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27384, __extension__
__PRETTY_FUNCTION__));

27385

assert(Subtarget.hasBWI() && "Expected AVX512BW target!")(static_cast <bool> (Subtarget.hasBWI() && "Expected AVX512BW target!"
) ? void (0) : __assert_fail ("Subtarget.hasBWI() && \"Expected AVX512BW target!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27385, __extension__
__PRETTY_FUNCTION__));

27386

// In case 32bit mode, bitcast i64 is illegal, extend/split it.

27387

SDValue Lo, Hi;

27388

std::tie(Lo, Hi) = DAG.SplitScalar(Mask, dl, MVT::i32, MVT::i32);

27389

Lo = DAG.getBitcast(MVT::v32i1, Lo);

27390

Hi = DAG.getBitcast(MVT::v32i1, Hi);

27391

return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);

27392

} else {

27393

MVT BitcastVT = MVT::getVectorVT(MVT::i1,

27394

Mask.getSimpleValueType().getSizeInBits());

27395

// In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements

27396

// are extracted by EXTRACT_SUBVECTOR.

27397

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,

27398

DAG.getBitcast(BitcastVT, Mask),

27399

DAG.getIntPtrConstant(0, dl));

27400

}

27401

}

27402

27403

/// Return (and \p Op, \p Mask) for compare instructions or

27404

/// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the

27405

/// necessary casting or extending for \p Mask when lowering masking intrinsics

27406

static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,

27407

SDValue PreservedSrc,

27408

const X86Subtarget &Subtarget,

27409

SelectionDAG &DAG) {

27410

MVT VT = Op.getSimpleValueType();

27411

MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());

27412

unsigned OpcodeSelect = ISD::VSELECT;

27413

SDLoc dl(Op);

27414

27415

if (isAllOnesConstant(Mask))

27416

return Op;

27417

27418

SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);

27419

27420

if (PreservedSrc.isUndef())

27421

PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);

27422

return DAG.getNode(OpcodeSelect, dl, VT, VMask, Op, PreservedSrc);

27423

}

27424

27425

/// Creates an SDNode for a predicated scalar operation.

27426

/// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc).

27427

/// The mask is coming as MVT::i8 and it should be transformed

27428

/// to MVT::v1i1 while lowering masking intrinsics.

27429

/// The main difference between ScalarMaskingNode and VectorMaskingNode is using

27430

/// "X86select" instead of "vselect". We just can't create the "vselect" node

27431

/// for a scalar instruction.

27432

static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,

27433

SDValue PreservedSrc,

27434

const X86Subtarget &Subtarget,

27435

SelectionDAG &DAG) {

27436

27437

if (auto *MaskConst = dyn_cast<ConstantSDNode>(Mask))

27438

if (MaskConst->getZExtValue() & 0x1)

27439

return Op;

27440

27441

MVT VT = Op.getSimpleValueType();

27442

SDLoc dl(Op);

27443

27444

assert(Mask.getValueType() == MVT::i8 && "Unexpect type")(static_cast <bool> (Mask.getValueType() == MVT::i8 &&
"Unexpect type") ? void (0) : __assert_fail ("Mask.getValueType() == MVT::i8 && \"Unexpect type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27444, __extension__
__PRETTY_FUNCTION__));

27445

SDValue IMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v1i1,

27446

DAG.getBitcast(MVT::v8i1, Mask),

27447

DAG.getIntPtrConstant(0, dl));

27448

if (Op.getOpcode() == X86ISD::FSETCCM ||

27449

Op.getOpcode() == X86ISD::FSETCCM_SAE ||

27450

Op.getOpcode() == X86ISD::VFPCLASSS)

27451

return DAG.getNode(ISD::AND, dl, VT, Op, IMask);

27452

27453

if (PreservedSrc.isUndef())

27454

PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);

27455

return DAG.getNode(X86ISD::SELECTS, dl, VT, IMask, Op, PreservedSrc);

27456

}

27457

27458

static int getSEHRegistrationNodeSize(const Function *Fn) {

27459

if (!Fn->hasPersonalityFn())

27460

report_fatal_error(

27461

"querying registration node size for function without personality");

27462

// The RegNodeSize is 6 32-bit words for SEH and 4 for C++ EH. See

27463

// WinEHStatePass for the full struct definition.

27464

switch (classifyEHPersonality(Fn->getPersonalityFn())) {

27465

case EHPersonality::MSVC_X86SEH: return 24;

27466

case EHPersonality::MSVC_CXX: return 16;

27467

default: break;

27468

}

27469

report_fatal_error(

27470

"can only recover FP for 32-bit MSVC EH personality functions");

27471

}

27472

27473

/// When the MSVC runtime transfers control to us, either to an outlined

27474

/// function or when returning to a parent frame after catching an exception, we

27475

/// recover the parent frame pointer by doing arithmetic on the incoming EBP.

27476

/// Here's the math:

27477

/// RegNodeBase = EntryEBP - RegNodeSize

27478

/// ParentFP = RegNodeBase - ParentFrameOffset

27479

/// Subtracting RegNodeSize takes us to the offset of the registration node, and

27480

/// subtracting the offset (negative on x86) takes us back to the parent FP.

27481

static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn,

27482

SDValue EntryEBP) {

27483

MachineFunction &MF = DAG.getMachineFunction();

27484

SDLoc dl;

27485

27486

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

27487

MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());

27488

27489

// It's possible that the parent function no longer has a personality function

27490

// if the exceptional code was optimized away, in which case we just return

27491

// the incoming EBP.

27492

if (!Fn->hasPersonalityFn())

27493

return EntryEBP;

27494

27495

// Get an MCSymbol that will ultimately resolve to the frame offset of the EH

27496

// registration, or the .set_setframe offset.

27497

MCSymbol *OffsetSym =

27498

MF.getMMI().getContext().getOrCreateParentFrameOffsetSymbol(

27499

GlobalValue::dropLLVMManglingEscape(Fn->getName()));

27500

SDValue OffsetSymVal = DAG.getMCSymbol(OffsetSym, PtrVT);

27501

SDValue ParentFrameOffset =

27502

DAG.getNode(ISD::LOCAL_RECOVER, dl, PtrVT, OffsetSymVal);

27503

27504

// Return EntryEBP + ParentFrameOffset for x64. This adjusts from RSP after

27505

// prologue to RBP in the parent function.

27506

const X86Subtarget &Subtarget = DAG.getSubtarget<X86Subtarget>();

27507

if (Subtarget.is64Bit())

27508

return DAG.getNode(ISD::ADD, dl, PtrVT, EntryEBP, ParentFrameOffset);

27509

27510

int RegNodeSize = getSEHRegistrationNodeSize(Fn);

27511

// RegNodeBase = EntryEBP - RegNodeSize

27512

// ParentFP = RegNodeBase - ParentFrameOffset

27513

SDValue RegNodeBase = DAG.getNode(ISD::SUB, dl, PtrVT, EntryEBP,

27514

DAG.getConstant(RegNodeSize, dl, PtrVT));

27515

return DAG.getNode(ISD::SUB, dl, PtrVT, RegNodeBase, ParentFrameOffset);

27516

}

27517

27518

SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,

27519

SelectionDAG &DAG) const {

27520

// Helper to detect if the operand is CUR_DIRECTION rounding mode.

27521

auto isRoundModeCurDirection = [](SDValue Rnd) {

27522

if (auto *C = dyn_cast<ConstantSDNode>(Rnd))

27523

return C->getAPIntValue() == X86::STATIC_ROUNDING::CUR_DIRECTION;

27524

27525

return false;

27526

};

27527

auto isRoundModeSAE = [](SDValue Rnd) {

27528

if (auto *C = dyn_cast<ConstantSDNode>(Rnd)) {

27529

unsigned RC = C->getZExtValue();

27530

if (RC & X86::STATIC_ROUNDING::NO_EXC) {

27531

// Clear the NO_EXC bit and check remaining bits.

27532

RC ^= X86::STATIC_ROUNDING::NO_EXC;

27533

// As a convenience we allow no other bits or explicitly

27534

// current direction.

27535

return RC == 0 || RC == X86::STATIC_ROUNDING::CUR_DIRECTION;

27536

}

27537

}

27538

27539

return false;

27540

};

27541

auto isRoundModeSAEToX = [](SDValue Rnd, unsigned &RC) {

27542

if (auto *C = dyn_cast<ConstantSDNode>(Rnd)) {

27543

RC = C->getZExtValue();

27544

if (RC & X86::STATIC_ROUNDING::NO_EXC) {

27545

// Clear the NO_EXC bit and check remaining bits.

27546

RC ^= X86::STATIC_ROUNDING::NO_EXC;

27547

return RC == X86::STATIC_ROUNDING::TO_NEAREST_INT ||

27548

RC == X86::STATIC_ROUNDING::TO_NEG_INF ||

27549

RC == X86::STATIC_ROUNDING::TO_POS_INF ||

27550

RC == X86::STATIC_ROUNDING::TO_ZERO;

27551

}

27552

}

27553

27554

return false;

27555

};

27556

27557

SDLoc dl(Op);

27558

unsigned IntNo = Op.getConstantOperandVal(0);

27559

MVT VT = Op.getSimpleValueType();

27560

const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo);

27561

27562

// Propagate flags from original node to transformed node(s).

27563

SelectionDAG::FlagInserter FlagsInserter(DAG, Op->getFlags());

27564

27565

if (IntrData) {

27566

switch(IntrData->Type) {

27567

case INTR_TYPE_1OP: {

27568

// We specify 2 possible opcodes for intrinsics with rounding modes.

27569

// First, we check if the intrinsic may have non-default rounding mode,

27570

// (IntrData->Opc1 != 0), then we check the rounding mode operand.

27571

unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;

27572

if (IntrWithRoundingModeOpcode != 0) {

27573

SDValue Rnd = Op.getOperand(2);

27574

unsigned RC = 0;

27575

if (isRoundModeSAEToX(Rnd, RC))

27576

return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),

27577

Op.getOperand(1),

27578

DAG.getTargetConstant(RC, dl, MVT::i32));

27579

if (!isRoundModeCurDirection(Rnd))

27580

return SDValue();

27581

}

27582

return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),

27583

Op.getOperand(1));

27584

}

27585

case INTR_TYPE_1OP_SAE: {

27586

SDValue Sae = Op.getOperand(2);

27587

27588

unsigned Opc;

27589

if (isRoundModeCurDirection(Sae))

27590

Opc = IntrData->Opc0;

27591

else if (isRoundModeSAE(Sae))

27592

Opc = IntrData->Opc1;

27593

else

27594

return SDValue();

27595

27596

return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1));

27597

}

27598

case INTR_TYPE_2OP: {

27599

SDValue Src2 = Op.getOperand(2);

27600

27601

// We specify 2 possible opcodes for intrinsics with rounding modes.

27602

// First, we check if the intrinsic may have non-default rounding mode,

27603

// (IntrData->Opc1 != 0), then we check the rounding mode operand.

27604

unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;

27605

if (IntrWithRoundingModeOpcode != 0) {

27606

SDValue Rnd = Op.getOperand(3);

27607

unsigned RC = 0;

27608

if (isRoundModeSAEToX(Rnd, RC))

27609

return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),

27610

Op.getOperand(1), Src2,

27611

DAG.getTargetConstant(RC, dl, MVT::i32));

27612

if (!isRoundModeCurDirection(Rnd))

27613

return SDValue();

27614

}

27615

27616

return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),

27617

Op.getOperand(1), Src2);

27618

}

27619

case INTR_TYPE_2OP_SAE: {

27620

SDValue Sae = Op.getOperand(3);

27621

27622

unsigned Opc;

27623

if (isRoundModeCurDirection(Sae))

27624

Opc = IntrData->Opc0;

27625

else if (isRoundModeSAE(Sae))

27626

Opc = IntrData->Opc1;

27627

else

27628

return SDValue();

27629

27630

return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1),

27631

Op.getOperand(2));

27632

}

27633

case INTR_TYPE_3OP:

27634

case INTR_TYPE_3OP_IMM8: {

27635

SDValue Src1 = Op.getOperand(1);

27636

SDValue Src2 = Op.getOperand(2);

27637

SDValue Src3 = Op.getOperand(3);

27638

27639

if (IntrData->Type == INTR_TYPE_3OP_IMM8 &&

27640

Src3.getValueType() != MVT::i8) {

27641

Src3 = DAG.getTargetConstant(

27642

cast<ConstantSDNode>(Src3)->getZExtValue() & 0xff, dl, MVT::i8);

27643

}

27644

27645

// We specify 2 possible opcodes for intrinsics with rounding modes.

27646

// First, we check if the intrinsic may have non-default rounding mode,

27647

// (IntrData->Opc1 != 0), then we check the rounding mode operand.

27648

unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;

27649

if (IntrWithRoundingModeOpcode != 0) {

27650

SDValue Rnd = Op.getOperand(4);

27651

unsigned RC = 0;

27652

if (isRoundModeSAEToX(Rnd, RC))

27653

return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),

27654

Src1, Src2, Src3,

27655

DAG.getTargetConstant(RC, dl, MVT::i32));

27656

if (!isRoundModeCurDirection(Rnd))

27657

return SDValue();

27658

}

27659

27660

return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),

27661

{Src1, Src2, Src3});

27662

}

27663

case INTR_TYPE_4OP_IMM8: {

27664

assert(Op.getOperand(4)->getOpcode() == ISD::TargetConstant)(static_cast <bool> (Op.getOperand(4)->getOpcode() ==
ISD::TargetConstant) ? void (0) : __assert_fail ("Op.getOperand(4)->getOpcode() == ISD::TargetConstant"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27664, __extension__
__PRETTY_FUNCTION__));

27665

SDValue Src4 = Op.getOperand(4);

27666

if (Src4.getValueType() != MVT::i8) {

27667

Src4 = DAG.getTargetConstant(

27668

cast<ConstantSDNode>(Src4)->getZExtValue() & 0xff, dl, MVT::i8);

27669

}

27670

27671

return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),

27672

Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),

27673

Src4);

27674

}

27675

case INTR_TYPE_1OP_MASK: {

27676

SDValue Src = Op.getOperand(1);

27677

SDValue PassThru = Op.getOperand(2);

27678

SDValue Mask = Op.getOperand(3);

27679

// We add rounding mode to the Node when

27680

// - RC Opcode is specified and

27681

// - RC is not "current direction".

27682

unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;

27683

if (IntrWithRoundingModeOpcode != 0) {

27684

SDValue Rnd = Op.getOperand(4);

27685

unsigned RC = 0;

27686

if (isRoundModeSAEToX(Rnd, RC))

27687

return getVectorMaskingNode(

27688

DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),

27689

Src, DAG.getTargetConstant(RC, dl, MVT::i32)),

27690

Mask, PassThru, Subtarget, DAG);

27691

if (!isRoundModeCurDirection(Rnd))

27692

return SDValue();

27693

}

27694

return getVectorMaskingNode(

27695

DAG.getNode(IntrData->Opc0, dl, VT, Src), Mask, PassThru,

27696

Subtarget, DAG);

27697

}

27698

case INTR_TYPE_1OP_MASK_SAE: {

27699

SDValue Src = Op.getOperand(1);

27700

SDValue PassThru = Op.getOperand(2);

27701

SDValue Mask = Op.getOperand(3);

27702

SDValue Rnd = Op.getOperand(4);

27703

27704

unsigned Opc;

27705

if (isRoundModeCurDirection(Rnd))

27706

Opc = IntrData->Opc0;

27707

else if (isRoundModeSAE(Rnd))

27708

Opc = IntrData->Opc1;

27709

else

27710

return SDValue();

27711

27712

return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src), Mask, PassThru,

27713

Subtarget, DAG);

27714

}

27715

case INTR_TYPE_SCALAR_MASK: {

27716

SDValue Src1 = Op.getOperand(1);

27717

SDValue Src2 = Op.getOperand(2);

27718

SDValue passThru = Op.getOperand(3);

27719

SDValue Mask = Op.getOperand(4);

27720

unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;

27721

// There are 2 kinds of intrinsics in this group:

27722

// (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands

27723

// (2) With rounding mode and sae - 7 operands.

27724

bool HasRounding = IntrWithRoundingModeOpcode != 0;

27725

if (Op.getNumOperands() == (5U + HasRounding)) {

27726

if (HasRounding) {

27727

SDValue Rnd = Op.getOperand(5);

27728

unsigned RC = 0;

27729

if (isRoundModeSAEToX(Rnd, RC))

27730

return getScalarMaskingNode(

27731

DAG.getNode(IntrWithRoundingModeOpcode, dl, VT, Src1, Src2,

27732

DAG.getTargetConstant(RC, dl, MVT::i32)),

27733

Mask, passThru, Subtarget, DAG);

27734

if (!isRoundModeCurDirection(Rnd))

27735

return SDValue();

27736

}

27737

return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,

27738

Src2),

27739

Mask, passThru, Subtarget, DAG);

27740

}

27741

27742

assert(Op.getNumOperands() == (6U + HasRounding) &&(static_cast <bool> (Op.getNumOperands() == (6U + HasRounding
) && "Unexpected intrinsic form") ? void (0) : __assert_fail
("Op.getNumOperands() == (6U + HasRounding) && \"Unexpected intrinsic form\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27743, __extension__
__PRETTY_FUNCTION__))

27743

"Unexpected intrinsic form")(static_cast <bool> (Op.getNumOperands() == (6U + HasRounding
) && "Unexpected intrinsic form") ? void (0) : __assert_fail
("Op.getNumOperands() == (6U + HasRounding) && \"Unexpected intrinsic form\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27743, __extension__
__PRETTY_FUNCTION__));

27744

SDValue RoundingMode = Op.getOperand(5);

27745

unsigned Opc = IntrData->Opc0;

27746

if (HasRounding) {

27747

SDValue Sae = Op.getOperand(6);

27748

if (isRoundModeSAE(Sae))

27749

Opc = IntrWithRoundingModeOpcode;

27750

else if (!isRoundModeCurDirection(Sae))

27751

return SDValue();

27752

}

27753

return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1,

27754

Src2, RoundingMode),

27755

Mask, passThru, Subtarget, DAG);

27756

}

27757

case INTR_TYPE_SCALAR_MASK_RND: {

27758

SDValue Src1 = Op.getOperand(1);

27759

SDValue Src2 = Op.getOperand(2);

27760

SDValue passThru = Op.getOperand(3);

27761

SDValue Mask = Op.getOperand(4);

27762

SDValue Rnd = Op.getOperand(5);

27763

27764

SDValue NewOp;

27765

unsigned RC = 0;

27766

if (isRoundModeCurDirection(Rnd))

27767

NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2);

27768

else if (isRoundModeSAEToX(Rnd, RC))

27769

NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2,

27770

DAG.getTargetConstant(RC, dl, MVT::i32));

27771

else

27772

return SDValue();

27773

27774

return getScalarMaskingNode(NewOp, Mask, passThru, Subtarget, DAG);

27775

}

27776

case INTR_TYPE_SCALAR_MASK_SAE: {

27777

SDValue Src1 = Op.getOperand(1);

27778

SDValue Src2 = Op.getOperand(2);

27779

SDValue passThru = Op.getOperand(3);

27780

SDValue Mask = Op.getOperand(4);

27781

SDValue Sae = Op.getOperand(5);

27782

unsigned Opc;

27783

if (isRoundModeCurDirection(Sae))

27784

Opc = IntrData->Opc0;

27785

else if (isRoundModeSAE(Sae))

27786

Opc = IntrData->Opc1;

27787

else

27788

return SDValue();

27789

27790

return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2),

27791

Mask, passThru, Subtarget, DAG);

27792

}

27793

case INTR_TYPE_2OP_MASK: {

27794

SDValue Src1 = Op.getOperand(1);

27795

SDValue Src2 = Op.getOperand(2);

27796

SDValue PassThru = Op.getOperand(3);

27797

SDValue Mask = Op.getOperand(4);

27798

SDValue NewOp;

27799

if (IntrData->Opc1 != 0) {

27800

SDValue Rnd = Op.getOperand(5);

27801

unsigned RC = 0;

27802

if (isRoundModeSAEToX(Rnd, RC))

27803

NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2,

27804

DAG.getTargetConstant(RC, dl, MVT::i32));

27805

else if (!isRoundModeCurDirection(Rnd))

27806

return SDValue();

27807

}

27808

if (!NewOp)

27809

NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2);

27810

return getVectorMaskingNode(NewOp, Mask, PassThru, Subtarget, DAG);

27811

}

27812

case INTR_TYPE_2OP_MASK_SAE: {

27813

SDValue Src1 = Op.getOperand(1);

27814

SDValue Src2 = Op.getOperand(2);

27815

SDValue PassThru = Op.getOperand(3);

27816

SDValue Mask = Op.getOperand(4);

27817

27818

unsigned Opc = IntrData->Opc0;

27819

if (IntrData->Opc1 != 0) {

27820

SDValue Sae = Op.getOperand(5);

27821

if (isRoundModeSAE(Sae))

27822

Opc = IntrData->Opc1;

27823

else if (!isRoundModeCurDirection(Sae))

27824

return SDValue();

27825

}

27826

27827

return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2),

27828

Mask, PassThru, Subtarget, DAG);

27829

}

27830

case INTR_TYPE_3OP_SCALAR_MASK_SAE: {

27831

SDValue Src1 = Op.getOperand(1);

27832

SDValue Src2 = Op.getOperand(2);

27833

SDValue Src3 = Op.getOperand(3);

27834

SDValue PassThru = Op.getOperand(4);

27835

SDValue Mask = Op.getOperand(5);

27836

SDValue Sae = Op.getOperand(6);

27837

unsigned Opc;

27838

if (isRoundModeCurDirection(Sae))

27839

Opc = IntrData->Opc0;

27840

else if (isRoundModeSAE(Sae))

27841

Opc = IntrData->Opc1;

27842

else

27843

return SDValue();

27844

27845

return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, Src3),

27846

Mask, PassThru, Subtarget, DAG);

27847

}

27848

case INTR_TYPE_3OP_MASK_SAE: {

27849

SDValue Src1 = Op.getOperand(1);

27850

SDValue Src2 = Op.getOperand(2);

27851

SDValue Src3 = Op.getOperand(3);

27852

SDValue PassThru = Op.getOperand(4);

27853

SDValue Mask = Op.getOperand(5);

27854

27855

unsigned Opc = IntrData->Opc0;

27856

if (IntrData->Opc1 != 0) {

27857

SDValue Sae = Op.getOperand(6);

27858

if (isRoundModeSAE(Sae))

27859

Opc = IntrData->Opc1;

27860

else if (!isRoundModeCurDirection(Sae))

27861

return SDValue();

27862

}

27863

return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, Src3),

27864

Mask, PassThru, Subtarget, DAG);

27865

}

27866

case BLENDV: {

27867

SDValue Src1 = Op.getOperand(1);

27868

SDValue Src2 = Op.getOperand(2);

27869

SDValue Src3 = Op.getOperand(3);

27870

27871

EVT MaskVT = Src3.getValueType().changeVectorElementTypeToInteger();

27872

Src3 = DAG.getBitcast(MaskVT, Src3);

27873

27874

// Reverse the operands to match VSELECT order.

27875

return DAG.getNode(IntrData->Opc0, dl, VT, Src3, Src2, Src1);

27876

}

27877

case VPERM_2OP : {

27878

SDValue Src1 = Op.getOperand(1);

27879

SDValue Src2 = Op.getOperand(2);

27880

27881

// Swap Src1 and Src2 in the node creation

27882

return DAG.getNode(IntrData->Opc0, dl, VT,Src2, Src1);

27883

}

27884

case CFMA_OP_MASKZ:

27885

case CFMA_OP_MASK: {

27886

SDValue Src1 = Op.getOperand(1);

27887

SDValue Src2 = Op.getOperand(2);

27888

SDValue Src3 = Op.getOperand(3);

27889

SDValue Mask = Op.getOperand(4);

27890

MVT VT = Op.getSimpleValueType();

27891

27892

SDValue PassThru = Src3;

27893

if (IntrData->Type == CFMA_OP_MASKZ)

27894

PassThru = getZeroVector(VT, Subtarget, DAG, dl);

27895

27896

// We add rounding mode to the Node when

27897

// - RC Opcode is specified and

27898

// - RC is not "current direction".

27899

SDValue NewOp;

27900

if (IntrData->Opc1 != 0) {

27901

SDValue Rnd = Op.getOperand(5);

27902

unsigned RC = 0;

27903

if (isRoundModeSAEToX(Rnd, RC))

27904

NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2, Src3,

27905

DAG.getTargetConstant(RC, dl, MVT::i32));

27906

else if (!isRoundModeCurDirection(Rnd))

27907

return SDValue();

27908

}

27909

if (!NewOp)

27910

NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2, Src3);

27911

return getVectorMaskingNode(NewOp, Mask, PassThru, Subtarget, DAG);

27912

}

27913

case IFMA_OP:

27914

// NOTE: We need to swizzle the operands to pass the multiply operands

27915

// first.

27916

return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),

27917

Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));

27918

case FPCLASSS: {

27919

SDValue Src1 = Op.getOperand(1);

27920

SDValue Imm = Op.getOperand(2);

27921

SDValue Mask = Op.getOperand(3);

27922

SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Imm);

27923

SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask, SDValue(),

27924

Subtarget, DAG);

27925

// Need to fill with zeros to ensure the bitcast will produce zeroes

27926

// for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.

27927

SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,

27928

DAG.getConstant(0, dl, MVT::v8i1),

27929

FPclassMask, DAG.getIntPtrConstant(0, dl));

27930

return DAG.getBitcast(MVT::i8, Ins);

27931

}

27932

27933

case CMP_MASK_CC: {

27934

MVT MaskVT = Op.getSimpleValueType();

27935

SDValue CC = Op.getOperand(3);

27936

SDValue Mask = Op.getOperand(4);

27937

// We specify 2 possible opcodes for intrinsics with rounding modes.

27938

// First, we check if the intrinsic may have non-default rounding mode,

27939

// (IntrData->Opc1 != 0), then we check the rounding mode operand.

27940

if (IntrData->Opc1 != 0) {

27941

SDValue Sae = Op.getOperand(5);

27942

if (isRoundModeSAE(Sae))

27943

return DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1),

27944

Op.getOperand(2), CC, Mask, Sae);

27945

if (!isRoundModeCurDirection(Sae))

27946

return SDValue();

27947

}

27948

//default rounding mode

27949

return DAG.getNode(IntrData->Opc0, dl, MaskVT,

27950

{Op.getOperand(1), Op.getOperand(2), CC, Mask});

27951

}

27952

case CMP_MASK_SCALAR_CC: {

27953

SDValue Src1 = Op.getOperand(1);

27954

SDValue Src2 = Op.getOperand(2);

27955

SDValue CC = Op.getOperand(3);

27956

SDValue Mask = Op.getOperand(4);

27957

27958

SDValue Cmp;

27959

if (IntrData->Opc1 != 0) {

27960

SDValue Sae = Op.getOperand(5);

27961

if (isRoundModeSAE(Sae))

27962

Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::v1i1, Src1, Src2, CC, Sae);

27963

else if (!isRoundModeCurDirection(Sae))

27964

return SDValue();

27965

}

27966

//default rounding mode

27967

if (!Cmp.getNode())

27968

Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Src2, CC);

27969

27970

SDValue CmpMask = getScalarMaskingNode(Cmp, Mask, SDValue(),

27971

Subtarget, DAG);

27972

// Need to fill with zeros to ensure the bitcast will produce zeroes

27973

// for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.

27974

SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,

27975

DAG.getConstant(0, dl, MVT::v8i1),

27976

CmpMask, DAG.getIntPtrConstant(0, dl));

27977

return DAG.getBitcast(MVT::i8, Ins);

27978

}

27979

case COMI: { // Comparison intrinsics

27980

ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;

27981

SDValue LHS = Op.getOperand(1);

27982

SDValue RHS = Op.getOperand(2);

27983

// Some conditions require the operands to be swapped.

27984

if (CC == ISD::SETLT || CC == ISD::SETLE)

27985

std::swap(LHS, RHS);

27986

27987

SDValue Comi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, LHS, RHS);

27988

SDValue SetCC;

27989

switch (CC) {

27990

case ISD::SETEQ: { // (ZF = 0 and PF = 0)

27991

SetCC = getSETCC(X86::COND_E, Comi, dl, DAG);

27992

SDValue SetNP = getSETCC(X86::COND_NP, Comi, dl, DAG);

27993

SetCC = DAG.getNode(ISD::AND, dl, MVT::i8, SetCC, SetNP);

27994

break;

27995

}

27996

case ISD::SETNE: { // (ZF = 1 or PF = 1)

27997

SetCC = getSETCC(X86::COND_NE, Comi, dl, DAG);

27998

SDValue SetP = getSETCC(X86::COND_P, Comi, dl, DAG);

27999

SetCC = DAG.getNode(ISD::OR, dl, MVT::i8, SetCC, SetP);

28000

break;

28001

}

28002

case ISD::SETGT: // (CF = 0 and ZF = 0)

28003

case ISD::SETLT: { // Condition opposite to GT. Operands swapped above.

28004

SetCC = getSETCC(X86::COND_A, Comi, dl, DAG);

28005

break;

28006

}

28007

case ISD::SETGE: // CF = 0

28008

case ISD::SETLE: // Condition opposite to GE. Operands swapped above.

28009

SetCC = getSETCC(X86::COND_AE, Comi, dl, DAG);

28010

break;

28011

default:

28012

llvm_unreachable("Unexpected illegal condition!")::llvm::llvm_unreachable_internal("Unexpected illegal condition!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28012);

28013

}

28014

return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);

28015

}

28016

case COMI_RM: { // Comparison intrinsics with Sae

28017

SDValue LHS = Op.getOperand(1);

28018

SDValue RHS = Op.getOperand(2);

28019

unsigned CondVal = Op.getConstantOperandVal(3);

28020

SDValue Sae = Op.getOperand(4);

28021

28022

SDValue FCmp;

28023

if (isRoundModeCurDirection(Sae))

28024

FCmp = DAG.getNode(X86ISD::FSETCCM, dl, MVT::v1i1, LHS, RHS,

28025

DAG.getTargetConstant(CondVal, dl, MVT::i8));

28026

else if (isRoundModeSAE(Sae))

28027

FCmp = DAG.getNode(X86ISD::FSETCCM_SAE, dl, MVT::v1i1, LHS, RHS,

28028

DAG.getTargetConstant(CondVal, dl, MVT::i8), Sae);

28029

else

28030

return SDValue();

28031

// Need to fill with zeros to ensure the bitcast will produce zeroes

28032

// for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.

28033

SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,

28034

DAG.getConstant(0, dl, MVT::v16i1),

28035

FCmp, DAG.getIntPtrConstant(0, dl));

28036

return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32,

28037

DAG.getBitcast(MVT::i16, Ins));

28038

}

28039

case VSHIFT: {

28040

SDValue SrcOp = Op.getOperand(1);

28041

SDValue ShAmt = Op.getOperand(2);

28042

assert(ShAmt.getValueType() == MVT::i32 &&(static_cast <bool> (ShAmt.getValueType() == MVT::i32 &&
"Unexpected VSHIFT amount type") ? void (0) : __assert_fail (
"ShAmt.getValueType() == MVT::i32 && \"Unexpected VSHIFT amount type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28043, __extension__
__PRETTY_FUNCTION__))

28043

"Unexpected VSHIFT amount type")(static_cast <bool> (ShAmt.getValueType() == MVT::i32 &&
"Unexpected VSHIFT amount type") ? void (0) : __assert_fail (
"ShAmt.getValueType() == MVT::i32 && \"Unexpected VSHIFT amount type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28043, __extension__
__PRETTY_FUNCTION__));

28044

28045

// Catch shift-by-constant.

28046

if (auto *CShAmt = dyn_cast<ConstantSDNode>(ShAmt))

28047

return getTargetVShiftByConstNode(IntrData->Opc0, dl,

28048

Op.getSimpleValueType(), SrcOp,

28049

CShAmt->getZExtValue(), DAG);

28050

28051

ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, ShAmt);

28052

return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),

28053

SrcOp, ShAmt, 0, Subtarget, DAG);

28054

}

28055

case COMPRESS_EXPAND_IN_REG: {

28056

SDValue Mask = Op.getOperand(3);

28057

SDValue DataToCompress = Op.getOperand(1);

28058

SDValue PassThru = Op.getOperand(2);

28059

if (ISD::isBuildVectorAllOnes(Mask.getNode())) // return data as is

28060

return Op.getOperand(1);

28061

28062

// Avoid false dependency.

28063

if (PassThru.isUndef())

28064

PassThru = getZeroVector(VT, Subtarget, DAG, dl);

28065

28066

return DAG.getNode(IntrData->Opc0, dl, VT, DataToCompress, PassThru,

28067

Mask);

28068

}

28069

case FIXUPIMM:

28070

case FIXUPIMM_MASKZ: {

28071

SDValue Src1 = Op.getOperand(1);

28072

SDValue Src2 = Op.getOperand(2);

28073

SDValue Src3 = Op.getOperand(3);

28074

SDValue Imm = Op.getOperand(4);

28075

SDValue Mask = Op.getOperand(5);

28076

SDValue Passthru = (IntrData->Type == FIXUPIMM)

28077

? Src1

28078

: getZeroVector(VT, Subtarget, DAG, dl);

28079

28080

unsigned Opc = IntrData->Opc0;

28081

if (IntrData->Opc1 != 0) {

28082

SDValue Sae = Op.getOperand(6);

28083

if (isRoundModeSAE(Sae))

28084

Opc = IntrData->Opc1;

28085

else if (!isRoundModeCurDirection(Sae))

28086

return SDValue();

28087

}

28088

28089

SDValue FixupImm = DAG.getNode(Opc, dl, VT, Src1, Src2, Src3, Imm);

28090

28091

if (Opc == X86ISD::VFIXUPIMM || Opc == X86ISD::VFIXUPIMM_SAE)

28092

return getVectorMaskingNode(FixupImm, Mask, Passthru, Subtarget, DAG);

28093

28094

return getScalarMaskingNode(FixupImm, Mask, Passthru, Subtarget, DAG);

28095

}

28096

case ROUNDP: {

28097

assert(IntrData->Opc0 == X86ISD::VRNDSCALE && "Unexpected opcode")(static_cast <bool> (IntrData->Opc0 == X86ISD::VRNDSCALE
&& "Unexpected opcode") ? void (0) : __assert_fail (
"IntrData->Opc0 == X86ISD::VRNDSCALE && \"Unexpected opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28097, __extension__
__PRETTY_FUNCTION__));

28098

// Clear the upper bits of the rounding immediate so that the legacy

28099

// intrinsic can't trigger the scaling behavior of VRNDSCALE.

28100

auto Round = cast<ConstantSDNode>(Op.getOperand(2));

28101

SDValue RoundingMode =

28102

DAG.getTargetConstant(Round->getZExtValue() & 0xf, dl, MVT::i32);

28103

return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),

28104

Op.getOperand(1), RoundingMode);

28105

}

28106

case ROUNDS: {

28107

assert(IntrData->Opc0 == X86ISD::VRNDSCALES && "Unexpected opcode")(static_cast <bool> (IntrData->Opc0 == X86ISD::VRNDSCALES
&& "Unexpected opcode") ? void (0) : __assert_fail (
"IntrData->Opc0 == X86ISD::VRNDSCALES && \"Unexpected opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28107, __extension__
__PRETTY_FUNCTION__));

28108

// Clear the upper bits of the rounding immediate so that the legacy

28109

// intrinsic can't trigger the scaling behavior of VRNDSCALE.

28110

auto Round = cast<ConstantSDNode>(Op.getOperand(3));

28111

SDValue RoundingMode =

28112

DAG.getTargetConstant(Round->getZExtValue() & 0xf, dl, MVT::i32);

28113

return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),

28114

Op.getOperand(1), Op.getOperand(2), RoundingMode);

28115

}

28116

case BEXTRI: {

28117

assert(IntrData->Opc0 == X86ISD::BEXTRI && "Unexpected opcode")(static_cast <bool> (IntrData->Opc0 == X86ISD::BEXTRI
&& "Unexpected opcode") ? void (0) : __assert_fail (
"IntrData->Opc0 == X86ISD::BEXTRI && \"Unexpected opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28117, __extension__
__PRETTY_FUNCTION__));

28118

28119

uint64_t Imm = Op.getConstantOperandVal(2);

28120

SDValue Control = DAG.getTargetConstant(Imm & 0xffff, dl,

28121

Op.getValueType());

28122

return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),

28123

Op.getOperand(1), Control);

28124

}

28125

// ADC/ADCX/SBB

28126

case ADX: {

28127

SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::i32);

28128

SDVTList VTs = DAG.getVTList(Op.getOperand(2).getValueType(), MVT::i32);

28129

28130

SDValue Res;

28131

// If the carry in is zero, then we should just use ADD/SUB instead of

28132

// ADC/SBB.

28133

if (isNullConstant(Op.getOperand(1))) {

28134

Res = DAG.getNode(IntrData->Opc1, dl, VTs, Op.getOperand(2),

28135

Op.getOperand(3));

28136

} else {

28137

SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(1),

28138

DAG.getConstant(-1, dl, MVT::i8));

28139

Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(2),

28140

Op.getOperand(3), GenCF.getValue(1));

28141

}

28142

SDValue SetCC = getSETCC(X86::COND_B, Res.getValue(1), dl, DAG);

28143

SDValue Results[] = { SetCC, Res };

28144

return DAG.getMergeValues(Results, dl);

28145

}

28146

case CVTPD2PS_MASK:

28147

case CVTPD2DQ_MASK:

28148

case CVTQQ2PS_MASK:

28149

case TRUNCATE_TO_REG: {

28150

SDValue Src = Op.getOperand(1);

28151

SDValue PassThru = Op.getOperand(2);

28152

SDValue Mask = Op.getOperand(3);

28153

28154

if (isAllOnesConstant(Mask))

28155

return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src);

28156

28157

MVT SrcVT = Src.getSimpleValueType();

28158

MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());

28159

Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);

28160

return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(),

28161

{Src, PassThru, Mask});

28162

}

28163

case CVTPS2PH_MASK: {

28164

SDValue Src = Op.getOperand(1);

28165

SDValue Rnd = Op.getOperand(2);

28166

SDValue PassThru = Op.getOperand(3);

28167

SDValue Mask = Op.getOperand(4);

28168

28169

unsigned RC = 0;

28170

unsigned Opc = IntrData->Opc0;

28171

bool SAE = Src.getValueType().is512BitVector() &&

28172

(isRoundModeSAEToX(Rnd, RC) || isRoundModeSAE(Rnd));

28173

if (SAE) {

28174

Opc = X86ISD::CVTPS2PH_SAE;

28175

Rnd = DAG.getTargetConstant(RC, dl, MVT::i32);

28176

}

28177

28178

if (isAllOnesConstant(Mask))

28179

return DAG.getNode(Opc, dl, Op.getValueType(), Src, Rnd);

28180

28181

if (SAE)

28182

Opc = X86ISD::MCVTPS2PH_SAE;

28183

else

28184

Opc = IntrData->Opc1;

28185

MVT SrcVT = Src.getSimpleValueType();

28186

MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());

28187

Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);

28188

return DAG.getNode(Opc, dl, Op.getValueType(), Src, Rnd, PassThru, Mask);

28189

}

28190

case CVTNEPS2BF16_MASK: {

28191

SDValue Src = Op.getOperand(1);

28192

SDValue PassThru = Op.getOperand(2);

28193

SDValue Mask = Op.getOperand(3);

28194

28195

if (ISD::isBuildVectorAllOnes(Mask.getNode()))

28196

return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src);

28197

28198

// Break false dependency.

28199

if (PassThru.isUndef())

28200

PassThru = DAG.getConstant(0, dl, PassThru.getValueType());

28201

28202

return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(), Src, PassThru,

28203

Mask);

28204

}

28205

default:

28206

break;

28207

}

28208

}

28209

28210

switch (IntNo) {

28211

default: return SDValue(); // Don't custom lower most intrinsics.

28212

28213

// ptest and testp intrinsics. The intrinsic these come from are designed to

28214

// return an integer value, not just an instruction so lower it to the ptest

28215

// or testp pattern and a setcc for the result.

28216

case Intrinsic::x86_avx512_ktestc_b:

28217

case Intrinsic::x86_avx512_ktestc_w:

28218

case Intrinsic::x86_avx512_ktestc_d:

28219

case Intrinsic::x86_avx512_ktestc_q:

28220

case Intrinsic::x86_avx512_ktestz_b:

28221

case Intrinsic::x86_avx512_ktestz_w:

28222

case Intrinsic::x86_avx512_ktestz_d:

28223

case Intrinsic::x86_avx512_ktestz_q:

28224

case Intrinsic::x86_sse41_ptestz:

28225

case Intrinsic::x86_sse41_ptestc:

28226

case Intrinsic::x86_sse41_ptestnzc:

28227

case Intrinsic::x86_avx_ptestz_256:

28228

case Intrinsic::x86_avx_ptestc_256:

28229

case Intrinsic::x86_avx_ptestnzc_256:

28230

case Intrinsic::x86_avx_vtestz_ps:

28231

case Intrinsic::x86_avx_vtestc_ps:

28232

case Intrinsic::x86_avx_vtestnzc_ps:

28233

case Intrinsic::x86_avx_vtestz_pd:

28234

case Intrinsic::x86_avx_vtestc_pd:

28235

case Intrinsic::x86_avx_vtestnzc_pd:

28236

case Intrinsic::x86_avx_vtestz_ps_256:

28237

case Intrinsic::x86_avx_vtestc_ps_256:

28238

case Intrinsic::x86_avx_vtestnzc_ps_256:

28239

case Intrinsic::x86_avx_vtestz_pd_256:

28240

case Intrinsic::x86_avx_vtestc_pd_256:

28241

case Intrinsic::x86_avx_vtestnzc_pd_256: {

28242

unsigned TestOpc = X86ISD::PTEST;

28243

X86::CondCode X86CC;

28244

switch (IntNo) {

28245

default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.")::llvm::llvm_unreachable_internal("Bad fallthrough in Intrinsic lowering."
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28245);

28246

case Intrinsic::x86_avx512_ktestc_b:

28247

case Intrinsic::x86_avx512_ktestc_w:

28248

case Intrinsic::x86_avx512_ktestc_d:

28249

case Intrinsic::x86_avx512_ktestc_q:

28250

// CF = 1

28251

TestOpc = X86ISD::KTEST;

28252

X86CC = X86::COND_B;

28253

break;

28254

case Intrinsic::x86_avx512_ktestz_b:

28255

case Intrinsic::x86_avx512_ktestz_w:

28256

case Intrinsic::x86_avx512_ktestz_d:

28257

case Intrinsic::x86_avx512_ktestz_q:

28258

TestOpc = X86ISD::KTEST;

28259

X86CC = X86::COND_E;

28260

break;

28261

case Intrinsic::x86_avx_vtestz_ps:

28262

case Intrinsic::x86_avx_vtestz_pd:

28263

case Intrinsic::x86_avx_vtestz_ps_256:

28264

case Intrinsic::x86_avx_vtestz_pd_256:

28265

TestOpc = X86ISD::TESTP;

28266

[[fallthrough]];

28267

case Intrinsic::x86_sse41_ptestz:

28268

case Intrinsic::x86_avx_ptestz_256:

28269

// ZF = 1

28270

X86CC = X86::COND_E;

28271

break;

28272

case Intrinsic::x86_avx_vtestc_ps:

28273

case Intrinsic::x86_avx_vtestc_pd:

28274

case Intrinsic::x86_avx_vtestc_ps_256:

28275

case Intrinsic::x86_avx_vtestc_pd_256:

28276

TestOpc = X86ISD::TESTP;

28277

[[fallthrough]];

28278

case Intrinsic::x86_sse41_ptestc:

28279

case Intrinsic::x86_avx_ptestc_256:

28280

// CF = 1

28281

X86CC = X86::COND_B;

28282

break;

28283

case Intrinsic::x86_avx_vtestnzc_ps:

28284

case Intrinsic::x86_avx_vtestnzc_pd:

28285

case Intrinsic::x86_avx_vtestnzc_ps_256:

28286

case Intrinsic::x86_avx_vtestnzc_pd_256:

28287

TestOpc = X86ISD::TESTP;

28288

[[fallthrough]];

28289

case Intrinsic::x86_sse41_ptestnzc:

28290

case Intrinsic::x86_avx_ptestnzc_256:

28291

// ZF and CF = 0

28292

X86CC = X86::COND_A;

28293

break;

28294

}

28295

28296

SDValue LHS = Op.getOperand(1);

28297

SDValue RHS = Op.getOperand(2);

28298

SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS);

28299

SDValue SetCC = getSETCC(X86CC, Test, dl, DAG);

28300

return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);

28301

}

28302

28303

case Intrinsic::x86_sse42_pcmpistria128:

28304

case Intrinsic::x86_sse42_pcmpestria128:

28305

case Intrinsic::x86_sse42_pcmpistric128:

28306

case Intrinsic::x86_sse42_pcmpestric128:

28307

case Intrinsic::x86_sse42_pcmpistrio128:

28308

case Intrinsic::x86_sse42_pcmpestrio128:

28309

case Intrinsic::x86_sse42_pcmpistris128:

28310

case Intrinsic::x86_sse42_pcmpestris128:

28311

case Intrinsic::x86_sse42_pcmpistriz128:

28312

case Intrinsic::x86_sse42_pcmpestriz128: {

28313

unsigned Opcode;

28314

X86::CondCode X86CC;

28315

switch (IntNo) {

28316

default: llvm_unreachable("Impossible intrinsic")::llvm::llvm_unreachable_internal("Impossible intrinsic", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28316); // Can't reach here.

28317

case Intrinsic::x86_sse42_pcmpistria128:

28318

Opcode = X86ISD::PCMPISTR;

28319

X86CC = X86::COND_A;

28320

break;

28321

case Intrinsic::x86_sse42_pcmpestria128:

28322

Opcode = X86ISD::PCMPESTR;

28323

X86CC = X86::COND_A;

28324

break;

28325

case Intrinsic::x86_sse42_pcmpistric128:

28326

Opcode = X86ISD::PCMPISTR;

28327

X86CC = X86::COND_B;

28328

break;

28329

case Intrinsic::x86_sse42_pcmpestric128:

28330

Opcode = X86ISD::PCMPESTR;

28331

X86CC = X86::COND_B;

28332

break;

28333

case Intrinsic::x86_sse42_pcmpistrio128:

28334

Opcode = X86ISD::PCMPISTR;

28335

X86CC = X86::COND_O;

28336

break;

28337

case Intrinsic::x86_sse42_pcmpestrio128:

28338

Opcode = X86ISD::PCMPESTR;

28339

X86CC = X86::COND_O;

28340

break;

28341

case Intrinsic::x86_sse42_pcmpistris128:

28342

Opcode = X86ISD::PCMPISTR;

28343

X86CC = X86::COND_S;

28344

break;

28345

case Intrinsic::x86_sse42_pcmpestris128:

28346

Opcode = X86ISD::PCMPESTR;

28347

X86CC = X86::COND_S;

28348

break;

28349

case Intrinsic::x86_sse42_pcmpistriz128:

28350

Opcode = X86ISD::PCMPISTR;

28351

X86CC = X86::COND_E;

28352

break;

28353

case Intrinsic::x86_sse42_pcmpestriz128:

28354

Opcode = X86ISD::PCMPESTR;

28355

X86CC = X86::COND_E;

28356

break;

28357

}

28358

SmallVector<SDValue, 5> NewOps(llvm::drop_begin(Op->ops()));

28359

SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);

28360

SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps).getValue(2);

28361

SDValue SetCC = getSETCC(X86CC, PCMP, dl, DAG);

28362

return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);

28363

}

28364

28365

case Intrinsic::x86_sse42_pcmpistri128:

28366

case Intrinsic::x86_sse42_pcmpestri128: {

28367

unsigned Opcode;

28368

if (IntNo == Intrinsic::x86_sse42_pcmpistri128)

28369

Opcode = X86ISD::PCMPISTR;

28370

else

28371

Opcode = X86ISD::PCMPESTR;

28372

28373

SmallVector<SDValue, 5> NewOps(llvm::drop_begin(Op->ops()));

28374

SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);

28375

return DAG.getNode(Opcode, dl, VTs, NewOps);

28376

}

28377

28378

case Intrinsic::x86_sse42_pcmpistrm128:

28379

case Intrinsic::x86_sse42_pcmpestrm128: {

28380

unsigned Opcode;

28381

if (IntNo == Intrinsic::x86_sse42_pcmpistrm128)

28382

Opcode = X86ISD::PCMPISTR;

28383

else

28384

Opcode = X86ISD::PCMPESTR;

28385

28386

SmallVector<SDValue, 5> NewOps(llvm::drop_begin(Op->ops()));

28387

SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);

28388

return DAG.getNode(Opcode, dl, VTs, NewOps).getValue(1);

28389

}

28390

28391

case Intrinsic::eh_sjlj_lsda: {

28392

MachineFunction &MF = DAG.getMachineFunction();

28393

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

28394

MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());

28395

auto &Context = MF.getMMI().getContext();

28396

MCSymbol *S = Context.getOrCreateSymbol(Twine("GCC_except_table") +

28397

Twine(MF.getFunctionNumber()));

28398

return DAG.getNode(getGlobalWrapperKind(), dl, VT,

28399

DAG.getMCSymbol(S, PtrVT));

28400

}

28401

28402

case Intrinsic::x86_seh_lsda: {

28403

// Compute the symbol for the LSDA. We know it'll get emitted later.

28404

MachineFunction &MF = DAG.getMachineFunction();

28405

SDValue Op1 = Op.getOperand(1);

28406

auto *Fn = cast<Function>(cast<GlobalAddressSDNode>(Op1)->getGlobal());

28407

MCSymbol *LSDASym = MF.getMMI().getContext().getOrCreateLSDASymbol(

28408

GlobalValue::dropLLVMManglingEscape(Fn->getName()));

28409

28410

// Generate a simple absolute symbol reference. This intrinsic is only

28411

// supported on 32-bit Windows, which isn't PIC.

28412

SDValue Result = DAG.getMCSymbol(LSDASym, VT);

28413

return DAG.getNode(X86ISD::Wrapper, dl, VT, Result);

28414

}

28415

28416

case Intrinsic::eh_recoverfp: {

28417

SDValue FnOp = Op.getOperand(1);

28418

SDValue IncomingFPOp = Op.getOperand(2);

28419

GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);

28420

auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);

28421

if (!Fn)

28422

report_fatal_error(

28423

"llvm.eh.recoverfp must take a function as the first argument");

28424

return recoverFramePointer(DAG, Fn, IncomingFPOp);

28425

}

28426

28427

case Intrinsic::localaddress: {

28428

// Returns one of the stack, base, or frame pointer registers, depending on

28429

// which is used to reference local variables.

28430

MachineFunction &MF = DAG.getMachineFunction();

28431

const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();

28432

unsigned Reg;

28433

if (RegInfo->hasBasePointer(MF))

28434

Reg = RegInfo->getBaseRegister();

28435

else { // Handles the SP or FP case.

28436

bool CantUseFP = RegInfo->hasStackRealignment(MF);

28437

if (CantUseFP)

28438

Reg = RegInfo->getPtrSizedStackRegister(MF);

28439

else

28440

Reg = RegInfo->getPtrSizedFrameRegister(MF);

28441

}

28442

return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);

28443

}

28444

case Intrinsic::x86_avx512_vp2intersect_q_512:

28445

case Intrinsic::x86_avx512_vp2intersect_q_256:

28446

case Intrinsic::x86_avx512_vp2intersect_q_128:

28447

case Intrinsic::x86_avx512_vp2intersect_d_512:

28448

case Intrinsic::x86_avx512_vp2intersect_d_256:

28449

case Intrinsic::x86_avx512_vp2intersect_d_128: {

28450

MVT MaskVT = Op.getSimpleValueType();

28451

28452

SDVTList VTs = DAG.getVTList(MVT::Untyped, MVT::Other);

28453

SDLoc DL(Op);

28454

28455

SDValue Operation =

28456

DAG.getNode(X86ISD::VP2INTERSECT, DL, VTs,

28457

Op->getOperand(1), Op->getOperand(2));

28458

28459

SDValue Result0 = DAG.getTargetExtractSubreg(X86::sub_mask_0, DL,

28460

MaskVT, Operation);

28461

SDValue Result1 = DAG.getTargetExtractSubreg(X86::sub_mask_1, DL,

28462

MaskVT, Operation);

28463

return DAG.getMergeValues({Result0, Result1}, DL);

28464

}

28465

case Intrinsic::x86_mmx_pslli_w:

28466

case Intrinsic::x86_mmx_pslli_d:

28467

case Intrinsic::x86_mmx_pslli_q:

28468

case Intrinsic::x86_mmx_psrli_w:

28469

case Intrinsic::x86_mmx_psrli_d:

28470

case Intrinsic::x86_mmx_psrli_q:

28471

case Intrinsic::x86_mmx_psrai_w:

28472

case Intrinsic::x86_mmx_psrai_d: {

28473

SDLoc DL(Op);

28474

SDValue ShAmt = Op.getOperand(2);

28475

// If the argument is a constant, convert it to a target constant.

28476

if (auto *C = dyn_cast<ConstantSDNode>(ShAmt)) {

28477

// Clamp out of bounds shift amounts since they will otherwise be masked

28478

// to 8-bits which may make it no longer out of bounds.

28479

unsigned ShiftAmount = C->getAPIntValue().getLimitedValue(255);

28480

if (ShiftAmount == 0)

28481

return Op.getOperand(1);

28482

28483

return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),

28484

Op.getOperand(0), Op.getOperand(1),

28485

DAG.getTargetConstant(ShiftAmount, DL, MVT::i32));

28486

}

28487

28488

unsigned NewIntrinsic;

28489

switch (IntNo) {

28490

default: llvm_unreachable("Impossible intrinsic")::llvm::llvm_unreachable_internal("Impossible intrinsic", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28490); // Can't reach here.

28491

case Intrinsic::x86_mmx_pslli_w:

28492

NewIntrinsic = Intrinsic::x86_mmx_psll_w;

28493

break;

28494

case Intrinsic::x86_mmx_pslli_d:

28495

NewIntrinsic = Intrinsic::x86_mmx_psll_d;

28496

break;

28497

case Intrinsic::x86_mmx_pslli_q:

28498

NewIntrinsic = Intrinsic::x86_mmx_psll_q;

28499

break;

28500

case Intrinsic::x86_mmx_psrli_w:

28501

NewIntrinsic = Intrinsic::x86_mmx_psrl_w;

28502

break;

28503

case Intrinsic::x86_mmx_psrli_d:

28504

NewIntrinsic = Intrinsic::x86_mmx_psrl_d;

28505

break;

28506

case Intrinsic::x86_mmx_psrli_q:

28507

NewIntrinsic = Intrinsic::x86_mmx_psrl_q;

28508

break;

28509

case Intrinsic::x86_mmx_psrai_w:

28510

NewIntrinsic = Intrinsic::x86_mmx_psra_w;

28511

break;

28512

case Intrinsic::x86_mmx_psrai_d:

28513

NewIntrinsic = Intrinsic::x86_mmx_psra_d;

28514

break;

28515

}

28516

28517

// The vector shift intrinsics with scalars uses 32b shift amounts but

28518

// the sse2/mmx shift instructions reads 64 bits. Copy the 32 bits to an

28519

// MMX register.

28520

ShAmt = DAG.getNode(X86ISD::MMX_MOVW2D, DL, MVT::x86mmx, ShAmt);

28521

return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),

28522

DAG.getTargetConstant(NewIntrinsic, DL,

28523

getPointerTy(DAG.getDataLayout())),

28524

Op.getOperand(1), ShAmt);

28525

}

28526

case Intrinsic::thread_pointer: {

28527

if (Subtarget.isTargetELF()) {

28528

SDLoc dl(Op);

28529

EVT PtrVT = getPointerTy(DAG.getDataLayout());

28530

// Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).

28531

Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(

28532

*DAG.getContext(), Subtarget.is64Bit() ? X86AS::FS : X86AS::GS));

28533

return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),

28534

DAG.getIntPtrConstant(0, dl), MachinePointerInfo(Ptr));

28535

}

28536

report_fatal_error(

28537

"Target OS doesn't support __builtin_thread_pointer() yet.");

28538

}

28539

}

28540

}

28541

28542

static SDValue getAVX2GatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,

28543

SDValue Src, SDValue Mask, SDValue Base,

28544

SDValue Index, SDValue ScaleOp, SDValue Chain,

28545

const X86Subtarget &Subtarget) {

28546

SDLoc dl(Op);

28547

auto *C = dyn_cast<ConstantSDNode>(ScaleOp);

28548

// Scale must be constant.

28549

if (!C)

28550

return SDValue();

28551

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

28552

SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,

28553

TLI.getPointerTy(DAG.getDataLayout()));

28554

EVT MaskVT = Mask.getValueType().changeVectorElementTypeToInteger();

28555

SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Other);

28556

// If source is undef or we know it won't be used, use a zero vector

28557

// to break register dependency.

28558

// TODO: use undef instead and let BreakFalseDeps deal with it?

28559

if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))

28560

Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);

28561

28562

// Cast mask to an integer type.

28563

Mask = DAG.getBitcast(MaskVT, Mask);

28564

28565

MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);

28566

28567

SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale };

28568

SDValue Res =

28569

DAG.getMemIntrinsicNode(X86ISD::MGATHER, dl, VTs, Ops,

28570

MemIntr->getMemoryVT(), MemIntr->getMemOperand());

28571

return DAG.getMergeValues({Res, Res.getValue(1)}, dl);

28572

}

28573

28574

static SDValue getGatherNode(SDValue Op, SelectionDAG &DAG,

28575

SDValue Src, SDValue Mask, SDValue Base,

28576

SDValue Index, SDValue ScaleOp, SDValue Chain,

28577

const X86Subtarget &Subtarget) {

28578

MVT VT = Op.getSimpleValueType();

28579

SDLoc dl(Op);

28580

auto *C = dyn_cast<ConstantSDNode>(ScaleOp);

28581

// Scale must be constant.

28582

if (!C)

28583

return SDValue();

28584

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

28585

SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,

28586

TLI.getPointerTy(DAG.getDataLayout()));

28587

unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(),

28588

VT.getVectorNumElements());

28589

MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts);

28590

28591

// We support two versions of the gather intrinsics. One with scalar mask and

28592

// one with vXi1 mask. Convert scalar to vXi1 if necessary.

28593

if (Mask.getValueType() != MaskVT)

28594

Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);

28595

28596

SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Other);

28597

// If source is undef or we know it won't be used, use a zero vector

28598

// to break register dependency.

28599

// TODO: use undef instead and let BreakFalseDeps deal with it?

28600

if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))

28601

Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);

28602

28603

MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);

28604

28605

SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale };

28606

SDValue Res =

28607

DAG.getMemIntrinsicNode(X86ISD::MGATHER, dl, VTs, Ops,

28608

MemIntr->getMemoryVT(), MemIntr->getMemOperand());

28609

return DAG.getMergeValues({Res, Res.getValue(1)}, dl);

28610

}

28611

28612

static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,

28613

SDValue Src, SDValue Mask, SDValue Base,

28614

SDValue Index, SDValue ScaleOp, SDValue Chain,

28615

const X86Subtarget &Subtarget) {

28616

SDLoc dl(Op);

28617

auto *C = dyn_cast<ConstantSDNode>(ScaleOp);

28618

// Scale must be constant.

28619

if (!C)

28620

return SDValue();

28621

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

28622

SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,

28623

TLI.getPointerTy(DAG.getDataLayout()));

28624

unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(),

28625

Src.getSimpleValueType().getVectorNumElements());

28626

MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts);

28627

28628

// We support two versions of the scatter intrinsics. One with scalar mask and

28629

// one with vXi1 mask. Convert scalar to vXi1 if necessary.

28630

if (Mask.getValueType() != MaskVT)

28631

Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);

28632

28633

MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);

28634

28635

SDVTList VTs = DAG.getVTList(MVT::Other);

28636

SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale};

28637

SDValue Res =

28638

DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,

28639

MemIntr->getMemoryVT(), MemIntr->getMemOperand());

28640

return Res;

28641

}

28642

28643

static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,

28644

SDValue Mask, SDValue Base, SDValue Index,

28645

SDValue ScaleOp, SDValue Chain,

28646

const X86Subtarget &Subtarget) {

28647

SDLoc dl(Op);

28648

auto *C = dyn_cast<ConstantSDNode>(ScaleOp);

28649

// Scale must be constant.

28650

if (!C)

28651

return SDValue();

28652

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

28653

SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,

28654

TLI.getPointerTy(DAG.getDataLayout()));

28655

SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);

28656

SDValue Segment = DAG.getRegister(0, MVT::i32);

28657

MVT MaskVT =

28658

MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());

28659

SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);

28660

SDValue Ops[] = {VMask, Base, Scale, Index, Disp, Segment, Chain};

28661

SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);

28662

return SDValue(Res, 0);

28663

}

28664

28665

/// Handles the lowering of builtin intrinsics with chain that return their

28666

/// value into registers EDX:EAX.

28667

/// If operand ScrReg is a valid register identifier, then operand 2 of N is

28668

/// copied to SrcReg. The assumption is that SrcReg is an implicit input to

28669

/// TargetOpcode.

28670

/// Returns a Glue value which can be used to add extra copy-from-reg if the

28671

/// expanded intrinsics implicitly defines extra registers (i.e. not just

28672

/// EDX:EAX).

28673

static SDValue expandIntrinsicWChainHelper(SDNode *N, const SDLoc &DL,

28674

SelectionDAG &DAG,

28675

unsigned TargetOpcode,

28676

unsigned SrcReg,

28677

const X86Subtarget &Subtarget,

28678

SmallVectorImpl<SDValue> &Results) {

28679

SDValue Chain = N->getOperand(0);

28680

SDValue Glue;

28681

28682

if (SrcReg) {

28683

assert(N->getNumOperands() == 3 && "Unexpected number of operands!")(static_cast <bool> (N->getNumOperands() == 3 &&
"Unexpected number of operands!") ? void (0) : __assert_fail
("N->getNumOperands() == 3 && \"Unexpected number of operands!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28683, __extension__
__PRETTY_FUNCTION__));

28684

Chain = DAG.getCopyToReg(Chain, DL, SrcReg, N->getOperand(2), Glue);

28685

Glue = Chain.getValue(1);

28686

}

28687

28688

SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);

28689

SDValue N1Ops[] = {Chain, Glue};

28690

SDNode *N1 = DAG.getMachineNode(

28691

TargetOpcode, DL, Tys, ArrayRef<SDValue>(N1Ops, Glue.getNode() ? 2 : 1));

28692

Chain = SDValue(N1, 0);

28693

28694

// Reads the content of XCR and returns it in registers EDX:EAX.

28695

SDValue LO, HI;

28696

if (Subtarget.is64Bit()) {

28697

LO = DAG.getCopyFromReg(Chain, DL, X86::RAX, MVT::i64, SDValue(N1, 1));

28698

HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,

28699

LO.getValue(2));

28700

} else {

28701

LO = DAG.getCopyFromReg(Chain, DL, X86::EAX, MVT::i32, SDValue(N1, 1));

28702

HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,

28703

LO.getValue(2));

28704

}

28705

Chain = HI.getValue(1);

28706

Glue = HI.getValue(2);

28707

28708

if (Subtarget.is64Bit()) {

28709

// Merge the two 32-bit values into a 64-bit one.

28710

SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,

28711

DAG.getConstant(32, DL, MVT::i8));

28712

Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));

28713

Results.push_back(Chain);

28714

return Glue;

28715

}

28716

28717

// Use a buildpair to merge the two 32-bit values into a 64-bit one.

28718

SDValue Ops[] = { LO, HI };

28719

SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);

28720

Results.push_back(Pair);

28721

Results.push_back(Chain);

28722

return Glue;

28723

}

28724

28725

/// Handles the lowering of builtin intrinsics that read the time stamp counter

28726

/// (x86_rdtsc and x86_rdtscp). This function is also used to custom lower

28727

/// READCYCLECOUNTER nodes.

28728

static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode,

28729

SelectionDAG &DAG,

28730

const X86Subtarget &Subtarget,

28731

SmallVectorImpl<SDValue> &Results) {

28732

// The processor's time-stamp counter (a 64-bit MSR) is stored into the

28733

// EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR

28734

// and the EAX register is loaded with the low-order 32 bits.

28735

SDValue Glue = expandIntrinsicWChainHelper(N, DL, DAG, Opcode,

28736

/* NoRegister */0, Subtarget,

28737

Results);

28738

if (Opcode != X86::RDTSCP)

28739

return;

28740

28741

SDValue Chain = Results[1];

28742

// Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into

28743

// the ECX register. Add 'ecx' explicitly to the chain.

28744

SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32, Glue);

28745

Results[1] = ecx;

28746

Results.push_back(ecx.getValue(1));

28747

}

28748

28749

static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget &Subtarget,

28750

SelectionDAG &DAG) {

28751

SmallVector<SDValue, 3> Results;

28752

SDLoc DL(Op);

28753

getReadTimeStampCounter(Op.getNode(), DL, X86::RDTSC, DAG, Subtarget,

28754

Results);

28755

return DAG.getMergeValues(Results, DL);

28756

}

28757

28758

static SDValue MarkEHRegistrationNode(SDValue Op, SelectionDAG &DAG) {

28759

MachineFunction &MF = DAG.getMachineFunction();

28760

SDValue Chain = Op.getOperand(0);

28761

SDValue RegNode = Op.getOperand(2);

28762

WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();

28763

if (!EHInfo)

28764

report_fatal_error("EH registrations only live in functions using WinEH");

28765

28766

// Cast the operand to an alloca, and remember the frame index.

28767

auto *FINode = dyn_cast<FrameIndexSDNode>(RegNode);

28768

if (!FINode)

28769

report_fatal_error("llvm.x86.seh.ehregnode expects a static alloca");

28770

EHInfo->EHRegNodeFrameIndex = FINode->getIndex();

28771

28772

// Return the chain operand without making any DAG nodes.

28773

return Chain;

28774

}

28775

28776

static SDValue MarkEHGuard(SDValue Op, SelectionDAG &DAG) {

28777

MachineFunction &MF = DAG.getMachineFunction();

28778

SDValue Chain = Op.getOperand(0);

28779

SDValue EHGuard = Op.getOperand(2);

28780

WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();

28781

if (!EHInfo)

28782

report_fatal_error("EHGuard only live in functions using WinEH");

28783

28784

// Cast the operand to an alloca, and remember the frame index.

28785

auto *FINode = dyn_cast<FrameIndexSDNode>(EHGuard);

28786

if (!FINode)

28787

report_fatal_error("llvm.x86.seh.ehguard expects a static alloca");

28788

EHInfo->EHGuardFrameIndex = FINode->getIndex();

28789

28790

// Return the chain operand without making any DAG nodes.

28791

return Chain;

28792

}

28793

28794

/// Emit Truncating Store with signed or unsigned saturation.

28795

static SDValue

28796

EmitTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl, SDValue Val,

28797

SDValue Ptr, EVT MemVT, MachineMemOperand *MMO,

28798

SelectionDAG &DAG) {

28799

SDVTList VTs = DAG.getVTList(MVT::Other);

28800

SDValue Undef = DAG.getUNDEF(Ptr.getValueType());

28801

SDValue Ops[] = { Chain, Val, Ptr, Undef };

28802

unsigned Opc = SignedSat ? X86ISD::VTRUNCSTORES : X86ISD::VTRUNCSTOREUS;

28803

return DAG.getMemIntrinsicNode(Opc, Dl, VTs, Ops, MemVT, MMO);

28804

}

28805

28806

/// Emit Masked Truncating Store with signed or unsigned saturation.

28807

static SDValue

28808

EmitMaskedTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl,

28809

SDValue Val, SDValue Ptr, SDValue Mask, EVT MemVT,

28810

MachineMemOperand *MMO, SelectionDAG &DAG) {

28811

SDVTList VTs = DAG.getVTList(MVT::Other);

28812

SDValue Ops[] = { Chain, Val, Ptr, Mask };

28813

unsigned Opc = SignedSat ? X86ISD::VMTRUNCSTORES : X86ISD::VMTRUNCSTOREUS;

28814

return DAG.getMemIntrinsicNode(Opc, Dl, VTs, Ops, MemVT, MMO);

28815

}

28816

28817

static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,

28818

SelectionDAG &DAG) {

28819

unsigned IntNo = Op.getConstantOperandVal(1);

28820

const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo);

28821

if (!IntrData) {

28822

switch (IntNo) {

28823

28824

case Intrinsic::swift_async_context_addr: {

28825

SDLoc dl(Op);

28826

auto &MF = DAG.getMachineFunction();

28827

auto X86FI = MF.getInfo<X86MachineFunctionInfo>();

28828

if (Subtarget.is64Bit()) {

28829

MF.getFrameInfo().setFrameAddressIsTaken(true);

28830

X86FI->setHasSwiftAsyncContext(true);

28831

SDValue Chain = Op->getOperand(0);

28832

SDValue CopyRBP = DAG.getCopyFromReg(Chain, dl, X86::RBP, MVT::i64);

28833

SDValue Result =

28834

SDValue(DAG.getMachineNode(X86::SUB64ri8, dl, MVT::i64, CopyRBP,

28835

DAG.getTargetConstant(8, dl, MVT::i32)),

28836

0);

28837

// Return { result, chain }.

28838

return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result,

28839

CopyRBP.getValue(1));

28840

} else {

28841

// 32-bit so no special extended frame, create or reuse an existing

28842

// stack slot.

28843

if (!X86FI->getSwiftAsyncContextFrameIdx())

28844

X86FI->setSwiftAsyncContextFrameIdx(

28845

MF.getFrameInfo().CreateStackObject(4, Align(4), false));

28846

SDValue Result =

28847

DAG.getFrameIndex(*X86FI->getSwiftAsyncContextFrameIdx(), MVT::i32);

28848

// Return { result, chain }.

28849

return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result,

28850

Op->getOperand(0));

28851

}

28852

}

28853

28854

case llvm::Intrinsic::x86_seh_ehregnode:

28855

return MarkEHRegistrationNode(Op, DAG);

28856

case llvm::Intrinsic::x86_seh_ehguard:

28857

return MarkEHGuard(Op, DAG);

28858

case llvm::Intrinsic::x86_rdpkru: {

28859

SDLoc dl(Op);

28860

SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);

28861

// Create a RDPKRU node and pass 0 to the ECX parameter.

28862

return DAG.getNode(X86ISD::RDPKRU, dl, VTs, Op.getOperand(0),

28863

DAG.getConstant(0, dl, MVT::i32));

28864

}

28865

case llvm::Intrinsic::x86_wrpkru: {

28866

SDLoc dl(Op);

28867

// Create a WRPKRU node, pass the input to the EAX parameter, and pass 0

28868

// to the EDX and ECX parameters.

28869

return DAG.getNode(X86ISD::WRPKRU, dl, MVT::Other,

28870

Op.getOperand(0), Op.getOperand(2),

28871

DAG.getConstant(0, dl, MVT::i32),

28872

DAG.getConstant(0, dl, MVT::i32));

28873

}

28874

case llvm::Intrinsic::asan_check_memaccess: {

28875

// Mark this as adjustsStack because it will be lowered to a call.

28876

DAG.getMachineFunction().getFrameInfo().setAdjustsStack(true);

28877

// Don't do anything here, we will expand these intrinsics out later.

28878

return Op;

28879

}

28880

case llvm::Intrinsic::x86_flags_read_u32:

28881

case llvm::Intrinsic::x86_flags_read_u64:

28882

case llvm::Intrinsic::x86_flags_write_u32:

28883

case llvm::Intrinsic::x86_flags_write_u64: {

28884

// We need a frame pointer because this will get lowered to a PUSH/POP

28885

// sequence.

28886

MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();

28887

MFI.setHasCopyImplyingStackAdjustment(true);

28888

// Don't do anything here, we will expand these intrinsics out later

28889

// during FinalizeISel in EmitInstrWithCustomInserter.

28890

return Op;

28891

}

28892

case Intrinsic::x86_lwpins32:

28893

case Intrinsic::x86_lwpins64:

28894

case Intrinsic::x86_umwait:

28895

case Intrinsic::x86_tpause: {

28896

SDLoc dl(Op);

28897

SDValue Chain = Op->getOperand(0);

28898

SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);

28899

unsigned Opcode;

28900

28901

switch (IntNo) {

28902

default: llvm_unreachable("Impossible intrinsic")::llvm::llvm_unreachable_internal("Impossible intrinsic", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28902);

28903

case Intrinsic::x86_umwait:

28904

Opcode = X86ISD::UMWAIT;

28905

break;

28906

case Intrinsic::x86_tpause:

28907

Opcode = X86ISD::TPAUSE;

28908

break;

28909

case Intrinsic::x86_lwpins32:

28910

case Intrinsic::x86_lwpins64:

28911

Opcode = X86ISD::LWPINS;

28912

break;

28913

}

28914

28915

SDValue Operation =

28916

DAG.getNode(Opcode, dl, VTs, Chain, Op->getOperand(2),

28917

Op->getOperand(3), Op->getOperand(4));

28918

SDValue SetCC = getSETCC(X86::COND_B, Operation.getValue(0), dl, DAG);

28919

return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,

28920

Operation.getValue(1));

28921

}

28922

case Intrinsic::x86_enqcmd:

28923

case Intrinsic::x86_enqcmds: {

28924

SDLoc dl(Op);

28925

SDValue Chain = Op.getOperand(0);

28926

SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);

28927

unsigned Opcode;

28928

switch (IntNo) {

28929

default: llvm_unreachable("Impossible intrinsic!")::llvm::llvm_unreachable_internal("Impossible intrinsic!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28929);

28930

case Intrinsic::x86_enqcmd:

28931

Opcode = X86ISD::ENQCMD;

28932

break;

28933

case Intrinsic::x86_enqcmds:

28934

Opcode = X86ISD::ENQCMDS;

28935

break;

28936

}

28937

SDValue Operation = DAG.getNode(Opcode, dl, VTs, Chain, Op.getOperand(2),

28938

Op.getOperand(3));

28939

SDValue SetCC = getSETCC(X86::COND_E, Operation.getValue(0), dl, DAG);

28940

return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,

28941

Operation.getValue(1));

28942

}

28943

case Intrinsic::x86_aesenc128kl:

28944

case Intrinsic::x86_aesdec128kl:

28945

case Intrinsic::x86_aesenc256kl:

28946

case Intrinsic::x86_aesdec256kl: {

28947

SDLoc DL(Op);

28948

SDVTList VTs = DAG.getVTList(MVT::v2i64, MVT::i32, MVT::Other);

28949

SDValue Chain = Op.getOperand(0);

28950

unsigned Opcode;

28951

28952

switch (IntNo) {

28953

default: llvm_unreachable("Impossible intrinsic")::llvm::llvm_unreachable_internal("Impossible intrinsic", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28953);

28954

case Intrinsic::x86_aesenc128kl:

28955

Opcode = X86ISD::AESENC128KL;

28956

break;

28957

case Intrinsic::x86_aesdec128kl:

28958

Opcode = X86ISD::AESDEC128KL;

28959

break;

28960

case Intrinsic::x86_aesenc256kl:

28961

Opcode = X86ISD::AESENC256KL;

28962

break;

28963

case Intrinsic::x86_aesdec256kl:

28964

Opcode = X86ISD::AESDEC256KL;

28965

break;

28966

}

28967

28968

MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);

28969

MachineMemOperand *MMO = MemIntr->getMemOperand();

28970

EVT MemVT = MemIntr->getMemoryVT();

28971

SDValue Operation = DAG.getMemIntrinsicNode(

28972

Opcode, DL, VTs, {Chain, Op.getOperand(2), Op.getOperand(3)}, MemVT,

28973

MMO);

28974

SDValue ZF = getSETCC(X86::COND_E, Operation.getValue(1), DL, DAG);

28975

28976

return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),

28977

{ZF, Operation.getValue(0), Operation.getValue(2)});

28978

}

28979

case Intrinsic::x86_aesencwide128kl:

28980

case Intrinsic::x86_aesdecwide128kl:

28981

case Intrinsic::x86_aesencwide256kl:

28982

case Intrinsic::x86_aesdecwide256kl: {

28983

SDLoc DL(Op);

28984

SDVTList VTs = DAG.getVTList(

28985

{MVT::i32, MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::v2i64,

28986

MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::Other});

28987

SDValue Chain = Op.getOperand(0);

28988

unsigned Opcode;

28989

28990

switch (IntNo) {

28991

default: llvm_unreachable("Impossible intrinsic")::llvm::llvm_unreachable_internal("Impossible intrinsic", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28991);

28992

case Intrinsic::x86_aesencwide128kl:

28993

Opcode = X86ISD::AESENCWIDE128KL;

28994

break;

28995

case Intrinsic::x86_aesdecwide128kl:

28996

Opcode = X86ISD::AESDECWIDE128KL;

28997

break;

28998

case Intrinsic::x86_aesencwide256kl:

28999

Opcode = X86ISD::AESENCWIDE256KL;

29000

break;

29001

case Intrinsic::x86_aesdecwide256kl:

29002

Opcode = X86ISD::AESDECWIDE256KL;

29003

break;

29004

}

29005

29006

MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);

29007

MachineMemOperand *MMO = MemIntr->getMemOperand();

29008

EVT MemVT = MemIntr->getMemoryVT();

29009

SDValue Operation = DAG.getMemIntrinsicNode(

29010

Opcode, DL, VTs,

29011

{Chain, Op.getOperand(2), Op.getOperand(3), Op.getOperand(4),

29012

Op.getOperand(5), Op.getOperand(6), Op.getOperand(7),

29013

Op.getOperand(8), Op.getOperand(9), Op.getOperand(10)},

29014

MemVT, MMO);

29015

SDValue ZF = getSETCC(X86::COND_E, Operation.getValue(0), DL, DAG);

29016

29017

return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),

29018

{ZF, Operation.getValue(1), Operation.getValue(2),

29019

Operation.getValue(3), Operation.getValue(4),

29020

Operation.getValue(5), Operation.getValue(6),

29021

Operation.getValue(7), Operation.getValue(8),

29022

Operation.getValue(9)});

29023

}

29024

case Intrinsic::x86_testui: {

29025

SDLoc dl(Op);

29026

SDValue Chain = Op.getOperand(0);

29027

SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);

29028

SDValue Operation = DAG.getNode(X86ISD::TESTUI, dl, VTs, Chain);

29029

SDValue SetCC = getSETCC(X86::COND_B, Operation.getValue(0), dl, DAG);

29030

return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,

29031

Operation.getValue(1));

29032

}

29033

case Intrinsic::x86_atomic_bts_rm:

29034

case Intrinsic::x86_atomic_btc_rm:

29035

case Intrinsic::x86_atomic_btr_rm: {

29036

SDLoc DL(Op);

29037

MVT VT = Op.getSimpleValueType();

29038

SDValue Chain = Op.getOperand(0);

29039

SDValue Op1 = Op.getOperand(2);

29040

SDValue Op2 = Op.getOperand(3);

29041

unsigned Opc = IntNo == Intrinsic::x86_atomic_bts_rm ? X86ISD::LBTS_RM

29042

: IntNo == Intrinsic::x86_atomic_btc_rm ? X86ISD::LBTC_RM

29043

: X86ISD::LBTR_RM;

29044

MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();

29045

SDValue Res =

29046

DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::i32, MVT::Other),

29047

{Chain, Op1, Op2}, VT, MMO);

29048

Chain = Res.getValue(1);

29049

Res = DAG.getZExtOrTrunc(getSETCC(X86::COND_B, Res, DL, DAG), DL, VT);

29050

return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Res, Chain);

29051

}

29052

case Intrinsic::x86_atomic_bts:

29053

case Intrinsic::x86_atomic_btc:

29054

case Intrinsic::x86_atomic_btr: {

29055

SDLoc DL(Op);

29056

MVT VT = Op.getSimpleValueType();

29057

SDValue Chain = Op.getOperand(0);

29058

SDValue Op1 = Op.getOperand(2);

29059

SDValue Op2 = Op.getOperand(3);

29060

unsigned Opc = IntNo == Intrinsic::x86_atomic_bts ? X86ISD::LBTS

29061

: IntNo == Intrinsic::x86_atomic_btc ? X86ISD::LBTC

29062

: X86ISD::LBTR;

29063

SDValue Size = DAG.getConstant(VT.getScalarSizeInBits(), DL, MVT::i32);

29064

MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();

29065

SDValue Res =

29066

DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::i32, MVT::Other),

29067

{Chain, Op1, Op2, Size}, VT, MMO);

29068

Chain = Res.getValue(1);

29069

Res = DAG.getZExtOrTrunc(getSETCC(X86::COND_B, Res, DL, DAG), DL, VT);

29070

unsigned Imm = cast<ConstantSDNode>(Op2)->getZExtValue();

29071

if (Imm)

29072

Res = DAG.getNode(ISD::SHL, DL, VT, Res,

29073

DAG.getShiftAmountConstant(Imm, VT, DL));

29074

return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Res, Chain);

29075

}

29076

case Intrinsic::x86_cmpccxadd32:

29077

case Intrinsic::x86_cmpccxadd64: {

29078

SDLoc DL(Op);

29079

SDValue Chain = Op.getOperand(0);

29080

SDValue Addr = Op.getOperand(2);

29081

SDValue Src1 = Op.getOperand(3);

29082

SDValue Src2 = Op.getOperand(4);

29083

SDValue CC = Op.getOperand(5);

29084

MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();

29085

SDValue Operation = DAG.getMemIntrinsicNode(

29086

X86ISD::CMPCCXADD, DL, Op->getVTList(), {Chain, Addr, Src1, Src2, CC},

29087

MVT::i32, MMO);

29088

return Operation;

29089

}

29090

case Intrinsic::x86_aadd32:

29091

case Intrinsic::x86_aadd64:

29092

case Intrinsic::x86_aand32:

29093

case Intrinsic::x86_aand64:

29094

case Intrinsic::x86_aor32:

29095

case Intrinsic::x86_aor64:

29096

case Intrinsic::x86_axor32:

29097

case Intrinsic::x86_axor64: {

29098

SDLoc DL(Op);

29099

SDValue Chain = Op.getOperand(0);

29100

SDValue Op1 = Op.getOperand(2);

29101

SDValue Op2 = Op.getOperand(3);

29102

MVT VT = Op2.getSimpleValueType();

29103

unsigned Opc = 0;

29104

switch (IntNo) {

29105

default:

29106

llvm_unreachable("Unknown Intrinsic")::llvm::llvm_unreachable_internal("Unknown Intrinsic", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29106);

29107

case Intrinsic::x86_aadd32:

29108

case Intrinsic::x86_aadd64:

29109

Opc = X86ISD::AADD;

29110

break;

29111

case Intrinsic::x86_aand32:

29112

case Intrinsic::x86_aand64:

29113

Opc = X86ISD::AAND;

29114

break;

29115

case Intrinsic::x86_aor32:

29116

case Intrinsic::x86_aor64:

29117

Opc = X86ISD::AOR;

29118

break;

29119

case Intrinsic::x86_axor32:

29120

case Intrinsic::x86_axor64:

29121

Opc = X86ISD::AXOR;

29122

break;

29123

}

29124

MachineMemOperand *MMO = cast<MemSDNode>(Op)->getMemOperand();

29125

return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(),

29126

{Chain, Op1, Op2}, VT, MMO);

29127

}

29128

case Intrinsic::x86_atomic_add_cc:

29129

case Intrinsic::x86_atomic_sub_cc:

29130

case Intrinsic::x86_atomic_or_cc:

29131

case Intrinsic::x86_atomic_and_cc:

29132

case Intrinsic::x86_atomic_xor_cc: {

29133

SDLoc DL(Op);

29134

SDValue Chain = Op.getOperand(0);

29135

SDValue Op1 = Op.getOperand(2);

29136

SDValue Op2 = Op.getOperand(3);

29137

X86::CondCode CC = (X86::CondCode)Op.getConstantOperandVal(4);

29138

MVT VT = Op2.getSimpleValueType();

29139

unsigned Opc = 0;

29140

switch (IntNo) {

29141

default:

29142

llvm_unreachable("Unknown Intrinsic")::llvm::llvm_unreachable_internal("Unknown Intrinsic", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29142);

29143

case Intrinsic::x86_atomic_add_cc:

29144

Opc = X86ISD::LADD;

29145

break;

29146

case Intrinsic::x86_atomic_sub_cc:

29147

Opc = X86ISD::LSUB;

29148

break;

29149

case Intrinsic::x86_atomic_or_cc:

29150

Opc = X86ISD::LOR;

29151

break;

29152

case Intrinsic::x86_atomic_and_cc:

29153

Opc = X86ISD::LAND;

29154

break;

29155

case Intrinsic::x86_atomic_xor_cc:

29156

Opc = X86ISD::LXOR;

29157

break;

29158

}

29159

MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();

29160

SDValue LockArith =

29161

DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::i32, MVT::Other),

29162

{Chain, Op1, Op2}, VT, MMO);

29163

Chain = LockArith.getValue(1);

29164

return DAG.getMergeValues({getSETCC(CC, LockArith, DL, DAG), Chain}, DL);

29165

}

29166

}

29167

return SDValue();

29168

}

29169

29170

SDLoc dl(Op);

29171

switch(IntrData->Type) {

29172

default: llvm_unreachable("Unknown Intrinsic Type")::llvm::llvm_unreachable_internal("Unknown Intrinsic Type", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29172);

29173

case RDSEED:

29174

case RDRAND: {

29175

// Emit the node with the right value type.

29176

SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32, MVT::Other);

29177

SDValue Result = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));

29178

29179

// If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.

29180

// Otherwise return the value from Rand, which is always 0, casted to i32.

29181

SDValue Ops[] = {DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),

29182

DAG.getConstant(1, dl, Op->getValueType(1)),

29183

DAG.getTargetConstant(X86::COND_B, dl, MVT::i8),

29184

SDValue(Result.getNode(), 1)};

29185

SDValue isValid = DAG.getNode(X86ISD::CMOV, dl, Op->getValueType(1), Ops);

29186

29187

// Return { result, isValid, chain }.

29188

return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,

29189

SDValue(Result.getNode(), 2));

29190

}

29191

case GATHER_AVX2: {

29192

SDValue Chain = Op.getOperand(0);

29193

SDValue Src = Op.getOperand(2);

29194

SDValue Base = Op.getOperand(3);

29195

SDValue Index = Op.getOperand(4);

29196

SDValue Mask = Op.getOperand(5);

29197

SDValue Scale = Op.getOperand(6);

29198

return getAVX2GatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,

29199

Scale, Chain, Subtarget);

29200

}

29201

case GATHER: {

29202

//gather(v1, mask, index, base, scale);

29203

SDValue Chain = Op.getOperand(0);

29204

SDValue Src = Op.getOperand(2);

29205

SDValue Base = Op.getOperand(3);

29206

SDValue Index = Op.getOperand(4);

29207

SDValue Mask = Op.getOperand(5);

29208

SDValue Scale = Op.getOperand(6);

29209

return getGatherNode(Op, DAG, Src, Mask, Base, Index, Scale,

29210

Chain, Subtarget);

29211

}

29212

case SCATTER: {

29213

//scatter(base, mask, index, v1, scale);

29214

SDValue Chain = Op.getOperand(0);

29215

SDValue Base = Op.getOperand(2);

29216

SDValue Mask = Op.getOperand(3);

29217

SDValue Index = Op.getOperand(4);

29218

SDValue Src = Op.getOperand(5);

29219

SDValue Scale = Op.getOperand(6);

29220

return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,

29221

Scale, Chain, Subtarget);

29222

}

29223

case PREFETCH: {

29224

const APInt &HintVal = Op.getConstantOperandAPInt(6);

29225

assert((HintVal == 2 || HintVal == 3) &&(static_cast <bool> ((HintVal == 2 || HintVal == 3) &&
"Wrong prefetch hint in intrinsic: should be 2 or 3") ? void
(0) : __assert_fail ("(HintVal == 2 || HintVal == 3) && \"Wrong prefetch hint in intrinsic: should be 2 or 3\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29226, __extension__
__PRETTY_FUNCTION__))

29226

"Wrong prefetch hint in intrinsic: should be 2 or 3")(static_cast <bool> ((HintVal == 2 || HintVal == 3) &&
"Wrong prefetch hint in intrinsic: should be 2 or 3") ? void
(0) : __assert_fail ("(HintVal == 2 || HintVal == 3) && \"Wrong prefetch hint in intrinsic: should be 2 or 3\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29226, __extension__
__PRETTY_FUNCTION__));

29227

unsigned Opcode = (HintVal == 2 ? IntrData->Opc1 : IntrData->Opc0);

29228

SDValue Chain = Op.getOperand(0);

29229

SDValue Mask = Op.getOperand(2);

29230

SDValue Index = Op.getOperand(3);

29231

SDValue Base = Op.getOperand(4);

29232

SDValue Scale = Op.getOperand(5);

29233

return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain,

29234

Subtarget);

29235

}

29236

// Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP).

29237

case RDTSC: {

29238

SmallVector<SDValue, 2> Results;

29239

getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget,

29240

Results);

29241

return DAG.getMergeValues(Results, dl);

29242

}

29243

// Read Performance Monitoring Counters.

29244

case RDPMC:

29245

// Read Processor Register.

29246

case RDPRU:

29247

// GetExtended Control Register.

29248

case XGETBV: {

29249

SmallVector<SDValue, 2> Results;

29250

29251

// RDPMC uses ECX to select the index of the performance counter to read.

29252

// RDPRU uses ECX to select the processor register to read.

29253

// XGETBV uses ECX to select the index of the XCR register to return.

29254

// The result is stored into registers EDX:EAX.

29255

expandIntrinsicWChainHelper(Op.getNode(), dl, DAG, IntrData->Opc0, X86::ECX,

29256

Subtarget, Results);

29257

return DAG.getMergeValues(Results, dl);

29258

}

29259

// XTEST intrinsics.

29260

case XTEST: {

29261

SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);

29262

SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));

29263

29264

SDValue SetCC = getSETCC(X86::COND_NE, InTrans, dl, DAG);

29265

SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC);

29266

return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),

29267

Ret, SDValue(InTrans.getNode(), 1));

29268

}

29269

case TRUNCATE_TO_MEM_VI8:

29270

case TRUNCATE_TO_MEM_VI16:

29271

case TRUNCATE_TO_MEM_VI32: {

29272

SDValue Mask = Op.getOperand(4);

29273

SDValue DataToTruncate = Op.getOperand(3);

29274

SDValue Addr = Op.getOperand(2);

29275

SDValue Chain = Op.getOperand(0);

29276

29277

MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);

29278

assert(MemIntr && "Expected MemIntrinsicSDNode!")(static_cast <bool> (MemIntr && "Expected MemIntrinsicSDNode!"
) ? void (0) : __assert_fail ("MemIntr && \"Expected MemIntrinsicSDNode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29278, __extension__
__PRETTY_FUNCTION__));

29279

29280

EVT MemVT = MemIntr->getMemoryVT();

29281

29282

uint16_t TruncationOp = IntrData->Opc0;

29283

switch (TruncationOp) {

29284

case X86ISD::VTRUNC: {

29285

if (isAllOnesConstant(Mask)) // return just a truncate store

29286

return DAG.getTruncStore(Chain, dl, DataToTruncate, Addr, MemVT,

29287

MemIntr->getMemOperand());

29288

29289

MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());

29290

SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);

29291

SDValue Offset = DAG.getUNDEF(VMask.getValueType());

29292

29293

return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr, Offset, VMask,

29294

MemVT, MemIntr->getMemOperand(), ISD::UNINDEXED,

29295

true /* truncating */);

29296

}

29297

case X86ISD::VTRUNCUS:

29298

case X86ISD::VTRUNCS: {

29299

bool IsSigned = (TruncationOp == X86ISD::VTRUNCS);

29300

if (isAllOnesConstant(Mask))

29301

return EmitTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr, MemVT,

29302

MemIntr->getMemOperand(), DAG);

29303

29304

MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());

29305

SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);

29306

29307

return EmitMaskedTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr,

29308

VMask, MemVT, MemIntr->getMemOperand(), DAG);

29309

}

29310

default:

29311

llvm_unreachable("Unsupported truncstore intrinsic")::llvm::llvm_unreachable_internal("Unsupported truncstore intrinsic"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29311);

29312

}

29313

}

29314

}

29315

}

29316

29317

SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,

29318

SelectionDAG &DAG) const {

29319

MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();

29320

MFI.setReturnAddressIsTaken(true);

29321

29322

if (verifyReturnAddressArgumentIsConstant(Op, DAG))

29323

return SDValue();

29324

29325

unsigned Depth = Op.getConstantOperandVal(0);

29326

SDLoc dl(Op);

29327

EVT PtrVT = getPointerTy(DAG.getDataLayout());

29328

29329

if (Depth > 0) {

29330

SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);

29331

const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();

29332

SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), dl, PtrVT);

29333

return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),

29334

DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset),

29335

MachinePointerInfo());

29336

}

29337

29338

// Just load the return address.

29339

SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);

29340

return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), RetAddrFI,

29341

MachinePointerInfo());

29342

}

29343

29344

SDValue X86TargetLowering::LowerADDROFRETURNADDR(SDValue Op,

29345

SelectionDAG &DAG) const {

29346

DAG.getMachineFunction().getFrameInfo().setReturnAddressIsTaken(true);

29347

return getReturnAddressFrameIndex(DAG);

29348

}

29349

29350

SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {

29351

MachineFunction &MF = DAG.getMachineFunction();

29352

MachineFrameInfo &MFI = MF.getFrameInfo();

29353

X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();

29354

const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();

29355

EVT VT = Op.getValueType();

29356

29357

MFI.setFrameAddressIsTaken(true);

29358

29359

if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) {

29360

// Depth > 0 makes no sense on targets which use Windows unwind codes. It

29361

// is not possible to crawl up the stack without looking at the unwind codes

29362

// simultaneously.

29363

int FrameAddrIndex = FuncInfo->getFAIndex();

29364

if (!FrameAddrIndex) {

29365

// Set up a frame object for the return address.

29366

unsigned SlotSize = RegInfo->getSlotSize();

29367

FrameAddrIndex = MF.getFrameInfo().CreateFixedObject(

29368

SlotSize, /*SPOffset=*/0, /*IsImmutable=*/false);

29369

FuncInfo->setFAIndex(FrameAddrIndex);

29370

}

29371

return DAG.getFrameIndex(FrameAddrIndex, VT);

29372

}

29373

29374

unsigned FrameReg =

29375

RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());

29376

SDLoc dl(Op); // FIXME probably not meaningful

29377

unsigned Depth = Op.getConstantOperandVal(0);

29378

assert(((FrameReg == X86::RBP && VT == MVT::i64) ||(static_cast <bool> (((FrameReg == X86::RBP && VT
== MVT::i64) || (FrameReg == X86::EBP && VT == MVT::
i32)) && "Invalid Frame Register!") ? void (0) : __assert_fail
("((FrameReg == X86::RBP && VT == MVT::i64) || (FrameReg == X86::EBP && VT == MVT::i32)) && \"Invalid Frame Register!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29380, __extension__
__PRETTY_FUNCTION__))

29379

(FrameReg == X86::EBP && VT == MVT::i32)) &&(static_cast <bool> (((FrameReg == X86::RBP && VT
== MVT::i64) || (FrameReg == X86::EBP && VT == MVT::
i32)) && "Invalid Frame Register!") ? void (0) : __assert_fail
("((FrameReg == X86::RBP && VT == MVT::i64) || (FrameReg == X86::EBP && VT == MVT::i32)) && \"Invalid Frame Register!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29380, __extension__
__PRETTY_FUNCTION__))

29380

"Invalid Frame Register!")(static_cast <bool> (((FrameReg == X86::RBP && VT
== MVT::i64) || (FrameReg == X86::EBP && VT == MVT::
i32)) && "Invalid Frame Register!") ? void (0) : __assert_fail
("((FrameReg == X86::RBP && VT == MVT::i64) || (FrameReg == X86::EBP && VT == MVT::i32)) && \"Invalid Frame Register!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29380, __extension__
__PRETTY_FUNCTION__));

29381

SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);

29382

while (Depth--)

29383

FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,

29384

MachinePointerInfo());

29385

return FrameAddr;

29386

}

29387

29388

// FIXME? Maybe this could be a TableGen attribute on some registers and

29389

// this table could be generated automatically from RegInfo.

29390

Register X86TargetLowering::getRegisterByName(const char* RegName, LLT VT,

29391

const MachineFunction &MF) const {

29392

const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();

29393

29394

Register Reg = StringSwitch<unsigned>(RegName)

29395

.Case("esp", X86::ESP)

29396

.Case("rsp", X86::RSP)

29397

.Case("ebp", X86::EBP)

29398

.Case("rbp", X86::RBP)

29399

.Default(0);

29400

29401

if (Reg == X86::EBP || Reg == X86::RBP) {

29402

if (!TFI.hasFP(MF))

29403

report_fatal_error("register " + StringRef(RegName) +

29404

" is allocatable: function has no frame pointer");

29405

#ifndef NDEBUG

29406

else {

29407

const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();

29408

Register FrameReg = RegInfo->getPtrSizedFrameRegister(MF);

29409

assert((FrameReg == X86::EBP || FrameReg == X86::RBP) &&(static_cast <bool> ((FrameReg == X86::EBP || FrameReg ==
X86::RBP) && "Invalid Frame Register!") ? void (0) :
__assert_fail ("(FrameReg == X86::EBP || FrameReg == X86::RBP) && \"Invalid Frame Register!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29410, __extension__
__PRETTY_FUNCTION__))

29410

"Invalid Frame Register!")(static_cast <bool> ((FrameReg == X86::EBP || FrameReg ==
X86::RBP) && "Invalid Frame Register!") ? void (0) :
__assert_fail ("(FrameReg == X86::EBP || FrameReg == X86::RBP) && \"Invalid Frame Register!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29410, __extension__
__PRETTY_FUNCTION__));

29411

}

29412

#endif

29413

}

29414

29415

if (Reg)

29416

return Reg;

29417

29418

report_fatal_error("Invalid register name global variable");

29419

}

29420

29421

SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,

29422

SelectionDAG &DAG) const {

29423

const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();

29424

return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize(), SDLoc(Op));

29425

}

29426

29427

Register X86TargetLowering::getExceptionPointerRegister(

29428

const Constant *PersonalityFn) const {

29429

if (classifyEHPersonality(PersonalityFn) == EHPersonality::CoreCLR)

29430

return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;

29431

29432

return Subtarget.isTarget64BitLP64() ? X86::RAX : X86::EAX;

29433

}

29434

29435

Register X86TargetLowering::getExceptionSelectorRegister(

29436

const Constant *PersonalityFn) const {

29437

// Funclet personalities don't use selectors (the runtime does the selection).

29438

if (isFuncletEHPersonality(classifyEHPersonality(PersonalityFn)))

29439

return X86::NoRegister;

29440

return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;

29441

}

29442

29443

bool X86TargetLowering::needsFixedCatchObjects() const {

29444

return Subtarget.isTargetWin64();

29445

}

29446

29447

SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {

29448

SDValue Chain = Op.getOperand(0);

29449

SDValue Offset = Op.getOperand(1);

29450

SDValue Handler = Op.getOperand(2);

29451

SDLoc dl (Op);

29452

29453

EVT PtrVT = getPointerTy(DAG.getDataLayout());

29454

const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();

29455

Register FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());

29456

assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) ||(static_cast <bool> (((FrameReg == X86::RBP && PtrVT
== MVT::i64) || (FrameReg == X86::EBP && PtrVT == MVT
::i32)) && "Invalid Frame Register!") ? void (0) : __assert_fail
("((FrameReg == X86::RBP && PtrVT == MVT::i64) || (FrameReg == X86::EBP && PtrVT == MVT::i32)) && \"Invalid Frame Register!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29458, __extension__
__PRETTY_FUNCTION__))

29457

(FrameReg == X86::EBP && PtrVT == MVT::i32)) &&(static_cast <bool> (((FrameReg == X86::RBP && PtrVT
== MVT::i64) || (FrameReg == X86::EBP && PtrVT == MVT
::i32)) && "Invalid Frame Register!") ? void (0) : __assert_fail
("((FrameReg == X86::RBP && PtrVT == MVT::i64) || (FrameReg == X86::EBP && PtrVT == MVT::i32)) && \"Invalid Frame Register!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29458, __extension__
__PRETTY_FUNCTION__))

29458

"Invalid Frame Register!")(static_cast <bool> (((FrameReg == X86::RBP && PtrVT
== MVT::i64) || (FrameReg == X86::EBP && PtrVT == MVT
::i32)) && "Invalid Frame Register!") ? void (0) : __assert_fail
("((FrameReg == X86::RBP && PtrVT == MVT::i64) || (FrameReg == X86::EBP && PtrVT == MVT::i32)) && \"Invalid Frame Register!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29458, __extension__
__PRETTY_FUNCTION__));

29459

SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT);

29460

Register StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX;

29461

29462

SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame,

29463

DAG.getIntPtrConstant(RegInfo->getSlotSize(),

29464

dl));

29465

StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset);

29466

Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo());

29467

Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);

29468

29469

return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain,

29470

DAG.getRegister(StoreAddrReg, PtrVT));

29471

}

29472

29473

SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,

29474

SelectionDAG &DAG) const {

29475

SDLoc DL(Op);

29476

// If the subtarget is not 64bit, we may need the global base reg

29477

// after isel expand pseudo, i.e., after CGBR pass ran.

29478

// Therefore, ask for the GlobalBaseReg now, so that the pass

29479

// inserts the code for us in case we need it.

29480

// Otherwise, we will end up in a situation where we will

29481

// reference a virtual register that is not defined!

29482

if (!Subtarget.is64Bit()) {

29483

const X86InstrInfo *TII = Subtarget.getInstrInfo();

29484

(void)TII->getGlobalBaseReg(&DAG.getMachineFunction());

29485

}

29486

return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL,

29487

DAG.getVTList(MVT::i32, MVT::Other),

29488

Op.getOperand(0), Op.getOperand(1));

29489

}

29490

29491

SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,

29492

SelectionDAG &DAG) const {

29493

SDLoc DL(Op);

29494

return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other,

29495

Op.getOperand(0), Op.getOperand(1));

29496

}

29497

29498

SDValue X86TargetLowering::lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,

29499

SelectionDAG &DAG) const {

29500

SDLoc DL(Op);

29501

return DAG.getNode(X86ISD::EH_SJLJ_SETUP_DISPATCH, DL, MVT::Other,

29502

Op.getOperand(0));

29503

}

29504

29505

static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) {

29506

return Op.getOperand(0);

29507

}

29508

29509

SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,

29510

SelectionDAG &DAG) const {

29511

SDValue Root = Op.getOperand(0);

29512

SDValue Trmp = Op.getOperand(1); // trampoline

29513

SDValue FPtr = Op.getOperand(2); // nested function

29514

SDValue Nest = Op.getOperand(3); // 'nest' parameter value

29515

SDLoc dl (Op);

29516

29517

const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();

29518

const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();

29519

29520

if (Subtarget.is64Bit()) {

29521

SDValue OutChains[6];

29522

29523

// Large code-model.

29524

const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode.

29525

const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.

29526

29527

const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7;

29528

const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7;

29529

29530

const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix

29531

29532

// Load the pointer to the nested function into R11.

29533

unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11

29534

SDValue Addr = Trmp;

29535

OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),

29536

Addr, MachinePointerInfo(TrmpAddr));

29537

29538

Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,

29539

DAG.getConstant(2, dl, MVT::i64));

29540

OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr,

29541

MachinePointerInfo(TrmpAddr, 2), Align(2));

29542

29543

// Load the 'nest' parameter value into R10.

29544

// R10 is specified in X86CallingConv.td

29545

OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10

29546

Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,

29547

DAG.getConstant(10, dl, MVT::i64));

29548

OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),

29549

Addr, MachinePointerInfo(TrmpAddr, 10));

29550

29551

Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,

29552

DAG.getConstant(12, dl, MVT::i64));

29553

OutChains[3] = DAG.getStore(Root, dl, Nest, Addr,

29554

MachinePointerInfo(TrmpAddr, 12), Align(2));

29555

29556

// Jump to the nested function.

29557

OpCode = (JMP64r << 8) | REX_WB; // jmpq *...

29558

Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,

29559

DAG.getConstant(20, dl, MVT::i64));

29560

OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),

29561

Addr, MachinePointerInfo(TrmpAddr, 20));

29562

29563

unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11

29564

Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,

29565

DAG.getConstant(22, dl, MVT::i64));

29566

OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, dl, MVT::i8),

29567

Addr, MachinePointerInfo(TrmpAddr, 22));

29568

29569

return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);

29570

} else {

29571

const Function *Func =

29572

cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());

29573

CallingConv::ID CC = Func->getCallingConv();

29574

unsigned NestReg;

29575

29576

switch (CC) {

29577

default:

29578

llvm_unreachable("Unsupported calling convention")::llvm::llvm_unreachable_internal("Unsupported calling convention"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29578);

29579

case CallingConv::C:

29580

case CallingConv::X86_StdCall: {

29581

// Pass 'nest' parameter in ECX.

29582

// Must be kept in sync with X86CallingConv.td

29583

NestReg = X86::ECX;

29584

29585

// Check that ECX wasn't needed by an 'inreg' parameter.

29586

FunctionType *FTy = Func->getFunctionType();

29587

const AttributeList &Attrs = Func->getAttributes();

29588

29589

if (!Attrs.isEmpty() && !Func->isVarArg()) {

29590

unsigned InRegCount = 0;

29591

unsigned Idx = 0;

29592

29593

for (FunctionType::param_iterator I = FTy->param_begin(),

29594

E = FTy->param_end(); I != E; ++I, ++Idx)

29595

if (Attrs.hasParamAttr(Idx, Attribute::InReg)) {

29596

const DataLayout &DL = DAG.getDataLayout();

29597

// FIXME: should only count parameters that are lowered to integers.

29598

InRegCount += (DL.getTypeSizeInBits(*I) + 31) / 32;

29599

}

29600

29601

if (InRegCount > 2) {

29602

report_fatal_error("Nest register in use - reduce number of inreg"

29603

" parameters!");

29604

}

29605

}

29606

break;

29607

}

29608

case CallingConv::X86_FastCall:

29609

case CallingConv::X86_ThisCall:

29610

case CallingConv::Fast:

29611

case CallingConv::Tail:

29612

case CallingConv::SwiftTail:

29613

// Pass 'nest' parameter in EAX.

29614

// Must be kept in sync with X86CallingConv.td

29615

NestReg = X86::EAX;

29616

break;

29617

}

29618

29619

SDValue OutChains[4];

29620

SDValue Addr, Disp;

29621

29622

Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,

29623

DAG.getConstant(10, dl, MVT::i32));

29624

Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);

29625

29626

// This is storing the opcode for MOV32ri.

29627

const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.

29628

const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7;

29629

OutChains[0] =

29630

DAG.getStore(Root, dl, DAG.getConstant(MOV32ri | N86Reg, dl, MVT::i8),

29631

Trmp, MachinePointerInfo(TrmpAddr));

29632

29633

Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,

29634

DAG.getConstant(1, dl, MVT::i32));

29635

OutChains[1] = DAG.getStore(Root, dl, Nest, Addr,

29636

MachinePointerInfo(TrmpAddr, 1), Align(1));

29637

29638

const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.

29639

Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,

29640

DAG.getConstant(5, dl, MVT::i32));

29641

OutChains[2] =

29642

DAG.getStore(Root, dl, DAG.getConstant(JMP, dl, MVT::i8), Addr,

29643

MachinePointerInfo(TrmpAddr, 5), Align(1));

29644

29645

Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,

29646

DAG.getConstant(6, dl, MVT::i32));

29647

OutChains[3] = DAG.getStore(Root, dl, Disp, Addr,

29648

MachinePointerInfo(TrmpAddr, 6), Align(1));

29649

29650

return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);

29651

}

29652

}

29653

29654

SDValue X86TargetLowering::LowerGET_ROUNDING(SDValue Op,

29655

SelectionDAG &DAG) const {

29656

/*

29657

The rounding mode is in bits 11:10 of FPSR, and has the following

29658

settings:

29659

00 Round to nearest

29660

01 Round to -inf

29661

10 Round to +inf

29662

11 Round to 0

29663

29664

GET_ROUNDING, on the other hand, expects the following:

29665

-1 Undefined

29666

0 Round to 0

29667

1 Round to nearest

29668

2 Round to +inf

29669

3 Round to -inf

29670

29671

To perform the conversion, we use a packed lookup table of the four 2-bit

29672

values that we can index by FPSP[11:10]

29673

0x2d --> (0b00,10,11,01) --> (0,2,3,1) >> FPSR[11:10]

29674

29675

(0x2d >> ((FPSR & 0xc00) >> 9)) & 3

29676

*/

29677

29678

MachineFunction &MF = DAG.getMachineFunction();

29679

MVT VT = Op.getSimpleValueType();

29680

SDLoc DL(Op);

29681

29682

// Save FP Control Word to stack slot

29683

int SSFI = MF.getFrameInfo().CreateStackObject(2, Align(2), false);

29684

SDValue StackSlot =

29685

DAG.getFrameIndex(SSFI, getPointerTy(DAG.getDataLayout()));

29686

29687

MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, SSFI);

29688

29689

SDValue Chain = Op.getOperand(0);

29690

SDValue Ops[] = {Chain, StackSlot};

29691

Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL,

29692

DAG.getVTList(MVT::Other), Ops, MVT::i16, MPI,

29693

Align(2), MachineMemOperand::MOStore);

29694

29695

// Load FP Control Word from stack slot

29696

SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MPI, Align(2));

29697

Chain = CWD.getValue(1);

29698

29699

// Mask and turn the control bits into a shift for the lookup table.

29700

SDValue Shift =

29701

DAG.getNode(ISD::SRL, DL, MVT::i16,

29702

DAG.getNode(ISD::AND, DL, MVT::i16,

29703

CWD, DAG.getConstant(0xc00, DL, MVT::i16)),

29704

DAG.getConstant(9, DL, MVT::i8));

29705

Shift = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Shift);

29706

29707

SDValue LUT = DAG.getConstant(0x2d, DL, MVT::i32);

29708

SDValue RetVal =

29709

DAG.getNode(ISD::AND, DL, MVT::i32,

29710

DAG.getNode(ISD::SRL, DL, MVT::i32, LUT, Shift),

29711

DAG.getConstant(3, DL, MVT::i32));

29712

29713

RetVal = DAG.getZExtOrTrunc(RetVal, DL, VT);

29714

29715

return DAG.getMergeValues({RetVal, Chain}, DL);

29716

}

29717

29718

SDValue X86TargetLowering::LowerSET_ROUNDING(SDValue Op,

29719

SelectionDAG &DAG) const {

29720

MachineFunction &MF = DAG.getMachineFunction();

29721

SDLoc DL(Op);

29722

SDValue Chain = Op.getNode()->getOperand(0);

29723

29724

// FP control word may be set only from data in memory. So we need to allocate

29725

// stack space to save/load FP control word.

29726

int OldCWFrameIdx = MF.getFrameInfo().CreateStackObject(4, Align(4), false);

29727

SDValue StackSlot =

29728

DAG.getFrameIndex(OldCWFrameIdx, getPointerTy(DAG.getDataLayout()));

29729

MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, OldCWFrameIdx);

29730

MachineMemOperand *MMO =

29731

MF.getMachineMemOperand(MPI, MachineMemOperand::MOStore, 2, Align(2));

29732

29733

// Store FP control word into memory.

29734

SDValue Ops[] = {Chain, StackSlot};

29735

Chain = DAG.getMemIntrinsicNode(

29736

X86ISD::FNSTCW16m, DL, DAG.getVTList(MVT::Other), Ops, MVT::i16, MMO);

29737

29738

// Load FP Control Word from stack slot and clear RM field (bits 11:10).

29739

SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MPI);

29740

Chain = CWD.getValue(1);

29741

CWD = DAG.getNode(ISD::AND, DL, MVT::i16, CWD.getValue(0),

29742

DAG.getConstant(0xf3ff, DL, MVT::i16));

29743

29744

// Calculate new rounding mode.

29745

SDValue NewRM = Op.getNode()->getOperand(1);

29746

SDValue RMBits;

29747

if (auto *CVal = dyn_cast<ConstantSDNode>(NewRM)) {

29748

uint64_t RM = CVal->getZExtValue();

29749

int FieldVal;

29750

switch (static_cast<RoundingMode>(RM)) {

29751

case RoundingMode::NearestTiesToEven: FieldVal = X86::rmToNearest; break;

29752

case RoundingMode::TowardNegative: FieldVal = X86::rmDownward; break;

29753

case RoundingMode::TowardPositive: FieldVal = X86::rmUpward; break;

29754

case RoundingMode::TowardZero: FieldVal = X86::rmTowardZero; break;

29755

default:

29756

llvm_unreachable("rounding mode is not supported by X86 hardware")::llvm::llvm_unreachable_internal("rounding mode is not supported by X86 hardware"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29756);

29757

}

29758

RMBits = DAG.getConstant(FieldVal, DL, MVT::i16);

29759

} else {

29760

// Need to convert argument into bits of control word:

29761

// 0 Round to 0 -> 11

29762

// 1 Round to nearest -> 00

29763

// 2 Round to +inf -> 10

29764

// 3 Round to -inf -> 01

29765

// The 2-bit value needs then to be shifted so that it occupies bits 11:10.

29766

// To make the conversion, put all these values into a value 0xc9 and shift

29767

// it left depending on the rounding mode:

29768

// (0xc9 << 4) & 0xc00 = X86::rmTowardZero

29769

// (0xc9 << 6) & 0xc00 = X86::rmToNearest

29770

// ...

29771

// (0xc9 << (2 * NewRM + 4)) & 0xc00

29772

SDValue ShiftValue =

29773

DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,

29774

DAG.getNode(ISD::ADD, DL, MVT::i32,

29775

DAG.getNode(ISD::SHL, DL, MVT::i32, NewRM,

29776

DAG.getConstant(1, DL, MVT::i8)),

29777

DAG.getConstant(4, DL, MVT::i32)));

29778

SDValue Shifted =

29779

DAG.getNode(ISD::SHL, DL, MVT::i16, DAG.getConstant(0xc9, DL, MVT::i16),

29780

ShiftValue);

29781

RMBits = DAG.getNode(ISD::AND, DL, MVT::i16, Shifted,

29782

DAG.getConstant(0xc00, DL, MVT::i16));

29783

}

29784

29785

// Update rounding mode bits and store the new FP Control Word into stack.

29786

CWD = DAG.getNode(ISD::OR, DL, MVT::i16, CWD, RMBits);

29787

Chain = DAG.getStore(Chain, DL, CWD, StackSlot, MPI, Align(2));

29788

29789

// Load FP control word from the slot.

29790

SDValue OpsLD[] = {Chain, StackSlot};

29791

MachineMemOperand *MMOL =

29792

MF.getMachineMemOperand(MPI, MachineMemOperand::MOLoad, 2, Align(2));

29793

Chain = DAG.getMemIntrinsicNode(

29794

X86ISD::FLDCW16m, DL, DAG.getVTList(MVT::Other), OpsLD, MVT::i16, MMOL);

29795

29796

// If target supports SSE, set MXCSR as well. Rounding mode is encoded in the

29797

// same way but in bits 14:13.

29798

if (Subtarget.hasSSE1()) {

29799

// Store MXCSR into memory.

29800

Chain = DAG.getNode(

29801

ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,

29802

DAG.getTargetConstant(Intrinsic::x86_sse_stmxcsr, DL, MVT::i32),

29803

StackSlot);

29804

29805

// Load MXCSR from stack slot and clear RM field (bits 14:13).

29806

SDValue CWD = DAG.getLoad(MVT::i32, DL, Chain, StackSlot, MPI);

29807

Chain = CWD.getValue(1);

29808

CWD = DAG.getNode(ISD::AND, DL, MVT::i32, CWD.getValue(0),

29809

DAG.getConstant(0xffff9fff, DL, MVT::i32));

29810

29811

// Shift X87 RM bits from 11:10 to 14:13.

29812

RMBits = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, RMBits);

29813

RMBits = DAG.getNode(ISD::SHL, DL, MVT::i32, RMBits,

29814

DAG.getConstant(3, DL, MVT::i8));

29815

29816

// Update rounding mode bits and store the new FP Control Word into stack.

29817

CWD = DAG.getNode(ISD::OR, DL, MVT::i32, CWD, RMBits);

29818

Chain = DAG.getStore(Chain, DL, CWD, StackSlot, MPI, Align(4));

29819

29820

// Load MXCSR from the slot.

29821

Chain = DAG.getNode(

29822

ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,

29823

DAG.getTargetConstant(Intrinsic::x86_sse_ldmxcsr, DL, MVT::i32),

29824

StackSlot);

29825

}

29826

29827

return Chain;

29828

}

29829

29830

/// Lower a vector CTLZ using native supported vector CTLZ instruction.

29831

//

29832

// i8/i16 vector implemented using dword LZCNT vector instruction

29833

// ( sub(trunc(lzcnt(zext32(x)))) ). In case zext32(x) is illegal,

29834

// split the vector, perform operation on it's Lo a Hi part and

29835

// concatenate the results.

29836

static SDValue LowerVectorCTLZ_AVX512CDI(SDValue Op, SelectionDAG &DAG,

29837

const X86Subtarget &Subtarget) {

29838

assert(Op.getOpcode() == ISD::CTLZ)(static_cast <bool> (Op.getOpcode() == ISD::CTLZ) ? void
(0) : __assert_fail ("Op.getOpcode() == ISD::CTLZ", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29838, __extension__ __PRETTY_FUNCTION__));

29839

SDLoc dl(Op);

29840

MVT VT = Op.getSimpleValueType();

29841

MVT EltVT = VT.getVectorElementType();

29842

unsigned NumElems = VT.getVectorNumElements();

29843

29844

assert((EltVT == MVT::i8 || EltVT == MVT::i16) &&(static_cast <bool> ((EltVT == MVT::i8 || EltVT == MVT::
i16) && "Unsupported element type") ? void (0) : __assert_fail
("(EltVT == MVT::i8 || EltVT == MVT::i16) && \"Unsupported element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29845, __extension__
__PRETTY_FUNCTION__))

29845

"Unsupported element type")(static_cast <bool> ((EltVT == MVT::i8 || EltVT == MVT::
i16) && "Unsupported element type") ? void (0) : __assert_fail
("(EltVT == MVT::i8 || EltVT == MVT::i16) && \"Unsupported element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29845, __extension__
__PRETTY_FUNCTION__));

29846

29847

// Split vector, it's Lo and Hi parts will be handled in next iteration.

29848

if (NumElems > 16 ||

29849

(NumElems == 16 && !Subtarget.canExtendTo512DQ()))

29850

return splitVectorIntUnary(Op, DAG);

29851

29852

MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);

29853

assert((NewVT.is256BitVector() || NewVT.is512BitVector()) &&(static_cast <bool> ((NewVT.is256BitVector() || NewVT.is512BitVector
()) && "Unsupported value type for operation") ? void
(0) : __assert_fail ("(NewVT.is256BitVector() || NewVT.is512BitVector()) && \"Unsupported value type for operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29854, __extension__
__PRETTY_FUNCTION__))

29854

"Unsupported value type for operation")(static_cast <bool> ((NewVT.is256BitVector() || NewVT.is512BitVector
()) && "Unsupported value type for operation") ? void
(0) : __assert_fail ("(NewVT.is256BitVector() || NewVT.is512BitVector()) && \"Unsupported value type for operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29854, __extension__
__PRETTY_FUNCTION__));

29855

29856

// Use native supported vector instruction vplzcntd.

29857

Op = DAG.getNode(ISD::ZERO_EXTEND, dl, NewVT, Op.getOperand(0));

29858

SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Op);

29859

SDValue TruncNode = DAG.getNode(ISD::TRUNCATE, dl, VT, CtlzNode);

29860

SDValue Delta = DAG.getConstant(32 - EltVT.getSizeInBits(), dl, VT);

29861

29862

return DAG.getNode(ISD::SUB, dl, VT, TruncNode, Delta);

29863

}

29864

29865

// Lower CTLZ using a PSHUFB lookup table implementation.

29866

static SDValue LowerVectorCTLZInRegLUT(SDValue Op, const SDLoc &DL,

29867

const X86Subtarget &Subtarget,

29868

SelectionDAG &DAG) {

29869

MVT VT = Op.getSimpleValueType();

29870

int NumElts = VT.getVectorNumElements();

29871

int NumBytes = NumElts * (VT.getScalarSizeInBits() / 8);

29872

MVT CurrVT = MVT::getVectorVT(MVT::i8, NumBytes);

29873

29874

// Per-nibble leading zero PSHUFB lookup table.

29875

const int LUT[16] = {/* 0 */ 4, /* 1 */ 3, /* 2 */ 2, /* 3 */ 2,

29876

/* 4 */ 1, /* 5 */ 1, /* 6 */ 1, /* 7 */ 1,

29877

/* 8 */ 0, /* 9 */ 0, /* a */ 0, /* b */ 0,

29878

/* c */ 0, /* d */ 0, /* e */ 0, /* f */ 0};

29879

29880

SmallVector<SDValue, 64> LUTVec;

29881

for (int i = 0; i < NumBytes; ++i)

29882

LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));

29883

SDValue InRegLUT = DAG.getBuildVector(CurrVT, DL, LUTVec);

29884

29885

// Begin by bitcasting the input to byte vector, then split those bytes

29886

// into lo/hi nibbles and use the PSHUFB LUT to perform CLTZ on each of them.

29887

// If the hi input nibble is zero then we add both results together, otherwise

29888

// we just take the hi result (by masking the lo result to zero before the

29889

// add).

29890

SDValue Op0 = DAG.getBitcast(CurrVT, Op.getOperand(0));

29891

SDValue Zero = DAG.getConstant(0, DL, CurrVT);

29892

29893

SDValue NibbleShift = DAG.getConstant(0x4, DL, CurrVT);

29894

SDValue Lo = Op0;

29895

SDValue Hi = DAG.getNode(ISD::SRL, DL, CurrVT, Op0, NibbleShift);

29896

SDValue HiZ;

29897

if (CurrVT.is512BitVector()) {

29898

MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());

29899

HiZ = DAG.getSetCC(DL, MaskVT, Hi, Zero, ISD::SETEQ);

29900

HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);

29901

} else {

29902

HiZ = DAG.getSetCC(DL, CurrVT, Hi, Zero, ISD::SETEQ);

29903

}

29904

29905

Lo = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Lo);

29906

Hi = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Hi);

29907

Lo = DAG.getNode(ISD::AND, DL, CurrVT, Lo, HiZ);

29908

SDValue Res = DAG.getNode(ISD::ADD, DL, CurrVT, Lo, Hi);

29909

29910

// Merge result back from vXi8 back to VT, working on the lo/hi halves

29911

// of the current vector width in the same way we did for the nibbles.

29912

// If the upper half of the input element is zero then add the halves'

29913

// leading zero counts together, otherwise just use the upper half's.

29914

// Double the width of the result until we are at target width.

29915

while (CurrVT != VT) {

29916

int CurrScalarSizeInBits = CurrVT.getScalarSizeInBits();

29917

int CurrNumElts = CurrVT.getVectorNumElements();

29918

MVT NextSVT = MVT::getIntegerVT(CurrScalarSizeInBits * 2);

29919

MVT NextVT = MVT::getVectorVT(NextSVT, CurrNumElts / 2);

29920

SDValue Shift = DAG.getConstant(CurrScalarSizeInBits, DL, NextVT);

29921

29922

// Check if the upper half of the input element is zero.

29923

if (CurrVT.is512BitVector()) {

29924

MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());

29925

HiZ = DAG.getSetCC(DL, MaskVT, DAG.getBitcast(CurrVT, Op0),

29926

DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);

29927

HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);

29928

} else {

29929

HiZ = DAG.getSetCC(DL, CurrVT, DAG.getBitcast(CurrVT, Op0),

29930

DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);

29931

}

29932

HiZ = DAG.getBitcast(NextVT, HiZ);

29933

29934

// Move the upper/lower halves to the lower bits as we'll be extending to

29935

// NextVT. Mask the lower result to zero if HiZ is true and add the results

29936

// together.

29937

SDValue ResNext = Res = DAG.getBitcast(NextVT, Res);

29938

SDValue R0 = DAG.getNode(ISD::SRL, DL, NextVT, ResNext, Shift);

29939

SDValue R1 = DAG.getNode(ISD::SRL, DL, NextVT, HiZ, Shift);

29940

R1 = DAG.getNode(ISD::AND, DL, NextVT, ResNext, R1);

29941

Res = DAG.getNode(ISD::ADD, DL, NextVT, R0, R1);

29942

CurrVT = NextVT;

29943

}

29944

29945

return Res;

29946

}

29947

29948

static SDValue LowerVectorCTLZ(SDValue Op, const SDLoc &DL,

29949

const X86Subtarget &Subtarget,

29950

SelectionDAG &DAG) {

29951

MVT VT = Op.getSimpleValueType();

29952

29953

if (Subtarget.hasCDI() &&

29954

// vXi8 vectors need to be promoted to 512-bits for vXi32.

29955

(Subtarget.canExtendTo512DQ() || VT.getVectorElementType() != MVT::i8))

29956

return LowerVectorCTLZ_AVX512CDI(Op, DAG, Subtarget);

29957

29958

// Decompose 256-bit ops into smaller 128-bit ops.

29959

if (VT.is256BitVector() && !Subtarget.hasInt256())

29960

return splitVectorIntUnary(Op, DAG);

29961

29962

// Decompose 512-bit ops into smaller 256-bit ops.

29963

if (VT.is512BitVector() && !Subtarget.hasBWI())

29964

return splitVectorIntUnary(Op, DAG);

29965

29966

assert(Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB")(static_cast <bool> (Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB"
) ? void (0) : __assert_fail ("Subtarget.hasSSSE3() && \"Expected SSSE3 support for PSHUFB\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29966, __extension__
__PRETTY_FUNCTION__));

29967

return LowerVectorCTLZInRegLUT(Op, DL, Subtarget, DAG);

29968

}

29969

29970

static SDValue LowerCTLZ(SDValue Op, const X86Subtarget &Subtarget,

29971

SelectionDAG &DAG) {

29972

MVT VT = Op.getSimpleValueType();

29973

MVT OpVT = VT;

29974

unsigned NumBits = VT.getSizeInBits();

29975

SDLoc dl(Op);

29976

unsigned Opc = Op.getOpcode();

29977

29978

if (VT.isVector())

29979

return LowerVectorCTLZ(Op, dl, Subtarget, DAG);

29980

29981

Op = Op.getOperand(0);

29982

if (VT == MVT::i8) {

29983

// Zero extend to i32 since there is not an i8 bsr.

29984

OpVT = MVT::i32;

29985

Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);

29986

}

29987

29988

// Issue a bsr (scan bits in reverse) which also sets EFLAGS.

29989

SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);

29990

Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);

29991

29992

if (Opc == ISD::CTLZ) {

29993

// If src is zero (i.e. bsr sets ZF), returns NumBits.

29994

SDValue Ops[] = {Op, DAG.getConstant(NumBits + NumBits - 1, dl, OpVT),

29995

DAG.getTargetConstant(X86::COND_E, dl, MVT::i8),

29996

Op.getValue(1)};

29997

Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops);

29998

}

29999

30000

// Finally xor with NumBits-1.

30001

Op = DAG.getNode(ISD::XOR, dl, OpVT, Op,

30002

DAG.getConstant(NumBits - 1, dl, OpVT));

30003

30004

if (VT == MVT::i8)

30005

Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);

30006

return Op;

30007

}

30008

30009

static SDValue LowerCTTZ(SDValue Op, const X86Subtarget &Subtarget,

30010

SelectionDAG &DAG) {

30011

MVT VT = Op.getSimpleValueType();

30012

unsigned NumBits = VT.getScalarSizeInBits();

30013

SDValue N0 = Op.getOperand(0);

30014

SDLoc dl(Op);

30015

30016

assert(!VT.isVector() && Op.getOpcode() == ISD::CTTZ &&(static_cast <bool> (!VT.isVector() && Op.getOpcode
() == ISD::CTTZ && "Only scalar CTTZ requires custom lowering"
) ? void (0) : __assert_fail ("!VT.isVector() && Op.getOpcode() == ISD::CTTZ && \"Only scalar CTTZ requires custom lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30017, __extension__
__PRETTY_FUNCTION__))

30017

"Only scalar CTTZ requires custom lowering")(static_cast <bool> (!VT.isVector() && Op.getOpcode
() == ISD::CTTZ && "Only scalar CTTZ requires custom lowering"
) ? void (0) : __assert_fail ("!VT.isVector() && Op.getOpcode() == ISD::CTTZ && \"Only scalar CTTZ requires custom lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30017, __extension__
__PRETTY_FUNCTION__));

30018

30019

// Issue a bsf (scan bits forward) which also sets EFLAGS.

30020

SDVTList VTs = DAG.getVTList(VT, MVT::i32);

30021

Op = DAG.getNode(X86ISD::BSF, dl, VTs, N0);

30022

30023

// If src is known never zero we can skip the CMOV.

30024

if (DAG.isKnownNeverZero(N0))

30025

return Op;

30026

30027

// If src is zero (i.e. bsf sets ZF), returns NumBits.

30028

SDValue Ops[] = {Op, DAG.getConstant(NumBits, dl, VT),

30029

DAG.getTargetConstant(X86::COND_E, dl, MVT::i8),

30030

Op.getValue(1)};

30031

return DAG.getNode(X86ISD::CMOV, dl, VT, Ops);

30032

}

30033

30034

static SDValue lowerAddSub(SDValue Op, SelectionDAG &DAG,

30035

const X86Subtarget &Subtarget) {

30036

MVT VT = Op.getSimpleValueType();

30037

if (VT == MVT::i16 || VT == MVT::i32)

30038

return lowerAddSubToHorizontalOp(Op, DAG, Subtarget);

30039

30040

if (VT == MVT::v32i16 || VT == MVT::v64i8)

30041

return splitVectorIntBinary(Op, DAG);

30042

30043

assert(Op.getSimpleValueType().is256BitVector() &&(static_cast <bool> (Op.getSimpleValueType().is256BitVector
() && Op.getSimpleValueType().isInteger() && "Only handle AVX 256-bit vector integer operation"
) ? void (0) : __assert_fail ("Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType().isInteger() && \"Only handle AVX 256-bit vector integer operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30045, __extension__
__PRETTY_FUNCTION__))

30044

Op.getSimpleValueType().isInteger() &&(static_cast <bool> (Op.getSimpleValueType().is256BitVector
() && Op.getSimpleValueType().isInteger() && "Only handle AVX 256-bit vector integer operation"
) ? void (0) : __assert_fail ("Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType().isInteger() && \"Only handle AVX 256-bit vector integer operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30045, __extension__
__PRETTY_FUNCTION__))

30045

"Only handle AVX 256-bit vector integer operation")(static_cast <bool> (Op.getSimpleValueType().is256BitVector
() && Op.getSimpleValueType().isInteger() && "Only handle AVX 256-bit vector integer operation"
) ? void (0) : __assert_fail ("Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType().isInteger() && \"Only handle AVX 256-bit vector integer operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30045, __extension__
__PRETTY_FUNCTION__));

30046

return splitVectorIntBinary(Op, DAG);

30047

}

30048

30049

static SDValue LowerADDSAT_SUBSAT(SDValue Op, SelectionDAG &DAG,

30050

const X86Subtarget &Subtarget) {

30051

MVT VT = Op.getSimpleValueType();

30052

SDValue X = Op.getOperand(0), Y = Op.getOperand(1);

30053

unsigned Opcode = Op.getOpcode();

30054

SDLoc DL(Op);

30055

30056

if (VT == MVT::v32i16 || VT == MVT::v64i8 ||

30057

(VT.is256BitVector() && !Subtarget.hasInt256())) {

30058

assert(Op.getSimpleValueType().isInteger() &&(static_cast <bool> (Op.getSimpleValueType().isInteger(
) && "Only handle AVX vector integer operation") ? void
(0) : __assert_fail ("Op.getSimpleValueType().isInteger() && \"Only handle AVX vector integer operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30059, __extension__
__PRETTY_FUNCTION__))

30059

"Only handle AVX vector integer operation")(static_cast <bool> (Op.getSimpleValueType().isInteger(
) && "Only handle AVX vector integer operation") ? void
(0) : __assert_fail ("Op.getSimpleValueType().isInteger() && \"Only handle AVX vector integer operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30059, __extension__
__PRETTY_FUNCTION__));

30060

return splitVectorIntBinary(Op, DAG);

30061

}

30062

30063

// Avoid the generic expansion with min/max if we don't have pminu*/pmaxu*.

30064

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

30065

EVT SetCCResultType =

30066

TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);

30067

30068

unsigned BitWidth = VT.getScalarSizeInBits();

30069

if (Opcode == ISD::USUBSAT) {

30070

if (!TLI.isOperationLegal(ISD::UMAX, VT) || useVPTERNLOG(Subtarget, VT)) {

30071

// Handle a special-case with a bit-hack instead of cmp+select:

30072

// usubsat X, SMIN --> (X ^ SMIN) & (X s>> BW-1)

30073

// If the target can use VPTERNLOG, DAGToDAG will match this as

30074

// "vpsra + vpternlog" which is better than "vpmax + vpsub" with a

30075

// "broadcast" constant load.

30076

ConstantSDNode *C = isConstOrConstSplat(Y, true);

30077

if (C && C->getAPIntValue().isSignMask()) {

30078

SDValue SignMask = DAG.getConstant(C->getAPIntValue(), DL, VT);

30079

SDValue ShiftAmt = DAG.getConstant(BitWidth - 1, DL, VT);

30080

SDValue Xor = DAG.getNode(ISD::XOR, DL, VT, X, SignMask);

30081

SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, X, ShiftAmt);

30082

return DAG.getNode(ISD::AND, DL, VT, Xor, Sra);

30083

}

30084

}

30085

if (!TLI.isOperationLegal(ISD::UMAX, VT)) {

30086

// usubsat X, Y --> (X >u Y) ? X - Y : 0

30087

SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, X, Y);

30088

SDValue Cmp = DAG.getSetCC(DL, SetCCResultType, X, Y, ISD::SETUGT);

30089

// TODO: Move this to DAGCombiner?

30090

if (SetCCResultType == VT &&

30091

DAG.ComputeNumSignBits(Cmp) == VT.getScalarSizeInBits())

30092

return DAG.getNode(ISD::AND, DL, VT, Cmp, Sub);

30093

return DAG.getSelect(DL, VT, Cmp, Sub, DAG.getConstant(0, DL, VT));

30094

}

30095

}

30096

30097

if ((Opcode == ISD::SADDSAT || Opcode == ISD::SSUBSAT) &&

30098

(!VT.isVector() || VT == MVT::v2i64)) {

30099

APInt MinVal = APInt::getSignedMinValue(BitWidth);

30100

APInt MaxVal = APInt::getSignedMaxValue(BitWidth);

30101

SDValue Zero = DAG.getConstant(0, DL, VT);

30102

SDValue Result =

30103

DAG.getNode(Opcode == ISD::SADDSAT ? ISD::SADDO : ISD::SSUBO, DL,

30104

DAG.getVTList(VT, SetCCResultType), X, Y);

30105

SDValue SumDiff = Result.getValue(0);

30106

SDValue Overflow = Result.getValue(1);

30107

SDValue SatMin = DAG.getConstant(MinVal, DL, VT);

30108

SDValue SatMax = DAG.getConstant(MaxVal, DL, VT);

30109

SDValue SumNeg =

30110

DAG.getSetCC(DL, SetCCResultType, SumDiff, Zero, ISD::SETLT);

30111

Result = DAG.getSelect(DL, VT, SumNeg, SatMax, SatMin);

30112

return DAG.getSelect(DL, VT, Overflow, Result, SumDiff);

30113

}

30114

30115

// Use default expansion.

30116

return SDValue();

30117

}

30118

30119

static SDValue LowerABS(SDValue Op, const X86Subtarget &Subtarget,

30120

SelectionDAG &DAG) {

30121

MVT VT = Op.getSimpleValueType();

30122

if (VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) {

30123

// Since X86 does not have CMOV for 8-bit integer, we don't convert

30124

// 8-bit integer abs to NEG and CMOV.

30125

SDLoc DL(Op);

30126

SDValue N0 = Op.getOperand(0);

30127

SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),

30128

DAG.getConstant(0, DL, VT), N0);

30129

SDValue Ops[] = {N0, Neg, DAG.getTargetConstant(X86::COND_NS, DL, MVT::i8),

30130

SDValue(Neg.getNode(), 1)};

30131

return DAG.getNode(X86ISD::CMOV, DL, VT, Ops);

30132

}

30133

30134

// ABS(vXi64 X) --> VPBLENDVPD(X, 0-X, X).

30135

if ((VT == MVT::v2i64 || VT == MVT::v4i64) && Subtarget.hasSSE41()) {

30136

SDLoc DL(Op);

30137

SDValue Src = Op.getOperand(0);

30138

SDValue Sub =

30139

DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Src);

30140

return DAG.getNode(X86ISD::BLENDV, DL, VT, Src, Sub, Src);

30141

}

30142

30143

if (VT.is256BitVector() && !Subtarget.hasInt256()) {

30144

assert(VT.isInteger() &&(static_cast <bool> (VT.isInteger() && "Only handle AVX 256-bit vector integer operation"
) ? void (0) : __assert_fail ("VT.isInteger() && \"Only handle AVX 256-bit vector integer operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30145, __extension__
__PRETTY_FUNCTION__))

30145

"Only handle AVX 256-bit vector integer operation")(static_cast <bool> (VT.isInteger() && "Only handle AVX 256-bit vector integer operation"
) ? void (0) : __assert_fail ("VT.isInteger() && \"Only handle AVX 256-bit vector integer operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30145, __extension__
__PRETTY_FUNCTION__));

30146

return splitVectorIntUnary(Op, DAG);

30147

}

30148

30149

if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())

30150

return splitVectorIntUnary(Op, DAG);

30151

30152

// Default to expand.

30153

return SDValue();

30154

}

30155

30156

static SDValue LowerAVG(SDValue Op, const X86Subtarget &Subtarget,

30157

SelectionDAG &DAG) {

30158

MVT VT = Op.getSimpleValueType();

30159

30160

// For AVX1 cases, split to use legal ops.

30161

if (VT.is256BitVector() && !Subtarget.hasInt256())

30162

return splitVectorIntBinary(Op, DAG);

30163

30164

if (VT == MVT::v32i16 || VT == MVT::v64i8)

30165

return splitVectorIntBinary(Op, DAG);

30166

30167

// Default to expand.

30168

return SDValue();

30169

}

30170

30171

static SDValue LowerMINMAX(SDValue Op, const X86Subtarget &Subtarget,

30172

SelectionDAG &DAG) {

30173

MVT VT = Op.getSimpleValueType();

30174

30175

// For AVX1 cases, split to use legal ops.

30176

if (VT.is256BitVector() && !Subtarget.hasInt256())

30177

return splitVectorIntBinary(Op, DAG);

30178

30179

if (VT == MVT::v32i16 || VT == MVT::v64i8)

30180

return splitVectorIntBinary(Op, DAG);

30181

30182

// umax(x,1) --> sub(x,cmpeq(x,0))

30183

// TODO: Move this to expandIntMINMAX?

30184

if (VT.isVector() && Op.getOpcode() == ISD::UMAX &&

30185

llvm::isOneOrOneSplat(Op.getOperand(1), true)) {

30186

SDLoc DL(Op);

30187

SDValue X = DAG.getFreeze(Op.getOperand(0));

30188

SDValue Zero = getZeroVector(VT, Subtarget, DAG, DL);

30189

return DAG.getNode(ISD::SUB, DL, VT, X,

30190

DAG.getSetCC(DL, VT, X, Zero, ISD::SETEQ));

30191

}

30192

30193

// Default to expand.

30194

return SDValue();

30195

}

30196

30197

static SDValue LowerABD(SDValue Op, const X86Subtarget &Subtarget,

30198

SelectionDAG &DAG) {

30199

MVT VT = Op.getSimpleValueType();

30200

30201

// For AVX1 cases, split to use legal ops.

30202

if (VT.is256BitVector() && !Subtarget.hasInt256())

30203

return splitVectorIntBinary(Op, DAG);

30204

30205

if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.useBWIRegs())

30206

return splitVectorIntBinary(Op, DAG);

30207

30208

// TODO: Add TargetLowering expandABD() support.

30209

SDLoc dl(Op);

30210

bool IsSigned = Op.getOpcode() == ISD::ABDS;

30211

SDValue LHS = DAG.getFreeze(Op.getOperand(0));

30212

SDValue RHS = DAG.getFreeze(Op.getOperand(1));

30213

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

30214

30215

// abds(lhs, rhs) -> sub(smax(lhs,rhs), smin(lhs,rhs))

30216

// abdu(lhs, rhs) -> sub(umax(lhs,rhs), umin(lhs,rhs))

30217

unsigned MaxOpc = IsSigned ? ISD::SMAX : ISD::UMAX;

30218

unsigned MinOpc = IsSigned ? ISD::SMIN : ISD::UMIN;

30219

if (TLI.isOperationLegal(MaxOpc, VT) && TLI.isOperationLegal(MinOpc, VT)) {

30220

SDValue Max = DAG.getNode(MaxOpc, dl, VT, LHS, RHS);

30221

SDValue Min = DAG.getNode(MinOpc, dl, VT, LHS, RHS);

30222

return DAG.getNode(ISD::SUB, dl, VT, Max, Min);

30223

}

30224

30225

// abds(lhs, rhs) -> select(sgt(lhs,rhs), sub(lhs,rhs), sub(rhs,lhs))

30226

// abdu(lhs, rhs) -> select(ugt(lhs,rhs), sub(lhs,rhs), sub(rhs,lhs))

30227

EVT CCVT = TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);

30228

ISD::CondCode CC = IsSigned ? ISD::CondCode::SETGT : ISD::CondCode::SETUGT;

30229

SDValue Cmp = DAG.getSetCC(dl, CCVT, LHS, RHS, CC);

30230

return DAG.getSelect(dl, VT, Cmp, DAG.getNode(ISD::SUB, dl, VT, LHS, RHS),

30231

DAG.getNode(ISD::SUB, dl, VT, RHS, LHS));

30232

}

30233

30234

static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,

30235

SelectionDAG &DAG) {

30236

SDLoc dl(Op);

30237

MVT VT = Op.getSimpleValueType();

30238

30239

// Decompose 256-bit ops into 128-bit ops.

30240

if (VT.is256BitVector() && !Subtarget.hasInt256())

30241

return splitVectorIntBinary(Op, DAG);

30242

30243

if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())

30244

return splitVectorIntBinary(Op, DAG);

30245

30246

SDValue A = Op.getOperand(0);

30247

SDValue B = Op.getOperand(1);

30248

30249

// Lower v16i8/v32i8/v64i8 mul as sign-extension to v8i16/v16i16/v32i16

30250

// vector pairs, multiply and truncate.

30251

if (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8) {

30252

unsigned NumElts = VT.getVectorNumElements();

30253

30254

if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||

30255

(VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {

30256

MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());

30257

return DAG.getNode(

30258

ISD::TRUNCATE, dl, VT,

30259

DAG.getNode(ISD::MUL, dl, ExVT,

30260

DAG.getNode(ISD::ANY_EXTEND, dl, ExVT, A),

30261

DAG.getNode(ISD::ANY_EXTEND, dl, ExVT, B)));

30262

}

30263

30264

MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);

30265

30266

// Extract the lo/hi parts to any extend to i16.

30267

// We're going to mask off the low byte of each result element of the

30268

// pmullw, so it doesn't matter what's in the high byte of each 16-bit

30269

// element.

30270

SDValue Undef = DAG.getUNDEF(VT);

30271

SDValue ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A, Undef));

30272

SDValue AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A, Undef));

30273

30274

SDValue BLo, BHi;

30275

if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) {

30276

// If the RHS is a constant, manually unpackl/unpackh.

30277

SmallVector<SDValue, 16> LoOps, HiOps;

30278

for (unsigned i = 0; i != NumElts; i += 16) {

30279

for (unsigned j = 0; j != 8; ++j) {

30280

LoOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j), dl,

30281

MVT::i16));

30282

HiOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j + 8), dl,

30283

MVT::i16));

30284

}

30285

}

30286

30287

BLo = DAG.getBuildVector(ExVT, dl, LoOps);

30288

BHi = DAG.getBuildVector(ExVT, dl, HiOps);

30289

} else {

30290

BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B, Undef));

30291

BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B, Undef));

30292

}

30293

30294

// Multiply, mask the lower 8bits of the lo/hi results and pack.

30295

SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);

30296

SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);

30297

return getPack(DAG, Subtarget, dl, VT, RLo, RHi);

30298

}

30299

30300

// Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle.

30301

if (VT == MVT::v4i32) {

30302

assert(Subtarget.hasSSE2() && !Subtarget.hasSSE41() &&(static_cast <bool> (Subtarget.hasSSE2() && !Subtarget
.hasSSE41() && "Should not custom lower when pmulld is available!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && !Subtarget.hasSSE41() && \"Should not custom lower when pmulld is available!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30303, __extension__
__PRETTY_FUNCTION__))

30303

"Should not custom lower when pmulld is available!")(static_cast <bool> (Subtarget.hasSSE2() && !Subtarget
.hasSSE41() && "Should not custom lower when pmulld is available!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && !Subtarget.hasSSE41() && \"Should not custom lower when pmulld is available!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30303, __extension__
__PRETTY_FUNCTION__));

30304

30305

// Extract the odd parts.

30306

static const int UnpackMask[] = { 1, -1, 3, -1 };

30307

SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);

30308

SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask);

30309

30310

// Multiply the even parts.

30311

SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64,

30312

DAG.getBitcast(MVT::v2i64, A),

30313

DAG.getBitcast(MVT::v2i64, B));

30314

// Now multiply odd parts.

30315

SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64,

30316

DAG.getBitcast(MVT::v2i64, Aodds),

30317

DAG.getBitcast(MVT::v2i64, Bodds));

30318

30319

Evens = DAG.getBitcast(VT, Evens);

30320

Odds = DAG.getBitcast(VT, Odds);

30321

30322

// Merge the two vectors back together with a shuffle. This expands into 2

30323

// shuffles.

30324

static const int ShufMask[] = { 0, 4, 2, 6 };

30325

return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);

30326

}

30327

30328

assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&(static_cast <bool> ((VT == MVT::v2i64 || VT == MVT::v4i64
|| VT == MVT::v8i64) && "Only know how to lower V2I64/V4I64/V8I64 multiply"
) ? void (0) : __assert_fail ("(VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) && \"Only know how to lower V2I64/V4I64/V8I64 multiply\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30329, __extension__
__PRETTY_FUNCTION__))

30329

"Only know how to lower V2I64/V4I64/V8I64 multiply")(static_cast <bool> ((VT == MVT::v2i64 || VT == MVT::v4i64
|| VT == MVT::v8i64) && "Only know how to lower V2I64/V4I64/V8I64 multiply"
) ? void (0) : __assert_fail ("(VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) && \"Only know how to lower V2I64/V4I64/V8I64 multiply\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30329, __extension__
__PRETTY_FUNCTION__));

30330

assert(!Subtarget.hasDQI() && "DQI should use MULLQ")(static_cast <bool> (!Subtarget.hasDQI() && "DQI should use MULLQ"
) ? void (0) : __assert_fail ("!Subtarget.hasDQI() && \"DQI should use MULLQ\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30330, __extension__
__PRETTY_FUNCTION__));

30331

30332

// Ahi = psrlqi(a, 32);

30333

// Bhi = psrlqi(b, 32);

30334

//

30335

// AloBlo = pmuludq(a, b);

30336

// AloBhi = pmuludq(a, Bhi);

30337

// AhiBlo = pmuludq(Ahi, b);

30338

//

30339

// Hi = psllqi(AloBhi + AhiBlo, 32);

30340

// return AloBlo + Hi;

30341

KnownBits AKnown = DAG.computeKnownBits(A);

30342

KnownBits BKnown = DAG.computeKnownBits(B);

30343

30344

APInt LowerBitsMask = APInt::getLowBitsSet(64, 32);

30345

bool ALoIsZero = LowerBitsMask.isSubsetOf(AKnown.Zero);

30346

bool BLoIsZero = LowerBitsMask.isSubsetOf(BKnown.Zero);

30347

30348

APInt UpperBitsMask = APInt::getHighBitsSet(64, 32);

30349

bool AHiIsZero = UpperBitsMask.isSubsetOf(AKnown.Zero);

30350

bool BHiIsZero = UpperBitsMask.isSubsetOf(BKnown.Zero);

30351

30352

SDValue Zero = DAG.getConstant(0, dl, VT);

30353

30354

// Only multiply lo/hi halves that aren't known to be zero.

30355

SDValue AloBlo = Zero;

30356

if (!ALoIsZero && !BLoIsZero)

30357

AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B);

30358

30359

SDValue AloBhi = Zero;

30360

if (!ALoIsZero && !BHiIsZero) {

30361

SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG);

30362

AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi);

30363

}

30364

30365

SDValue AhiBlo = Zero;

30366

if (!AHiIsZero && !BLoIsZero) {

30367

SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG);

30368

AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B);

30369

}

30370

30371

SDValue Hi = DAG.getNode(ISD::ADD, dl, VT, AloBhi, AhiBlo);

30372

Hi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Hi, 32, DAG);

30373

30374

return DAG.getNode(ISD::ADD, dl, VT, AloBlo, Hi);

30375

}

30376

30377

static SDValue LowervXi8MulWithUNPCK(SDValue A, SDValue B, const SDLoc &dl,

30378

MVT VT, bool IsSigned,

30379

const X86Subtarget &Subtarget,

30380

SelectionDAG &DAG,

30381

SDValue *Low = nullptr) {

30382

unsigned NumElts = VT.getVectorNumElements();

30383

30384

// For vXi8 we will unpack the low and high half of each 128 bit lane to widen

30385

// to a vXi16 type. Do the multiplies, shift the results and pack the half

30386

// lane results back together.

30387

30388

// We'll take different approaches for signed and unsigned.

30389

// For unsigned we'll use punpcklbw/punpckhbw to put zero extend the bytes

30390

// and use pmullw to calculate the full 16-bit product.

30391

// For signed we'll use punpcklbw/punpckbw to extend the bytes to words and

30392

// shift them left into the upper byte of each word. This allows us to use

30393

// pmulhw to calculate the full 16-bit product. This trick means we don't

30394

// need to sign extend the bytes to use pmullw.

30395

30396

MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);

30397

SDValue Zero = DAG.getConstant(0, dl, VT);

30398

30399

SDValue ALo, AHi;

30400

if (IsSigned) {

30401

ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, Zero, A));

30402

AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, Zero, A));

30403

} else {

30404

ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A, Zero));

30405

AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A, Zero));

30406

}

30407

30408

SDValue BLo, BHi;

30409

if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) {

30410

// If the RHS is a constant, manually unpackl/unpackh and extend.

30411

SmallVector<SDValue, 16> LoOps, HiOps;

30412

for (unsigned i = 0; i != NumElts; i += 16) {

30413

for (unsigned j = 0; j != 8; ++j) {

30414

SDValue LoOp = B.getOperand(i + j);

30415

SDValue HiOp = B.getOperand(i + j + 8);

30416

30417

if (IsSigned) {

30418

LoOp = DAG.getAnyExtOrTrunc(LoOp, dl, MVT::i16);

30419

HiOp = DAG.getAnyExtOrTrunc(HiOp, dl, MVT::i16);

30420

LoOp = DAG.getNode(ISD::SHL, dl, MVT::i16, LoOp,

30421

DAG.getConstant(8, dl, MVT::i16));

30422

HiOp = DAG.getNode(ISD::SHL, dl, MVT::i16, HiOp,

30423

DAG.getConstant(8, dl, MVT::i16));

30424

} else {

30425

LoOp = DAG.getZExtOrTrunc(LoOp, dl, MVT::i16);

30426

HiOp = DAG.getZExtOrTrunc(HiOp, dl, MVT::i16);

30427

}

30428

30429

LoOps.push_back(LoOp);

30430

HiOps.push_back(HiOp);

30431

}

30432

}

30433

30434

BLo = DAG.getBuildVector(ExVT, dl, LoOps);

30435

BHi = DAG.getBuildVector(ExVT, dl, HiOps);

30436

} else if (IsSigned) {

30437

BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, Zero, B));

30438

BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, Zero, B));

30439

} else {

30440

BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B, Zero));

30441

BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B, Zero));

30442

}

30443

30444

// Multiply, lshr the upper 8bits to the lower 8bits of the lo/hi results and

30445

// pack back to vXi8.

30446

unsigned MulOpc = IsSigned ? ISD::MULHS : ISD::MUL;

30447

SDValue RLo = DAG.getNode(MulOpc, dl, ExVT, ALo, BLo);

30448

SDValue RHi = DAG.getNode(MulOpc, dl, ExVT, AHi, BHi);

30449

30450

if (Low)

30451

*Low = getPack(DAG, Subtarget, dl, VT, RLo, RHi);

30452

30453

return getPack(DAG, Subtarget, dl, VT, RLo, RHi, /*PackHiHalf*/ true);

30454

}

30455

30456

static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,

30457

SelectionDAG &DAG) {

30458

SDLoc dl(Op);

30459

MVT VT = Op.getSimpleValueType();

30460

bool IsSigned = Op->getOpcode() == ISD::MULHS;

30461

unsigned NumElts = VT.getVectorNumElements();

30462

SDValue A = Op.getOperand(0);

30463

SDValue B = Op.getOperand(1);

30464

30465

// Decompose 256-bit ops into 128-bit ops.

30466

if (VT.is256BitVector() && !Subtarget.hasInt256())

30467

return splitVectorIntBinary(Op, DAG);

30468

30469

if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())

30470

return splitVectorIntBinary(Op, DAG);

30471

30472

if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) {

30473

assert((VT == MVT::v4i32 && Subtarget.hasSSE2()) ||(static_cast <bool> ((VT == MVT::v4i32 && Subtarget
.hasSSE2()) || (VT == MVT::v8i32 && Subtarget.hasInt256
()) || (VT == MVT::v16i32 && Subtarget.hasAVX512())) ?
void (0) : __assert_fail ("(VT == MVT::v4i32 && Subtarget.hasSSE2()) || (VT == MVT::v8i32 && Subtarget.hasInt256()) || (VT == MVT::v16i32 && Subtarget.hasAVX512())"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30475, __extension__
__PRETTY_FUNCTION__))

30474

(VT == MVT::v8i32 && Subtarget.hasInt256()) ||(static_cast <bool> ((VT == MVT::v4i32 && Subtarget
.hasSSE2()) || (VT == MVT::v8i32 && Subtarget.hasInt256
()) || (VT == MVT::v16i32 && Subtarget.hasAVX512())) ?
void (0) : __assert_fail ("(VT == MVT::v4i32 && Subtarget.hasSSE2()) || (VT == MVT::v8i32 && Subtarget.hasInt256()) || (VT == MVT::v16i32 && Subtarget.hasAVX512())"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30475, __extension__
__PRETTY_FUNCTION__))

30475

(VT == MVT::v16i32 && Subtarget.hasAVX512()))(static_cast <bool> ((VT == MVT::v4i32 && Subtarget
.hasSSE2()) || (VT == MVT::v8i32 && Subtarget.hasInt256
()) || (VT == MVT::v16i32 && Subtarget.hasAVX512())) ?
void (0) : __assert_fail ("(VT == MVT::v4i32 && Subtarget.hasSSE2()) || (VT == MVT::v8i32 && Subtarget.hasInt256()) || (VT == MVT::v16i32 && Subtarget.hasAVX512())"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30475, __extension__
__PRETTY_FUNCTION__));

30476

30477

// PMULxD operations multiply each even value (starting at 0) of LHS with

30478

// the related value of RHS and produce a widen result.

30479

// E.g., PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>

30480

// => <2 x i64> <ae|cg>

30481

//

30482

// In other word, to have all the results, we need to perform two PMULxD:

30483

// 1. one with the even values.

30484

// 2. one with the odd values.

30485

// To achieve #2, with need to place the odd values at an even position.

30486

//

30487

// Place the odd value at an even position (basically, shift all values 1

30488

// step to the left):

30489

const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1,

30490

9, -1, 11, -1, 13, -1, 15, -1};

30491

// <a|b|c|d> => <b|undef|d|undef>

30492

SDValue Odd0 =

30493

DAG.getVectorShuffle(VT, dl, A, A, ArrayRef(&Mask[0], NumElts));

30494

// <e|f|g|h> => <f|undef|h|undef>

30495

SDValue Odd1 =

30496

DAG.getVectorShuffle(VT, dl, B, B, ArrayRef(&Mask[0], NumElts));

30497

30498

// Emit two multiplies, one for the lower 2 ints and one for the higher 2

30499

// ints.

30500

MVT MulVT = MVT::getVectorVT(MVT::i64, NumElts / 2);

30501

unsigned Opcode =

30502

(IsSigned && Subtarget.hasSSE41()) ? X86ISD::PMULDQ : X86ISD::PMULUDQ;

30503

// PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>

30504

// => <2 x i64> <ae|cg>

30505

SDValue Mul1 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,

30506

DAG.getBitcast(MulVT, A),

30507

DAG.getBitcast(MulVT, B)));

30508

30509

// => <2 x i64> <bf|dh>

30510

SDValue Mul2 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,

30511

DAG.getBitcast(MulVT, Odd0),

30512

DAG.getBitcast(MulVT, Odd1)));

30513

30514

// Shuffle it back into the right order.

30515

SmallVector<int, 16> ShufMask(NumElts);

30516

for (int i = 0; i != (int)NumElts; ++i)

30517

ShufMask[i] = (i / 2) * 2 + ((i % 2) * NumElts) + 1;

30518

30519

SDValue Res = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, ShufMask);

30520

30521

// If we have a signed multiply but no PMULDQ fix up the result of an

30522

// unsigned multiply.

30523

if (IsSigned && !Subtarget.hasSSE41()) {

30524

SDValue Zero = DAG.getConstant(0, dl, VT);

30525

SDValue T1 = DAG.getNode(ISD::AND, dl, VT,

30526

DAG.getSetCC(dl, VT, Zero, A, ISD::SETGT), B);

30527

SDValue T2 = DAG.getNode(ISD::AND, dl, VT,

30528

DAG.getSetCC(dl, VT, Zero, B, ISD::SETGT), A);

30529

30530

SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2);

30531

Res = DAG.getNode(ISD::SUB, dl, VT, Res, Fixup);

30532

}

30533

30534

return Res;

30535

}

30536

30537

// Only i8 vectors should need custom lowering after this.

30538

assert((VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) ||(static_cast <bool> ((VT == MVT::v16i8 || (VT == MVT::v32i8
&& Subtarget.hasInt256()) || (VT == MVT::v64i8 &&
Subtarget.hasBWI())) && "Unsupported vector type") ?
void (0) : __assert_fail ("(VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) || (VT == MVT::v64i8 && Subtarget.hasBWI())) && \"Unsupported vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30540, __extension__
__PRETTY_FUNCTION__))

30539

(VT == MVT::v64i8 && Subtarget.hasBWI())) &&(static_cast <bool> ((VT == MVT::v16i8 || (VT == MVT::v32i8
&& Subtarget.hasInt256()) || (VT == MVT::v64i8 &&
Subtarget.hasBWI())) && "Unsupported vector type") ?
void (0) : __assert_fail ("(VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) || (VT == MVT::v64i8 && Subtarget.hasBWI())) && \"Unsupported vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30540, __extension__
__PRETTY_FUNCTION__))

30540

"Unsupported vector type")(static_cast <bool> ((VT == MVT::v16i8 || (VT == MVT::v32i8
&& Subtarget.hasInt256()) || (VT == MVT::v64i8 &&
Subtarget.hasBWI())) && "Unsupported vector type") ?
void (0) : __assert_fail ("(VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) || (VT == MVT::v64i8 && Subtarget.hasBWI())) && \"Unsupported vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30540, __extension__
__PRETTY_FUNCTION__));

30541

30542

// Lower v16i8/v32i8 as extension to v8i16/v16i16 vector pairs, multiply,

30543

// logical shift down the upper half and pack back to i8.

30544

30545

// With SSE41 we can use sign/zero extend, but for pre-SSE41 we unpack

30546

// and then ashr/lshr the upper bits down to the lower bits before multiply.

30547

30548

if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||

30549

(VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {

30550

MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);

30551

unsigned ExAVX = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;

30552

SDValue ExA = DAG.getNode(ExAVX, dl, ExVT, A);

30553

SDValue ExB = DAG.getNode(ExAVX, dl, ExVT, B);

30554

SDValue Mul = DAG.getNode(ISD::MUL, dl, ExVT, ExA, ExB);

30555

Mul = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);

30556

return DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);

30557

}

30558

30559

return LowervXi8MulWithUNPCK(A, B, dl, VT, IsSigned, Subtarget, DAG);

30560

}

30561

30562

// Custom lowering for SMULO/UMULO.

30563

static SDValue LowerMULO(SDValue Op, const X86Subtarget &Subtarget,

30564

SelectionDAG &DAG) {

30565

MVT VT = Op.getSimpleValueType();

30566

30567

// Scalars defer to LowerXALUO.

30568

if (!VT.isVector())

30569

return LowerXALUO(Op, DAG);

30570

30571

SDLoc dl(Op);

30572

bool IsSigned = Op->getOpcode() == ISD::SMULO;

30573

SDValue A = Op.getOperand(0);

30574

SDValue B = Op.getOperand(1);

30575

EVT OvfVT = Op->getValueType(1);

30576

30577

if ((VT == MVT::v32i8 && !Subtarget.hasInt256()) ||

30578

(VT == MVT::v64i8 && !Subtarget.hasBWI())) {

30579

// Extract the LHS Lo/Hi vectors

30580

SDValue LHSLo, LHSHi;

30581

std::tie(LHSLo, LHSHi) = splitVector(A, DAG, dl);

30582

30583

// Extract the RHS Lo/Hi vectors

30584

SDValue RHSLo, RHSHi;

30585

std::tie(RHSLo, RHSHi) = splitVector(B, DAG, dl);

30586

30587

EVT LoOvfVT, HiOvfVT;

30588

std::tie(LoOvfVT, HiOvfVT) = DAG.GetSplitDestVTs(OvfVT);

30589

SDVTList LoVTs = DAG.getVTList(LHSLo.getValueType(), LoOvfVT);

30590

SDVTList HiVTs = DAG.getVTList(LHSHi.getValueType(), HiOvfVT);

30591

30592

// Issue the split operations.

30593

SDValue Lo = DAG.getNode(Op.getOpcode(), dl, LoVTs, LHSLo, RHSLo);

30594

SDValue Hi = DAG.getNode(Op.getOpcode(), dl, HiVTs, LHSHi, RHSHi);

30595

30596

// Join the separate data results and the overflow results.

30597

SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);

30598

SDValue Ovf = DAG.getNode(ISD::CONCAT_VECTORS, dl, OvfVT, Lo.getValue(1),

30599

Hi.getValue(1));

30600

30601

return DAG.getMergeValues({Res, Ovf}, dl);

30602

}

30603

30604

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

30605

EVT SetccVT =

30606

TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);

30607

30608

if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||

30609

(VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {

30610

unsigned NumElts = VT.getVectorNumElements();

30611

MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);

30612

unsigned ExAVX = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;

30613

SDValue ExA = DAG.getNode(ExAVX, dl, ExVT, A);

30614

SDValue ExB = DAG.getNode(ExAVX, dl, ExVT, B);

30615

SDValue Mul = DAG.getNode(ISD::MUL, dl, ExVT, ExA, ExB);

30616

30617

SDValue Low = DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);

30618

30619

SDValue Ovf;

30620

if (IsSigned) {

30621

SDValue High, LowSign;

30622

if (OvfVT.getVectorElementType() == MVT::i1 &&

30623

(Subtarget.hasBWI() || Subtarget.canExtendTo512DQ())) {

30624

// Rather the truncating try to do the compare on vXi16 or vXi32.

30625

// Shift the high down filling with sign bits.

30626

High = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Mul, 8, DAG);

30627

// Fill all 16 bits with the sign bit from the low.

30628

LowSign =

30629

getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ExVT, Mul, 8, DAG);

30630

LowSign = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, LowSign,

30631

15, DAG);

30632

SetccVT = OvfVT;

30633

if (!Subtarget.hasBWI()) {

30634

// We can't do a vXi16 compare so sign extend to v16i32.

30635

High = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v16i32, High);

30636

LowSign = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v16i32, LowSign);

30637

}

30638

} else {

30639

// Otherwise do the compare at vXi8.

30640

High = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);

30641

High = DAG.getNode(ISD::TRUNCATE, dl, VT, High);

30642

LowSign =

30643

DAG.getNode(ISD::SRA, dl, VT, Low, DAG.getConstant(7, dl, VT));

30644

}

30645

30646

Ovf = DAG.getSetCC(dl, SetccVT, LowSign, High, ISD::SETNE);

30647

} else {

30648

SDValue High =

30649

getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);

30650

if (OvfVT.getVectorElementType() == MVT::i1 &&

30651

(Subtarget.hasBWI() || Subtarget.canExtendTo512DQ())) {

30652

// Rather the truncating try to do the compare on vXi16 or vXi32.

30653

SetccVT = OvfVT;

30654

if (!Subtarget.hasBWI()) {

30655

// We can't do a vXi16 compare so sign extend to v16i32.

30656

High = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v16i32, High);

30657

}

30658

} else {

30659

// Otherwise do the compare at vXi8.

30660

High = DAG.getNode(ISD::TRUNCATE, dl, VT, High);

30661

}

30662

30663

Ovf =

30664

DAG.getSetCC(dl, SetccVT, High,

30665

DAG.getConstant(0, dl, High.getValueType()), ISD::SETNE);

30666

}

30667

30668

Ovf = DAG.getSExtOrTrunc(Ovf, dl, OvfVT);

30669

30670

return DAG.getMergeValues({Low, Ovf}, dl);

30671

}

30672

30673

SDValue Low;

30674

SDValue High =

30675

LowervXi8MulWithUNPCK(A, B, dl, VT, IsSigned, Subtarget, DAG, &Low);

30676

30677

SDValue Ovf;

30678

if (IsSigned) {

30679

// SMULO overflows if the high bits don't match the sign of the low.

30680

SDValue LowSign =

30681

DAG.getNode(ISD::SRA, dl, VT, Low, DAG.getConstant(7, dl, VT));

30682

Ovf = DAG.getSetCC(dl, SetccVT, LowSign, High, ISD::SETNE);

30683

} else {

30684

// UMULO overflows if the high bits are non-zero.

30685

Ovf =

30686

DAG.getSetCC(dl, SetccVT, High, DAG.getConstant(0, dl, VT), ISD::SETNE);

30687

}

30688

30689

Ovf = DAG.getSExtOrTrunc(Ovf, dl, OvfVT);

30690

30691

return DAG.getMergeValues({Low, Ovf}, dl);

30692

}

30693

30694

SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const {

30695

assert(Subtarget.isTargetWin64() && "Unexpected target")(static_cast <bool> (Subtarget.isTargetWin64() &&
"Unexpected target") ? void (0) : __assert_fail ("Subtarget.isTargetWin64() && \"Unexpected target\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30695, __extension__
__PRETTY_FUNCTION__));

30696

EVT VT = Op.getValueType();

30697

assert(VT.isInteger() && VT.getSizeInBits() == 128 &&(static_cast <bool> (VT.isInteger() && VT.getSizeInBits
() == 128 && "Unexpected return type for lowering") ?
void (0) : __assert_fail ("VT.isInteger() && VT.getSizeInBits() == 128 && \"Unexpected return type for lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30698, __extension__
__PRETTY_FUNCTION__))

30698

"Unexpected return type for lowering")(static_cast <bool> (VT.isInteger() && VT.getSizeInBits
() == 128 && "Unexpected return type for lowering") ?
void (0) : __assert_fail ("VT.isInteger() && VT.getSizeInBits() == 128 && \"Unexpected return type for lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30698, __extension__
__PRETTY_FUNCTION__));

30699

30700

if (isa<ConstantSDNode>(Op->getOperand(1))) {

30701

SmallVector<SDValue> Result;

30702

if (expandDIVREMByConstant(Op.getNode(), Result, MVT::i64, DAG))

30703

return DAG.getNode(ISD::BUILD_PAIR, SDLoc(Op), VT, Result[0], Result[1]);

30704

}

30705

30706

RTLIB::Libcall LC;

30707

bool isSigned;

30708

switch (Op->getOpcode()) {

30709

default: llvm_unreachable("Unexpected request for libcall!")::llvm::llvm_unreachable_internal("Unexpected request for libcall!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30709);

30710

case ISD::SDIV: isSigned = true; LC = RTLIB::SDIV_I128; break;

30711

case ISD::UDIV: isSigned = false; LC = RTLIB::UDIV_I128; break;

30712

case ISD::SREM: isSigned = true; LC = RTLIB::SREM_I128; break;

30713

case ISD::UREM: isSigned = false; LC = RTLIB::UREM_I128; break;

30714

}

30715

30716

SDLoc dl(Op);

30717

SDValue InChain = DAG.getEntryNode();

30718

30719

TargetLowering::ArgListTy Args;

30720

TargetLowering::ArgListEntry Entry;

30721

for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {

30722

EVT ArgVT = Op->getOperand(i).getValueType();

30723

assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&(static_cast <bool> (ArgVT.isInteger() && ArgVT
.getSizeInBits() == 128 && "Unexpected argument type for lowering"
) ? void (0) : __assert_fail ("ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 && \"Unexpected argument type for lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30724, __extension__
__PRETTY_FUNCTION__))

30724

"Unexpected argument type for lowering")(static_cast <bool> (ArgVT.isInteger() && ArgVT
.getSizeInBits() == 128 && "Unexpected argument type for lowering"
) ? void (0) : __assert_fail ("ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 && \"Unexpected argument type for lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30724, __extension__
__PRETTY_FUNCTION__));

30725

SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);

30726

int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();

30727

MachinePointerInfo MPI =

30728

MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);

30729

Entry.Node = StackPtr;

30730

InChain =

30731

DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr, MPI, Align(16));

30732

Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());

30733

Entry.Ty = PointerType::get(ArgTy,0);

30734

Entry.IsSExt = false;

30735

Entry.IsZExt = false;

30736

Args.push_back(Entry);

30737

}

30738

30739

SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),

30740

getPointerTy(DAG.getDataLayout()));

30741

30742

TargetLowering::CallLoweringInfo CLI(DAG);

30743

CLI.setDebugLoc(dl)

30744

.setChain(InChain)

30745

.setLibCallee(

30746

getLibcallCallingConv(LC),

30747

static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()), Callee,

30748

std::move(Args))

30749

.setInRegister()

30750

.setSExtResult(isSigned)

30751

.setZExtResult(!isSigned);

30752

30753

std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);

30754

return DAG.getBitcast(VT, CallInfo.first);

30755

}

30756

30757

SDValue X86TargetLowering::LowerWin64_FP_TO_INT128(SDValue Op,

30758

SelectionDAG &DAG,

30759

SDValue &Chain) const {

30760

assert(Subtarget.isTargetWin64() && "Unexpected target")(static_cast <bool> (Subtarget.isTargetWin64() &&
"Unexpected target") ? void (0) : __assert_fail ("Subtarget.isTargetWin64() && \"Unexpected target\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30760, __extension__
__PRETTY_FUNCTION__));

30761

EVT VT = Op.getValueType();

30762

bool IsStrict = Op->isStrictFPOpcode();

30763

30764

SDValue Arg = Op.getOperand(IsStrict ? 1 : 0);

30765

EVT ArgVT = Arg.getValueType();

30766

30767

assert(VT.isInteger() && VT.getSizeInBits() == 128 &&(static_cast <bool> (VT.isInteger() && VT.getSizeInBits
() == 128 && "Unexpected return type for lowering") ?
void (0) : __assert_fail ("VT.isInteger() && VT.getSizeInBits() == 128 && \"Unexpected return type for lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30768, __extension__
__PRETTY_FUNCTION__))

30768

"Unexpected return type for lowering")(static_cast <bool> (VT.isInteger() && VT.getSizeInBits
() == 128 && "Unexpected return type for lowering") ?
void (0) : __assert_fail ("VT.isInteger() && VT.getSizeInBits() == 128 && \"Unexpected return type for lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30768, __extension__
__PRETTY_FUNCTION__));

30769

30770

RTLIB::Libcall LC;

30771

if (Op->getOpcode() == ISD::FP_TO_SINT ||

30772

Op->getOpcode() == ISD::STRICT_FP_TO_SINT)

30773

LC = RTLIB::getFPTOSINT(ArgVT, VT);

30774

else

30775

LC = RTLIB::getFPTOUINT(ArgVT, VT);

30776

assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected request for libcall!")(static_cast <bool> (LC != RTLIB::UNKNOWN_LIBCALL &&
"Unexpected request for libcall!") ? void (0) : __assert_fail
("LC != RTLIB::UNKNOWN_LIBCALL && \"Unexpected request for libcall!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30776, __extension__
__PRETTY_FUNCTION__));

30777

30778

SDLoc dl(Op);

30779

MakeLibCallOptions CallOptions;

30780

Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();

30781

30782

SDValue Result;

30783

// Expect the i128 argument returned as a v2i64 in xmm0, cast back to the

30784

// expected VT (i128).

30785

std::tie(Result, Chain) =

30786

makeLibCall(DAG, LC, MVT::v2i64, Arg, CallOptions, dl, Chain);

30787

Result = DAG.getBitcast(VT, Result);

30788

return Result;

30789

}

30790

30791

SDValue X86TargetLowering::LowerWin64_INT128_TO_FP(SDValue Op,

30792

SelectionDAG &DAG) const {

30793

assert(Subtarget.isTargetWin64() && "Unexpected target")(static_cast <bool> (Subtarget.isTargetWin64() &&
"Unexpected target") ? void (0) : __assert_fail ("Subtarget.isTargetWin64() && \"Unexpected target\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30793, __extension__
__PRETTY_FUNCTION__));

30794

EVT VT = Op.getValueType();

30795

bool IsStrict = Op->isStrictFPOpcode();

30796

30797

SDValue Arg = Op.getOperand(IsStrict ? 1 : 0);

30798

EVT ArgVT = Arg.getValueType();

30799

30800

assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&(static_cast <bool> (ArgVT.isInteger() && ArgVT
.getSizeInBits() == 128 && "Unexpected argument type for lowering"
) ? void (0) : __assert_fail ("ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 && \"Unexpected argument type for lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30801, __extension__
__PRETTY_FUNCTION__))

30801

"Unexpected argument type for lowering")(static_cast <bool> (ArgVT.isInteger() && ArgVT
.getSizeInBits() == 128 && "Unexpected argument type for lowering"
) ? void (0) : __assert_fail ("ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 && \"Unexpected argument type for lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30801, __extension__
__PRETTY_FUNCTION__));

30802

30803

RTLIB::Libcall LC;

30804

if (Op->getOpcode() == ISD::SINT_TO_FP ||

30805

Op->getOpcode() == ISD::STRICT_SINT_TO_FP)

30806

LC = RTLIB::getSINTTOFP(ArgVT, VT);

30807

else

30808

LC = RTLIB::getUINTTOFP(ArgVT, VT);

30809

assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected request for libcall!")(static_cast <bool> (LC != RTLIB::UNKNOWN_LIBCALL &&
"Unexpected request for libcall!") ? void (0) : __assert_fail
("LC != RTLIB::UNKNOWN_LIBCALL && \"Unexpected request for libcall!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30809, __extension__
__PRETTY_FUNCTION__));

30810

30811

SDLoc dl(Op);

30812

MakeLibCallOptions CallOptions;

30813

SDValue Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();

30814

30815

// Pass the i128 argument as an indirect argument on the stack.

30816

SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);

30817

int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();

30818

MachinePointerInfo MPI =

30819

MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);

30820

Chain = DAG.getStore(Chain, dl, Arg, StackPtr, MPI, Align(16));

30821

30822

SDValue Result;

30823

std::tie(Result, Chain) =

30824

makeLibCall(DAG, LC, VT, StackPtr, CallOptions, dl, Chain);

30825

return IsStrict ? DAG.getMergeValues({Result, Chain}, dl) : Result;

30826

}

30827

30828

// Return true if the required (according to Opcode) shift-imm form is natively

30829

// supported by the Subtarget

30830

static bool supportedVectorShiftWithImm(MVT VT, const X86Subtarget &Subtarget,

30831

unsigned Opcode) {

30832

if (!(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))

30833

return false;

30834

30835

if (VT.getScalarSizeInBits() < 16)

30836

return false;

30837

30838

if (VT.is512BitVector() && Subtarget.useAVX512Regs() &&

30839

(VT.getScalarSizeInBits() > 16 || Subtarget.hasBWI()))

30840

return true;

30841

30842

bool LShift = (VT.is128BitVector() && Subtarget.hasSSE2()) ||

30843

(VT.is256BitVector() && Subtarget.hasInt256());

30844

30845

bool AShift = LShift && (Subtarget.hasAVX512() ||

30846

(VT != MVT::v2i64 && VT != MVT::v4i64));

30847

return (Opcode == ISD::SRA) ? AShift : LShift;

30848

}

30849

30850

// The shift amount is a variable, but it is the same for all vector lanes.

30851

// These instructions are defined together with shift-immediate.

30852

static

30853

bool supportedVectorShiftWithBaseAmnt(MVT VT, const X86Subtarget &Subtarget,

30854

unsigned Opcode) {

30855

return supportedVectorShiftWithImm(VT, Subtarget, Opcode);

30856

}

30857

30858

// Return true if the required (according to Opcode) variable-shift form is

30859

// natively supported by the Subtarget

30860

static bool supportedVectorVarShift(MVT VT, const X86Subtarget &Subtarget,

30861

unsigned Opcode) {

30862

if (!(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))

30863

return false;

30864

30865

if (!Subtarget.hasInt256() || VT.getScalarSizeInBits() < 16)

30866

return false;

30867

30868

// vXi16 supported only on AVX-512, BWI

30869

if (VT.getScalarSizeInBits() == 16 && !Subtarget.hasBWI())

30870

return false;

30871

30872

if (Subtarget.hasAVX512() &&

30873

(Subtarget.useAVX512Regs() || !VT.is512BitVector()))

30874

return true;

30875

30876

bool LShift = VT.is128BitVector() || VT.is256BitVector();

30877

bool AShift = LShift && VT != MVT::v2i64 && VT != MVT::v4i64;

30878

return (Opcode == ISD::SRA) ? AShift : LShift;

30879

}

30880

30881

static SDValue LowerShiftByScalarImmediate(SDValue Op, SelectionDAG &DAG,

30882

const X86Subtarget &Subtarget) {

30883

MVT VT = Op.getSimpleValueType();

30884

SDLoc dl(Op);

30885

SDValue R = Op.getOperand(0);

30886

SDValue Amt = Op.getOperand(1);

30887

unsigned X86Opc = getTargetVShiftUniformOpcode(Op.getOpcode(), false);

30888

30889

auto ArithmeticShiftRight64 = [&](uint64_t ShiftAmt) {

30890

assert((VT == MVT::v2i64 || VT == MVT::v4i64) && "Unexpected SRA type")(static_cast <bool> ((VT == MVT::v2i64 || VT == MVT::v4i64
) && "Unexpected SRA type") ? void (0) : __assert_fail
("(VT == MVT::v2i64 || VT == MVT::v4i64) && \"Unexpected SRA type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30890, __extension__
__PRETTY_FUNCTION__));

30891

MVT ExVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);

30892

SDValue Ex = DAG.getBitcast(ExVT, R);

30893

30894

// ashr(R, 63) === cmp_slt(R, 0)

30895

if (ShiftAmt == 63 && Subtarget.hasSSE42()) {

30896

assert((VT != MVT::v4i64 || Subtarget.hasInt256()) &&(static_cast <bool> ((VT != MVT::v4i64 || Subtarget.hasInt256
()) && "Unsupported PCMPGT op") ? void (0) : __assert_fail
("(VT != MVT::v4i64 || Subtarget.hasInt256()) && \"Unsupported PCMPGT op\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30897, __extension__
__PRETTY_FUNCTION__))

30897

"Unsupported PCMPGT op")(static_cast <bool> ((VT != MVT::v4i64 || Subtarget.hasInt256
()) && "Unsupported PCMPGT op") ? void (0) : __assert_fail
("(VT != MVT::v4i64 || Subtarget.hasInt256()) && \"Unsupported PCMPGT op\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30897, __extension__
__PRETTY_FUNCTION__));

30898

return DAG.getNode(X86ISD::PCMPGT, dl, VT, DAG.getConstant(0, dl, VT), R);

30899

}

30900

30901

if (ShiftAmt >= 32) {

30902

// Splat sign to upper i32 dst, and SRA upper i32 src to lower i32.

30903

SDValue Upper =

30904

getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, 31, DAG);

30905

SDValue Lower = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,

30906

ShiftAmt - 32, DAG);

30907

if (VT == MVT::v2i64)

30908

Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {5, 1, 7, 3});

30909

if (VT == MVT::v4i64)

30910

Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,

30911

{9, 1, 11, 3, 13, 5, 15, 7});

30912

} else {

30913

// SRA upper i32, SRL whole i64 and select lower i32.

30914

SDValue Upper = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,

30915

ShiftAmt, DAG);

30916

SDValue Lower =

30917

getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt, DAG);

30918

Lower = DAG.getBitcast(ExVT, Lower);

30919

if (VT == MVT::v2i64)

30920

Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {4, 1, 6, 3});

30921

if (VT == MVT::v4i64)

30922

Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,

30923

{8, 1, 10, 3, 12, 5, 14, 7});

30924

}

30925

return DAG.getBitcast(VT, Ex);

30926

};

30927

30928

// Optimize shl/srl/sra with constant shift amount.

30929

APInt APIntShiftAmt;

30930

if (!X86::isConstantSplat(Amt, APIntShiftAmt))

30931

return SDValue();

30932

30933

// If the shift amount is out of range, return undef.

30934

if (APIntShiftAmt.uge(VT.getScalarSizeInBits()))

30935

return DAG.getUNDEF(VT);

30936

30937

uint64_t ShiftAmt = APIntShiftAmt.getZExtValue();

30938

30939

if (supportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode())) {

30940

// Hardware support for vector shifts is sparse which makes us scalarize the

30941

// vector operations in many cases. Also, on sandybridge ADD is faster than

30942

// shl: (shl V, 1) -> (add (freeze V), (freeze V))

30943

if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1) {

30944

// R may be undef at run-time, but (shl R, 1) must be an even number (LSB

30945

// must be 0). (add undef, undef) however can be any value. To make this

30946

// safe, we must freeze R to ensure that register allocation uses the same

30947

// register for an undefined value. This ensures that the result will

30948

// still be even and preserves the original semantics.

30949

R = DAG.getFreeze(R);

30950

return DAG.getNode(ISD::ADD, dl, VT, R, R);

30951

}

30952

30953

return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);

30954

}

30955

30956

// i64 SRA needs to be performed as partial shifts.

30957

if (((!Subtarget.hasXOP() && VT == MVT::v2i64) ||

30958

(Subtarget.hasInt256() && VT == MVT::v4i64)) &&

30959

Op.getOpcode() == ISD::SRA)

30960

return ArithmeticShiftRight64(ShiftAmt);

30961

30962

if (VT == MVT::v16i8 || (Subtarget.hasInt256() && VT == MVT::v32i8) ||

30963

(Subtarget.hasBWI() && VT == MVT::v64i8)) {

30964

unsigned NumElts = VT.getVectorNumElements();

30965

MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);

30966

30967

// Simple i8 add case

30968

if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1) {

30969

// R may be undef at run-time, but (shl R, 1) must be an even number (LSB

30970

// must be 0). (add undef, undef) however can be any value. To make this

30971

// safe, we must freeze R to ensure that register allocation uses the same

30972

// register for an undefined value. This ensures that the result will

30973

// still be even and preserves the original semantics.

30974

R = DAG.getFreeze(R);

30975

return DAG.getNode(ISD::ADD, dl, VT, R, R);

30976

}

30977

30978

// ashr(R, 7) === cmp_slt(R, 0)

30979

if (Op.getOpcode() == ISD::SRA && ShiftAmt == 7) {

30980

SDValue Zeros = DAG.getConstant(0, dl, VT);

30981

if (VT.is512BitVector()) {

30982

assert(VT == MVT::v64i8 && "Unexpected element type!")(static_cast <bool> (VT == MVT::v64i8 && "Unexpected element type!"
) ? void (0) : __assert_fail ("VT == MVT::v64i8 && \"Unexpected element type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30982, __extension__
__PRETTY_FUNCTION__));

30983

SDValue CMP = DAG.getSetCC(dl, MVT::v64i1, Zeros, R, ISD::SETGT);

30984

return DAG.getNode(ISD::SIGN_EXTEND, dl, VT, CMP);

30985

}

30986

return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);

30987

}

30988

30989

// XOP can shift v16i8 directly instead of as shift v8i16 + mask.

30990

if (VT == MVT::v16i8 && Subtarget.hasXOP())

30991

return SDValue();

30992

30993

if (Op.getOpcode() == ISD::SHL) {

30994

// Make a large shift.

30995

SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT, R,

30996

ShiftAmt, DAG);

30997

SHL = DAG.getBitcast(VT, SHL);

30998

// Zero out the rightmost bits.

30999

APInt Mask = APInt::getHighBitsSet(8, 8 - ShiftAmt);

31000

return DAG.getNode(ISD::AND, dl, VT, SHL, DAG.getConstant(Mask, dl, VT));

31001

}

31002

if (Op.getOpcode() == ISD::SRL) {

31003

// Make a large shift.

31004

SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ShiftVT, R,

31005

ShiftAmt, DAG);

31006

SRL = DAG.getBitcast(VT, SRL);

31007

// Zero out the leftmost bits.

31008

APInt Mask = APInt::getLowBitsSet(8, 8 - ShiftAmt);

31009

return DAG.getNode(ISD::AND, dl, VT, SRL, DAG.getConstant(Mask, dl, VT));

31010

}

31011

if (Op.getOpcode() == ISD::SRA) {

31012

// ashr(R, Amt) === sub(xor(lshr(R, Amt), Mask), Mask)

31013

SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);

31014

31015

SDValue Mask = DAG.getConstant(128 >> ShiftAmt, dl, VT);

31016

Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);

31017

Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);

31018

return Res;

31019

}

31020

llvm_unreachable("Unknown shift opcode.")::llvm::llvm_unreachable_internal("Unknown shift opcode.", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31020);

31021

}

31022

31023

return SDValue();

31024

}

31025

31026

static SDValue LowerShiftByScalarVariable(SDValue Op, SelectionDAG &DAG,

31027

const X86Subtarget &Subtarget) {

31028

MVT VT = Op.getSimpleValueType();

31029

SDLoc dl(Op);

31030

SDValue R = Op.getOperand(0);

31031

SDValue Amt = Op.getOperand(1);

31032

unsigned Opcode = Op.getOpcode();

31033

unsigned X86OpcI = getTargetVShiftUniformOpcode(Opcode, false);

31034

31035

int BaseShAmtIdx = -1;

31036

if (SDValue BaseShAmt = DAG.getSplatSourceVector(Amt, BaseShAmtIdx)) {

31037

if (supportedVectorShiftWithBaseAmnt(VT, Subtarget, Opcode))

31038

return getTargetVShiftNode(X86OpcI, dl, VT, R, BaseShAmt, BaseShAmtIdx,

31039

Subtarget, DAG);

31040

31041

// vXi8 shifts - shift as v8i16 + mask result.

31042

if (((VT == MVT::v16i8 && !Subtarget.canExtendTo512DQ()) ||

31043

(VT == MVT::v32i8 && !Subtarget.canExtendTo512BW()) ||

31044

VT == MVT::v64i8) &&

31045

!Subtarget.hasXOP()) {

31046

unsigned NumElts = VT.getVectorNumElements();

31047

MVT ExtVT = MVT::getVectorVT(MVT::i16, NumElts / 2);

31048

if (supportedVectorShiftWithBaseAmnt(ExtVT, Subtarget, Opcode)) {

31049

unsigned LogicalOp = (Opcode == ISD::SHL ? ISD::SHL : ISD::SRL);

31050

unsigned LogicalX86Op = getTargetVShiftUniformOpcode(LogicalOp, false);

31051

31052

// Create the mask using vXi16 shifts. For shift-rights we need to move

31053

// the upper byte down before splatting the vXi8 mask.

31054

SDValue BitMask = DAG.getConstant(-1, dl, ExtVT);

31055

BitMask = getTargetVShiftNode(LogicalX86Op, dl, ExtVT, BitMask,

31056

BaseShAmt, BaseShAmtIdx, Subtarget, DAG);

31057

if (Opcode != ISD::SHL)

31058

BitMask = getTargetVShiftByConstNode(LogicalX86Op, dl, ExtVT, BitMask,

31059

8, DAG);

31060

BitMask = DAG.getBitcast(VT, BitMask);

31061

BitMask = DAG.getVectorShuffle(VT, dl, BitMask, BitMask,

31062

SmallVector<int, 64>(NumElts, 0));

31063

31064

SDValue Res = getTargetVShiftNode(LogicalX86Op, dl, ExtVT,

31065

DAG.getBitcast(ExtVT, R), BaseShAmt,

31066

BaseShAmtIdx, Subtarget, DAG);

31067

Res = DAG.getBitcast(VT, Res);

31068

Res = DAG.getNode(ISD::AND, dl, VT, Res, BitMask);

31069

31070

if (Opcode == ISD::SRA) {

31071

// ashr(R, Amt) === sub(xor(lshr(R, Amt), SignMask), SignMask)

31072

// SignMask = lshr(SignBit, Amt) - safe to do this with PSRLW.

31073

SDValue SignMask = DAG.getConstant(0x8080, dl, ExtVT);

31074

SignMask =

31075

getTargetVShiftNode(LogicalX86Op, dl, ExtVT, SignMask, BaseShAmt,

31076

BaseShAmtIdx, Subtarget, DAG);

31077

SignMask = DAG.getBitcast(VT, SignMask);

31078

Res = DAG.getNode(ISD::XOR, dl, VT, Res, SignMask);

31079

Res = DAG.getNode(ISD::SUB, dl, VT, Res, SignMask);

31080

}

31081

return Res;

31082

}

31083

}

31084

}

31085

31086

return SDValue();

31087

}

31088

31089

// Convert a shift/rotate left amount to a multiplication scale factor.

31090

static SDValue convertShiftLeftToScale(SDValue Amt, const SDLoc &dl,

31091

const X86Subtarget &Subtarget,

31092

SelectionDAG &DAG) {

31093

MVT VT = Amt.getSimpleValueType();

31094

if (!(VT == MVT::v8i16 || VT == MVT::v4i32 ||

31095

(Subtarget.hasInt256() && VT == MVT::v16i16) ||

31096

(Subtarget.hasAVX512() && VT == MVT::v32i16) ||

31097

(!Subtarget.hasAVX512() && VT == MVT::v16i8) ||

31098

(Subtarget.hasInt256() && VT == MVT::v32i8) ||

31099

(Subtarget.hasBWI() && VT == MVT::v64i8)))

31100

return SDValue();

31101

31102

MVT SVT = VT.getVectorElementType();

31103

unsigned SVTBits = SVT.getSizeInBits();

31104

unsigned NumElems = VT.getVectorNumElements();

31105

31106

APInt UndefElts;

31107

SmallVector<APInt> EltBits;

31108

if (getTargetConstantBitsFromNode(Amt, SVTBits, UndefElts, EltBits)) {

31109

APInt One(SVTBits, 1);

31110

SmallVector<SDValue> Elts(NumElems, DAG.getUNDEF(SVT));

31111

for (unsigned I = 0; I != NumElems; ++I) {

31112

if (UndefElts[I] || EltBits[I].uge(SVTBits))

31113

continue;

31114

uint64_t ShAmt = EltBits[I].getZExtValue();

31115

Elts[I] = DAG.getConstant(One.shl(ShAmt), dl, SVT);

31116

}

31117

return DAG.getBuildVector(VT, dl, Elts);

31118

}

31119

31120

// If the target doesn't support variable shifts, use either FP conversion

31121

// or integer multiplication to avoid shifting each element individually.

31122

if (VT == MVT::v4i32) {

31123

Amt = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, dl, VT));

31124

Amt = DAG.getNode(ISD::ADD, dl, VT, Amt,

31125

DAG.getConstant(0x3f800000U, dl, VT));

31126

Amt = DAG.getBitcast(MVT::v4f32, Amt);

31127

return DAG.getNode(ISD::FP_TO_SINT, dl, VT, Amt);

31128

}

31129

31130

// AVX2 can more effectively perform this as a zext/trunc to/from v8i32.

31131

if (VT == MVT::v8i16 && !Subtarget.hasAVX2()) {

31132

SDValue Z = DAG.getConstant(0, dl, VT);

31133

SDValue Lo = DAG.getBitcast(MVT::v4i32, getUnpackl(DAG, dl, VT, Amt, Z));

31134

SDValue Hi = DAG.getBitcast(MVT::v4i32, getUnpackh(DAG, dl, VT, Amt, Z));

31135

Lo = convertShiftLeftToScale(Lo, dl, Subtarget, DAG);

31136

Hi = convertShiftLeftToScale(Hi, dl, Subtarget, DAG);

31137

if (Subtarget.hasSSE41())

31138

return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);

31139

return getPack(DAG, Subtarget, dl, VT, Lo, Hi);

31140

}

31141

31142

return SDValue();

31143

}

31144

31145

static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,

31146

SelectionDAG &DAG) {

31147

MVT VT = Op.getSimpleValueType();

31148

SDLoc dl(Op);

31149

SDValue R = Op.getOperand(0);

31150

SDValue Amt = Op.getOperand(1);

31151

unsigned EltSizeInBits = VT.getScalarSizeInBits();

31152

bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());

31153

31154

unsigned Opc = Op.getOpcode();

31155

unsigned X86OpcV = getTargetVShiftUniformOpcode(Opc, true);

31156

unsigned X86OpcI = getTargetVShiftUniformOpcode(Opc, false);

31157

31158

assert(VT.isVector() && "Custom lowering only for vector shifts!")(static_cast <bool> (VT.isVector() && "Custom lowering only for vector shifts!"
) ? void (0) : __assert_fail ("VT.isVector() && \"Custom lowering only for vector shifts!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31158, __extension__
__PRETTY_FUNCTION__));

31159

assert(Subtarget.hasSSE2() && "Only custom lower when we have SSE2!")(static_cast <bool> (Subtarget.hasSSE2() && "Only custom lower when we have SSE2!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"Only custom lower when we have SSE2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31159, __extension__
__PRETTY_FUNCTION__));

31160

31161

if (SDValue V = LowerShiftByScalarImmediate(Op, DAG, Subtarget))

31162

return V;

31163

31164

if (SDValue V = LowerShiftByScalarVariable(Op, DAG, Subtarget))

31165

return V;

31166

31167

if (supportedVectorVarShift(VT, Subtarget, Opc))

31168

return Op;

31169

31170

// i64 vector arithmetic shift can be emulated with the transform:

31171

// M = lshr(SIGN_MASK, Amt)

31172

// ashr(R, Amt) === sub(xor(lshr(R, Amt), M), M)

31173

if (((VT == MVT::v2i64 && !Subtarget.hasXOP()) ||

31174

(VT == MVT::v4i64 && Subtarget.hasInt256())) &&

31175

Opc == ISD::SRA) {

31176

SDValue S = DAG.getConstant(APInt::getSignMask(64), dl, VT);

31177

SDValue M = DAG.getNode(ISD::SRL, dl, VT, S, Amt);

31178

R = DAG.getNode(ISD::SRL, dl, VT, R, Amt);

31179

R = DAG.getNode(ISD::XOR, dl, VT, R, M);

31180

R = DAG.getNode(ISD::SUB, dl, VT, R, M);

31181

return R;

31182

}

31183

31184

// XOP has 128-bit variable logical/arithmetic shifts.

31185

// +ve/-ve Amt = shift left/right.

31186

if (Subtarget.hasXOP() && (VT == MVT::v2i64 || VT == MVT::v4i32 ||

31187

VT == MVT::v8i16 || VT == MVT::v16i8)) {

31188

if (Opc == ISD::SRL || Opc == ISD::SRA) {

31189

SDValue Zero = DAG.getConstant(0, dl, VT);

31190

Amt = DAG.getNode(ISD::SUB, dl, VT, Zero, Amt);

31191

}

31192

if (Opc == ISD::SHL || Opc == ISD::SRL)

31193

return DAG.getNode(X86ISD::VPSHL, dl, VT, R, Amt);

31194

if (Opc == ISD::SRA)

31195

return DAG.getNode(X86ISD::VPSHA, dl, VT, R, Amt);

31196

}

31197

31198

// 2i64 vector logical shifts can efficiently avoid scalarization - do the

31199

// shifts per-lane and then shuffle the partial results back together.

31200

if (VT == MVT::v2i64 && Opc != ISD::SRA) {

31201

// Splat the shift amounts so the scalar shifts above will catch it.

31202

SDValue Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {0, 0});

31203

SDValue Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {1, 1});

31204

SDValue R0 = DAG.getNode(Opc, dl, VT, R, Amt0);

31205

SDValue R1 = DAG.getNode(Opc, dl, VT, R, Amt1);

31206

return DAG.getVectorShuffle(VT, dl, R0, R1, {0, 3});

31207

}

31208

31209

// If possible, lower this shift as a sequence of two shifts by

31210

// constant plus a BLENDing shuffle instead of scalarizing it.

31211

// Example:

31212

// (v4i32 (srl A, (build_vector < X, Y, Y, Y>)))

31213

//

31214

// Could be rewritten as:

31215

// (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))

31216

//

31217

// The advantage is that the two shifts from the example would be

31218

// lowered as X86ISD::VSRLI nodes in parallel before blending.

31219

if (ConstantAmt && (VT == MVT::v8i16 || VT == MVT::v4i32 ||

31220

(VT == MVT::v16i16 && Subtarget.hasInt256()))) {

31221

SDValue Amt1, Amt2;

31222

unsigned NumElts = VT.getVectorNumElements();

31223

SmallVector<int, 8> ShuffleMask;

31224

for (unsigned i = 0; i != NumElts; ++i) {

31225

SDValue A = Amt->getOperand(i);

31226

if (A.isUndef()) {

31227

ShuffleMask.push_back(SM_SentinelUndef);

31228

continue;

31229

}

31230

if (!Amt1 || Amt1 == A) {

31231

ShuffleMask.push_back(i);

31232

Amt1 = A;

31233

continue;

31234

}

31235

if (!Amt2 || Amt2 == A) {

31236

ShuffleMask.push_back(i + NumElts);

31237

Amt2 = A;

31238

continue;

31239

}

31240

break;

31241

}

31242

31243

// Only perform this blend if we can perform it without loading a mask.

31244

if (ShuffleMask.size() == NumElts && Amt1 && Amt2 &&

31245

(VT != MVT::v16i16 ||

31246

is128BitLaneRepeatedShuffleMask(VT, ShuffleMask)) &&

31247

(VT == MVT::v4i32 || Subtarget.hasSSE41() || Opc != ISD::SHL ||

31248

canWidenShuffleElements(ShuffleMask))) {

31249

auto *Cst1 = dyn_cast<ConstantSDNode>(Amt1);

31250

auto *Cst2 = dyn_cast<ConstantSDNode>(Amt2);

31251

if (Cst1 && Cst2 && Cst1->getAPIntValue().ult(EltSizeInBits) &&

31252

Cst2->getAPIntValue().ult(EltSizeInBits)) {

31253

SDValue Shift1 = getTargetVShiftByConstNode(X86OpcI, dl, VT, R,

31254

Cst1->getZExtValue(), DAG);

31255

SDValue Shift2 = getTargetVShiftByConstNode(X86OpcI, dl, VT, R,

31256

Cst2->getZExtValue(), DAG);

31257

return DAG.getVectorShuffle(VT, dl, Shift1, Shift2, ShuffleMask);

31258

}

31259

}

31260

}

31261

31262

// If possible, lower this packed shift into a vector multiply instead of

31263

// expanding it into a sequence of scalar shifts.

31264

// For v32i8 cases, it might be quicker to split/extend to vXi16 shifts.

31265

if (Opc == ISD::SHL && !(VT == MVT::v32i8 && (Subtarget.hasXOP() ||

31266

Subtarget.canExtendTo512BW())))

31267

if (SDValue Scale = convertShiftLeftToScale(Amt, dl, Subtarget, DAG))

31268

return DAG.getNode(ISD::MUL, dl, VT, R, Scale);

31269

31270

// Constant ISD::SRL can be performed efficiently on vXi16 vectors as we

31271

// can replace with ISD::MULHU, creating scale factor from (NumEltBits - Amt).

31272

if (Opc == ISD::SRL && ConstantAmt &&

31273

(VT == MVT::v8i16 || (VT == MVT::v16i16 && Subtarget.hasInt256()))) {

31274

SDValue EltBits = DAG.getConstant(EltSizeInBits, dl, VT);

31275

SDValue RAmt = DAG.getNode(ISD::SUB, dl, VT, EltBits, Amt);

31276

if (SDValue Scale = convertShiftLeftToScale(RAmt, dl, Subtarget, DAG)) {

31277

SDValue Zero = DAG.getConstant(0, dl, VT);

31278

SDValue ZAmt = DAG.getSetCC(dl, VT, Amt, Zero, ISD::SETEQ);

31279

SDValue Res = DAG.getNode(ISD::MULHU, dl, VT, R, Scale);

31280

return DAG.getSelect(dl, VT, ZAmt, R, Res);

31281

}

31282

}

31283

31284

// Constant ISD::SRA can be performed efficiently on vXi16 vectors as we

31285

// can replace with ISD::MULHS, creating scale factor from (NumEltBits - Amt).

31286

// TODO: Special case handling for shift by 0/1, really we can afford either

31287

// of these cases in pre-SSE41/XOP/AVX512 but not both.

31288

if (Opc == ISD::SRA && ConstantAmt &&

31289

(VT == MVT::v8i16 || (VT == MVT::v16i16 && Subtarget.hasInt256())) &&

31290

((Subtarget.hasSSE41() && !Subtarget.hasXOP() &&

31291

!Subtarget.hasAVX512()) ||

31292

DAG.isKnownNeverZero(Amt))) {

31293

SDValue EltBits = DAG.getConstant(EltSizeInBits, dl, VT);

31294

SDValue RAmt = DAG.getNode(ISD::SUB, dl, VT, EltBits, Amt);

31295

if (SDValue Scale = convertShiftLeftToScale(RAmt, dl, Subtarget, DAG)) {

31296

SDValue Amt0 =

31297

DAG.getSetCC(dl, VT, Amt, DAG.getConstant(0, dl, VT), ISD::SETEQ);

31298

SDValue Amt1 =

31299

DAG.getSetCC(dl, VT, Amt, DAG.getConstant(1, dl, VT), ISD::SETEQ);

31300

SDValue Sra1 =

31301

getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, R, 1, DAG);

31302

SDValue Res = DAG.getNode(ISD::MULHS, dl, VT, R, Scale);

31303

Res = DAG.getSelect(dl, VT, Amt0, R, Res);

31304

return DAG.getSelect(dl, VT, Amt1, Sra1, Res);

31305

}

31306

}

31307

31308

// v4i32 Non Uniform Shifts.

31309

// If the shift amount is constant we can shift each lane using the SSE2

31310

// immediate shifts, else we need to zero-extend each lane to the lower i64

31311

// and shift using the SSE2 variable shifts.

31312

// The separate results can then be blended together.

31313

if (VT == MVT::v4i32) {

31314

SDValue Amt0, Amt1, Amt2, Amt3;

31315

if (ConstantAmt) {

31316

Amt0 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {0, 0, 0, 0});

31317

Amt1 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {1, 1, 1, 1});

31318

Amt2 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {2, 2, 2, 2});

31319

Amt3 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {3, 3, 3, 3});

31320

} else {

31321

// The SSE2 shifts use the lower i64 as the same shift amount for

31322

// all lanes and the upper i64 is ignored. On AVX we're better off

31323

// just zero-extending, but for SSE just duplicating the top 16-bits is

31324

// cheaper and has the same effect for out of range values.

31325

if (Subtarget.hasAVX()) {

31326

SDValue Z = DAG.getConstant(0, dl, VT);

31327

Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Z, {0, 4, -1, -1});

31328

Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Z, {1, 5, -1, -1});

31329

Amt2 = DAG.getVectorShuffle(VT, dl, Amt, Z, {2, 6, -1, -1});

31330

Amt3 = DAG.getVectorShuffle(VT, dl, Amt, Z, {3, 7, -1, -1});

31331

} else {

31332

SDValue Amt01 = DAG.getBitcast(MVT::v8i16, Amt);

31333

SDValue Amt23 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01,

31334

{4, 5, 6, 7, -1, -1, -1, -1});

31335

SDValue Msk02 = getV4X86ShuffleImm8ForMask({0, 1, 1, 1}, dl, DAG);

31336

SDValue Msk13 = getV4X86ShuffleImm8ForMask({2, 3, 3, 3}, dl, DAG);

31337

Amt0 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt01, Msk02);

31338

Amt1 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt01, Msk13);

31339

Amt2 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt23, Msk02);

31340

Amt3 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt23, Msk13);

31341

}

31342

}

31343

31344

unsigned ShOpc = ConstantAmt ? Opc : X86OpcV;

31345

SDValue R0 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt0));

31346

SDValue R1 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt1));

31347

SDValue R2 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt2));

31348

SDValue R3 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt3));

31349

31350

// Merge the shifted lane results optimally with/without PBLENDW.

31351

// TODO - ideally shuffle combining would handle this.

31352

if (Subtarget.hasSSE41()) {

31353

SDValue R02 = DAG.getVectorShuffle(VT, dl, R0, R2, {0, -1, 6, -1});

31354

SDValue R13 = DAG.getVectorShuffle(VT, dl, R1, R3, {-1, 1, -1, 7});

31355

return DAG.getVectorShuffle(VT, dl, R02, R13, {0, 5, 2, 7});

31356

}

31357

SDValue R01 = DAG.getVectorShuffle(VT, dl, R0, R1, {0, -1, -1, 5});

31358

SDValue R23 = DAG.getVectorShuffle(VT, dl, R2, R3, {2, -1, -1, 7});

31359

return DAG.getVectorShuffle(VT, dl, R01, R23, {0, 3, 4, 7});

31360

}

31361

31362

// It's worth extending once and using the vXi16/vXi32 shifts for smaller

31363

// types, but without AVX512 the extra overheads to get from vXi8 to vXi32

31364

// make the existing SSE solution better.

31365

// NOTE: We honor prefered vector width before promoting to 512-bits.

31366

if ((Subtarget.hasInt256() && VT == MVT::v8i16) ||

31367

(Subtarget.canExtendTo512DQ() && VT == MVT::v16i16) ||

31368

(Subtarget.canExtendTo512DQ() && VT == MVT::v16i8) ||

31369

(Subtarget.canExtendTo512BW() && VT == MVT::v32i8) ||

31370

(Subtarget.hasBWI() && Subtarget.hasVLX() && VT == MVT::v16i8)) {

31371

assert((!Subtarget.hasBWI() || VT == MVT::v32i8 || VT == MVT::v16i8) &&(static_cast <bool> ((!Subtarget.hasBWI() || VT == MVT::
v32i8 || VT == MVT::v16i8) && "Unexpected vector type"
) ? void (0) : __assert_fail ("(!Subtarget.hasBWI() || VT == MVT::v32i8 || VT == MVT::v16i8) && \"Unexpected vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31372, __extension__
__PRETTY_FUNCTION__))

31372

"Unexpected vector type")(static_cast <bool> ((!Subtarget.hasBWI() || VT == MVT::
v32i8 || VT == MVT::v16i8) && "Unexpected vector type"
) ? void (0) : __assert_fail ("(!Subtarget.hasBWI() || VT == MVT::v32i8 || VT == MVT::v16i8) && \"Unexpected vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31372, __extension__
__PRETTY_FUNCTION__));

31373

MVT EvtSVT = Subtarget.hasBWI() ? MVT::i16 : MVT::i32;

31374

MVT ExtVT = MVT::getVectorVT(EvtSVT, VT.getVectorNumElements());

31375

unsigned ExtOpc = Opc == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;

31376

R = DAG.getNode(ExtOpc, dl, ExtVT, R);

31377

Amt = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Amt);

31378

return DAG.getNode(ISD::TRUNCATE, dl, VT,

31379

DAG.getNode(Opc, dl, ExtVT, R, Amt));

31380

}

31381

31382

// Constant ISD::SRA/SRL can be performed efficiently on vXi8 vectors as we

31383

// extend to vXi16 to perform a MUL scale effectively as a MUL_LOHI.

31384

if (ConstantAmt && (Opc == ISD::SRA || Opc == ISD::SRL) &&

31385

(VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) ||

31386

(VT == MVT::v64i8 && Subtarget.hasBWI())) &&

31387

!Subtarget.hasXOP()) {

31388

int NumElts = VT.getVectorNumElements();

31389

SDValue Cst8 = DAG.getTargetConstant(8, dl, MVT::i8);

31390

31391

// Extend constant shift amount to vXi16 (it doesn't matter if the type

31392

// isn't legal).

31393

MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);

31394

Amt = DAG.getZExtOrTrunc(Amt, dl, ExVT);

31395

Amt = DAG.getNode(ISD::SUB, dl, ExVT, DAG.getConstant(8, dl, ExVT), Amt);

31396

Amt = DAG.getNode(ISD::SHL, dl, ExVT, DAG.getConstant(1, dl, ExVT), Amt);

31397

assert(ISD::isBuildVectorOfConstantSDNodes(Amt.getNode()) &&(static_cast <bool> (ISD::isBuildVectorOfConstantSDNodes
(Amt.getNode()) && "Constant build vector expected") ?
void (0) : __assert_fail ("ISD::isBuildVectorOfConstantSDNodes(Amt.getNode()) && \"Constant build vector expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31398, __extension__
__PRETTY_FUNCTION__))

31398

"Constant build vector expected")(static_cast <bool> (ISD::isBuildVectorOfConstantSDNodes
(Amt.getNode()) && "Constant build vector expected") ?
void (0) : __assert_fail ("ISD::isBuildVectorOfConstantSDNodes(Amt.getNode()) && \"Constant build vector expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31398, __extension__
__PRETTY_FUNCTION__));

31399

31400

if (VT == MVT::v16i8 && Subtarget.hasInt256()) {

31401

R = Opc == ISD::SRA ? DAG.getSExtOrTrunc(R, dl, ExVT)

31402

: DAG.getZExtOrTrunc(R, dl, ExVT);

31403

R = DAG.getNode(ISD::MUL, dl, ExVT, R, Amt);

31404

R = DAG.getNode(X86ISD::VSRLI, dl, ExVT, R, Cst8);

31405

return DAG.getZExtOrTrunc(R, dl, VT);

31406

}

31407

31408

SmallVector<SDValue, 16> LoAmt, HiAmt;

31409

for (int i = 0; i != NumElts; i += 16) {

31410

for (int j = 0; j != 8; ++j) {

31411

LoAmt.push_back(Amt.getOperand(i + j));

31412

HiAmt.push_back(Amt.getOperand(i + j + 8));

31413

}

31414

}

31415

31416

MVT VT16 = MVT::getVectorVT(MVT::i16, NumElts / 2);

31417

SDValue LoA = DAG.getBuildVector(VT16, dl, LoAmt);

31418

SDValue HiA = DAG.getBuildVector(VT16, dl, HiAmt);

31419

31420

SDValue LoR = DAG.getBitcast(VT16, getUnpackl(DAG, dl, VT, R, R));

31421

SDValue HiR = DAG.getBitcast(VT16, getUnpackh(DAG, dl, VT, R, R));

31422

LoR = DAG.getNode(X86OpcI, dl, VT16, LoR, Cst8);

31423

HiR = DAG.getNode(X86OpcI, dl, VT16, HiR, Cst8);

31424

LoR = DAG.getNode(ISD::MUL, dl, VT16, LoR, LoA);

31425

HiR = DAG.getNode(ISD::MUL, dl, VT16, HiR, HiA);

31426

LoR = DAG.getNode(X86ISD::VSRLI, dl, VT16, LoR, Cst8);

31427

HiR = DAG.getNode(X86ISD::VSRLI, dl, VT16, HiR, Cst8);

31428

return DAG.getNode(X86ISD::PACKUS, dl, VT, LoR, HiR);

31429

}

31430

31431

if (VT == MVT::v16i8 ||

31432

(VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP()) ||

31433

(VT == MVT::v64i8 && Subtarget.hasBWI())) {

31434

MVT ExtVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2);

31435

31436

auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {

31437

if (VT.is512BitVector()) {

31438

// On AVX512BW targets we make use of the fact that VSELECT lowers

31439

// to a masked blend which selects bytes based just on the sign bit

31440

// extracted to a mask.

31441

MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());

31442

V0 = DAG.getBitcast(VT, V0);

31443

V1 = DAG.getBitcast(VT, V1);

31444

Sel = DAG.getBitcast(VT, Sel);

31445

Sel = DAG.getSetCC(dl, MaskVT, DAG.getConstant(0, dl, VT), Sel,

31446

ISD::SETGT);

31447

return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));

31448

} else if (Subtarget.hasSSE41()) {

31449

// On SSE41 targets we can use PBLENDVB which selects bytes based just

31450

// on the sign bit.

31451

V0 = DAG.getBitcast(VT, V0);

31452

V1 = DAG.getBitcast(VT, V1);

31453

Sel = DAG.getBitcast(VT, Sel);

31454

return DAG.getBitcast(SelVT,

31455

DAG.getNode(X86ISD::BLENDV, dl, VT, Sel, V0, V1));

31456

}

31457

// On pre-SSE41 targets we test for the sign bit by comparing to

31458

// zero - a negative value will set all bits of the lanes to true

31459

// and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.

31460

SDValue Z = DAG.getConstant(0, dl, SelVT);

31461

SDValue C = DAG.getNode(X86ISD::PCMPGT, dl, SelVT, Z, Sel);

31462

return DAG.getSelect(dl, SelVT, C, V0, V1);

31463

};

31464

31465

// Turn 'a' into a mask suitable for VSELECT: a = a << 5;

31466

// We can safely do this using i16 shifts as we're only interested in

31467

// the 3 lower bits of each byte.

31468

Amt = DAG.getBitcast(ExtVT, Amt);

31469

Amt = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ExtVT, Amt, 5, DAG);

31470

Amt = DAG.getBitcast(VT, Amt);

31471

31472

if (Opc == ISD::SHL || Opc == ISD::SRL) {

31473

// r = VSELECT(r, shift(r, 4), a);

31474

SDValue M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(4, dl, VT));

31475

R = SignBitSelect(VT, Amt, M, R);

31476

31477

// a += a

31478

Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);

31479

31480

// r = VSELECT(r, shift(r, 2), a);

31481

M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(2, dl, VT));

31482

R = SignBitSelect(VT, Amt, M, R);

31483

31484

// a += a

31485

Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);

31486

31487

// return VSELECT(r, shift(r, 1), a);

31488

M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(1, dl, VT));

31489

R = SignBitSelect(VT, Amt, M, R);

31490

return R;

31491

}

31492

31493

if (Opc == ISD::SRA) {

31494

// For SRA we need to unpack each byte to the higher byte of a i16 vector

31495

// so we can correctly sign extend. We don't care what happens to the

31496

// lower byte.

31497

SDValue ALo = getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), Amt);

31498

SDValue AHi = getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), Amt);

31499

SDValue RLo = getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), R);

31500

SDValue RHi = getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), R);

31501

ALo = DAG.getBitcast(ExtVT, ALo);

31502

AHi = DAG.getBitcast(ExtVT, AHi);

31503

RLo = DAG.getBitcast(ExtVT, RLo);

31504

RHi = DAG.getBitcast(ExtVT, RHi);

31505

31506

// r = VSELECT(r, shift(r, 4), a);

31507

SDValue MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 4, DAG);

31508

SDValue MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 4, DAG);

31509

RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);

31510

RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);

31511

31512

// a += a

31513

ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);

31514

AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);

31515

31516

// r = VSELECT(r, shift(r, 2), a);

31517

MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 2, DAG);

31518

MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 2, DAG);

31519

RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);

31520

RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);

31521

31522

// a += a

31523

ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);

31524

AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);

31525

31526

// r = VSELECT(r, shift(r, 1), a);

31527

MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 1, DAG);

31528

MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 1, DAG);

31529

RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);

31530

RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);

31531

31532

// Logical shift the result back to the lower byte, leaving a zero upper

31533

// byte meaning that we can safely pack with PACKUSWB.

31534

RLo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, RLo, 8, DAG);

31535

RHi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, RHi, 8, DAG);

31536

return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);

31537

}

31538

}

31539

31540

if (Subtarget.hasInt256() && !Subtarget.hasXOP() && VT == MVT::v16i16) {

31541

MVT ExtVT = MVT::v8i32;

31542

SDValue Z = DAG.getConstant(0, dl, VT);

31543

SDValue ALo = getUnpackl(DAG, dl, VT, Amt, Z);

31544

SDValue AHi = getUnpackh(DAG, dl, VT, Amt, Z);

31545

SDValue RLo = getUnpackl(DAG, dl, VT, Z, R);

31546

SDValue RHi = getUnpackh(DAG, dl, VT, Z, R);

31547

ALo = DAG.getBitcast(ExtVT, ALo);

31548

AHi = DAG.getBitcast(ExtVT, AHi);

31549

RLo = DAG.getBitcast(ExtVT, RLo);

31550

RHi = DAG.getBitcast(ExtVT, RHi);

31551

SDValue Lo = DAG.getNode(Opc, dl, ExtVT, RLo, ALo);

31552

SDValue Hi = DAG.getNode(Opc, dl, ExtVT, RHi, AHi);

31553

Lo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, Lo, 16, DAG);

31554

Hi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, Hi, 16, DAG);

31555

return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);

31556

}

31557

31558

if (VT == MVT::v8i16) {

31559

// If we have a constant shift amount, the non-SSE41 path is best as

31560

// avoiding bitcasts make it easier to constant fold and reduce to PBLENDW.

31561

bool UseSSE41 = Subtarget.hasSSE41() &&

31562

!ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());

31563

31564

auto SignBitSelect = [&](SDValue Sel, SDValue V0, SDValue V1) {

31565

// On SSE41 targets we can use PBLENDVB which selects bytes based just on

31566

// the sign bit.

31567

if (UseSSE41) {

31568

MVT ExtVT = MVT::getVectorVT(MVT::i8, VT.getVectorNumElements() * 2);

31569

V0 = DAG.getBitcast(ExtVT, V0);

31570

V1 = DAG.getBitcast(ExtVT, V1);

31571

Sel = DAG.getBitcast(ExtVT, Sel);

31572

return DAG.getBitcast(

31573

VT, DAG.getNode(X86ISD::BLENDV, dl, ExtVT, Sel, V0, V1));

31574

}

31575

// On pre-SSE41 targets we splat the sign bit - a negative value will

31576

// set all bits of the lanes to true and VSELECT uses that in

31577

// its OR(AND(V0,C),AND(V1,~C)) lowering.

31578

SDValue C =

31579

getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, Sel, 15, DAG);

31580

return DAG.getSelect(dl, VT, C, V0, V1);

31581

};

31582

31583

// Turn 'a' into a mask suitable for VSELECT: a = a << 12;

31584

if (UseSSE41) {

31585

// On SSE41 targets we need to replicate the shift mask in both

31586

// bytes for PBLENDVB.

31587

Amt = DAG.getNode(

31588

ISD::OR, dl, VT,

31589

getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 4, DAG),

31590

getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 12, DAG));

31591

} else {

31592

Amt = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 12, DAG);

31593

}

31594

31595

// r = VSELECT(r, shift(r, 8), a);

31596

SDValue M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 8, DAG);

31597

R = SignBitSelect(Amt, M, R);

31598

31599

// a += a

31600

Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);

31601

31602

// r = VSELECT(r, shift(r, 4), a);

31603

M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 4, DAG);

31604

R = SignBitSelect(Amt, M, R);

31605

31606

// a += a

31607

Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);

31608

31609

// r = VSELECT(r, shift(r, 2), a);

31610

M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 2, DAG);

31611

R = SignBitSelect(Amt, M, R);

31612

31613

// a += a

31614

Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);

31615

31616

// return VSELECT(r, shift(r, 1), a);

31617

M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 1, DAG);

31618

R = SignBitSelect(Amt, M, R);

31619

return R;

31620

}

31621

31622

// Decompose 256-bit shifts into 128-bit shifts.

31623

if (VT.is256BitVector())

31624

return splitVectorIntBinary(Op, DAG);

31625

31626

if (VT == MVT::v32i16 || VT == MVT::v64i8)

31627

return splitVectorIntBinary(Op, DAG);

31628

31629

return SDValue();

31630

}

31631

31632

static SDValue LowerFunnelShift(SDValue Op, const X86Subtarget &Subtarget,

31633

SelectionDAG &DAG) {

31634

MVT VT = Op.getSimpleValueType();

31635

assert((Op.getOpcode() == ISD::FSHL || Op.getOpcode() == ISD::FSHR) &&(static_cast <bool> ((Op.getOpcode() == ISD::FSHL || Op
.getOpcode() == ISD::FSHR) && "Unexpected funnel shift opcode!"
) ? void (0) : __assert_fail ("(Op.getOpcode() == ISD::FSHL || Op.getOpcode() == ISD::FSHR) && \"Unexpected funnel shift opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31636, __extension__
__PRETTY_FUNCTION__))

31636

"Unexpected funnel shift opcode!")(static_cast <bool> ((Op.getOpcode() == ISD::FSHL || Op
.getOpcode() == ISD::FSHR) && "Unexpected funnel shift opcode!"
) ? void (0) : __assert_fail ("(Op.getOpcode() == ISD::FSHL || Op.getOpcode() == ISD::FSHR) && \"Unexpected funnel shift opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31636, __extension__
__PRETTY_FUNCTION__));

31637

31638

SDLoc DL(Op);

31639

SDValue Op0 = Op.getOperand(0);

31640

SDValue Op1 = Op.getOperand(1);

31641

SDValue Amt = Op.getOperand(2);

31642

unsigned EltSizeInBits = VT.getScalarSizeInBits();

31643

bool IsFSHR = Op.getOpcode() == ISD::FSHR;

31644

31645

if (VT.isVector()) {

31646

APInt APIntShiftAmt;

31647

bool IsCstSplat = X86::isConstantSplat(Amt, APIntShiftAmt);

31648

31649

if (Subtarget.hasVBMI2() && EltSizeInBits > 8) {

31650

if (IsFSHR)

31651

std::swap(Op0, Op1);

31652

31653

if (IsCstSplat) {

31654

uint64_t ShiftAmt = APIntShiftAmt.urem(EltSizeInBits);

31655

SDValue Imm = DAG.getTargetConstant(ShiftAmt, DL, MVT::i8);

31656

return getAVX512Node(IsFSHR ? X86ISD::VSHRD : X86ISD::VSHLD, DL, VT,

31657

{Op0, Op1, Imm}, DAG, Subtarget);

31658

}

31659

return getAVX512Node(IsFSHR ? X86ISD::VSHRDV : X86ISD::VSHLDV, DL, VT,

31660

{Op0, Op1, Amt}, DAG, Subtarget);

31661

}

31662

assert((VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8 ||(static_cast <bool> ((VT == MVT::v16i8 || VT == MVT::v32i8
|| VT == MVT::v64i8 || VT == MVT::v8i16 || VT == MVT::v16i16
|| VT == MVT::v32i16 || VT == MVT::v4i32 || VT == MVT::v8i32
|| VT == MVT::v16i32) && "Unexpected funnel shift type!"
) ? void (0) : __assert_fail ("(VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8 || VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16 || VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) && \"Unexpected funnel shift type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31665, __extension__
__PRETTY_FUNCTION__))

31663

VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16 ||(static_cast <bool> ((VT == MVT::v16i8 || VT == MVT::v32i8
|| VT == MVT::v64i8 || VT == MVT::v8i16 || VT == MVT::v16i16
|| VT == MVT::v32i16 || VT == MVT::v4i32 || VT == MVT::v8i32
|| VT == MVT::v16i32) && "Unexpected funnel shift type!"
) ? void (0) : __assert_fail ("(VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8 || VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16 || VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) && \"Unexpected funnel shift type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31665, __extension__
__PRETTY_FUNCTION__))

31664

VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) &&(static_cast <bool> ((VT == MVT::v16i8 || VT == MVT::v32i8
|| VT == MVT::v64i8 || VT == MVT::v8i16 || VT == MVT::v16i16
|| VT == MVT::v32i16 || VT == MVT::v4i32 || VT == MVT::v8i32
|| VT == MVT::v16i32) && "Unexpected funnel shift type!"
) ? void (0) : __assert_fail ("(VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8 || VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16 || VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) && \"Unexpected funnel shift type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31665, __extension__
__PRETTY_FUNCTION__))

31665

"Unexpected funnel shift type!")(static_cast <bool> ((VT == MVT::v16i8 || VT == MVT::v32i8
|| VT == MVT::v64i8 || VT == MVT::v8i16 || VT == MVT::v16i16
|| VT == MVT::v32i16 || VT == MVT::v4i32 || VT == MVT::v8i32
|| VT == MVT::v16i32) && "Unexpected funnel shift type!"
) ? void (0) : __assert_fail ("(VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8 || VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16 || VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) && \"Unexpected funnel shift type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31665, __extension__
__PRETTY_FUNCTION__));

31666

31667

// fshl(x,y,z) -> unpack(y,x) << (z & (bw-1))) >> bw.

31668

// fshr(x,y,z) -> unpack(y,x) >> (z & (bw-1))).

31669

if (IsCstSplat)

31670

return SDValue();

31671

31672

SDValue AmtMask = DAG.getConstant(EltSizeInBits - 1, DL, VT);

31673

SDValue AmtMod = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);

31674

bool IsCst = ISD::isBuildVectorOfConstantSDNodes(AmtMod.getNode());

31675

31676

// Constant vXi16 funnel shifts can be efficiently handled by default.

31677

if (IsCst && EltSizeInBits == 16)

31678

return SDValue();

31679

31680

unsigned ShiftOpc = IsFSHR ? ISD::SRL : ISD::SHL;

31681

unsigned NumElts = VT.getVectorNumElements();

31682

MVT ExtSVT = MVT::getIntegerVT(2 * EltSizeInBits);

31683

MVT ExtVT = MVT::getVectorVT(ExtSVT, NumElts / 2);

31684

31685

// Split 256-bit integers on XOP/pre-AVX2 targets.

31686

// Split 512-bit integers on non 512-bit BWI targets.

31687

if ((VT.is256BitVector() && ((Subtarget.hasXOP() && EltSizeInBits < 16) ||

31688

!Subtarget.hasAVX2())) ||

31689

(VT.is512BitVector() && !Subtarget.useBWIRegs() &&

31690

EltSizeInBits < 32)) {

31691

// Pre-mask the amount modulo using the wider vector.

31692

Op = DAG.getNode(Op.getOpcode(), DL, VT, Op0, Op1, AmtMod);

31693

return splitVectorOp(Op, DAG);

31694

}

31695

31696

// Attempt to fold scalar shift as unpack(y,x) << zext(splat(z))

31697

if (supportedVectorShiftWithBaseAmnt(ExtVT, Subtarget, ShiftOpc)) {

31698

int ScalarAmtIdx = -1;

31699

if (SDValue ScalarAmt = DAG.getSplatSourceVector(AmtMod, ScalarAmtIdx)) {

31700

// Uniform vXi16 funnel shifts can be efficiently handled by default.

31701

if (EltSizeInBits == 16)

31702

return SDValue();

31703

31704

SDValue Lo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, Op1, Op0));

31705

SDValue Hi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, Op1, Op0));

31706

Lo = getTargetVShiftNode(ShiftOpc, DL, ExtVT, Lo, ScalarAmt,

31707

ScalarAmtIdx, Subtarget, DAG);

31708

Hi = getTargetVShiftNode(ShiftOpc, DL, ExtVT, Hi, ScalarAmt,

31709

ScalarAmtIdx, Subtarget, DAG);

31710

return getPack(DAG, Subtarget, DL, VT, Lo, Hi, !IsFSHR);

31711

}

31712

}

31713

31714

MVT WideSVT = MVT::getIntegerVT(

31715

std::min<unsigned>(EltSizeInBits * 2, Subtarget.hasBWI() ? 16 : 32));

31716

MVT WideVT = MVT::getVectorVT(WideSVT, NumElts);

31717

31718

// If per-element shifts are legal, fallback to generic expansion.

31719

if (supportedVectorVarShift(VT, Subtarget, ShiftOpc) || Subtarget.hasXOP())

31720

return SDValue();

31721

31722

// Attempt to fold as:

31723

// fshl(x,y,z) -> (((aext(x) << bw) | zext(y)) << (z & (bw-1))) >> bw.

31724

// fshr(x,y,z) -> (((aext(x) << bw) | zext(y)) >> (z & (bw-1))).

31725

if (supportedVectorVarShift(WideVT, Subtarget, ShiftOpc) &&

31726

supportedVectorShiftWithImm(WideVT, Subtarget, ShiftOpc)) {

31727

Op0 = DAG.getNode(ISD::ANY_EXTEND, DL, WideVT, Op0);

31728

Op1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, Op1);

31729

AmtMod = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, AmtMod);

31730

Op0 = getTargetVShiftByConstNode(X86ISD::VSHLI, DL, WideVT, Op0,

31731

EltSizeInBits, DAG);

31732

SDValue Res = DAG.getNode(ISD::OR, DL, WideVT, Op0, Op1);

31733

Res = DAG.getNode(ShiftOpc, DL, WideVT, Res, AmtMod);

31734

if (!IsFSHR)

31735

Res = getTargetVShiftByConstNode(X86ISD::VSRLI, DL, WideVT, Res,

31736

EltSizeInBits, DAG);

31737

return DAG.getNode(ISD::TRUNCATE, DL, VT, Res);

31738

}

31739

31740

// Attempt to fold per-element (ExtVT) shift as unpack(y,x) << zext(z)

31741

if (((IsCst || !Subtarget.hasAVX512()) && !IsFSHR && EltSizeInBits <= 16) ||

31742

supportedVectorVarShift(ExtVT, Subtarget, ShiftOpc)) {

31743

SDValue Z = DAG.getConstant(0, DL, VT);

31744

SDValue RLo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, Op1, Op0));

31745

SDValue RHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, Op1, Op0));

31746

SDValue ALo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, AmtMod, Z));

31747

SDValue AHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, AmtMod, Z));

31748

SDValue Lo = DAG.getNode(ShiftOpc, DL, ExtVT, RLo, ALo);

31749

SDValue Hi = DAG.getNode(ShiftOpc, DL, ExtVT, RHi, AHi);

31750

return getPack(DAG, Subtarget, DL, VT, Lo, Hi, !IsFSHR);

31751

}

31752

31753

// Fallback to generic expansion.

31754

return SDValue();

31755

}

31756

assert((static_cast <bool> ((VT == MVT::i8 || VT == MVT::i16 ||
VT == MVT::i32 || VT == MVT::i64) && "Unexpected funnel shift type!"
) ? void (0) : __assert_fail ("(VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) && \"Unexpected funnel shift type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31758, __extension__
__PRETTY_FUNCTION__))

31757

(VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) &&(static_cast <bool> ((VT == MVT::i8 || VT == MVT::i16 ||
VT == MVT::i32 || VT == MVT::i64) && "Unexpected funnel shift type!"
) ? void (0) : __assert_fail ("(VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) && \"Unexpected funnel shift type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31758, __extension__
__PRETTY_FUNCTION__))

31758

"Unexpected funnel shift type!")(static_cast <bool> ((VT == MVT::i8 || VT == MVT::i16 ||
VT == MVT::i32 || VT == MVT::i64) && "Unexpected funnel shift type!"
) ? void (0) : __assert_fail ("(VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) && \"Unexpected funnel shift type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31758, __extension__
__PRETTY_FUNCTION__));

31759

31760

// Expand slow SHLD/SHRD cases if we are not optimizing for size.

31761

bool OptForSize = DAG.shouldOptForSize();

31762

bool ExpandFunnel = !OptForSize && Subtarget.isSHLDSlow();

31763

31764

// fshl(x,y,z) -> (((aext(x) << bw) | zext(y)) << (z & (bw-1))) >> bw.

31765

// fshr(x,y,z) -> (((aext(x) << bw) | zext(y)) >> (z & (bw-1))).

31766

if ((VT == MVT::i8 || (ExpandFunnel && VT == MVT::i16)) &&

31767

!isa<ConstantSDNode>(Amt)) {

31768

SDValue Mask = DAG.getConstant(EltSizeInBits - 1, DL, Amt.getValueType());

31769

SDValue HiShift = DAG.getConstant(EltSizeInBits, DL, Amt.getValueType());

31770

Op0 = DAG.getAnyExtOrTrunc(Op0, DL, MVT::i32);

31771

Op1 = DAG.getZExtOrTrunc(Op1, DL, MVT::i32);

31772

Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt, Mask);

31773

SDValue Res = DAG.getNode(ISD::SHL, DL, MVT::i32, Op0, HiShift);

31774

Res = DAG.getNode(ISD::OR, DL, MVT::i32, Res, Op1);

31775

if (IsFSHR) {

31776

Res = DAG.getNode(ISD::SRL, DL, MVT::i32, Res, Amt);

31777

} else {

31778

Res = DAG.getNode(ISD::SHL, DL, MVT::i32, Res, Amt);

31779

Res = DAG.getNode(ISD::SRL, DL, MVT::i32, Res, HiShift);

31780

}

31781

return DAG.getZExtOrTrunc(Res, DL, VT);

31782

}

31783

31784

if (VT == MVT::i8 || ExpandFunnel)

31785

return SDValue();

31786

31787

// i16 needs to modulo the shift amount, but i32/i64 have implicit modulo.

31788

if (VT == MVT::i16) {

31789

Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt,

31790

DAG.getConstant(15, DL, Amt.getValueType()));

31791

unsigned FSHOp = (IsFSHR ? X86ISD::FSHR : X86ISD::FSHL);

31792

return DAG.getNode(FSHOp, DL, VT, Op0, Op1, Amt);

31793

}

31794

31795

return Op;

31796

}

31797

31798

static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,

31799

SelectionDAG &DAG) {

31800

MVT VT = Op.getSimpleValueType();

31801

assert(VT.isVector() && "Custom lowering only for vector rotates!")(static_cast <bool> (VT.isVector() && "Custom lowering only for vector rotates!"
) ? void (0) : __assert_fail ("VT.isVector() && \"Custom lowering only for vector rotates!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31801, __extension__
__PRETTY_FUNCTION__));

31802

31803

SDLoc DL(Op);

31804

SDValue R = Op.getOperand(0);

31805

SDValue Amt = Op.getOperand(1);

31806

unsigned Opcode = Op.getOpcode();

31807

unsigned EltSizeInBits = VT.getScalarSizeInBits();

31808

int NumElts = VT.getVectorNumElements();

31809

bool IsROTL = Opcode == ISD::ROTL;

31810

31811

// Check for constant splat rotation amount.

31812

APInt CstSplatValue;

31813

bool IsCstSplat = X86::isConstantSplat(Amt, CstSplatValue);

31814

31815

// Check for splat rotate by zero.

31816

if (IsCstSplat && CstSplatValue.urem(EltSizeInBits) == 0)

31817

return R;

31818

31819

// AVX512 implicitly uses modulo rotation amounts.

31820

if (Subtarget.hasAVX512() && 32 <= EltSizeInBits) {

31821

// Attempt to rotate by immediate.

31822

if (IsCstSplat) {

31823

unsigned RotOpc = IsROTL ? X86ISD::VROTLI : X86ISD::VROTRI;

31824

uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);

31825

return DAG.getNode(RotOpc, DL, VT, R,

31826

DAG.getTargetConstant(RotAmt, DL, MVT::i8));

31827

}

31828

31829

// Else, fall-back on VPROLV/VPRORV.

31830

return Op;

31831

}

31832

31833

// AVX512 VBMI2 vXi16 - lower to funnel shifts.

31834

if (Subtarget.hasVBMI2() && 16 == EltSizeInBits) {

31835

unsigned FunnelOpc = IsROTL ? ISD::FSHL : ISD::FSHR;

31836

return DAG.getNode(FunnelOpc, DL, VT, R, R, Amt);

31837

}

31838

31839

SDValue Z = DAG.getConstant(0, DL, VT);

31840

31841

if (!IsROTL) {

31842

// If the ISD::ROTR amount is constant, we're always better converting to

31843

// ISD::ROTL.

31844

if (SDValue NegAmt = DAG.FoldConstantArithmetic(ISD::SUB, DL, VT, {Z, Amt}))

31845

return DAG.getNode(ISD::ROTL, DL, VT, R, NegAmt);

31846

31847

// XOP targets always prefers ISD::ROTL.

31848

if (Subtarget.hasXOP())

31849

return DAG.getNode(ISD::ROTL, DL, VT, R,

31850

DAG.getNode(ISD::SUB, DL, VT, Z, Amt));

31851

}

31852

31853

// Split 256-bit integers on XOP/pre-AVX2 targets.

31854

if (VT.is256BitVector() && (Subtarget.hasXOP() || !Subtarget.hasAVX2()))

31855

return splitVectorIntBinary(Op, DAG);

31856

31857

// XOP has 128-bit vector variable + immediate rotates.

31858

// +ve/-ve Amt = rotate left/right - just need to handle ISD::ROTL.

31859

// XOP implicitly uses modulo rotation amounts.

31860

if (Subtarget.hasXOP()) {

31861

assert(IsROTL && "Only ROTL expected")(static_cast <bool> (IsROTL && "Only ROTL expected"
) ? void (0) : __assert_fail ("IsROTL && \"Only ROTL expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31861, __extension__
__PRETTY_FUNCTION__));

31862

assert(VT.is128BitVector() && "Only rotate 128-bit vectors!")(static_cast <bool> (VT.is128BitVector() && "Only rotate 128-bit vectors!"
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Only rotate 128-bit vectors!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31862, __extension__
__PRETTY_FUNCTION__));

31863

31864

// Attempt to rotate by immediate.

31865

if (IsCstSplat) {

31866

uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);

31867

return DAG.getNode(X86ISD::VROTLI, DL, VT, R,

31868

DAG.getTargetConstant(RotAmt, DL, MVT::i8));

31869

}

31870

31871

// Use general rotate by variable (per-element).

31872

return Op;

31873

}

31874

31875

// Rotate by an uniform constant - expand back to shifts.

31876

if (IsCstSplat)

31877

return SDValue();

31878

31879

// Split 512-bit integers on non 512-bit BWI targets.

31880

if (VT.is512BitVector() && !Subtarget.useBWIRegs())

31881

return splitVectorIntBinary(Op, DAG);

31882

31883

assert((static_cast <bool> ((VT == MVT::v4i32 || VT == MVT::v8i16
|| VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16
|| VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT
== MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs
())) && "Only vXi32/vXi16/vXi8 vector rotates supported"
) ? void (0) : __assert_fail ("(VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT == MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs())) && \"Only vXi32/vXi16/vXi8 vector rotates supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31888, __extension__
__PRETTY_FUNCTION__))

31884

(VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 ||(static_cast <bool> ((VT == MVT::v4i32 || VT == MVT::v8i16
|| VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16
|| VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT
== MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs
())) && "Only vXi32/vXi16/vXi8 vector rotates supported"
) ? void (0) : __assert_fail ("(VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT == MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs())) && \"Only vXi32/vXi16/vXi8 vector rotates supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31888, __extension__
__PRETTY_FUNCTION__))

31885

((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) &&(static_cast <bool> ((VT == MVT::v4i32 || VT == MVT::v8i16
|| VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16
|| VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT
== MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs
())) && "Only vXi32/vXi16/vXi8 vector rotates supported"
) ? void (0) : __assert_fail ("(VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT == MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs())) && \"Only vXi32/vXi16/vXi8 vector rotates supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31888, __extension__
__PRETTY_FUNCTION__))

31886

Subtarget.hasAVX2()) ||(static_cast <bool> ((VT == MVT::v4i32 || VT == MVT::v8i16
|| VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16
|| VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT
== MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs
())) && "Only vXi32/vXi16/vXi8 vector rotates supported"
) ? void (0) : __assert_fail ("(VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT == MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs())) && \"Only vXi32/vXi16/vXi8 vector rotates supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31888, __extension__
__PRETTY_FUNCTION__))

31887

((VT == MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs())) &&(static_cast <bool> ((VT == MVT::v4i32 || VT == MVT::v8i16
|| VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16
|| VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT
== MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs
())) && "Only vXi32/vXi16/vXi8 vector rotates supported"
) ? void (0) : __assert_fail ("(VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT == MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs())) && \"Only vXi32/vXi16/vXi8 vector rotates supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31888, __extension__
__PRETTY_FUNCTION__))

31888

"Only vXi32/vXi16/vXi8 vector rotates supported")(static_cast <bool> ((VT == MVT::v4i32 || VT == MVT::v8i16
|| VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16
|| VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT
== MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs
())) && "Only vXi32/vXi16/vXi8 vector rotates supported"
) ? void (0) : __assert_fail ("(VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT == MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs())) && \"Only vXi32/vXi16/vXi8 vector rotates supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31888, __extension__
__PRETTY_FUNCTION__));

31889

31890

MVT ExtSVT = MVT::getIntegerVT(2 * EltSizeInBits);

31891

MVT ExtVT = MVT::getVectorVT(ExtSVT, NumElts / 2);

31892

31893

SDValue AmtMask = DAG.getConstant(EltSizeInBits - 1, DL, VT);

31894

SDValue AmtMod = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);

31895

31896

// Attempt to fold as unpack(x,x) << zext(splat(y)):

31897

// rotl(x,y) -> (unpack(x,x) << (y & (bw-1))) >> bw.

31898

// rotr(x,y) -> (unpack(x,x) >> (y & (bw-1))).

31899

if (EltSizeInBits == 8 || EltSizeInBits == 16 || EltSizeInBits == 32) {

31900

int BaseRotAmtIdx = -1;

31901

if (SDValue BaseRotAmt = DAG.getSplatSourceVector(AmtMod, BaseRotAmtIdx)) {

31902

if (EltSizeInBits == 16 && Subtarget.hasSSE41()) {

31903

unsigned FunnelOpc = IsROTL ? ISD::FSHL : ISD::FSHR;

31904

return DAG.getNode(FunnelOpc, DL, VT, R, R, Amt);

31905

}

31906

unsigned ShiftX86Opc = IsROTL ? X86ISD::VSHLI : X86ISD::VSRLI;

31907

SDValue Lo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, R, R));

31908

SDValue Hi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, R, R));

31909

Lo = getTargetVShiftNode(ShiftX86Opc, DL, ExtVT, Lo, BaseRotAmt,

31910

BaseRotAmtIdx, Subtarget, DAG);

31911

Hi = getTargetVShiftNode(ShiftX86Opc, DL, ExtVT, Hi, BaseRotAmt,

31912

BaseRotAmtIdx, Subtarget, DAG);

31913

return getPack(DAG, Subtarget, DL, VT, Lo, Hi, IsROTL);

31914

}

31915

}

31916

31917

// v16i8/v32i8/v64i8: Split rotation into rot4/rot2/rot1 stages and select by

31918

// the amount bit.

31919

// TODO: We're doing nothing here that we couldn't do for funnel shifts.

31920

if (EltSizeInBits == 8) {

31921

bool IsConstAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());

31922

MVT WideVT =

31923

MVT::getVectorVT(Subtarget.hasBWI() ? MVT::i16 : MVT::i32, NumElts);

31924

unsigned ShiftOpc = IsROTL ? ISD::SHL : ISD::SRL;

31925

31926

// Attempt to fold as:

31927

// rotl(x,y) -> (((aext(x) << bw) | zext(x)) << (y & (bw-1))) >> bw.

31928

// rotr(x,y) -> (((aext(x) << bw) | zext(x)) >> (y & (bw-1))).

31929

if (supportedVectorVarShift(WideVT, Subtarget, ShiftOpc) &&

31930

supportedVectorShiftWithImm(WideVT, Subtarget, ShiftOpc)) {

31931

// If we're rotating by constant, just use default promotion.

31932

if (IsConstAmt)

31933

return SDValue();

31934

// See if we can perform this by widening to vXi16 or vXi32.

31935

R = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, R);

31936

R = DAG.getNode(

31937

ISD::OR, DL, WideVT, R,

31938

getTargetVShiftByConstNode(X86ISD::VSHLI, DL, WideVT, R, 8, DAG));

31939

Amt = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, AmtMod);

31940

R = DAG.getNode(ShiftOpc, DL, WideVT, R, Amt);

31941

if (IsROTL)

31942

R = getTargetVShiftByConstNode(X86ISD::VSRLI, DL, WideVT, R, 8, DAG);

31943

return DAG.getNode(ISD::TRUNCATE, DL, VT, R);

31944

}

31945

31946

// Attempt to fold as unpack(x,x) << zext(y):

31947

// rotl(x,y) -> (unpack(x,x) << (y & (bw-1))) >> bw.

31948

// rotr(x,y) -> (unpack(x,x) >> (y & (bw-1))).

31949

if (IsConstAmt || supportedVectorVarShift(ExtVT, Subtarget, ShiftOpc)) {

31950

// See if we can perform this by unpacking to lo/hi vXi16.

31951

SDValue RLo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, R, R));

31952

SDValue RHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, R, R));

31953

SDValue ALo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, AmtMod, Z));

31954

SDValue AHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, AmtMod, Z));

31955

SDValue Lo = DAG.getNode(ShiftOpc, DL, ExtVT, RLo, ALo);

31956

SDValue Hi = DAG.getNode(ShiftOpc, DL, ExtVT, RHi, AHi);

31957

return getPack(DAG, Subtarget, DL, VT, Lo, Hi, IsROTL);

31958

}

31959

assert((VT == MVT::v16i8 || VT == MVT::v32i8) && "Unsupported vXi8 type")(static_cast <bool> ((VT == MVT::v16i8 || VT == MVT::v32i8
) && "Unsupported vXi8 type") ? void (0) : __assert_fail
("(VT == MVT::v16i8 || VT == MVT::v32i8) && \"Unsupported vXi8 type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31959, __extension__
__PRETTY_FUNCTION__));

31960

31961

// We don't need ModuloAmt here as we just peek at individual bits.

31962

auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {

31963

if (Subtarget.hasSSE41()) {

31964

// On SSE41 targets we can use PBLENDVB which selects bytes based just

31965

// on the sign bit.

31966

V0 = DAG.getBitcast(VT, V0);

31967

V1 = DAG.getBitcast(VT, V1);

31968

Sel = DAG.getBitcast(VT, Sel);

31969

return DAG.getBitcast(SelVT,

31970

DAG.getNode(X86ISD::BLENDV, DL, VT, Sel, V0, V1));

31971

}

31972

// On pre-SSE41 targets we test for the sign bit by comparing to

31973

// zero - a negative value will set all bits of the lanes to true

31974

// and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.

31975

SDValue Z = DAG.getConstant(0, DL, SelVT);

31976

SDValue C = DAG.getNode(X86ISD::PCMPGT, DL, SelVT, Z, Sel);

31977

return DAG.getSelect(DL, SelVT, C, V0, V1);

31978

};

31979

31980

// ISD::ROTR is currently only profitable on AVX512 targets with VPTERNLOG.

31981

if (!IsROTL && !useVPTERNLOG(Subtarget, VT)) {

31982

Amt = DAG.getNode(ISD::SUB, DL, VT, Z, Amt);

31983

IsROTL = true;

31984

}

31985

31986

unsigned ShiftLHS = IsROTL ? ISD::SHL : ISD::SRL;

31987

unsigned ShiftRHS = IsROTL ? ISD::SRL : ISD::SHL;

31988

31989

// Turn 'a' into a mask suitable for VSELECT: a = a << 5;

31990

// We can safely do this using i16 shifts as we're only interested in

31991

// the 3 lower bits of each byte.

31992

Amt = DAG.getBitcast(ExtVT, Amt);

31993

Amt = DAG.getNode(ISD::SHL, DL, ExtVT, Amt, DAG.getConstant(5, DL, ExtVT));

31994

Amt = DAG.getBitcast(VT, Amt);

31995

31996

// r = VSELECT(r, rot(r, 4), a);

31997

SDValue M;

31998

M = DAG.getNode(

31999

ISD::OR, DL, VT,

32000

DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(4, DL, VT)),

32001

DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(4, DL, VT)));

32002

R = SignBitSelect(VT, Amt, M, R);

32003

32004

// a += a

32005

Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt);

32006

32007

// r = VSELECT(r, rot(r, 2), a);

32008

M = DAG.getNode(

32009

ISD::OR, DL, VT,

32010

DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(2, DL, VT)),

32011

DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(6, DL, VT)));

32012

R = SignBitSelect(VT, Amt, M, R);

32013

32014

// a += a

32015

Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt);

32016

32017

// return VSELECT(r, rot(r, 1), a);

32018

M = DAG.getNode(

32019

ISD::OR, DL, VT,

32020

DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(1, DL, VT)),

32021

DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(7, DL, VT)));

32022

return SignBitSelect(VT, Amt, M, R);

32023

}

32024

32025

bool IsSplatAmt = DAG.isSplatValue(Amt);

32026

bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());

32027

bool LegalVarShifts = supportedVectorVarShift(VT, Subtarget, ISD::SHL) &&

32028

supportedVectorVarShift(VT, Subtarget, ISD::SRL);

32029

32030

// Fallback for splats + all supported variable shifts.

32031

// Fallback for non-constants AVX2 vXi16 as well.

32032

if (IsSplatAmt || LegalVarShifts || (Subtarget.hasAVX2() && !ConstantAmt)) {

32033

Amt = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);

32034

SDValue AmtR = DAG.getConstant(EltSizeInBits, DL, VT);

32035

AmtR = DAG.getNode(ISD::SUB, DL, VT, AmtR, Amt);

32036

SDValue SHL = DAG.getNode(IsROTL ? ISD::SHL : ISD::SRL, DL, VT, R, Amt);

32037

SDValue SRL = DAG.getNode(IsROTL ? ISD::SRL : ISD::SHL, DL, VT, R, AmtR);

32038

return DAG.getNode(ISD::OR, DL, VT, SHL, SRL);

32039

}

32040

32041

// Everything below assumes ISD::ROTL.

32042

if (!IsROTL) {

32043

Amt = DAG.getNode(ISD::SUB, DL, VT, Z, Amt);

32044

IsROTL = true;

32045

}

32046

32047

// ISD::ROT* uses modulo rotate amounts.

32048

Amt = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);

32049

32050

assert(IsROTL && "Only ROTL supported")(static_cast <bool> (IsROTL && "Only ROTL supported"
) ? void (0) : __assert_fail ("IsROTL && \"Only ROTL supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32050, __extension__
__PRETTY_FUNCTION__));

32051

32052

// As with shifts, attempt to convert the rotation amount to a multiplication

32053

// factor, fallback to general expansion.

32054

SDValue Scale = convertShiftLeftToScale(Amt, DL, Subtarget, DAG);

32055

if (!Scale)

32056

return SDValue();

32057

32058

// v8i16/v16i16: perform unsigned multiply hi/lo and OR the results.

32059

if (EltSizeInBits == 16) {

32060

SDValue Lo = DAG.getNode(ISD::MUL, DL, VT, R, Scale);

32061

SDValue Hi = DAG.getNode(ISD::MULHU, DL, VT, R, Scale);

32062

return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);

32063

}

32064

32065

// v4i32: make use of the PMULUDQ instruction to multiply 2 lanes of v4i32

32066

// to v2i64 results at a time. The upper 32-bits contain the wrapped bits

32067

// that can then be OR'd with the lower 32-bits.

32068

assert(VT == MVT::v4i32 && "Only v4i32 vector rotate expected")(static_cast <bool> (VT == MVT::v4i32 && "Only v4i32 vector rotate expected"
) ? void (0) : __assert_fail ("VT == MVT::v4i32 && \"Only v4i32 vector rotate expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32068, __extension__
__PRETTY_FUNCTION__));

32069

static const int OddMask[] = {1, -1, 3, -1};

32070

SDValue R13 = DAG.getVectorShuffle(VT, DL, R, R, OddMask);

32071

SDValue Scale13 = DAG.getVectorShuffle(VT, DL, Scale, Scale, OddMask);

32072

32073

SDValue Res02 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64,

32074

DAG.getBitcast(MVT::v2i64, R),

32075

DAG.getBitcast(MVT::v2i64, Scale));

32076

SDValue Res13 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64,

32077

DAG.getBitcast(MVT::v2i64, R13),

32078

DAG.getBitcast(MVT::v2i64, Scale13));

32079

Res02 = DAG.getBitcast(VT, Res02);

32080

Res13 = DAG.getBitcast(VT, Res13);

32081

32082

return DAG.getNode(ISD::OR, DL, VT,

32083

DAG.getVectorShuffle(VT, DL, Res02, Res13, {0, 4, 2, 6}),

32084

DAG.getVectorShuffle(VT, DL, Res02, Res13, {1, 5, 3, 7}));

32085

}

32086

32087

/// Returns true if the operand type is exactly twice the native width, and

32088

/// the corresponding cmpxchg8b or cmpxchg16b instruction is available.

32089

/// Used to know whether to use cmpxchg8/16b when expanding atomic operations

32090

/// (otherwise we leave them alone to become __sync_fetch_and_... calls).

32091

bool X86TargetLowering::needsCmpXchgNb(Type *MemType) const {

32092

unsigned OpWidth = MemType->getPrimitiveSizeInBits();

32093

32094

if (OpWidth == 64)

32095

return Subtarget.canUseCMPXCHG8B() && !Subtarget.is64Bit();

32096

if (OpWidth == 128)

32097

return Subtarget.canUseCMPXCHG16B();

32098

32099

return false;

32100

}

32101

32102

TargetLoweringBase::AtomicExpansionKind

32103

X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {

32104

Type *MemType = SI->getValueOperand()->getType();

32105

32106

bool NoImplicitFloatOps =

32107

SI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat);

32108

if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&

32109

!Subtarget.useSoftFloat() && !NoImplicitFloatOps &&

32110

(Subtarget.hasSSE1() || Subtarget.hasX87()))

32111

return AtomicExpansionKind::None;

32112

32113

return needsCmpXchgNb(MemType) ? AtomicExpansionKind::Expand

32114

: AtomicExpansionKind::None;

32115

}

32116

32117

// Note: this turns large loads into lock cmpxchg8b/16b.

32118

// TODO: In 32-bit mode, use MOVLPS when SSE1 is available?

32119

TargetLowering::AtomicExpansionKind

32120

X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {

32121

Type *MemType = LI->getType();

32122

32123

// If this a 64 bit atomic load on a 32-bit target and SSE2 is enabled, we

32124

// can use movq to do the load. If we have X87 we can load into an 80-bit

32125

// X87 register and store it to a stack temporary.

32126

bool NoImplicitFloatOps =

32127

LI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat);

32128

if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&

32129

!Subtarget.useSoftFloat() && !NoImplicitFloatOps &&

32130

(Subtarget.hasSSE1() || Subtarget.hasX87()))

32131

return AtomicExpansionKind::None;

32132

32133

return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg

32134

: AtomicExpansionKind::None;

32135

}

32136

32137

enum BitTestKind : unsigned {

32138

UndefBit,

32139

ConstantBit,

32140

NotConstantBit,

32141

ShiftBit,

32142

NotShiftBit

32143

};

32144

32145

static std::pair<Value *, BitTestKind> FindSingleBitChange(Value *V) {

32146

using namespace llvm::PatternMatch;

32147

BitTestKind BTK = UndefBit;

32148

auto *C = dyn_cast<ConstantInt>(V);

32149

if (C) {

32150

// Check if V is a power of 2 or NOT power of 2.

32151

if (isPowerOf2_64(C->getZExtValue()))

32152

BTK = ConstantBit;

32153

else if (isPowerOf2_64((~C->getValue()).getZExtValue()))

32154

BTK = NotConstantBit;

32155

return {V, BTK};

32156

}

32157

32158

// Check if V is some power of 2 pattern known to be non-zero

32159

auto *I = dyn_cast<Instruction>(V);

32160

if (I) {

32161

bool Not = false;

32162

// Check if we have a NOT

32163

Value *PeekI;

32164

if (match(I, m_c_Xor(m_Value(PeekI), m_AllOnes())) ||

32165

match(I, m_Sub(m_AllOnes(), m_Value(PeekI)))) {

32166

Not = true;

32167

I = dyn_cast<Instruction>(PeekI);

32168

32169

// If I is constant, it will fold and we can evaluate later. If its an

32170

// argument or something of that nature, we can't analyze.

32171

if (I == nullptr)

32172

return {nullptr, UndefBit};

32173

}

32174

// We can only use 1 << X without more sophisticated analysis. C << X where

32175

// C is a power of 2 but not 1 can result in zero which cannot be translated

32176

// to bittest. Likewise any C >> X (either arith or logical) can be zero.

32177

if (I->getOpcode() == Instruction::Shl) {

32178

// Todo(1): The cmpxchg case is pretty costly so matching `BLSI(X)`, `X &

32179

// -X` and some other provable power of 2 patterns that we can use CTZ on

32180

// may be profitable.

32181

// Todo(2): It may be possible in some cases to prove that Shl(C, X) is

32182

// non-zero even where C != 1. Likewise LShr(C, X) and AShr(C, X) may also

32183

// be provably a non-zero power of 2.

32184

// Todo(3): ROTL and ROTR patterns on a power of 2 C should also be

32185

// transformable to bittest.

32186

auto *ShiftVal = dyn_cast<ConstantInt>(I->getOperand(0));

32187

if (!ShiftVal)

32188

return {nullptr, UndefBit};

32189

if (ShiftVal->equalsInt(1))

32190

BTK = Not ? NotShiftBit : ShiftBit;

32191

32192

if (BTK == UndefBit)

32193

return {nullptr, UndefBit};

32194

32195

Value *BitV = I->getOperand(1);

32196

32197

Value *AndOp;

32198

const APInt *AndC;

32199

if (match(BitV, m_c_And(m_Value(AndOp), m_APInt(AndC)))) {

32200

// Read past a shiftmask instruction to find count

32201

if (*AndC == (I->getType()->getPrimitiveSizeInBits() - 1))

32202

BitV = AndOp;

32203

}

32204

return {BitV, BTK};

32205

}

32206

}

32207

return {nullptr, UndefBit};

32208

}

32209

32210

TargetLowering::AtomicExpansionKind

32211

X86TargetLowering::shouldExpandLogicAtomicRMWInIR(AtomicRMWInst *AI) const {

32212

// If the atomicrmw's result isn't actually used, we can just add a "lock"

32213

// prefix to a normal instruction for these operations.

32214

if (AI->use_empty())

32215

return AtomicExpansionKind::None;

32216

32217

// If the atomicrmw's result is used by a single bit AND, we may use

32218

// bts/btr/btc instruction for these operations.

32219

// Note: InstCombinePass can cause a de-optimization here. It replaces the

32220

// SETCC(And(AtomicRMW(P, power_of_2), power_of_2)) with LShr and Xor

32221

// (depending on CC). This pattern can only use bts/btr/btc but we don't

32222

// detect it.

32223

Instruction *I = AI->user_back();

32224

auto BitChange = FindSingleBitChange(AI->getValOperand());

32225

if (BitChange.second == UndefBit || !AI->hasOneUse() ||

32226

I->getOpcode() != Instruction::And ||

32227

AI->getType()->getPrimitiveSizeInBits() == 8 ||

32228

AI->getParent() != I->getParent())

32229

return AtomicExpansionKind::CmpXChg;

32230

32231

unsigned OtherIdx = I->getOperand(0) == AI ? 1 : 0;

32232

32233

// This is a redundant AND, it should get cleaned up elsewhere.

32234

if (AI == I->getOperand(OtherIdx))

32235

return AtomicExpansionKind::CmpXChg;

32236

32237

// The following instruction must be a AND single bit.

32238

if (BitChange.second == ConstantBit || BitChange.second == NotConstantBit) {

32239

auto *C1 = cast<ConstantInt>(AI->getValOperand());

32240

auto *C2 = dyn_cast<ConstantInt>(I->getOperand(OtherIdx));

32241

if (!C2 || !isPowerOf2_64(C2->getZExtValue())) {

32242

return AtomicExpansionKind::CmpXChg;

32243

}

32244

if (AI->getOperation() == AtomicRMWInst::And) {

32245

return ~C1->getValue() == C2->getValue()

32246

? AtomicExpansionKind::BitTestIntrinsic

32247

: AtomicExpansionKind::CmpXChg;

32248

}

32249

return C1 == C2 ? AtomicExpansionKind::BitTestIntrinsic

32250

: AtomicExpansionKind::CmpXChg;

32251

}

32252

32253

assert(BitChange.second == ShiftBit || BitChange.second == NotShiftBit)(static_cast <bool> (BitChange.second == ShiftBit || BitChange
.second == NotShiftBit) ? void (0) : __assert_fail ("BitChange.second == ShiftBit || BitChange.second == NotShiftBit"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32253, __extension__
__PRETTY_FUNCTION__));

32254

32255

auto BitTested = FindSingleBitChange(I->getOperand(OtherIdx));

32256

if (BitTested.second != ShiftBit && BitTested.second != NotShiftBit)

32257

return AtomicExpansionKind::CmpXChg;

32258

32259

assert(BitChange.first != nullptr && BitTested.first != nullptr)(static_cast <bool> (BitChange.first != nullptr &&
BitTested.first != nullptr) ? void (0) : __assert_fail ("BitChange.first != nullptr && BitTested.first != nullptr"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32259, __extension__
__PRETTY_FUNCTION__));

32260

32261

// If shift amounts are not the same we can't use BitTestIntrinsic.

32262

if (BitChange.first != BitTested.first)

32263

return AtomicExpansionKind::CmpXChg;

32264

32265

// If atomic AND need to be masking all be one bit and testing the one bit

32266

// unset in the mask.

32267

if (AI->getOperation() == AtomicRMWInst::And)

32268

return (BitChange.second == NotShiftBit && BitTested.second == ShiftBit)

32269

? AtomicExpansionKind::BitTestIntrinsic

32270

: AtomicExpansionKind::CmpXChg;

32271

32272

// If atomic XOR/OR need to be setting and testing the same bit.

32273

return (BitChange.second == ShiftBit && BitTested.second == ShiftBit)

32274

? AtomicExpansionKind::BitTestIntrinsic

32275

: AtomicExpansionKind::CmpXChg;

32276

}

32277

32278

void X86TargetLowering::emitBitTestAtomicRMWIntrinsic(AtomicRMWInst *AI) const {

32279

IRBuilder<> Builder(AI);

32280

Builder.CollectMetadataToCopy(AI, {LLVMContext::MD_pcsections});

32281

Intrinsic::ID IID_C = Intrinsic::not_intrinsic;

32282

Intrinsic::ID IID_I = Intrinsic::not_intrinsic;

32283

switch (AI->getOperation()) {

32284

default:

32285

llvm_unreachable("Unknown atomic operation")::llvm::llvm_unreachable_internal("Unknown atomic operation",
"llvm/lib/Target/X86/X86ISelLowering.cpp", 32285);

32286

case AtomicRMWInst::Or:

32287

IID_C = Intrinsic::x86_atomic_bts;

32288

IID_I = Intrinsic::x86_atomic_bts_rm;

32289

break;

32290

case AtomicRMWInst::Xor:

32291

IID_C = Intrinsic::x86_atomic_btc;

32292

IID_I = Intrinsic::x86_atomic_btc_rm;

32293

break;

32294

case AtomicRMWInst::And:

32295

IID_C = Intrinsic::x86_atomic_btr;

32296

IID_I = Intrinsic::x86_atomic_btr_rm;

32297

break;

32298

}

32299

Instruction *I = AI->user_back();

32300

LLVMContext &Ctx = AI->getContext();

32301

Value *Addr = Builder.CreatePointerCast(AI->getPointerOperand(),

32302

Type::getInt8PtrTy(Ctx));

32303

Function *BitTest = nullptr;

32304

Value *Result = nullptr;

32305

auto BitTested = FindSingleBitChange(AI->getValOperand());

32306

assert(BitTested.first != nullptr)(static_cast <bool> (BitTested.first != nullptr) ? void
(0) : __assert_fail ("BitTested.first != nullptr", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32306, __extension__ __PRETTY_FUNCTION__));

32307

32308

if (BitTested.second == ConstantBit || BitTested.second == NotConstantBit) {

32309

auto *C = cast<ConstantInt>(I->getOperand(I->getOperand(0) == AI ? 1 : 0));

32310

32311

BitTest = Intrinsic::getDeclaration(AI->getModule(), IID_C, AI->getType());

32312

32313

unsigned Imm = llvm::countr_zero(C->getZExtValue());

32314

Result = Builder.CreateCall(BitTest, {Addr, Builder.getInt8(Imm)});

32315

} else {

32316

BitTest = Intrinsic::getDeclaration(AI->getModule(), IID_I, AI->getType());

32317

32318

assert(BitTested.second == ShiftBit || BitTested.second == NotShiftBit)(static_cast <bool> (BitTested.second == ShiftBit || BitTested
.second == NotShiftBit) ? void (0) : __assert_fail ("BitTested.second == ShiftBit || BitTested.second == NotShiftBit"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32318, __extension__
__PRETTY_FUNCTION__));

32319

32320

Value *SI = BitTested.first;

32321

assert(SI != nullptr)(static_cast <bool> (SI != nullptr) ? void (0) : __assert_fail
("SI != nullptr", "llvm/lib/Target/X86/X86ISelLowering.cpp",
32321, __extension__ __PRETTY_FUNCTION__));

32322

32323

// BT{S|R|C} on memory operand don't modulo bit position so we need to

32324

// mask it.

32325

unsigned ShiftBits = SI->getType()->getPrimitiveSizeInBits();

32326

Value *BitPos =

32327

Builder.CreateAnd(SI, Builder.getIntN(ShiftBits, ShiftBits - 1));

32328

// Todo(1): In many cases it may be provable that SI is less than

32329

// ShiftBits in which case this mask is unnecessary

32330

// Todo(2): In the fairly idiomatic case of P[X / sizeof_bits(X)] OP 1

32331

// << (X % sizeof_bits(X)) we can drop the shift mask and AGEN in

32332

// favor of just a raw BT{S|R|C}.

32333

32334

Result = Builder.CreateCall(BitTest, {Addr, BitPos});

32335

Result = Builder.CreateZExtOrTrunc(Result, AI->getType());

32336

32337

// If the result is only used for zero/non-zero status then we don't need to

32338

// shift value back. Otherwise do so.

32339

for (auto It = I->user_begin(); It != I->user_end(); ++It) {

32340

if (auto *ICmp = dyn_cast<ICmpInst>(*It)) {

32341

if (ICmp->isEquality()) {

32342

auto *C0 = dyn_cast<ConstantInt>(ICmp->getOperand(0));

32343

auto *C1 = dyn_cast<ConstantInt>(ICmp->getOperand(1));

32344

if (C0 || C1) {

32345

assert(C0 == nullptr || C1 == nullptr)(static_cast <bool> (C0 == nullptr || C1 == nullptr) ? void
(0) : __assert_fail ("C0 == nullptr || C1 == nullptr", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32345, __extension__ __PRETTY_FUNCTION__));

32346

if ((C0 ? C0 : C1)->isZero())

32347

continue;

32348

}

32349

}

32350

}

32351

Result = Builder.CreateShl(Result, BitPos);

32352

break;

32353

}

32354

}

32355

32356

I->replaceAllUsesWith(Result);

32357

I->eraseFromParent();

32358

AI->eraseFromParent();

32359

}

32360

32361

static bool shouldExpandCmpArithRMWInIR(AtomicRMWInst *AI) {

32362

using namespace llvm::PatternMatch;

32363

if (!AI->hasOneUse())

32364

return false;

32365

32366

Value *Op = AI->getOperand(1);

32367

ICmpInst::Predicate Pred;

32368

Instruction *I = AI->user_back();

32369

AtomicRMWInst::BinOp Opc = AI->getOperation();

32370

if (Opc == AtomicRMWInst::Add) {

32371

if (match(I, m_c_ICmp(Pred, m_Sub(m_ZeroInt(), m_Specific(Op)), m_Value())))

32372

return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE;

32373

if (match(I, m_OneUse(m_c_Add(m_Specific(Op), m_Value())))) {

32374

if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_ZeroInt())))

32375

return Pred == CmpInst::ICMP_SLT;

32376

if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_AllOnes())))

32377

return Pred == CmpInst::ICMP_SGT;

32378

}

32379

return false;

32380

}

32381

if (Opc == AtomicRMWInst::Sub) {

32382

if (match(I, m_c_ICmp(Pred, m_Specific(Op), m_Value())))

32383

return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE;

32384

if (match(I, m_OneUse(m_Sub(m_Value(), m_Specific(Op))))) {

32385

if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_ZeroInt())))

32386

return Pred == CmpInst::ICMP_SLT;

32387

if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_AllOnes())))

32388

return Pred == CmpInst::ICMP_SGT;

32389

}

32390

return false;

32391

}

32392

if ((Opc == AtomicRMWInst::Or &&

32393

match(I, m_OneUse(m_c_Or(m_Specific(Op), m_Value())))) ||

32394

(Opc == AtomicRMWInst::And &&

32395

match(I, m_OneUse(m_c_And(m_Specific(Op), m_Value()))))) {

32396

if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_ZeroInt())))

32397

return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE ||

32398

Pred == CmpInst::ICMP_SLT;

32399

if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_AllOnes())))

32400

return Pred == CmpInst::ICMP_SGT;

32401

return false;

32402

}

32403

if (Opc == AtomicRMWInst::Xor) {

32404

if (match(I, m_c_ICmp(Pred, m_Specific(Op), m_Value())))

32405

return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE;

32406

if (match(I, m_OneUse(m_c_Xor(m_Specific(Op), m_Value())))) {

32407

if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_ZeroInt())))

32408

return Pred == CmpInst::ICMP_SLT;

32409

if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_AllOnes())))

32410

return Pred == CmpInst::ICMP_SGT;

32411

}

32412

return false;

32413

}

32414

32415

return false;

32416

}

32417

32418

void X86TargetLowering::emitCmpArithAtomicRMWIntrinsic(

32419

AtomicRMWInst *AI) const {

32420

IRBuilder<> Builder(AI);

32421

Builder.CollectMetadataToCopy(AI, {LLVMContext::MD_pcsections});

32422

Instruction *TempI = nullptr;

32423

LLVMContext &Ctx = AI->getContext();

32424

ICmpInst *ICI = dyn_cast<ICmpInst>(AI->user_back());

32425

if (!ICI) {

32426

TempI = AI->user_back();

32427

assert(TempI->hasOneUse() && "Must have one use")(static_cast <bool> (TempI->hasOneUse() && "Must have one use"
) ? void (0) : __assert_fail ("TempI->hasOneUse() && \"Must have one use\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32427, __extension__
__PRETTY_FUNCTION__));

32428

ICI = cast<ICmpInst>(TempI->user_back());

32429

}

32430

X86::CondCode CC = X86::COND_INVALID;

32431

ICmpInst::Predicate Pred = ICI->getPredicate();

32432

switch (Pred) {

32433

default:

32434

llvm_unreachable("Not supported Pred")::llvm::llvm_unreachable_internal("Not supported Pred", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32434);

32435

case CmpInst::ICMP_EQ:

32436

CC = X86::COND_E;

32437

break;

32438

case CmpInst::ICMP_NE:

32439

CC = X86::COND_NE;

32440

break;

32441

case CmpInst::ICMP_SLT:

32442

CC = X86::COND_S;

32443

break;

32444

case CmpInst::ICMP_SGT:

32445

CC = X86::COND_NS;

32446

break;

32447

}

32448

Intrinsic::ID IID = Intrinsic::not_intrinsic;

32449

switch (AI->getOperation()) {

32450

default:

32451

llvm_unreachable("Unknown atomic operation")::llvm::llvm_unreachable_internal("Unknown atomic operation",
"llvm/lib/Target/X86/X86ISelLowering.cpp", 32451);

32452

case AtomicRMWInst::Add:

32453

IID = Intrinsic::x86_atomic_add_cc;

32454

break;

32455

case AtomicRMWInst::Sub:

32456

IID = Intrinsic::x86_atomic_sub_cc;

32457

break;

32458

case AtomicRMWInst::Or:

32459

IID = Intrinsic::x86_atomic_or_cc;

32460

break;

32461

case AtomicRMWInst::And:

32462

IID = Intrinsic::x86_atomic_and_cc;

32463

break;

32464

case AtomicRMWInst::Xor:

32465

IID = Intrinsic::x86_atomic_xor_cc;

32466

break;

32467

}

32468

Function *CmpArith =

32469

Intrinsic::getDeclaration(AI->getModule(), IID, AI->getType());

32470

Value *Addr = Builder.CreatePointerCast(AI->getPointerOperand(),

32471

Type::getInt8PtrTy(Ctx));

32472

Value *Call = Builder.CreateCall(

32473

CmpArith, {Addr, AI->getValOperand(), Builder.getInt32((unsigned)CC)});

32474

Value *Result = Builder.CreateTrunc(Call, Type::getInt1Ty(Ctx));

32475

ICI->replaceAllUsesWith(Result);

32476

ICI->eraseFromParent();

32477

if (TempI)

32478

TempI->eraseFromParent();

32479

AI->eraseFromParent();

32480

}

32481

32482

TargetLowering::AtomicExpansionKind

32483

X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {

32484

unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;

32485

Type *MemType = AI->getType();

32486

32487

// If the operand is too big, we must see if cmpxchg8/16b is available

32488

// and default to library calls otherwise.

32489

if (MemType->getPrimitiveSizeInBits() > NativeWidth) {

32490

return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg

32491

: AtomicExpansionKind::None;

32492

}

32493

32494

AtomicRMWInst::BinOp Op = AI->getOperation();

32495

switch (Op) {

32496

case AtomicRMWInst::Xchg:

32497

return AtomicExpansionKind::None;

32498

case AtomicRMWInst::Add:

32499

case AtomicRMWInst::Sub:

32500

if (shouldExpandCmpArithRMWInIR(AI))

32501

return AtomicExpansionKind::CmpArithIntrinsic;

32502

// It's better to use xadd, xsub or xchg for these in other cases.

32503

return AtomicExpansionKind::None;

32504

case AtomicRMWInst::Or:

32505

case AtomicRMWInst::And:

32506

case AtomicRMWInst::Xor:

32507

if (shouldExpandCmpArithRMWInIR(AI))

32508

return AtomicExpansionKind::CmpArithIntrinsic;

32509

return shouldExpandLogicAtomicRMWInIR(AI);

32510

case AtomicRMWInst::Nand:

32511

case AtomicRMWInst::Max:

32512

case AtomicRMWInst::Min:

32513

case AtomicRMWInst::UMax:

32514

case AtomicRMWInst::UMin:

32515

case AtomicRMWInst::FAdd:

32516

case AtomicRMWInst::FSub:

32517

case AtomicRMWInst::FMax:

32518

case AtomicRMWInst::FMin:

32519

case AtomicRMWInst::UIncWrap:

32520

case AtomicRMWInst::UDecWrap:

32521

default:

32522

// These always require a non-trivial set of data operations on x86. We must

32523

// use a cmpxchg loop.

32524

return AtomicExpansionKind::CmpXChg;

32525

}

32526

}

32527

32528

LoadInst *

32529

X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {

32530

unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;

32531

Type *MemType = AI->getType();

32532

// Accesses larger than the native width are turned into cmpxchg/libcalls, so

32533

// there is no benefit in turning such RMWs into loads, and it is actually

32534

// harmful as it introduces a mfence.

32535

if (MemType->getPrimitiveSizeInBits() > NativeWidth)

32536

return nullptr;

32537

32538

// If this is a canonical idempotent atomicrmw w/no uses, we have a better

32539

// lowering available in lowerAtomicArith.

32540

// TODO: push more cases through this path.

32541

if (auto *C = dyn_cast<ConstantInt>(AI->getValOperand()))

32542

if (AI->getOperation() == AtomicRMWInst::Or && C->isZero() &&

32543

AI->use_empty())

32544

return nullptr;

32545

32546

IRBuilder<> Builder(AI);

32547

Builder.CollectMetadataToCopy(AI, {LLVMContext::MD_pcsections});

32548

Module *M = Builder.GetInsertBlock()->getParent()->getParent();

32549

auto SSID = AI->getSyncScopeID();

32550

// We must restrict the ordering to avoid generating loads with Release or

32551

// ReleaseAcquire orderings.

32552

auto Order = AtomicCmpXchgInst::getStrongestFailureOrdering(AI->getOrdering());

32553

32554

// Before the load we need a fence. Here is an example lifted from

32555

// http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence

32556

// is required:

32557

// Thread 0:

32558

// x.store(1, relaxed);

32559

// r1 = y.fetch_add(0, release);

32560

// Thread 1:

32561

// y.fetch_add(42, acquire);

32562

// r2 = x.load(relaxed);

32563

// r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is

32564

// lowered to just a load without a fence. A mfence flushes the store buffer,

32565

// making the optimization clearly correct.

32566

// FIXME: it is required if isReleaseOrStronger(Order) but it is not clear

32567

// otherwise, we might be able to be more aggressive on relaxed idempotent

32568

// rmw. In practice, they do not look useful, so we don't try to be

32569

// especially clever.

32570

if (SSID == SyncScope::SingleThread)

32571

// FIXME: we could just insert an ISD::MEMBARRIER here, except we are at

32572

// the IR level, so we must wrap it in an intrinsic.

32573

return nullptr;

32574

32575

if (!Subtarget.hasMFence())

32576

// FIXME: it might make sense to use a locked operation here but on a

32577

// different cache-line to prevent cache-line bouncing. In practice it

32578

// is probably a small win, and x86 processors without mfence are rare

32579

// enough that we do not bother.

32580

return nullptr;

32581

32582

Function *MFence =

32583

llvm::Intrinsic::getDeclaration(M, Intrinsic::x86_sse2_mfence);

32584

Builder.CreateCall(MFence, {});

32585

32586

// Finally we can emit the atomic load.

32587

LoadInst *Loaded = Builder.CreateAlignedLoad(

32588

AI->getType(), AI->getPointerOperand(), AI->getAlign());

32589

Loaded->setAtomic(Order, SSID);

32590

AI->replaceAllUsesWith(Loaded);

32591

AI->eraseFromParent();

32592

return Loaded;

32593

}

32594

32595

bool X86TargetLowering::lowerAtomicStoreAsStoreSDNode(const StoreInst &SI) const {

32596

if (!SI.isUnordered())

32597

return false;

32598

return ExperimentalUnorderedISEL;

32599

}

32600

bool X86TargetLowering::lowerAtomicLoadAsLoadSDNode(const LoadInst &LI) const {

32601

if (!LI.isUnordered())

32602

return false;

32603

return ExperimentalUnorderedISEL;

32604

}

32605

32606

32607

/// Emit a locked operation on a stack location which does not change any

32608

/// memory location, but does involve a lock prefix. Location is chosen to be

32609

/// a) very likely accessed only by a single thread to minimize cache traffic,

32610

/// and b) definitely dereferenceable. Returns the new Chain result.

32611

static SDValue emitLockedStackOp(SelectionDAG &DAG,

32612

const X86Subtarget &Subtarget, SDValue Chain,

32613

const SDLoc &DL) {

32614

// Implementation notes:

32615

// 1) LOCK prefix creates a full read/write reordering barrier for memory

32616

// operations issued by the current processor. As such, the location

32617

// referenced is not relevant for the ordering properties of the instruction.

32618

// See: Intel® 64 and IA-32 ArchitecturesSoftware Developer’s Manual,

32619

// 8.2.3.9 Loads and Stores Are Not Reordered with Locked Instructions

32620

// 2) Using an immediate operand appears to be the best encoding choice

32621

// here since it doesn't require an extra register.

32622

// 3) OR appears to be very slightly faster than ADD. (Though, the difference

32623

// is small enough it might just be measurement noise.)

32624

// 4) When choosing offsets, there are several contributing factors:

32625

// a) If there's no redzone, we default to TOS. (We could allocate a cache

32626

// line aligned stack object to improve this case.)

32627

// b) To minimize our chances of introducing a false dependence, we prefer

32628

// to offset the stack usage from TOS slightly.

32629

// c) To minimize concerns about cross thread stack usage - in particular,

32630

// the idiomatic MyThreadPool.run([&StackVars]() {...}) pattern which

32631

// captures state in the TOS frame and accesses it from many threads -

32632

// we want to use an offset such that the offset is in a distinct cache

32633

// line from the TOS frame.

32634

//

32635

// For a general discussion of the tradeoffs and benchmark results, see:

32636

// https://shipilev.net/blog/2014/on-the-fence-with-dependencies/

32637

32638

auto &MF = DAG.getMachineFunction();

32639

auto &TFL = *Subtarget.getFrameLowering();

32640

const unsigned SPOffset = TFL.has128ByteRedZone(MF) ? -64 : 0;

32641

32642

if (Subtarget.is64Bit()) {

32643

SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);

32644

SDValue Ops[] = {

32645

DAG.getRegister(X86::RSP, MVT::i64), // Base

32646

DAG.getTargetConstant(1, DL, MVT::i8), // Scale

32647

DAG.getRegister(0, MVT::i64), // Index

32648

DAG.getTargetConstant(SPOffset, DL, MVT::i32), // Disp

32649

DAG.getRegister(0, MVT::i16), // Segment.

32650

Zero,

32651

Chain};

32652

SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32,

32653

MVT::Other, Ops);

32654

return SDValue(Res, 1);

32655

}

32656

32657

SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);

32658

SDValue Ops[] = {

32659

DAG.getRegister(X86::ESP, MVT::i32), // Base

32660

DAG.getTargetConstant(1, DL, MVT::i8), // Scale

32661

DAG.getRegister(0, MVT::i32), // Index

32662

DAG.getTargetConstant(SPOffset, DL, MVT::i32), // Disp

32663

DAG.getRegister(0, MVT::i16), // Segment.

32664

Zero,

32665

Chain

32666

};

32667

SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32,

32668

MVT::Other, Ops);

32669

return SDValue(Res, 1);

32670

}

32671

32672

static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget &Subtarget,

32673

SelectionDAG &DAG) {

32674

SDLoc dl(Op);

32675

AtomicOrdering FenceOrdering =

32676

static_cast<AtomicOrdering>(Op.getConstantOperandVal(1));

32677

SyncScope::ID FenceSSID =

32678

static_cast<SyncScope::ID>(Op.getConstantOperandVal(2));

32679

32680

// The only fence that needs an instruction is a sequentially-consistent

32681

// cross-thread fence.

32682

if (FenceOrdering == AtomicOrdering::SequentiallyConsistent &&

32683

FenceSSID == SyncScope::System) {

32684

if (Subtarget.hasMFence())

32685

return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));

32686

32687

SDValue Chain = Op.getOperand(0);

32688

return emitLockedStackOp(DAG, Subtarget, Chain, dl);

32689

}

32690

32691

// MEMBARRIER is a compiler barrier; it codegens to a no-op.

32692

return DAG.getNode(ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));

32693

}

32694

32695

static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget &Subtarget,

32696

SelectionDAG &DAG) {

32697

MVT T = Op.getSimpleValueType();

32698

SDLoc DL(Op);

32699

unsigned Reg = 0;

32700

unsigned size = 0;

32701

switch(T.SimpleTy) {

32702

default: llvm_unreachable("Invalid value type!")::llvm::llvm_unreachable_internal("Invalid value type!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32702);

32703

case MVT::i8: Reg = X86::AL; size = 1; break;

32704

case MVT::i16: Reg = X86::AX; size = 2; break;

32705

case MVT::i32: Reg = X86::EAX; size = 4; break;

32706

case MVT::i64:

32707

assert(Subtarget.is64Bit() && "Node not type legal!")(static_cast <bool> (Subtarget.is64Bit() && "Node not type legal!"
) ? void (0) : __assert_fail ("Subtarget.is64Bit() && \"Node not type legal!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32707, __extension__
__PRETTY_FUNCTION__));

32708

Reg = X86::RAX; size = 8;

32709

break;

32710

}

32711

SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg,

32712

Op.getOperand(2), SDValue());

32713

SDValue Ops[] = { cpIn.getValue(0),

32714

Op.getOperand(1),

32715

Op.getOperand(3),

32716

DAG.getTargetConstant(size, DL, MVT::i8),

32717

cpIn.getValue(1) };

32718

SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);

32719

MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();

32720

SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys,

32721

Ops, T, MMO);

32722

32723

SDValue cpOut =

32724

DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));

32725

SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS,

32726

MVT::i32, cpOut.getValue(2));

32727

SDValue Success = getSETCC(X86::COND_E, EFLAGS, DL, DAG);

32728

32729

return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),

32730

cpOut, Success, EFLAGS.getValue(1));

32731

}

32732

32733

// Create MOVMSKB, taking into account whether we need to split for AVX1.

32734

static SDValue getPMOVMSKB(const SDLoc &DL, SDValue V, SelectionDAG &DAG,

32735

const X86Subtarget &Subtarget) {

32736

MVT InVT = V.getSimpleValueType();

32737

32738

if (InVT == MVT::v64i8) {

32739

SDValue Lo, Hi;

32740

std::tie(Lo, Hi) = DAG.SplitVector(V, DL);

32741

Lo = getPMOVMSKB(DL, Lo, DAG, Subtarget);

32742

Hi = getPMOVMSKB(DL, Hi, DAG, Subtarget);

32743

Lo = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Lo);

32744

Hi = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Hi);

32745

Hi = DAG.getNode(ISD::SHL, DL, MVT::i64, Hi,

32746

DAG.getConstant(32, DL, MVT::i8));

32747

return DAG.getNode(ISD::OR, DL, MVT::i64, Lo, Hi);

32748

}

32749

if (InVT == MVT::v32i8 && !Subtarget.hasInt256()) {

32750

SDValue Lo, Hi;

32751

std::tie(Lo, Hi) = DAG.SplitVector(V, DL);

32752

Lo = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Lo);

32753

Hi = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Hi);

32754

Hi = DAG.getNode(ISD::SHL, DL, MVT::i32, Hi,

32755

DAG.getConstant(16, DL, MVT::i8));

32756

return DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi);

32757

}

32758

32759

return DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);

32760

}

32761

32762

static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget,

32763

SelectionDAG &DAG) {

32764

SDValue Src = Op.getOperand(0);

32765

MVT SrcVT = Src.getSimpleValueType();

32766

MVT DstVT = Op.getSimpleValueType();

32767

32768

// Legalize (v64i1 (bitcast i64 (X))) by splitting the i64, bitcasting each

32769

// half to v32i1 and concatenating the result.

32770

if (SrcVT == MVT::i64 && DstVT == MVT::v64i1) {

32771

assert(!Subtarget.is64Bit() && "Expected 32-bit mode")(static_cast <bool> (!Subtarget.is64Bit() && "Expected 32-bit mode"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"Expected 32-bit mode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32771, __extension__
__PRETTY_FUNCTION__));

32772

assert(Subtarget.hasBWI() && "Expected BWI target")(static_cast <bool> (Subtarget.hasBWI() && "Expected BWI target"
) ? void (0) : __assert_fail ("Subtarget.hasBWI() && \"Expected BWI target\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32772, __extension__
__PRETTY_FUNCTION__));

32773

SDLoc dl(Op);

32774

SDValue Lo, Hi;

32775

std::tie(Lo, Hi) = DAG.SplitScalar(Src, dl, MVT::i32, MVT::i32);

32776

Lo = DAG.getBitcast(MVT::v32i1, Lo);

32777

Hi = DAG.getBitcast(MVT::v32i1, Hi);

32778

return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);

32779

}

32780

32781

// Use MOVMSK for vector to scalar conversion to prevent scalarization.

32782

if ((SrcVT == MVT::v16i1 || SrcVT == MVT::v32i1) && DstVT.isScalarInteger()) {

32783

assert(!Subtarget.hasAVX512() && "Should use K-registers with AVX512")(static_cast <bool> (!Subtarget.hasAVX512() && "Should use K-registers with AVX512"
) ? void (0) : __assert_fail ("!Subtarget.hasAVX512() && \"Should use K-registers with AVX512\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32783, __extension__
__PRETTY_FUNCTION__));

32784

MVT SExtVT = SrcVT == MVT::v16i1 ? MVT::v16i8 : MVT::v32i8;

32785

SDLoc DL(Op);

32786

SDValue V = DAG.getSExtOrTrunc(Src, DL, SExtVT);

32787

V = getPMOVMSKB(DL, V, DAG, Subtarget);

32788

return DAG.getZExtOrTrunc(V, DL, DstVT);

32789

}

32790

32791

assert((SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 ||(static_cast <bool> ((SrcVT == MVT::v2i32 || SrcVT == MVT
::v4i16 || SrcVT == MVT::v8i8 || SrcVT == MVT::i64) &&
"Unexpected VT!") ? void (0) : __assert_fail ("(SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 || SrcVT == MVT::i64) && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32792, __extension__
__PRETTY_FUNCTION__))

32792

SrcVT == MVT::i64) && "Unexpected VT!")(static_cast <bool> ((SrcVT == MVT::v2i32 || SrcVT == MVT
::v4i16 || SrcVT == MVT::v8i8 || SrcVT == MVT::i64) &&
"Unexpected VT!") ? void (0) : __assert_fail ("(SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 || SrcVT == MVT::i64) && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32792, __extension__
__PRETTY_FUNCTION__));

32793

32794

assert(Subtarget.hasSSE2() && "Requires at least SSE2!")(static_cast <bool> (Subtarget.hasSSE2() && "Requires at least SSE2!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"Requires at least SSE2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32794, __extension__
__PRETTY_FUNCTION__));

32795

if (!(DstVT == MVT::f64 && SrcVT == MVT::i64) &&

32796

!(DstVT == MVT::x86mmx && SrcVT.isVector()))

32797

// This conversion needs to be expanded.

32798

return SDValue();

32799

32800

SDLoc dl(Op);

32801

if (SrcVT.isVector()) {

32802

// Widen the vector in input in the case of MVT::v2i32.

32803

// Example: from MVT::v2i32 to MVT::v4i32.

32804

MVT NewVT = MVT::getVectorVT(SrcVT.getVectorElementType(),

32805

SrcVT.getVectorNumElements() * 2);

32806

Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewVT, Src,

32807

DAG.getUNDEF(SrcVT));

32808

} else {

32809

assert(SrcVT == MVT::i64 && !Subtarget.is64Bit() &&(static_cast <bool> (SrcVT == MVT::i64 && !Subtarget
.is64Bit() && "Unexpected source type in LowerBITCAST"
) ? void (0) : __assert_fail ("SrcVT == MVT::i64 && !Subtarget.is64Bit() && \"Unexpected source type in LowerBITCAST\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32810, __extension__
__PRETTY_FUNCTION__))

32810

"Unexpected source type in LowerBITCAST")(static_cast <bool> (SrcVT == MVT::i64 && !Subtarget
.is64Bit() && "Unexpected source type in LowerBITCAST"
) ? void (0) : __assert_fail ("SrcVT == MVT::i64 && !Subtarget.is64Bit() && \"Unexpected source type in LowerBITCAST\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32810, __extension__
__PRETTY_FUNCTION__));

32811

Src = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Src);

32812

}

32813

32814

MVT V2X64VT = DstVT == MVT::f64 ? MVT::v2f64 : MVT::v2i64;

32815

Src = DAG.getNode(ISD::BITCAST, dl, V2X64VT, Src);

32816

32817

if (DstVT == MVT::x86mmx)

32818

return DAG.getNode(X86ISD::MOVDQ2Q, dl, DstVT, Src);

32819

32820

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, DstVT, Src,

32821

DAG.getIntPtrConstant(0, dl));

32822

}

32823

32824

/// Compute the horizontal sum of bytes in V for the elements of VT.

32825

///

32826

/// Requires V to be a byte vector and VT to be an integer vector type with

32827

/// wider elements than V's type. The width of the elements of VT determines

32828

/// how many bytes of V are summed horizontally to produce each element of the

32829

/// result.

32830

static SDValue LowerHorizontalByteSum(SDValue V, MVT VT,

32831

const X86Subtarget &Subtarget,

32832

SelectionDAG &DAG) {

32833

SDLoc DL(V);

32834

MVT ByteVecVT = V.getSimpleValueType();

32835

MVT EltVT = VT.getVectorElementType();

32836

assert(ByteVecVT.getVectorElementType() == MVT::i8 &&(static_cast <bool> (ByteVecVT.getVectorElementType() ==
MVT::i8 && "Expected value to have byte element type."
) ? void (0) : __assert_fail ("ByteVecVT.getVectorElementType() == MVT::i8 && \"Expected value to have byte element type.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32837, __extension__
__PRETTY_FUNCTION__))

32837

"Expected value to have byte element type.")(static_cast <bool> (ByteVecVT.getVectorElementType() ==
MVT::i8 && "Expected value to have byte element type."
) ? void (0) : __assert_fail ("ByteVecVT.getVectorElementType() == MVT::i8 && \"Expected value to have byte element type.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32837, __extension__
__PRETTY_FUNCTION__));

32838

assert(EltVT != MVT::i8 &&(static_cast <bool> (EltVT != MVT::i8 && "Horizontal byte sum only makes sense for wider elements!"
) ? void (0) : __assert_fail ("EltVT != MVT::i8 && \"Horizontal byte sum only makes sense for wider elements!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32839, __extension__
__PRETTY_FUNCTION__))

32839

"Horizontal byte sum only makes sense for wider elements!")(static_cast <bool> (EltVT != MVT::i8 && "Horizontal byte sum only makes sense for wider elements!"
) ? void (0) : __assert_fail ("EltVT != MVT::i8 && \"Horizontal byte sum only makes sense for wider elements!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32839, __extension__
__PRETTY_FUNCTION__));

32840

unsigned VecSize = VT.getSizeInBits();

32841

assert(ByteVecVT.getSizeInBits() == VecSize && "Cannot change vector size!")(static_cast <bool> (ByteVecVT.getSizeInBits() == VecSize
&& "Cannot change vector size!") ? void (0) : __assert_fail
("ByteVecVT.getSizeInBits() == VecSize && \"Cannot change vector size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32841, __extension__
__PRETTY_FUNCTION__));

32842

32843

// PSADBW instruction horizontally add all bytes and leave the result in i64

32844

// chunks, thus directly computes the pop count for v2i64 and v4i64.

32845

if (EltVT == MVT::i64) {

32846

SDValue Zeros = DAG.getConstant(0, DL, ByteVecVT);

32847

MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);

32848

V = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, V, Zeros);

32849

return DAG.getBitcast(VT, V);

32850

}

32851

32852

if (EltVT == MVT::i32) {

32853

// We unpack the low half and high half into i32s interleaved with zeros so

32854

// that we can use PSADBW to horizontally sum them. The most useful part of

32855

// this is that it lines up the results of two PSADBW instructions to be

32856

// two v2i64 vectors which concatenated are the 4 population counts. We can

32857

// then use PACKUSWB to shrink and concatenate them into a v4i32 again.

32858

SDValue Zeros = DAG.getConstant(0, DL, VT);

32859

SDValue V32 = DAG.getBitcast(VT, V);

32860

SDValue Low = getUnpackl(DAG, DL, VT, V32, Zeros);

32861

SDValue High = getUnpackh(DAG, DL, VT, V32, Zeros);

32862

32863

// Do the horizontal sums into two v2i64s.

32864

Zeros = DAG.getConstant(0, DL, ByteVecVT);

32865

MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);

32866

Low = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,

32867

DAG.getBitcast(ByteVecVT, Low), Zeros);

32868

High = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,

32869

DAG.getBitcast(ByteVecVT, High), Zeros);

32870

32871

// Merge them together.

32872

MVT ShortVecVT = MVT::getVectorVT(MVT::i16, VecSize / 16);

32873

V = DAG.getNode(X86ISD::PACKUS, DL, ByteVecVT,

32874

DAG.getBitcast(ShortVecVT, Low),

32875

DAG.getBitcast(ShortVecVT, High));

32876

32877

return DAG.getBitcast(VT, V);

32878

}

32879

32880

// The only element type left is i16.

32881

assert(EltVT == MVT::i16 && "Unknown how to handle type")(static_cast <bool> (EltVT == MVT::i16 && "Unknown how to handle type"
) ? void (0) : __assert_fail ("EltVT == MVT::i16 && \"Unknown how to handle type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32881, __extension__
__PRETTY_FUNCTION__));

32882

32883

// To obtain pop count for each i16 element starting from the pop count for

32884

// i8 elements, shift the i16s left by 8, sum as i8s, and then shift as i16s

32885

// right by 8. It is important to shift as i16s as i8 vector shift isn't

32886

// directly supported.

32887

SDValue ShifterV = DAG.getConstant(8, DL, VT);

32888

SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, DAG.getBitcast(VT, V), ShifterV);

32889

V = DAG.getNode(ISD::ADD, DL, ByteVecVT, DAG.getBitcast(ByteVecVT, Shl),

32890

DAG.getBitcast(ByteVecVT, V));

32891

return DAG.getNode(ISD::SRL, DL, VT, DAG.getBitcast(VT, V), ShifterV);

32892

}

32893

32894

static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, const SDLoc &DL,

32895

const X86Subtarget &Subtarget,

32896

SelectionDAG &DAG) {

32897

MVT VT = Op.getSimpleValueType();

32898

MVT EltVT = VT.getVectorElementType();

32899

int NumElts = VT.getVectorNumElements();

32900

(void)EltVT;

32901

assert(EltVT == MVT::i8 && "Only vXi8 vector CTPOP lowering supported.")(static_cast <bool> (EltVT == MVT::i8 && "Only vXi8 vector CTPOP lowering supported."
) ? void (0) : __assert_fail ("EltVT == MVT::i8 && \"Only vXi8 vector CTPOP lowering supported.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32901, __extension__
__PRETTY_FUNCTION__));

32902

32903

// Implement a lookup table in register by using an algorithm based on:

32904

// http://wm.ite.pl/articles/sse-popcount.html

32905

//

32906

// The general idea is that every lower byte nibble in the input vector is an

32907

// index into a in-register pre-computed pop count table. We then split up the

32908

// input vector in two new ones: (1) a vector with only the shifted-right

32909

// higher nibbles for each byte and (2) a vector with the lower nibbles (and

32910

// masked out higher ones) for each byte. PSHUFB is used separately with both

32911

// to index the in-register table. Next, both are added and the result is a

32912

// i8 vector where each element contains the pop count for input byte.

32913

const int LUT[16] = {/* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2,

32914

/* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3,

32915

/* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3,

32916

/* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4};

32917

32918

SmallVector<SDValue, 64> LUTVec;

32919

for (int i = 0; i < NumElts; ++i)

32920

LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));

32921

SDValue InRegLUT = DAG.getBuildVector(VT, DL, LUTVec);

32922

SDValue M0F = DAG.getConstant(0x0F, DL, VT);

32923

32924

// High nibbles

32925

SDValue FourV = DAG.getConstant(4, DL, VT);

32926

SDValue HiNibbles = DAG.getNode(ISD::SRL, DL, VT, Op, FourV);

32927

32928

// Low nibbles

32929

SDValue LoNibbles = DAG.getNode(ISD::AND, DL, VT, Op, M0F);

32930

32931

// The input vector is used as the shuffle mask that index elements into the

32932

// LUT. After counting low and high nibbles, add the vector to obtain the

32933

// final pop count per i8 element.

32934

SDValue HiPopCnt = DAG.getNode(X86ISD::PSHUFB, DL, VT, InRegLUT, HiNibbles);

32935

SDValue LoPopCnt = DAG.getNode(X86ISD::PSHUFB, DL, VT, InRegLUT, LoNibbles);

32936

return DAG.getNode(ISD::ADD, DL, VT, HiPopCnt, LoPopCnt);

32937

}

32938

32939

// Please ensure that any codegen change from LowerVectorCTPOP is reflected in

32940

// updated cost models in X86TTIImpl::getIntrinsicInstrCost.

32941

static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget &Subtarget,

32942

SelectionDAG &DAG) {

32943

MVT VT = Op.getSimpleValueType();

32944

assert((VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) &&(static_cast <bool> ((VT.is512BitVector() || VT.is256BitVector
() || VT.is128BitVector()) && "Unknown CTPOP type to handle"
) ? void (0) : __assert_fail ("(VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) && \"Unknown CTPOP type to handle\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32945, __extension__
__PRETTY_FUNCTION__))

32945

"Unknown CTPOP type to handle")(static_cast <bool> ((VT.is512BitVector() || VT.is256BitVector
() || VT.is128BitVector()) && "Unknown CTPOP type to handle"
) ? void (0) : __assert_fail ("(VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) && \"Unknown CTPOP type to handle\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32945, __extension__
__PRETTY_FUNCTION__));

32946

SDLoc DL(Op.getNode());

32947

SDValue Op0 = Op.getOperand(0);

32948

32949

// TRUNC(CTPOP(ZEXT(X))) to make use of vXi32/vXi64 VPOPCNT instructions.

32950

if (Subtarget.hasVPOPCNTDQ()) {

32951

unsigned NumElems = VT.getVectorNumElements();

32952

assert((VT.getVectorElementType() == MVT::i8 ||(static_cast <bool> ((VT.getVectorElementType() == MVT::
i8 || VT.getVectorElementType() == MVT::i16) && "Unexpected type"
) ? void (0) : __assert_fail ("(VT.getVectorElementType() == MVT::i8 || VT.getVectorElementType() == MVT::i16) && \"Unexpected type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32953, __extension__
__PRETTY_FUNCTION__))

32953

VT.getVectorElementType() == MVT::i16) && "Unexpected type")(static_cast <bool> ((VT.getVectorElementType() == MVT::
i8 || VT.getVectorElementType() == MVT::i16) && "Unexpected type"
) ? void (0) : __assert_fail ("(VT.getVectorElementType() == MVT::i8 || VT.getVectorElementType() == MVT::i16) && \"Unexpected type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32953, __extension__
__PRETTY_FUNCTION__));

32954

if (NumElems < 16 || (NumElems == 16 && Subtarget.canExtendTo512DQ())) {

32955

MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);

32956

Op = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, Op0);

32957

Op = DAG.getNode(ISD::CTPOP, DL, NewVT, Op);

32958

return DAG.getNode(ISD::TRUNCATE, DL, VT, Op);

32959

}

32960

}

32961

32962

// Decompose 256-bit ops into smaller 128-bit ops.

32963

if (VT.is256BitVector() && !Subtarget.hasInt256())

32964

return splitVectorIntUnary(Op, DAG);

32965

32966

// Decompose 512-bit ops into smaller 256-bit ops.

32967

if (VT.is512BitVector() && !Subtarget.hasBWI())

32968

return splitVectorIntUnary(Op, DAG);

32969

32970

// For element types greater than i8, do vXi8 pop counts and a bytesum.

32971

if (VT.getScalarType() != MVT::i8) {

32972

MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);

32973

SDValue ByteOp = DAG.getBitcast(ByteVT, Op0);

32974

SDValue PopCnt8 = DAG.getNode(ISD::CTPOP, DL, ByteVT, ByteOp);

32975

return LowerHorizontalByteSum(PopCnt8, VT, Subtarget, DAG);

32976

}

32977

32978

// We can't use the fast LUT approach, so fall back on LegalizeDAG.

32979

if (!Subtarget.hasSSSE3())

32980

return SDValue();

32981

32982

return LowerVectorCTPOPInRegLUT(Op0, DL, Subtarget, DAG);

32983

}

32984

32985

static SDValue LowerCTPOP(SDValue Op, const X86Subtarget &Subtarget,

32986

SelectionDAG &DAG) {

32987

assert(Op.getSimpleValueType().isVector() &&(static_cast <bool> (Op.getSimpleValueType().isVector()
&& "We only do custom lowering for vector population count."
) ? void (0) : __assert_fail ("Op.getSimpleValueType().isVector() && \"We only do custom lowering for vector population count.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32988, __extension__
__PRETTY_FUNCTION__))

32988

"We only do custom lowering for vector population count.")(static_cast <bool> (Op.getSimpleValueType().isVector()
&& "We only do custom lowering for vector population count."
) ? void (0) : __assert_fail ("Op.getSimpleValueType().isVector() && \"We only do custom lowering for vector population count.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32988, __extension__
__PRETTY_FUNCTION__));

32989

return LowerVectorCTPOP(Op, Subtarget, DAG);

32990

}

32991

32992

static SDValue LowerBITREVERSE_XOP(SDValue Op, SelectionDAG &DAG) {

32993

MVT VT = Op.getSimpleValueType();

32994

SDValue In = Op.getOperand(0);

32995

SDLoc DL(Op);

32996

32997

// For scalars, its still beneficial to transfer to/from the SIMD unit to

32998

// perform the BITREVERSE.

32999

if (!VT.isVector()) {

33000

MVT VecVT = MVT::getVectorVT(VT, 128 / VT.getSizeInBits());

33001

SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In);

33002

Res = DAG.getNode(ISD::BITREVERSE, DL, VecVT, Res);

33003

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Res,

33004

DAG.getIntPtrConstant(0, DL));

33005

}

33006

33007

int NumElts = VT.getVectorNumElements();

33008

int ScalarSizeInBytes = VT.getScalarSizeInBits() / 8;

33009

33010

// Decompose 256-bit ops into smaller 128-bit ops.

33011

if (VT.is256BitVector())

33012

return splitVectorIntUnary(Op, DAG);

33013

33014

assert(VT.is128BitVector() &&(static_cast <bool> (VT.is128BitVector() && "Only 128-bit vector bitreverse lowering supported."
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Only 128-bit vector bitreverse lowering supported.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33015, __extension__
__PRETTY_FUNCTION__))

33015

"Only 128-bit vector bitreverse lowering supported.")(static_cast <bool> (VT.is128BitVector() && "Only 128-bit vector bitreverse lowering supported."
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Only 128-bit vector bitreverse lowering supported.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33015, __extension__
__PRETTY_FUNCTION__));

33016

33017

// VPPERM reverses the bits of a byte with the permute Op (2 << 5), and we

33018

// perform the BSWAP in the shuffle.

33019

// Its best to shuffle using the second operand as this will implicitly allow

33020

// memory folding for multiple vectors.

33021

SmallVector<SDValue, 16> MaskElts;

33022

for (int i = 0; i != NumElts; ++i) {

33023

for (int j = ScalarSizeInBytes - 1; j >= 0; --j) {

33024

int SourceByte = 16 + (i * ScalarSizeInBytes) + j;

33025

int PermuteByte = SourceByte | (2 << 5);

33026

MaskElts.push_back(DAG.getConstant(PermuteByte, DL, MVT::i8));

33027

}

33028

}

33029

33030

SDValue Mask = DAG.getBuildVector(MVT::v16i8, DL, MaskElts);

33031

SDValue Res = DAG.getBitcast(MVT::v16i8, In);

33032

Res = DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, DAG.getUNDEF(MVT::v16i8),

33033

Res, Mask);

33034

return DAG.getBitcast(VT, Res);

33035

}

33036

33037

static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget,

33038

SelectionDAG &DAG) {

33039

MVT VT = Op.getSimpleValueType();

33040

33041

if (Subtarget.hasXOP() && !VT.is512BitVector())

33042

return LowerBITREVERSE_XOP(Op, DAG);

33043

33044

assert(Subtarget.hasSSSE3() && "SSSE3 required for BITREVERSE")(static_cast <bool> (Subtarget.hasSSSE3() && "SSSE3 required for BITREVERSE"
) ? void (0) : __assert_fail ("Subtarget.hasSSSE3() && \"SSSE3 required for BITREVERSE\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33044, __extension__
__PRETTY_FUNCTION__));

33045

33046

SDValue In = Op.getOperand(0);

33047

SDLoc DL(Op);

33048

33049

assert(VT.getScalarType() == MVT::i8 &&(static_cast <bool> (VT.getScalarType() == MVT::i8 &&
"Only byte vector BITREVERSE supported") ? void (0) : __assert_fail
("VT.getScalarType() == MVT::i8 && \"Only byte vector BITREVERSE supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33050, __extension__
__PRETTY_FUNCTION__))

33050

"Only byte vector BITREVERSE supported")(static_cast <bool> (VT.getScalarType() == MVT::i8 &&
"Only byte vector BITREVERSE supported") ? void (0) : __assert_fail
("VT.getScalarType() == MVT::i8 && \"Only byte vector BITREVERSE supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33050, __extension__
__PRETTY_FUNCTION__));

33051

33052

// Split v64i8 without BWI so that we can still use the PSHUFB lowering.

33053

if (VT == MVT::v64i8 && !Subtarget.hasBWI())

33054

return splitVectorIntUnary(Op, DAG);

33055

33056

// Decompose 256-bit ops into smaller 128-bit ops on pre-AVX2.

33057

if (VT == MVT::v32i8 && !Subtarget.hasInt256())

33058

return splitVectorIntUnary(Op, DAG);

33059

33060

unsigned NumElts = VT.getVectorNumElements();

33061

33062

// If we have GFNI, we can use GF2P8AFFINEQB to reverse the bits.

33063

if (Subtarget.hasGFNI()) {

33064

MVT MatrixVT = MVT::getVectorVT(MVT::i64, NumElts / 8);

33065

SDValue Matrix = DAG.getConstant(0x8040201008040201ULL, DL, MatrixVT);

33066

Matrix = DAG.getBitcast(VT, Matrix);

33067

return DAG.getNode(X86ISD::GF2P8AFFINEQB, DL, VT, In, Matrix,

33068

DAG.getTargetConstant(0, DL, MVT::i8));

33069

}

33070

33071

// Perform BITREVERSE using PSHUFB lookups. Each byte is split into

33072

// two nibbles and a PSHUFB lookup to find the bitreverse of each

33073

// 0-15 value (moved to the other nibble).

33074

SDValue NibbleMask = DAG.getConstant(0xF, DL, VT);

33075

SDValue Lo = DAG.getNode(ISD::AND, DL, VT, In, NibbleMask);

33076

SDValue Hi = DAG.getNode(ISD::SRL, DL, VT, In, DAG.getConstant(4, DL, VT));

33077

33078

const int LoLUT[16] = {

33079

/* 0 */ 0x00, /* 1 */ 0x80, /* 2 */ 0x40, /* 3 */ 0xC0,

33080

/* 4 */ 0x20, /* 5 */ 0xA0, /* 6 */ 0x60, /* 7 */ 0xE0,

33081

/* 8 */ 0x10, /* 9 */ 0x90, /* a */ 0x50, /* b */ 0xD0,

33082

/* c */ 0x30, /* d */ 0xB0, /* e */ 0x70, /* f */ 0xF0};

33083

const int HiLUT[16] = {

33084

/* 0 */ 0x00, /* 1 */ 0x08, /* 2 */ 0x04, /* 3 */ 0x0C,

33085

/* 4 */ 0x02, /* 5 */ 0x0A, /* 6 */ 0x06, /* 7 */ 0x0E,

33086

/* 8 */ 0x01, /* 9 */ 0x09, /* a */ 0x05, /* b */ 0x0D,

33087

/* c */ 0x03, /* d */ 0x0B, /* e */ 0x07, /* f */ 0x0F};

33088

33089

SmallVector<SDValue, 16> LoMaskElts, HiMaskElts;

33090

for (unsigned i = 0; i < NumElts; ++i) {

33091

LoMaskElts.push_back(DAG.getConstant(LoLUT[i % 16], DL, MVT::i8));

33092

HiMaskElts.push_back(DAG.getConstant(HiLUT[i % 16], DL, MVT::i8));

33093

}

33094

33095

SDValue LoMask = DAG.getBuildVector(VT, DL, LoMaskElts);

33096

SDValue HiMask = DAG.getBuildVector(VT, DL, HiMaskElts);

33097

Lo = DAG.getNode(X86ISD::PSHUFB, DL, VT, LoMask, Lo);

33098

Hi = DAG.getNode(X86ISD::PSHUFB, DL, VT, HiMask, Hi);

33099

return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);

33100

}

33101

33102

static SDValue LowerPARITY(SDValue Op, const X86Subtarget &Subtarget,

33103

SelectionDAG &DAG) {

33104

SDLoc DL(Op);

33105

SDValue X = Op.getOperand(0);

33106

MVT VT = Op.getSimpleValueType();

33107

33108

// Special case. If the input fits in 8-bits we can use a single 8-bit TEST.

33109

if (VT == MVT::i8 ||

33110

DAG.MaskedValueIsZero(X, APInt::getBitsSetFrom(VT.getSizeInBits(), 8))) {

33111

X = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X);

33112

SDValue Flags = DAG.getNode(X86ISD::CMP, DL, MVT::i32, X,

33113

DAG.getConstant(0, DL, MVT::i8));

33114

// Copy the inverse of the parity flag into a register with setcc.

33115

SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG);

33116

// Extend to the original type.

33117

return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Setnp);

33118

}

33119

33120

// If we have POPCNT, use the default expansion.

33121

if (Subtarget.hasPOPCNT())

33122

return SDValue();

33123

33124

if (VT == MVT::i64) {

33125

// Xor the high and low 16-bits together using a 32-bit operation.

33126

SDValue Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32,

33127

DAG.getNode(ISD::SRL, DL, MVT::i64, X,

33128

DAG.getConstant(32, DL, MVT::i8)));

33129

SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, X);

33130

X = DAG.getNode(ISD::XOR, DL, MVT::i32, Lo, Hi);

33131

}

33132

33133

if (VT != MVT::i16) {

33134

// Xor the high and low 16-bits together using a 32-bit operation.

33135

SDValue Hi16 = DAG.getNode(ISD::SRL, DL, MVT::i32, X,

33136

DAG.getConstant(16, DL, MVT::i8));

33137

X = DAG.getNode(ISD::XOR, DL, MVT::i32, X, Hi16);

33138

} else {

33139

// If the input is 16-bits, we need to extend to use an i32 shift below.

33140

X = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, X);

33141

}

33142

33143

// Finally xor the low 2 bytes together and use a 8-bit flag setting xor.

33144

// This should allow an h-reg to be used to save a shift.

33145

SDValue Hi = DAG.getNode(

33146

ISD::TRUNCATE, DL, MVT::i8,

33147

DAG.getNode(ISD::SRL, DL, MVT::i32, X, DAG.getConstant(8, DL, MVT::i8)));

33148

SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X);

33149

SDVTList VTs = DAG.getVTList(MVT::i8, MVT::i32);

33150

SDValue Flags = DAG.getNode(X86ISD::XOR, DL, VTs, Lo, Hi).getValue(1);

33151

33152

// Copy the inverse of the parity flag into a register with setcc.

33153

SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG);

33154

// Extend to the original type.

33155

return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Setnp);

33156

}

33157

33158

static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG,

33159

const X86Subtarget &Subtarget) {

33160

unsigned NewOpc = 0;

33161

switch (N->getOpcode()) {

33162

case ISD::ATOMIC_LOAD_ADD:

33163

NewOpc = X86ISD::LADD;

33164

break;

33165

case ISD::ATOMIC_LOAD_SUB:

33166

NewOpc = X86ISD::LSUB;

33167

break;

33168

case ISD::ATOMIC_LOAD_OR:

33169

NewOpc = X86ISD::LOR;

33170

break;

33171

case ISD::ATOMIC_LOAD_XOR:

33172

NewOpc = X86ISD::LXOR;

33173

break;

33174

case ISD::ATOMIC_LOAD_AND:

33175

NewOpc = X86ISD::LAND;

33176

break;

33177

default:

33178

llvm_unreachable("Unknown ATOMIC_LOAD_ opcode")::llvm::llvm_unreachable_internal("Unknown ATOMIC_LOAD_ opcode"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33178);

33179

}

33180

33181

MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand();

33182

33183

return DAG.getMemIntrinsicNode(

33184

NewOpc, SDLoc(N), DAG.getVTList(MVT::i32, MVT::Other),

33185

{N->getOperand(0), N->getOperand(1), N->getOperand(2)},

33186

/*MemVT=*/N->getSimpleValueType(0), MMO);

33187

}

33188

33189

/// Lower atomic_load_ops into LOCK-prefixed operations.

33190

static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG,

33191

const X86Subtarget &Subtarget) {

33192

AtomicSDNode *AN = cast<AtomicSDNode>(N.getNode());

33193

SDValue Chain = N->getOperand(0);

33194

SDValue LHS = N->getOperand(1);

33195

SDValue RHS = N->getOperand(2);

33196

unsigned Opc = N->getOpcode();

33197

MVT VT = N->getSimpleValueType(0);

33198

SDLoc DL(N);

33199

33200

// We can lower atomic_load_add into LXADD. However, any other atomicrmw op

33201

// can only be lowered when the result is unused. They should have already

33202

// been transformed into a cmpxchg loop in AtomicExpand.

33203

if (N->hasAnyUseOfValue(0)) {

33204

// Handle (atomic_load_sub p, v) as (atomic_load_add p, -v), to be able to

33205

// select LXADD if LOCK_SUB can't be selected.

33206

if (Opc == ISD::ATOMIC_LOAD_SUB) {

33207

RHS = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), RHS);

33208

return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, DL, VT, Chain, LHS,

33209

RHS, AN->getMemOperand());

33210

}

33211

assert(Opc == ISD::ATOMIC_LOAD_ADD &&(static_cast <bool> (Opc == ISD::ATOMIC_LOAD_ADD &&
"Used AtomicRMW ops other than Add should have been expanded!"
) ? void (0) : __assert_fail ("Opc == ISD::ATOMIC_LOAD_ADD && \"Used AtomicRMW ops other than Add should have been expanded!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33212, __extension__
__PRETTY_FUNCTION__))

33212

"Used AtomicRMW ops other than Add should have been expanded!")(static_cast <bool> (Opc == ISD::ATOMIC_LOAD_ADD &&
"Used AtomicRMW ops other than Add should have been expanded!"
) ? void (0) : __assert_fail ("Opc == ISD::ATOMIC_LOAD_ADD && \"Used AtomicRMW ops other than Add should have been expanded!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33212, __extension__
__PRETTY_FUNCTION__));

33213

return N;

33214

}

33215

33216

// Specialized lowering for the canonical form of an idemptotent atomicrmw.

33217

// The core idea here is that since the memory location isn't actually

33218

// changing, all we need is a lowering for the *ordering* impacts of the

33219

// atomicrmw. As such, we can chose a different operation and memory

33220

// location to minimize impact on other code.

33221

if (Opc == ISD::ATOMIC_LOAD_OR && isNullConstant(RHS)) {

33222

// On X86, the only ordering which actually requires an instruction is

33223

// seq_cst which isn't SingleThread, everything just needs to be preserved

33224

// during codegen and then dropped. Note that we expect (but don't assume),

33225

// that orderings other than seq_cst and acq_rel have been canonicalized to

33226

// a store or load.

33227

if (AN->getSuccessOrdering() == AtomicOrdering::SequentiallyConsistent &&

33228

AN->getSyncScopeID() == SyncScope::System) {

33229

// Prefer a locked operation against a stack location to minimize cache

33230

// traffic. This assumes that stack locations are very likely to be

33231

// accessed only by the owning thread.

33232

SDValue NewChain = emitLockedStackOp(DAG, Subtarget, Chain, DL);

33233

assert(!N->hasAnyUseOfValue(0))(static_cast <bool> (!N->hasAnyUseOfValue(0)) ? void
(0) : __assert_fail ("!N->hasAnyUseOfValue(0)", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33233, __extension__ __PRETTY_FUNCTION__));

33234

// NOTE: The getUNDEF is needed to give something for the unused result 0.

33235

return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),

33236

DAG.getUNDEF(VT), NewChain);

33237

}

33238

// MEMBARRIER is a compiler barrier; it codegens to a no-op.

33239

SDValue NewChain = DAG.getNode(ISD::MEMBARRIER, DL, MVT::Other, Chain);

33240

assert(!N->hasAnyUseOfValue(0))(static_cast <bool> (!N->hasAnyUseOfValue(0)) ? void
(0) : __assert_fail ("!N->hasAnyUseOfValue(0)", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33240, __extension__ __PRETTY_FUNCTION__));

33241

// NOTE: The getUNDEF is needed to give something for the unused result 0.

33242

return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),

33243

DAG.getUNDEF(VT), NewChain);

33244

}

33245

33246

SDValue LockOp = lowerAtomicArithWithLOCK(N, DAG, Subtarget);

33247

// RAUW the chain, but don't worry about the result, as it's unused.

33248

assert(!N->hasAnyUseOfValue(0))(static_cast <bool> (!N->hasAnyUseOfValue(0)) ? void
(0) : __assert_fail ("!N->hasAnyUseOfValue(0)", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 33248, __extension__ __PRETTY_FUNCTION__));

33249

// NOTE: The getUNDEF is needed to give something for the unused result 0.

33250

return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),

33251

DAG.getUNDEF(VT), LockOp.getValue(1));

33252

}

33253

33254

static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG,

33255

const X86Subtarget &Subtarget) {

33256

auto *Node = cast<AtomicSDNode>(Op.getNode());

33257

SDLoc dl(Node);

33258

EVT VT = Node->getMemoryVT();

33259

33260

bool IsSeqCst =

33261

Node->getSuccessOrdering() == AtomicOrdering::SequentiallyConsistent;

33262

bool IsTypeLegal = DAG.getTargetLoweringInfo().isTypeLegal(VT);

33263

33264

// If this store is not sequentially consistent and the type is legal

33265

// we can just keep it.

33266

if (!IsSeqCst && IsTypeLegal)

33267

return Op;

33268

33269

if (VT == MVT::i64 && !IsTypeLegal) {

33270

// For illegal i64 atomic_stores, we can try to use MOVQ or MOVLPS if SSE

33271

// is enabled.

33272

bool NoImplicitFloatOps =

33273

DAG.getMachineFunction().getFunction().hasFnAttribute(

33274

Attribute::NoImplicitFloat);

33275

if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps) {

33276

SDValue Chain;

33277

if (Subtarget.hasSSE1()) {

33278

SDValue SclToVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,

33279

Node->getOperand(2));

33280

MVT StVT = Subtarget.hasSSE2() ? MVT::v2i64 : MVT::v4f32;

33281

SclToVec = DAG.getBitcast(StVT, SclToVec);

33282

SDVTList Tys = DAG.getVTList(MVT::Other);

33283

SDValue Ops[] = {Node->getChain(), SclToVec, Node->getBasePtr()};

33284

Chain = DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, Ops,

33285

MVT::i64, Node->getMemOperand());

33286

} else if (Subtarget.hasX87()) {

33287

// First load this into an 80-bit X87 register using a stack temporary.

33288

// This will put the whole integer into the significand.

33289

SDValue StackPtr = DAG.CreateStackTemporary(MVT::i64);

33290

int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();

33291

MachinePointerInfo MPI =

33292

MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);

33293

Chain =

33294

DAG.getStore(Node->getChain(), dl, Node->getOperand(2), StackPtr,

33295

MPI, MaybeAlign(), MachineMemOperand::MOStore);

33296

SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);

33297

SDValue LdOps[] = {Chain, StackPtr};

33298

SDValue Value = DAG.getMemIntrinsicNode(

33299

X86ISD::FILD, dl, Tys, LdOps, MVT::i64, MPI,

33300

/*Align*/ std::nullopt, MachineMemOperand::MOLoad);

33301

Chain = Value.getValue(1);

33302

33303

// Now use an FIST to do the atomic store.

33304

SDValue StoreOps[] = {Chain, Value, Node->getBasePtr()};

33305

Chain =

33306

DAG.getMemIntrinsicNode(X86ISD::FIST, dl, DAG.getVTList(MVT::Other),

33307

StoreOps, MVT::i64, Node->getMemOperand());

33308

}

33309

33310

if (Chain) {

33311

// If this is a sequentially consistent store, also emit an appropriate

33312

// barrier.

33313

if (IsSeqCst)

33314

Chain = emitLockedStackOp(DAG, Subtarget, Chain, dl);

33315

33316

return Chain;

33317

}

33318

}

33319

}

33320

33321

// Convert seq_cst store -> xchg

33322

// Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)

33323

// FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.

33324

SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl,

33325

Node->getMemoryVT(),

33326

Node->getOperand(0),

33327

Node->getOperand(1), Node->getOperand(2),

33328

Node->getMemOperand());

33329

return Swap.getValue(1);

33330

}

33331

33332

static SDValue LowerADDSUBCARRY(SDValue Op, SelectionDAG &DAG) {

33333

SDNode *N = Op.getNode();

33334

MVT VT = N->getSimpleValueType(0);

33335

unsigned Opc = Op.getOpcode();

33336

33337

// Let legalize expand this if it isn't a legal type yet.

33338

if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))

33339

return SDValue();

33340

33341

SDVTList VTs = DAG.getVTList(VT, MVT::i32);

33342

SDLoc DL(N);

33343

33344

// Set the carry flag.

33345

SDValue Carry = Op.getOperand(2);

33346

EVT CarryVT = Carry.getValueType();

33347

Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),

33348

Carry, DAG.getAllOnesConstant(DL, CarryVT));

33349

33350

bool IsAdd = Opc == ISD::ADDCARRY || Opc == ISD::SADDO_CARRY;

33351

SDValue Sum = DAG.getNode(IsAdd ? X86ISD::ADC : X86ISD::SBB, DL, VTs,

33352

Op.getOperand(0), Op.getOperand(1),

33353

Carry.getValue(1));

33354

33355

bool IsSigned = Opc == ISD::SADDO_CARRY || Opc == ISD::SSUBO_CARRY;

33356

SDValue SetCC = getSETCC(IsSigned ? X86::COND_O : X86::COND_B,

33357

Sum.getValue(1), DL, DAG);

33358

if (N->getValueType(1) == MVT::i1)

33359

SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);

33360

33361

return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);

33362

}

33363

33364

static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,

33365

SelectionDAG &DAG) {

33366

assert(Subtarget.isTargetDarwin() && Subtarget.is64Bit())(static_cast <bool> (Subtarget.isTargetDarwin() &&
Subtarget.is64Bit()) ? void (0) : __assert_fail ("Subtarget.isTargetDarwin() && Subtarget.is64Bit()"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33366, __extension__
__PRETTY_FUNCTION__));

33367

33368

// For MacOSX, we want to call an alternative entry point: __sincos_stret,

33369

// which returns the values as { float, float } (in XMM0) or

33370

// { double, double } (which is returned in XMM0, XMM1).

33371

SDLoc dl(Op);

33372

SDValue Arg = Op.getOperand(0);

33373

EVT ArgVT = Arg.getValueType();

33374

Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());

33375

33376

TargetLowering::ArgListTy Args;

33377

TargetLowering::ArgListEntry Entry;

33378

33379

Entry.Node = Arg;

33380

Entry.Ty = ArgTy;

33381

Entry.IsSExt = false;

33382

Entry.IsZExt = false;

33383

Args.push_back(Entry);

33384

33385

bool isF64 = ArgVT == MVT::f64;

33386

// Only optimize x86_64 for now. i386 is a bit messy. For f32,

33387

// the small struct {f32, f32} is returned in (eax, edx). For f64,

33388

// the results are returned via SRet in memory.

33389

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

33390

RTLIB::Libcall LC = isF64 ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32;

33391

const char *LibcallName = TLI.getLibcallName(LC);

33392

SDValue Callee =

33393

DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout()));

33394

33395

Type *RetTy = isF64 ? (Type *)StructType::get(ArgTy, ArgTy)

33396

: (Type *)FixedVectorType::get(ArgTy, 4);

33397

33398

TargetLowering::CallLoweringInfo CLI(DAG);

33399

CLI.setDebugLoc(dl)

33400

.setChain(DAG.getEntryNode())

33401

.setLibCallee(CallingConv::C, RetTy, Callee, std::move(Args));

33402

33403

std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);

33404

33405

if (isF64)

33406

// Returned in xmm0 and xmm1.

33407

return CallResult.first;

33408

33409

// Returned in bits 0:31 and 32:64 xmm0.

33410

SDValue SinVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,

33411

CallResult.first, DAG.getIntPtrConstant(0, dl));

33412

SDValue CosVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,

33413

CallResult.first, DAG.getIntPtrConstant(1, dl));

33414

SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);

33415

return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);

33416

}

33417

33418

/// Widen a vector input to a vector of NVT. The

33419

/// input vector must have the same element type as NVT.

33420

static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG,

33421

bool FillWithZeroes = false) {

33422

// Check if InOp already has the right width.

33423

MVT InVT = InOp.getSimpleValueType();

33424

if (InVT == NVT)

33425

return InOp;

33426

33427

if (InOp.isUndef())

33428

return DAG.getUNDEF(NVT);

33429

33430

assert(InVT.getVectorElementType() == NVT.getVectorElementType() &&(static_cast <bool> (InVT.getVectorElementType() == NVT
.getVectorElementType() && "input and widen element type must match"
) ? void (0) : __assert_fail ("InVT.getVectorElementType() == NVT.getVectorElementType() && \"input and widen element type must match\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33431, __extension__
__PRETTY_FUNCTION__))

33431

"input and widen element type must match")(static_cast <bool> (InVT.getVectorElementType() == NVT
.getVectorElementType() && "input and widen element type must match"
) ? void (0) : __assert_fail ("InVT.getVectorElementType() == NVT.getVectorElementType() && \"input and widen element type must match\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33431, __extension__
__PRETTY_FUNCTION__));

33432

33433

unsigned InNumElts = InVT.getVectorNumElements();

33434

unsigned WidenNumElts = NVT.getVectorNumElements();

33435

assert(WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 &&(static_cast <bool> (WidenNumElts > InNumElts &&
WidenNumElts % InNumElts == 0 && "Unexpected request for vector widening"
) ? void (0) : __assert_fail ("WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 && \"Unexpected request for vector widening\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33436, __extension__
__PRETTY_FUNCTION__))

33436

"Unexpected request for vector widening")(static_cast <bool> (WidenNumElts > InNumElts &&
WidenNumElts % InNumElts == 0 && "Unexpected request for vector widening"
) ? void (0) : __assert_fail ("WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 && \"Unexpected request for vector widening\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33436, __extension__
__PRETTY_FUNCTION__));

33437

33438

SDLoc dl(InOp);

33439

if (InOp.getOpcode() == ISD::CONCAT_VECTORS &&

33440

InOp.getNumOperands() == 2) {

33441

SDValue N1 = InOp.getOperand(1);

33442

if ((ISD::isBuildVectorAllZeros(N1.getNode()) && FillWithZeroes) ||

33443

N1.isUndef()) {

33444

InOp = InOp.getOperand(0);

33445

InVT = InOp.getSimpleValueType();

33446

InNumElts = InVT.getVectorNumElements();

33447

}

33448

}

33449

if (ISD::isBuildVectorOfConstantSDNodes(InOp.getNode()) ||

33450

ISD::isBuildVectorOfConstantFPSDNodes(InOp.getNode())) {

33451

SmallVector<SDValue, 16> Ops;

33452

for (unsigned i = 0; i < InNumElts; ++i)

33453

Ops.push_back(InOp.getOperand(i));

33454

33455

EVT EltVT = InOp.getOperand(0).getValueType();

33456

33457

SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, EltVT) :

33458

DAG.getUNDEF(EltVT);

33459

for (unsigned i = 0; i < WidenNumElts - InNumElts; ++i)

33460

Ops.push_back(FillVal);

33461

return DAG.getBuildVector(NVT, dl, Ops);

33462

}

33463

SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, NVT) :

33464

DAG.getUNDEF(NVT);

33465

return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NVT, FillVal,

33466

InOp, DAG.getIntPtrConstant(0, dl));

33467

}

33468

33469

static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget,

33470

SelectionDAG &DAG) {

33471

assert(Subtarget.hasAVX512() &&(static_cast <bool> (Subtarget.hasAVX512() && "MGATHER/MSCATTER are supported on AVX-512 arch only"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && \"MGATHER/MSCATTER are supported on AVX-512 arch only\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33472, __extension__
__PRETTY_FUNCTION__))

33472

"MGATHER/MSCATTER are supported on AVX-512 arch only")(static_cast <bool> (Subtarget.hasAVX512() && "MGATHER/MSCATTER are supported on AVX-512 arch only"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && \"MGATHER/MSCATTER are supported on AVX-512 arch only\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33472, __extension__
__PRETTY_FUNCTION__));

33473

33474

MaskedScatterSDNode *N = cast<MaskedScatterSDNode>(Op.getNode());

33475

SDValue Src = N->getValue();

33476

MVT VT = Src.getSimpleValueType();

33477

assert(VT.getScalarSizeInBits() >= 32 && "Unsupported scatter op")(static_cast <bool> (VT.getScalarSizeInBits() >= 32 &&
"Unsupported scatter op") ? void (0) : __assert_fail ("VT.getScalarSizeInBits() >= 32 && \"Unsupported scatter op\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33477, __extension__
__PRETTY_FUNCTION__));

33478

SDLoc dl(Op);

33479

33480

SDValue Scale = N->getScale();

33481

SDValue Index = N->getIndex();

33482

SDValue Mask = N->getMask();

33483

SDValue Chain = N->getChain();

33484

SDValue BasePtr = N->getBasePtr();

33485

33486

if (VT == MVT::v2f32 || VT == MVT::v2i32) {

33487

assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type")(static_cast <bool> (Mask.getValueType() == MVT::v2i1 &&
"Unexpected mask type") ? void (0) : __assert_fail ("Mask.getValueType() == MVT::v2i1 && \"Unexpected mask type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33487, __extension__
__PRETTY_FUNCTION__));

33488

// If the index is v2i64 and we have VLX we can use xmm for data and index.

33489

if (Index.getValueType() == MVT::v2i64 && Subtarget.hasVLX()) {

33490

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

33491

EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);

33492

Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, Src, DAG.getUNDEF(VT));

33493

SDVTList VTs = DAG.getVTList(MVT::Other);

33494

SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};

33495

return DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,

33496

N->getMemoryVT(), N->getMemOperand());

33497

}

33498

return SDValue();

33499

}

33500

33501

MVT IndexVT = Index.getSimpleValueType();

33502

33503

// If the index is v2i32, we're being called by type legalization and we

33504

// should just let the default handling take care of it.

33505

if (IndexVT == MVT::v2i32)

33506

return SDValue();

33507

33508

// If we don't have VLX and neither the passthru or index is 512-bits, we

33509

// need to widen until one is.

33510

if (!Subtarget.hasVLX() && !VT.is512BitVector() &&

33511

!Index.getSimpleValueType().is512BitVector()) {

33512

// Determine how much we need to widen by to get a 512-bit type.

33513

unsigned Factor = std::min(512/VT.getSizeInBits(),

33514

512/IndexVT.getSizeInBits());

33515

unsigned NumElts = VT.getVectorNumElements() * Factor;

33516

33517

VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);

33518

IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);

33519

MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);

33520

33521

Src = ExtendToType(Src, VT, DAG);

33522

Index = ExtendToType(Index, IndexVT, DAG);

33523

Mask = ExtendToType(Mask, MaskVT, DAG, true);

33524

}

33525

33526

SDVTList VTs = DAG.getVTList(MVT::Other);

33527

SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};

33528

return DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,

33529

N->getMemoryVT(), N->getMemOperand());

33530

}

33531

33532

static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,

33533

SelectionDAG &DAG) {

33534

33535

MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode());

33536

MVT VT = Op.getSimpleValueType();

33537

MVT ScalarVT = VT.getScalarType();

33538

SDValue Mask = N->getMask();

33539

MVT MaskVT = Mask.getSimpleValueType();

33540

SDValue PassThru = N->getPassThru();

33541

SDLoc dl(Op);

33542

33543

// Handle AVX masked loads which don't support passthru other than 0.

33544

if (MaskVT.getVectorElementType() != MVT::i1) {

33545

// We also allow undef in the isel pattern.

33546

if (PassThru.isUndef() || ISD::isBuildVectorAllZeros(PassThru.getNode()))

33547

return Op;

33548

33549

SDValue NewLoad = DAG.getMaskedLoad(

33550

VT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask,

33551

getZeroVector(VT, Subtarget, DAG, dl), N->getMemoryVT(),

33552

N->getMemOperand(), N->getAddressingMode(), N->getExtensionType(),

33553

N->isExpandingLoad());

33554

// Emit a blend.

33555

SDValue Select = DAG.getNode(ISD::VSELECT, dl, VT, Mask, NewLoad, PassThru);

33556

return DAG.getMergeValues({ Select, NewLoad.getValue(1) }, dl);

33557

}

33558

33559

assert((!N->isExpandingLoad() || Subtarget.hasAVX512()) &&(static_cast <bool> ((!N->isExpandingLoad() || Subtarget
.hasAVX512()) && "Expanding masked load is supported on AVX-512 target only!"
) ? void (0) : __assert_fail ("(!N->isExpandingLoad() || Subtarget.hasAVX512()) && \"Expanding masked load is supported on AVX-512 target only!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33560, __extension__
__PRETTY_FUNCTION__))

33560

"Expanding masked load is supported on AVX-512 target only!")(static_cast <bool> ((!N->isExpandingLoad() || Subtarget
.hasAVX512()) && "Expanding masked load is supported on AVX-512 target only!"
) ? void (0) : __assert_fail ("(!N->isExpandingLoad() || Subtarget.hasAVX512()) && \"Expanding masked load is supported on AVX-512 target only!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33560, __extension__
__PRETTY_FUNCTION__));

33561

33562

assert((!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) &&(static_cast <bool> ((!N->isExpandingLoad() || ScalarVT
.getSizeInBits() >= 32) && "Expanding masked load is supported for 32 and 64-bit types only!"
) ? void (0) : __assert_fail ("(!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) && \"Expanding masked load is supported for 32 and 64-bit types only!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33563, __extension__
__PRETTY_FUNCTION__))

33563

"Expanding masked load is supported for 32 and 64-bit types only!")(static_cast <bool> ((!N->isExpandingLoad() || ScalarVT
.getSizeInBits() >= 32) && "Expanding masked load is supported for 32 and 64-bit types only!"
) ? void (0) : __assert_fail ("(!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) && \"Expanding masked load is supported for 32 and 64-bit types only!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33563, __extension__
__PRETTY_FUNCTION__));

33564

33565

assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&(static_cast <bool> (Subtarget.hasAVX512() && !
Subtarget.hasVLX() && !VT.is512BitVector() &&
"Cannot lower masked load op.") ? void (0) : __assert_fail (
"Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() && \"Cannot lower masked load op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33566, __extension__
__PRETTY_FUNCTION__))

33566

"Cannot lower masked load op.")(static_cast <bool> (Subtarget.hasAVX512() && !
Subtarget.hasVLX() && !VT.is512BitVector() &&
"Cannot lower masked load op.") ? void (0) : __assert_fail (
"Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() && \"Cannot lower masked load op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33566, __extension__
__PRETTY_FUNCTION__));

33567

33568

assert((ScalarVT.getSizeInBits() >= 32 ||(static_cast <bool> ((ScalarVT.getSizeInBits() >= 32
|| (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT
== MVT::i16))) && "Unsupported masked load op.") ? void
(0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked load op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33571, __extension__
__PRETTY_FUNCTION__))

33569

(Subtarget.hasBWI() &&(static_cast <bool> ((ScalarVT.getSizeInBits() >= 32
|| (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT
== MVT::i16))) && "Unsupported masked load op.") ? void
(0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked load op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33571, __extension__
__PRETTY_FUNCTION__))

33570

(ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&(static_cast <bool> ((ScalarVT.getSizeInBits() >= 32
|| (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT
== MVT::i16))) && "Unsupported masked load op.") ? void
(0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked load op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33571, __extension__
__PRETTY_FUNCTION__))

33571

"Unsupported masked load op.")(static_cast <bool> ((ScalarVT.getSizeInBits() >= 32
|| (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT
== MVT::i16))) && "Unsupported masked load op.") ? void
(0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked load op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33571, __extension__
__PRETTY_FUNCTION__));

33572

33573

// This operation is legal for targets with VLX, but without

33574

// VLX the vector should be widened to 512 bit

33575

unsigned NumEltsInWideVec = 512 / VT.getScalarSizeInBits();

33576

MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);

33577

PassThru = ExtendToType(PassThru, WideDataVT, DAG);

33578

33579

// Mask element has to be i1.

33580

assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&(static_cast <bool> (Mask.getSimpleValueType().getScalarType
() == MVT::i1 && "Unexpected mask type") ? void (0) :
__assert_fail ("Mask.getSimpleValueType().getScalarType() == MVT::i1 && \"Unexpected mask type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33581, __extension__
__PRETTY_FUNCTION__))

33581

"Unexpected mask type")(static_cast <bool> (Mask.getSimpleValueType().getScalarType
() == MVT::i1 && "Unexpected mask type") ? void (0) :
__assert_fail ("Mask.getSimpleValueType().getScalarType() == MVT::i1 && \"Unexpected mask type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33581, __extension__
__PRETTY_FUNCTION__));

33582

33583

MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);

33584

33585

Mask = ExtendToType(Mask, WideMaskVT, DAG, true);

33586

SDValue NewLoad = DAG.getMaskedLoad(

33587

WideDataVT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask,

33588

PassThru, N->getMemoryVT(), N->getMemOperand(), N->getAddressingMode(),

33589

N->getExtensionType(), N->isExpandingLoad());

33590

33591

SDValue Extract =

33592

DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, NewLoad.getValue(0),

33593

DAG.getIntPtrConstant(0, dl));

33594

SDValue RetOps[] = {Extract, NewLoad.getValue(1)};

33595

return DAG.getMergeValues(RetOps, dl);

33596

}

33597

33598

static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget,

33599

SelectionDAG &DAG) {

33600

MaskedStoreSDNode *N = cast<MaskedStoreSDNode>(Op.getNode());

33601

SDValue DataToStore = N->getValue();

33602

MVT VT = DataToStore.getSimpleValueType();

33603

MVT ScalarVT = VT.getScalarType();

33604

SDValue Mask = N->getMask();

33605

SDLoc dl(Op);

33606

33607

assert((!N->isCompressingStore() || Subtarget.hasAVX512()) &&(static_cast <bool> ((!N->isCompressingStore() || Subtarget
.hasAVX512()) && "Expanding masked load is supported on AVX-512 target only!"
) ? void (0) : __assert_fail ("(!N->isCompressingStore() || Subtarget.hasAVX512()) && \"Expanding masked load is supported on AVX-512 target only!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33608, __extension__
__PRETTY_FUNCTION__))

33608

"Expanding masked load is supported on AVX-512 target only!")(static_cast <bool> ((!N->isCompressingStore() || Subtarget
.hasAVX512()) && "Expanding masked load is supported on AVX-512 target only!"
) ? void (0) : __assert_fail ("(!N->isCompressingStore() || Subtarget.hasAVX512()) && \"Expanding masked load is supported on AVX-512 target only!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33608, __extension__
__PRETTY_FUNCTION__));

33609

33610

assert((!N->isCompressingStore() || ScalarVT.getSizeInBits() >= 32) &&(static_cast <bool> ((!N->isCompressingStore() || ScalarVT
.getSizeInBits() >= 32) && "Expanding masked load is supported for 32 and 64-bit types only!"
) ? void (0) : __assert_fail ("(!N->isCompressingStore() || ScalarVT.getSizeInBits() >= 32) && \"Expanding masked load is supported for 32 and 64-bit types only!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33611, __extension__
__PRETTY_FUNCTION__))

33611

"Expanding masked load is supported for 32 and 64-bit types only!")(static_cast <bool> ((!N->isCompressingStore() || ScalarVT
.getSizeInBits() >= 32) && "Expanding masked load is supported for 32 and 64-bit types only!"
) ? void (0) : __assert_fail ("(!N->isCompressingStore() || ScalarVT.getSizeInBits() >= 32) && \"Expanding masked load is supported for 32 and 64-bit types only!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33611, __extension__
__PRETTY_FUNCTION__));

33612

33613

assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&(static_cast <bool> (Subtarget.hasAVX512() && !
Subtarget.hasVLX() && !VT.is512BitVector() &&
"Cannot lower masked store op.") ? void (0) : __assert_fail (
"Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() && \"Cannot lower masked store op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33614, __extension__
__PRETTY_FUNCTION__))

33614

"Cannot lower masked store op.")(static_cast <bool> (Subtarget.hasAVX512() && !
Subtarget.hasVLX() && !VT.is512BitVector() &&
"Cannot lower masked store op.") ? void (0) : __assert_fail (
"Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() && \"Cannot lower masked store op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33614, __extension__
__PRETTY_FUNCTION__));

33615

33616

assert((ScalarVT.getSizeInBits() >= 32 ||(static_cast <bool> ((ScalarVT.getSizeInBits() >= 32
|| (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT
== MVT::i16))) && "Unsupported masked store op.") ? void
(0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked store op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33619, __extension__
__PRETTY_FUNCTION__))

33617

(Subtarget.hasBWI() &&(static_cast <bool> ((ScalarVT.getSizeInBits() >= 32
|| (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT
== MVT::i16))) && "Unsupported masked store op.") ? void
(0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked store op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33619, __extension__
__PRETTY_FUNCTION__))

33618

(ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&(static_cast <bool> ((ScalarVT.getSizeInBits() >= 32
|| (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT
== MVT::i16))) && "Unsupported masked store op.") ? void
(0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked store op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33619, __extension__
__PRETTY_FUNCTION__))

33619

"Unsupported masked store op.")(static_cast <bool> ((ScalarVT.getSizeInBits() >= 32
|| (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT
== MVT::i16))) && "Unsupported masked store op.") ? void
(0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked store op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33619, __extension__
__PRETTY_FUNCTION__));

33620

33621

// This operation is legal for targets with VLX, but without

33622

// VLX the vector should be widened to 512 bit

33623

unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits();

33624

MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);

33625

33626

// Mask element has to be i1.

33627

assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&(static_cast <bool> (Mask.getSimpleValueType().getScalarType
() == MVT::i1 && "Unexpected mask type") ? void (0) :
__assert_fail ("Mask.getSimpleValueType().getScalarType() == MVT::i1 && \"Unexpected mask type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33628, __extension__
__PRETTY_FUNCTION__))

33628

"Unexpected mask type")(static_cast <bool> (Mask.getSimpleValueType().getScalarType
() == MVT::i1 && "Unexpected mask type") ? void (0) :
__assert_fail ("Mask.getSimpleValueType().getScalarType() == MVT::i1 && \"Unexpected mask type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33628, __extension__
__PRETTY_FUNCTION__));

33629

33630

MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);

33631

33632

DataToStore = ExtendToType(DataToStore, WideDataVT, DAG);

33633

Mask = ExtendToType(Mask, WideMaskVT, DAG, true);

33634

return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(),

33635

N->getOffset(), Mask, N->getMemoryVT(),

33636

N->getMemOperand(), N->getAddressingMode(),

33637

N->isTruncatingStore(), N->isCompressingStore());

33638

}

33639

33640

static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,

33641

SelectionDAG &DAG) {

33642

assert(Subtarget.hasAVX2() &&(static_cast <bool> (Subtarget.hasAVX2() && "MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33643, __extension__
__PRETTY_FUNCTION__))

33643

"MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only")(static_cast <bool> (Subtarget.hasAVX2() && "MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33643, __extension__
__PRETTY_FUNCTION__));

33644

33645

MaskedGatherSDNode *N = cast<MaskedGatherSDNode>(Op.getNode());

33646

SDLoc dl(Op);

33647

MVT VT = Op.getSimpleValueType();

33648

SDValue Index = N->getIndex();

33649

SDValue Mask = N->getMask();

33650

SDValue PassThru = N->getPassThru();

33651

MVT IndexVT = Index.getSimpleValueType();

33652

33653

assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op")(static_cast <bool> (VT.getScalarSizeInBits() >= 32 &&
"Unsupported gather op") ? void (0) : __assert_fail ("VT.getScalarSizeInBits() >= 32 && \"Unsupported gather op\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33653, __extension__
__PRETTY_FUNCTION__));

33654

33655

// If the index is v2i32, we're being called by type legalization.

33656

if (IndexVT == MVT::v2i32)

33657

return SDValue();

33658

33659

// If we don't have VLX and neither the passthru or index is 512-bits, we

33660

// need to widen until one is.

33661

MVT OrigVT = VT;

33662

if (Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&

33663

!IndexVT.is512BitVector()) {

33664

// Determine how much we need to widen by to get a 512-bit type.

33665

unsigned Factor = std::min(512/VT.getSizeInBits(),

33666

512/IndexVT.getSizeInBits());

33667

33668

unsigned NumElts = VT.getVectorNumElements() * Factor;

33669

33670

VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);

33671

IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);

33672

MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);

33673

33674

PassThru = ExtendToType(PassThru, VT, DAG);

33675

Index = ExtendToType(Index, IndexVT, DAG);

33676

Mask = ExtendToType(Mask, MaskVT, DAG, true);

33677

}

33678

33679

// Break dependency on the data register.

33680

if (PassThru.isUndef())

33681

PassThru = getZeroVector(VT, Subtarget, DAG, dl);

33682

33683

SDValue Ops[] = { N->getChain(), PassThru, Mask, N->getBasePtr(), Index,

33684

N->getScale() };

33685

SDValue NewGather = DAG.getMemIntrinsicNode(

33686

X86ISD::MGATHER, dl, DAG.getVTList(VT, MVT::Other), Ops, N->getMemoryVT(),

33687

N->getMemOperand());

33688

SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OrigVT,

33689

NewGather, DAG.getIntPtrConstant(0, dl));

33690

return DAG.getMergeValues({Extract, NewGather.getValue(1)}, dl);

33691

}

33692

33693

static SDValue LowerADDRSPACECAST(SDValue Op, SelectionDAG &DAG) {

33694

SDLoc dl(Op);

33695

SDValue Src = Op.getOperand(0);

33696

MVT DstVT = Op.getSimpleValueType();

33697

33698

AddrSpaceCastSDNode *N = cast<AddrSpaceCastSDNode>(Op.getNode());

33699

unsigned SrcAS = N->getSrcAddressSpace();

33700

33701

assert(SrcAS != N->getDestAddressSpace() &&(static_cast <bool> (SrcAS != N->getDestAddressSpace
() && "addrspacecast must be between different address spaces"
) ? void (0) : __assert_fail ("SrcAS != N->getDestAddressSpace() && \"addrspacecast must be between different address spaces\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33702, __extension__
__PRETTY_FUNCTION__))

33702

"addrspacecast must be between different address spaces")(static_cast <bool> (SrcAS != N->getDestAddressSpace
() && "addrspacecast must be between different address spaces"
) ? void (0) : __assert_fail ("SrcAS != N->getDestAddressSpace() && \"addrspacecast must be between different address spaces\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33702, __extension__
__PRETTY_FUNCTION__));

33703

33704

if (SrcAS == X86AS::PTR32_UPTR && DstVT == MVT::i64) {

33705

Op = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Src);

33706

} else if (DstVT == MVT::i64) {

33707

Op = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Src);

33708

} else if (DstVT == MVT::i32) {

33709

Op = DAG.getNode(ISD::TRUNCATE, dl, DstVT, Src);

33710

} else {

33711

report_fatal_error("Bad address space in addrspacecast");

33712

}

33713

return Op;

33714

}

33715

33716

SDValue X86TargetLowering::LowerGC_TRANSITION(SDValue Op,

33717

SelectionDAG &DAG) const {

33718

// TODO: Eventually, the lowering of these nodes should be informed by or

33719

// deferred to the GC strategy for the function in which they appear. For

33720

// now, however, they must be lowered to something. Since they are logically

33721

// no-ops in the case of a null GC strategy (or a GC strategy which does not

33722

// require special handling for these nodes), lower them as literal NOOPs for

33723

// the time being.

33724

SmallVector<SDValue, 2> Ops;

33725

Ops.push_back(Op.getOperand(0));

33726

if (Op->getGluedNode())

33727

Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));

33728

33729

SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);

33730

return SDValue(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);

33731

}

33732

33733

// Custom split CVTPS2PH with wide types.

33734

static SDValue LowerCVTPS2PH(SDValue Op, SelectionDAG &DAG) {

33735

SDLoc dl(Op);

33736

EVT VT = Op.getValueType();

33737

SDValue Lo, Hi;

33738

std::tie(Lo, Hi) = DAG.SplitVectorOperand(Op.getNode(), 0);

33739

EVT LoVT, HiVT;

33740

std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);

33741

SDValue RC = Op.getOperand(1);

33742

Lo = DAG.getNode(X86ISD::CVTPS2PH, dl, LoVT, Lo, RC);

33743

Hi = DAG.getNode(X86ISD::CVTPS2PH, dl, HiVT, Hi, RC);

33744

return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);

33745

}

33746

33747

static StringRef getInstrStrFromOpNo(const SmallVectorImpl<StringRef> &AsmStrs,

33748

unsigned OpNo) {

33749

const APInt Operand(32, OpNo);

33750

std::string OpNoStr = llvm::toString(Operand, 10, false);

33751

std::string Str(" $");

33752

33753

std::string OpNoStr1(Str + OpNoStr); // e.g. " $1" (OpNo=1)

33754

std::string OpNoStr2(Str + "{" + OpNoStr + ":"); // With modifier, e.g. ${1:P}

33755

33756

auto I = StringRef::npos;

33757

for (auto &AsmStr : AsmStrs) {

33758

// Match the OpNo string. We should match exactly to exclude match

33759

// sub-string, e.g. "$12" contain "$1"

33760

if (AsmStr.endswith(OpNoStr1))

33761

I = AsmStr.size() - OpNoStr1.size();

33762

33763

// Get the index of operand in AsmStr.

33764

if (I == StringRef::npos)

33765

I = AsmStr.find(OpNoStr1 + ",");

33766

if (I == StringRef::npos)

33767

I = AsmStr.find(OpNoStr2);

33768

33769

if (I == StringRef::npos)

33770

continue;

33771

33772

assert(I > 0 && "Unexpected inline asm string!")(static_cast <bool> (I > 0 && "Unexpected inline asm string!"
) ? void (0) : __assert_fail ("I > 0 && \"Unexpected inline asm string!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33772, __extension__
__PRETTY_FUNCTION__));

33773

// Remove the operand string and label (if exsit).

33774

// For example:

33775

// ".L__MSASMLABEL_.${:uid}__l:call dword ptr ${0:P}"

33776

// ==>

33777

// ".L__MSASMLABEL_.${:uid}__l:call dword ptr "

33778

// ==>

33779

// "call dword ptr "

33780

auto TmpStr = AsmStr.substr(0, I);

33781

I = TmpStr.rfind(':');

33782

if (I == StringRef::npos)

33783

return TmpStr;

33784

33785

assert(I < TmpStr.size() && "Unexpected inline asm string!")(static_cast <bool> (I < TmpStr.size() && "Unexpected inline asm string!"
) ? void (0) : __assert_fail ("I < TmpStr.size() && \"Unexpected inline asm string!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33785, __extension__
__PRETTY_FUNCTION__));

33786

auto Asm = TmpStr.drop_front(I + 1);

33787

return Asm;

33788

}

33789

33790

return StringRef();

33791

}

33792

33793

bool X86TargetLowering::isInlineAsmTargetBranch(

33794

const SmallVectorImpl<StringRef> &AsmStrs, unsigned OpNo) const {

33795

StringRef InstrStr = getInstrStrFromOpNo(AsmStrs, OpNo);

33796

33797

if (InstrStr.contains("call"))

33798

return true;

33799

33800

return false;

33801

}

33802

33803

/// Provide custom lowering hooks for some operations.

33804

SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {

33805

switch (Op.getOpcode()) {

33806

default: llvm_unreachable("Should not custom lower this!")::llvm::llvm_unreachable_internal("Should not custom lower this!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33806);

33807

case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, Subtarget, DAG);

33808

case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:

33809

return LowerCMP_SWAP(Op, Subtarget, DAG);

33810

case ISD::CTPOP: return LowerCTPOP(Op, Subtarget, DAG);

33811

case ISD::ATOMIC_LOAD_ADD:

33812

case ISD::ATOMIC_LOAD_SUB:

33813

case ISD::ATOMIC_LOAD_OR:

33814

case ISD::ATOMIC_LOAD_XOR:

33815

case ISD::ATOMIC_LOAD_AND: return lowerAtomicArith(Op, DAG, Subtarget);

33816

case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op, DAG, Subtarget);

33817

case ISD::BITREVERSE: return LowerBITREVERSE(Op, Subtarget, DAG);

33818

case ISD::PARITY: return LowerPARITY(Op, Subtarget, DAG);

33819

case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG);

33820

case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, Subtarget, DAG);

33821

case ISD::VECTOR_SHUFFLE: return lowerVECTOR_SHUFFLE(Op, Subtarget, DAG);

33822

case ISD::VSELECT: return LowerVSELECT(Op, DAG);

33823

case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);

33824

case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);

33825

case ISD::INSERT_SUBVECTOR: return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG);

33826

case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG);

33827

case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, Subtarget,DAG);

33828

case ISD::ConstantPool: return LowerConstantPool(Op, DAG);

33829

case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);

33830

case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);

33831

case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG);

33832

case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);

33833

case ISD::SHL_PARTS:

33834

case ISD::SRA_PARTS:

33835

case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG);

33836

case ISD::FSHL:

33837

case ISD::FSHR: return LowerFunnelShift(Op, Subtarget, DAG);

33838

case ISD::STRICT_SINT_TO_FP:

33839

case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);

33840

case ISD::STRICT_UINT_TO_FP:

33841

case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);

33842

case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG);

33843

case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, Subtarget, DAG);

33844

case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, Subtarget, DAG);

33845

case ISD::ANY_EXTEND: return LowerANY_EXTEND(Op, Subtarget, DAG);

33846

case ISD::ZERO_EXTEND_VECTOR_INREG:

33847

case ISD::SIGN_EXTEND_VECTOR_INREG:

33848

return LowerEXTEND_VECTOR_INREG(Op, Subtarget, DAG);

33849

case ISD::FP_TO_SINT:

33850

case ISD::STRICT_FP_TO_SINT:

33851

case ISD::FP_TO_UINT:

33852

case ISD::STRICT_FP_TO_UINT: return LowerFP_TO_INT(Op, DAG);

33853

case ISD::FP_TO_SINT_SAT:

33854

case ISD::FP_TO_UINT_SAT: return LowerFP_TO_INT_SAT(Op, DAG);

33855

case ISD::FP_EXTEND:

33856

case ISD::STRICT_FP_EXTEND: return LowerFP_EXTEND(Op, DAG);

33857

case ISD::FP_ROUND:

33858

case ISD::STRICT_FP_ROUND: return LowerFP_ROUND(Op, DAG);

33859

case ISD::FP16_TO_FP:

33860

case ISD::STRICT_FP16_TO_FP: return LowerFP16_TO_FP(Op, DAG);

33861

case ISD::FP_TO_FP16:

33862

case ISD::STRICT_FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);

33863

case ISD::FP_TO_BF16: return LowerFP_TO_BF16(Op, DAG);

33864

case ISD::LOAD: return LowerLoad(Op, Subtarget, DAG);

33865

case ISD::STORE: return LowerStore(Op, Subtarget, DAG);

33866

case ISD::FADD:

33867

case ISD::FSUB: return lowerFaddFsub(Op, DAG);

33868

case ISD::FROUND: return LowerFROUND(Op, DAG);

33869

case ISD::FABS:

33870

case ISD::FNEG: return LowerFABSorFNEG(Op, DAG);

33871

case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);

33872

case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG);

33873

case ISD::LRINT:

33874

case ISD::LLRINT: return LowerLRINT_LLRINT(Op, DAG);

33875

case ISD::SETCC:

33876

case ISD::STRICT_FSETCC:

33877

case ISD::STRICT_FSETCCS: return LowerSETCC(Op, DAG);

33878

case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG);

33879

case ISD::SELECT: return LowerSELECT(Op, DAG);

33880

case ISD::BRCOND: return LowerBRCOND(Op, DAG);

33881

case ISD::JumpTable: return LowerJumpTable(Op, DAG);

33882

case ISD::VASTART: return LowerVASTART(Op, DAG);

33883

case ISD::VAARG: return LowerVAARG(Op, DAG);

33884

case ISD::VACOPY: return LowerVACOPY(Op, Subtarget, DAG);

33885

case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);

33886

case ISD::INTRINSIC_VOID:

33887

case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG);

33888

case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);

33889

case ISD::ADDROFRETURNADDR: return LowerADDROFRETURNADDR(Op, DAG);

33890

case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);

33891

case ISD::FRAME_TO_ARGS_OFFSET:

33892

return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);

33893

case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);

33894

case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG);

33895

case ISD::EH_SJLJ_SETJMP: return lowerEH_SJLJ_SETJMP(Op, DAG);

33896

case ISD::EH_SJLJ_LONGJMP: return lowerEH_SJLJ_LONGJMP(Op, DAG);

33897

case ISD::EH_SJLJ_SETUP_DISPATCH:

33898

return lowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);

33899

case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG);

33900

case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG);

33901

case ISD::GET_ROUNDING: return LowerGET_ROUNDING(Op, DAG);

33902

case ISD::SET_ROUNDING: return LowerSET_ROUNDING(Op, DAG);

33903

case ISD::CTLZ:

33904

case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ(Op, Subtarget, DAG);

33905

case ISD::CTTZ:

33906

case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op, Subtarget, DAG);

33907

case ISD::MUL: return LowerMUL(Op, Subtarget, DAG);

33908

case ISD::MULHS:

33909

case ISD::MULHU: return LowerMULH(Op, Subtarget, DAG);

33910

case ISD::ROTL:

33911

case ISD::ROTR: return LowerRotate(Op, Subtarget, DAG);

33912

case ISD::SRA:

33913

case ISD::SRL:

33914

case ISD::SHL: return LowerShift(Op, Subtarget, DAG);

33915

case ISD::SADDO:

33916

case ISD::UADDO:

33917

case ISD::SSUBO:

33918

case ISD::USUBO: return LowerXALUO(Op, DAG);

33919

case ISD::SMULO:

33920

case ISD::UMULO: return LowerMULO(Op, Subtarget, DAG);

33921

case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);

33922

case ISD::BITCAST: return LowerBITCAST(Op, Subtarget, DAG);

33923

case ISD::SADDO_CARRY:

33924

case ISD::SSUBO_CARRY:

33925

case ISD::ADDCARRY:

33926

case ISD::SUBCARRY: return LowerADDSUBCARRY(Op, DAG);

33927

case ISD::ADD:

33928

case ISD::SUB: return lowerAddSub(Op, DAG, Subtarget);

33929

case ISD::UADDSAT:

33930

case ISD::SADDSAT:

33931

case ISD::USUBSAT:

33932

case ISD::SSUBSAT: return LowerADDSAT_SUBSAT(Op, DAG, Subtarget);

33933

case ISD::SMAX:

33934

case ISD::SMIN:

33935

case ISD::UMAX:

33936

case ISD::UMIN: return LowerMINMAX(Op, Subtarget, DAG);

33937

case ISD::ABS: return LowerABS(Op, Subtarget, DAG);

33938

case ISD::ABDS:

33939

case ISD::ABDU: return LowerABD(Op, Subtarget, DAG);

33940

case ISD::AVGCEILU: return LowerAVG(Op, Subtarget, DAG);

33941

case ISD::FSINCOS: return LowerFSINCOS(Op, Subtarget, DAG);

33942

case ISD::MLOAD: return LowerMLOAD(Op, Subtarget, DAG);

33943

case ISD::MSTORE: return LowerMSTORE(Op, Subtarget, DAG);

33944

case ISD::MGATHER: return LowerMGATHER(Op, Subtarget, DAG);

33945

case ISD::MSCATTER: return LowerMSCATTER(Op, Subtarget, DAG);

33946

case ISD::GC_TRANSITION_START:

33947

case ISD::GC_TRANSITION_END: return LowerGC_TRANSITION(Op, DAG);

33948

case ISD::ADDRSPACECAST: return LowerADDRSPACECAST(Op, DAG);

33949

case X86ISD::CVTPS2PH: return LowerCVTPS2PH(Op, DAG);

33950

}

33951

}

33952

33953

/// Replace a node with an illegal result type with a new node built out of

33954

/// custom code.

33955

void X86TargetLowering::ReplaceNodeResults(SDNode *N,

33956

SmallVectorImpl<SDValue>&Results,

33957

SelectionDAG &DAG) const {

33958

SDLoc dl(N);

33959

switch (N->getOpcode()) {

33960

default:

33961

#ifndef NDEBUG

33962

dbgs() << "ReplaceNodeResults: ";

33963

N->dump(&DAG);

33964

#endif

33965

llvm_unreachable("Do not know how to custom type legalize this operation!")::llvm::llvm_unreachable_internal("Do not know how to custom type legalize this operation!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33965);

33966

case X86ISD::CVTPH2PS: {

33967

EVT VT = N->getValueType(0);

33968

SDValue Lo, Hi;

33969

std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);

33970

EVT LoVT, HiVT;

33971

std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);

33972

Lo = DAG.getNode(X86ISD::CVTPH2PS, dl, LoVT, Lo);

33973

Hi = DAG.getNode(X86ISD::CVTPH2PS, dl, HiVT, Hi);

33974

SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);

33975

Results.push_back(Res);

33976

return;

33977

}

33978

case X86ISD::STRICT_CVTPH2PS: {

33979

EVT VT = N->getValueType(0);

33980

SDValue Lo, Hi;

33981

std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 1);

33982

EVT LoVT, HiVT;

33983

std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);

33984

Lo = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {LoVT, MVT::Other},

33985

{N->getOperand(0), Lo});

33986

Hi = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {HiVT, MVT::Other},

33987

{N->getOperand(0), Hi});

33988

SDValue Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,

33989

Lo.getValue(1), Hi.getValue(1));

33990

SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);

33991

Results.push_back(Res);

33992

Results.push_back(Chain);

33993

return;

33994

}

33995

case X86ISD::CVTPS2PH:

33996

Results.push_back(LowerCVTPS2PH(SDValue(N, 0), DAG));

33997

return;

33998

case ISD::CTPOP: {

33999

assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!")(static_cast <bool> (N->getValueType(0) == MVT::i64 &&
"Unexpected VT!") ? void (0) : __assert_fail ("N->getValueType(0) == MVT::i64 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33999, __extension__
__PRETTY_FUNCTION__));

34000

// Use a v2i64 if possible.

34001

bool NoImplicitFloatOps =

34002

DAG.getMachineFunction().getFunction().hasFnAttribute(

34003

Attribute::NoImplicitFloat);

34004

if (isTypeLegal(MVT::v2i64) && !NoImplicitFloatOps) {

34005

SDValue Wide =

34006

DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, N->getOperand(0));

34007

Wide = DAG.getNode(ISD::CTPOP, dl, MVT::v2i64, Wide);

34008

// Bit count should fit in 32-bits, extract it as that and then zero

34009

// extend to i64. Otherwise we end up extracting bits 63:32 separately.

34010

Wide = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Wide);

34011

Wide = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, Wide,

34012

DAG.getIntPtrConstant(0, dl));

34013

Wide = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Wide);

34014

Results.push_back(Wide);

34015

}

34016

return;

34017

}

34018

case ISD::MUL: {

34019

EVT VT = N->getValueType(0);

34020

assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && VT.getVectorElementType() == MVT
::i8 && "Unexpected VT!") ? void (0) : __assert_fail (
"getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && VT.getVectorElementType() == MVT::i8 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34021, __extension__
__PRETTY_FUNCTION__))

34021

VT.getVectorElementType() == MVT::i8 && "Unexpected VT!")(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && VT.getVectorElementType() == MVT
::i8 && "Unexpected VT!") ? void (0) : __assert_fail (
"getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && VT.getVectorElementType() == MVT::i8 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34021, __extension__
__PRETTY_FUNCTION__));

34022

// Pre-promote these to vXi16 to avoid op legalization thinking all 16

34023

// elements are needed.

34024

MVT MulVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());

34025

SDValue Op0 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(0));

34026

SDValue Op1 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(1));

34027

SDValue Res = DAG.getNode(ISD::MUL, dl, MulVT, Op0, Op1);

34028

Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);

34029

unsigned NumConcats = 16 / VT.getVectorNumElements();

34030

SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(VT));

34031

ConcatOps[0] = Res;

34032

Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i8, ConcatOps);

34033

Results.push_back(Res);

34034

return;

34035

}

34036

case ISD::SMULO:

34037

case ISD::UMULO: {

34038

EVT VT = N->getValueType(0);

34039

assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && VT == MVT::v2i32 && "Unexpected VT!"
) ? void (0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && VT == MVT::v2i32 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34040, __extension__
__PRETTY_FUNCTION__))

34040

VT == MVT::v2i32 && "Unexpected VT!")(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && VT == MVT::v2i32 && "Unexpected VT!"
) ? void (0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && VT == MVT::v2i32 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34040, __extension__
__PRETTY_FUNCTION__));

34041

bool IsSigned = N->getOpcode() == ISD::SMULO;

34042

unsigned ExtOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;

34043

SDValue Op0 = DAG.getNode(ExtOpc, dl, MVT::v2i64, N->getOperand(0));

34044

SDValue Op1 = DAG.getNode(ExtOpc, dl, MVT::v2i64, N->getOperand(1));

34045

SDValue Res = DAG.getNode(ISD::MUL, dl, MVT::v2i64, Op0, Op1);

34046

// Extract the high 32 bits from each result using PSHUFD.

34047

// TODO: Could use SRL+TRUNCATE but that doesn't become a PSHUFD.

34048

SDValue Hi = DAG.getBitcast(MVT::v4i32, Res);

34049

Hi = DAG.getVectorShuffle(MVT::v4i32, dl, Hi, Hi, {1, 3, -1, -1});

34050

Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Hi,

34051

DAG.getIntPtrConstant(0, dl));

34052

34053

// Truncate the low bits of the result. This will become PSHUFD.

34054

Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);

34055

34056

SDValue HiCmp;

34057

if (IsSigned) {

34058

// SMULO overflows if the high bits don't match the sign of the low.

34059

HiCmp = DAG.getNode(ISD::SRA, dl, VT, Res, DAG.getConstant(31, dl, VT));

34060

} else {

34061

// UMULO overflows if the high bits are non-zero.

34062

HiCmp = DAG.getConstant(0, dl, VT);

34063

}

34064

SDValue Ovf = DAG.getSetCC(dl, N->getValueType(1), Hi, HiCmp, ISD::SETNE);

34065

34066

// Widen the result with by padding with undef.

34067

Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Res,

34068

DAG.getUNDEF(VT));

34069

Results.push_back(Res);

34070

Results.push_back(Ovf);

34071

return;

34072

}

34073

case X86ISD::VPMADDWD: {

34074

// Legalize types for X86ISD::VPMADDWD by widening.

34075

assert(Subtarget.hasSSE2() && "Requires at least SSE2!")(static_cast <bool> (Subtarget.hasSSE2() && "Requires at least SSE2!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"Requires at least SSE2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34075, __extension__
__PRETTY_FUNCTION__));

34076

34077

EVT VT = N->getValueType(0);

34078

EVT InVT = N->getOperand(0).getValueType();

34079

assert(VT.getSizeInBits() < 128 && 128 % VT.getSizeInBits() == 0 &&(static_cast <bool> (VT.getSizeInBits() < 128 &&
128 % VT.getSizeInBits() == 0 && "Expected a VT that divides into 128 bits."
) ? void (0) : __assert_fail ("VT.getSizeInBits() < 128 && 128 % VT.getSizeInBits() == 0 && \"Expected a VT that divides into 128 bits.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34080, __extension__
__PRETTY_FUNCTION__))

34080

"Expected a VT that divides into 128 bits.")(static_cast <bool> (VT.getSizeInBits() < 128 &&
128 % VT.getSizeInBits() == 0 && "Expected a VT that divides into 128 bits."
) ? void (0) : __assert_fail ("VT.getSizeInBits() < 128 && 128 % VT.getSizeInBits() == 0 && \"Expected a VT that divides into 128 bits.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34080, __extension__
__PRETTY_FUNCTION__));

34081

assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34082, __extension__
__PRETTY_FUNCTION__))

34082

"Unexpected type action!")(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34082, __extension__
__PRETTY_FUNCTION__));

34083

unsigned NumConcat = 128 / InVT.getSizeInBits();

34084

34085

EVT InWideVT = EVT::getVectorVT(*DAG.getContext(),

34086

InVT.getVectorElementType(),

34087

NumConcat * InVT.getVectorNumElements());

34088

EVT WideVT = EVT::getVectorVT(*DAG.getContext(),

34089

VT.getVectorElementType(),

34090

NumConcat * VT.getVectorNumElements());

34091

34092

SmallVector<SDValue, 16> Ops(NumConcat, DAG.getUNDEF(InVT));

34093

Ops[0] = N->getOperand(0);

34094

SDValue InVec0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWideVT, Ops);

34095

Ops[0] = N->getOperand(1);

34096

SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWideVT, Ops);

34097

34098

SDValue Res = DAG.getNode(N->getOpcode(), dl, WideVT, InVec0, InVec1);

34099

Results.push_back(Res);

34100

return;

34101

}

34102

// We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32.

34103

case X86ISD::FMINC:

34104

case X86ISD::FMIN:

34105

case X86ISD::FMAXC:

34106

case X86ISD::FMAX: {

34107

EVT VT = N->getValueType(0);

34108

assert(VT == MVT::v2f32 && "Unexpected type (!= v2f32) on FMIN/FMAX.")(static_cast <bool> (VT == MVT::v2f32 && "Unexpected type (!= v2f32) on FMIN/FMAX."
) ? void (0) : __assert_fail ("VT == MVT::v2f32 && \"Unexpected type (!= v2f32) on FMIN/FMAX.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34108, __extension__
__PRETTY_FUNCTION__));

34109

SDValue UNDEF = DAG.getUNDEF(VT);

34110

SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,

34111

N->getOperand(0), UNDEF);

34112

SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,

34113

N->getOperand(1), UNDEF);

34114

Results.push_back(DAG.getNode(N->getOpcode(), dl, MVT::v4f32, LHS, RHS));

34115

return;

34116

}

34117

case ISD::SDIV:

34118

case ISD::UDIV:

34119

case ISD::SREM:

34120

case ISD::UREM: {

34121

EVT VT = N->getValueType(0);

34122

if (VT.isVector()) {

34123

assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34124, __extension__
__PRETTY_FUNCTION__))

34124

"Unexpected type action!")(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34124, __extension__
__PRETTY_FUNCTION__));

34125

// If this RHS is a constant splat vector we can widen this and let

34126

// division/remainder by constant optimize it.

34127

// TODO: Can we do something for non-splat?

34128

APInt SplatVal;

34129

if (ISD::isConstantSplatVector(N->getOperand(1).getNode(), SplatVal)) {

34130

unsigned NumConcats = 128 / VT.getSizeInBits();

34131

SmallVector<SDValue, 8> Ops0(NumConcats, DAG.getUNDEF(VT));

34132

Ops0[0] = N->getOperand(0);

34133

EVT ResVT = getTypeToTransformTo(*DAG.getContext(), VT);

34134

SDValue N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Ops0);

34135

SDValue N1 = DAG.getConstant(SplatVal, dl, ResVT);

34136

SDValue Res = DAG.getNode(N->getOpcode(), dl, ResVT, N0, N1);

34137

Results.push_back(Res);

34138

}

34139

return;

34140

}

34141

34142

SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG);

34143

Results.push_back(V);

34144

return;

34145

}

34146

case ISD::TRUNCATE: {

34147

MVT VT = N->getSimpleValueType(0);

34148

if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector)

34149

return;

34150

34151

// The generic legalizer will try to widen the input type to the same

34152

// number of elements as the widened result type. But this isn't always

34153

// the best thing so do some custom legalization to avoid some cases.

34154

MVT WidenVT = getTypeToTransformTo(*DAG.getContext(), VT).getSimpleVT();

34155

SDValue In = N->getOperand(0);

34156

EVT InVT = In.getValueType();

34157

34158

unsigned InBits = InVT.getSizeInBits();

34159

if (128 % InBits == 0) {

34160

// 128 bit and smaller inputs should avoid truncate all together and

34161

// just use a build_vector that will become a shuffle.

34162

// TODO: Widen and use a shuffle directly?

34163

MVT InEltVT = InVT.getSimpleVT().getVectorElementType();

34164

EVT EltVT = VT.getVectorElementType();

34165

unsigned WidenNumElts = WidenVT.getVectorNumElements();

34166

SmallVector<SDValue, 16> Ops(WidenNumElts, DAG.getUNDEF(EltVT));

34167

// Use the original element count so we don't do more scalar opts than

34168

// necessary.

34169

unsigned MinElts = VT.getVectorNumElements();

34170

for (unsigned i=0; i < MinElts; ++i) {

34171

SDValue Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, InEltVT, In,

34172

DAG.getIntPtrConstant(i, dl));

34173

Ops[i] = DAG.getNode(ISD::TRUNCATE, dl, EltVT, Val);

34174

}

34175

Results.push_back(DAG.getBuildVector(WidenVT, dl, Ops));

34176

return;

34177

}

34178

// With AVX512 there are some cases that can use a target specific

34179

// truncate node to go from 256/512 to less than 128 with zeros in the

34180

// upper elements of the 128 bit result.

34181

if (Subtarget.hasAVX512() && isTypeLegal(InVT)) {

34182

// We can use VTRUNC directly if for 256 bits with VLX or for any 512.

34183

if ((InBits == 256 && Subtarget.hasVLX()) || InBits == 512) {

34184

Results.push_back(DAG.getNode(X86ISD::VTRUNC, dl, WidenVT, In));

34185

return;

34186

}

34187

// There's one case we can widen to 512 bits and use VTRUNC.

34188

if (InVT == MVT::v4i64 && VT == MVT::v4i8 && isTypeLegal(MVT::v8i64)) {

34189

In = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i64, In,

34190

DAG.getUNDEF(MVT::v4i64));

34191

Results.push_back(DAG.getNode(X86ISD::VTRUNC, dl, WidenVT, In));

34192

return;

34193

}

34194

}

34195

if (Subtarget.hasVLX() && InVT == MVT::v8i64 && VT == MVT::v8i8 &&

34196

getTypeAction(*DAG.getContext(), InVT) == TypeSplitVector &&

34197

isTypeLegal(MVT::v4i64)) {

34198

// Input needs to be split and output needs to widened. Let's use two

34199

// VTRUNCs, and shuffle their results together into the wider type.

34200

SDValue Lo, Hi;

34201

std::tie(Lo, Hi) = DAG.SplitVector(In, dl);

34202

34203

Lo = DAG.getNode(X86ISD::VTRUNC, dl, MVT::v16i8, Lo);

34204

Hi = DAG.getNode(X86ISD::VTRUNC, dl, MVT::v16i8, Hi);

34205

SDValue Res = DAG.getVectorShuffle(MVT::v16i8, dl, Lo, Hi,

34206

{ 0, 1, 2, 3, 16, 17, 18, 19,

34207

-1, -1, -1, -1, -1, -1, -1, -1 });

34208

Results.push_back(Res);

34209

return;

34210

}

34211

34212

return;

34213

}

34214

case ISD::ANY_EXTEND:

34215

// Right now, only MVT::v8i8 has Custom action for an illegal type.

34216

// It's intended to custom handle the input type.

34217

assert(N->getValueType(0) == MVT::v8i8 &&(static_cast <bool> (N->getValueType(0) == MVT::v8i8
&& "Do not know how to legalize this Node") ? void (
0) : __assert_fail ("N->getValueType(0) == MVT::v8i8 && \"Do not know how to legalize this Node\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34218, __extension__
__PRETTY_FUNCTION__))

34218

"Do not know how to legalize this Node")(static_cast <bool> (N->getValueType(0) == MVT::v8i8
&& "Do not know how to legalize this Node") ? void (
0) : __assert_fail ("N->getValueType(0) == MVT::v8i8 && \"Do not know how to legalize this Node\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34218, __extension__
__PRETTY_FUNCTION__));

34219

return;

34220

case ISD::SIGN_EXTEND:

34221

case ISD::ZERO_EXTEND: {

34222

EVT VT = N->getValueType(0);

34223

SDValue In = N->getOperand(0);

34224

EVT InVT = In.getValueType();

34225

if (!Subtarget.hasSSE41() && VT == MVT::v4i64 &&

34226

(InVT == MVT::v4i16 || InVT == MVT::v4i8)){

34227

assert(getTypeAction(*DAG.getContext(), InVT) == TypeWidenVector &&(static_cast <bool> (getTypeAction(*DAG.getContext(), InVT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), InVT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34228, __extension__
__PRETTY_FUNCTION__))

34228

"Unexpected type action!")(static_cast <bool> (getTypeAction(*DAG.getContext(), InVT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), InVT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34228, __extension__
__PRETTY_FUNCTION__));

34229

assert(N->getOpcode() == ISD::SIGN_EXTEND && "Unexpected opcode")(static_cast <bool> (N->getOpcode() == ISD::SIGN_EXTEND
&& "Unexpected opcode") ? void (0) : __assert_fail (
"N->getOpcode() == ISD::SIGN_EXTEND && \"Unexpected opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34229, __extension__
__PRETTY_FUNCTION__));

34230

// Custom split this so we can extend i8/i16->i32 invec. This is better

34231

// since sign_extend_inreg i8/i16->i64 requires an extend to i32 using

34232

// sra. Then extending from i32 to i64 using pcmpgt. By custom splitting

34233

// we allow the sra from the extend to i32 to be shared by the split.

34234

In = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, In);

34235

34236

// Fill a vector with sign bits for each element.

34237

SDValue Zero = DAG.getConstant(0, dl, MVT::v4i32);

34238

SDValue SignBits = DAG.getSetCC(dl, MVT::v4i32, Zero, In, ISD::SETGT);

34239

34240

// Create an unpackl and unpackh to interleave the sign bits then bitcast

34241

// to v2i64.

34242

SDValue Lo = DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits,

34243

{0, 4, 1, 5});

34244

Lo = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Lo);

34245

SDValue Hi = DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits,

34246

{2, 6, 3, 7});

34247

Hi = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Hi);

34248

34249

SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);

34250

Results.push_back(Res);

34251

return;

34252

}

34253

34254

if (VT == MVT::v16i32 || VT == MVT::v8i64) {

34255

if (!InVT.is128BitVector()) {

34256

// Not a 128 bit vector, but maybe type legalization will promote

34257

// it to 128 bits.

34258

if (getTypeAction(*DAG.getContext(), InVT) != TypePromoteInteger)

34259

return;

34260

InVT = getTypeToTransformTo(*DAG.getContext(), InVT);

34261

if (!InVT.is128BitVector())

34262

return;

34263

34264

// Promote the input to 128 bits. Type legalization will turn this into

34265

// zext_inreg/sext_inreg.

34266

In = DAG.getNode(N->getOpcode(), dl, InVT, In);

34267

}

34268

34269

// Perform custom splitting instead of the two stage extend we would get

34270

// by default.

34271

EVT LoVT, HiVT;

34272

std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));

34273

assert(isTypeLegal(LoVT) && "Split VT not legal?")(static_cast <bool> (isTypeLegal(LoVT) && "Split VT not legal?"
) ? void (0) : __assert_fail ("isTypeLegal(LoVT) && \"Split VT not legal?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34273, __extension__
__PRETTY_FUNCTION__));

34274

34275

SDValue Lo = getEXTEND_VECTOR_INREG(N->getOpcode(), dl, LoVT, In, DAG);

34276

34277

// We need to shift the input over by half the number of elements.

34278

unsigned NumElts = InVT.getVectorNumElements();

34279

unsigned HalfNumElts = NumElts / 2;

34280

SmallVector<int, 16> ShufMask(NumElts, SM_SentinelUndef);

34281

for (unsigned i = 0; i != HalfNumElts; ++i)

34282

ShufMask[i] = i + HalfNumElts;

34283

34284

SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask);

34285

Hi = getEXTEND_VECTOR_INREG(N->getOpcode(), dl, HiVT, Hi, DAG);

34286

34287

SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);

34288

Results.push_back(Res);

34289

}

34290

return;

34291

}

34292

case ISD::FP_TO_SINT:

34293

case ISD::STRICT_FP_TO_SINT:

34294

case ISD::FP_TO_UINT:

34295

case ISD::STRICT_FP_TO_UINT: {

34296

bool IsStrict = N->isStrictFPOpcode();

34297

bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT ||

34298

N->getOpcode() == ISD::STRICT_FP_TO_SINT;

34299

EVT VT = N->getValueType(0);

34300

SDValue Src = N->getOperand(IsStrict ? 1 : 0);

34301

SDValue Chain = IsStrict ? N->getOperand(0) : SDValue();

34302

EVT SrcVT = Src.getValueType();

34303

34304

SDValue Res;

34305

if (isSoftFP16(SrcVT)) {

34306

EVT NVT = VT.isVector() ? VT.changeVectorElementType(MVT::f32) : MVT::f32;

34307

if (IsStrict) {

34308

Res =

34309

DAG.getNode(N->getOpcode(), dl, {VT, MVT::Other},

34310

{Chain, DAG.getNode(ISD::STRICT_FP_EXTEND, dl,

34311

{NVT, MVT::Other}, {Chain, Src})});

34312

Chain = Res.getValue(1);

34313

} else {

34314

Res = DAG.getNode(N->getOpcode(), dl, VT,

34315

DAG.getNode(ISD::FP_EXTEND, dl, NVT, Src));

34316

}

34317

Results.push_back(Res);

34318

if (IsStrict)

34319

Results.push_back(Chain);

34320

34321

return;

34322

}

34323

34324

if (VT.isVector() && Subtarget.hasFP16() &&

34325

SrcVT.getVectorElementType() == MVT::f16) {

34326

EVT EleVT = VT.getVectorElementType();

34327

EVT ResVT = EleVT == MVT::i32 ? MVT::v4i32 : MVT::v8i16;

34328

34329

if (SrcVT != MVT::v8f16) {

34330

SDValue Tmp =

34331

IsStrict ? DAG.getConstantFP(0.0, dl, SrcVT) : DAG.getUNDEF(SrcVT);

34332

SmallVector<SDValue, 4> Ops(SrcVT == MVT::v2f16 ? 4 : 2, Tmp);

34333

Ops[0] = Src;

34334

Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f16, Ops);

34335

}

34336

34337

if (IsStrict) {

34338

unsigned Opc =

34339

IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI;

34340

Res =

34341

DAG.getNode(Opc, dl, {ResVT, MVT::Other}, {N->getOperand(0), Src});

34342

Chain = Res.getValue(1);

34343

} else {

34344

unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;

34345

Res = DAG.getNode(Opc, dl, ResVT, Src);

34346

}

34347

34348

// TODO: Need to add exception check code for strict FP.

34349

if (EleVT.getSizeInBits() < 16) {

34350

MVT TmpVT = MVT::getVectorVT(EleVT.getSimpleVT(), 8);

34351

Res = DAG.getNode(ISD::TRUNCATE, dl, TmpVT, Res);

34352

34353

// Now widen to 128 bits.

34354

unsigned NumConcats = 128 / TmpVT.getSizeInBits();

34355

MVT ConcatVT = MVT::getVectorVT(EleVT.getSimpleVT(), 8 * NumConcats);

34356

SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(TmpVT));

34357

ConcatOps[0] = Res;

34358

Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatVT, ConcatOps);

34359

}

34360

34361

Results.push_back(Res);

34362

if (IsStrict)

34363

Results.push_back(Chain);

34364

34365

return;

34366

}

34367

34368

if (VT.isVector() && VT.getScalarSizeInBits() < 32) {

34369

assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34370, __extension__
__PRETTY_FUNCTION__))

34370

"Unexpected type action!")(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34370, __extension__
__PRETTY_FUNCTION__));

34371

34372

// Try to create a 128 bit vector, but don't exceed a 32 bit element.

34373

unsigned NewEltWidth = std::min(128 / VT.getVectorNumElements(), 32U);

34374

MVT PromoteVT = MVT::getVectorVT(MVT::getIntegerVT(NewEltWidth),

34375

VT.getVectorNumElements());

34376

SDValue Res;

34377

SDValue Chain;

34378

if (IsStrict) {

34379

Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {PromoteVT, MVT::Other},

34380

{N->getOperand(0), Src});

34381

Chain = Res.getValue(1);

34382

} else

34383

Res = DAG.getNode(ISD::FP_TO_SINT, dl, PromoteVT, Src);

34384

34385

// Preserve what we know about the size of the original result. If the

34386

// result is v2i32, we have to manually widen the assert.

34387

if (PromoteVT == MVT::v2i32)

34388

Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Res,

34389

DAG.getUNDEF(MVT::v2i32));

34390

34391

Res = DAG.getNode(!IsSigned ? ISD::AssertZext : ISD::AssertSext, dl,

34392

Res.getValueType(), Res,

34393

DAG.getValueType(VT.getVectorElementType()));

34394

34395

if (PromoteVT == MVT::v2i32)

34396

Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res,

34397

DAG.getIntPtrConstant(0, dl));

34398

34399

// Truncate back to the original width.

34400

Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);

34401

34402

// Now widen to 128 bits.

34403

unsigned NumConcats = 128 / VT.getSizeInBits();

34404

MVT ConcatVT = MVT::getVectorVT(VT.getSimpleVT().getVectorElementType(),

34405

VT.getVectorNumElements() * NumConcats);

34406

SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(VT));

34407

ConcatOps[0] = Res;

34408

Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatVT, ConcatOps);

34409

Results.push_back(Res);

34410

if (IsStrict)

34411

Results.push_back(Chain);

34412

return;

34413

}

34414

34415

34416

if (VT == MVT::v2i32) {

34417

assert((!IsStrict || IsSigned || Subtarget.hasAVX512()) &&(static_cast <bool> ((!IsStrict || IsSigned || Subtarget
.hasAVX512()) && "Strict unsigned conversion requires AVX512"
) ? void (0) : __assert_fail ("(!IsStrict || IsSigned || Subtarget.hasAVX512()) && \"Strict unsigned conversion requires AVX512\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34418, __extension__
__PRETTY_FUNCTION__))

34418

"Strict unsigned conversion requires AVX512")(static_cast <bool> ((!IsStrict || IsSigned || Subtarget
.hasAVX512()) && "Strict unsigned conversion requires AVX512"
) ? void (0) : __assert_fail ("(!IsStrict || IsSigned || Subtarget.hasAVX512()) && \"Strict unsigned conversion requires AVX512\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34418, __extension__
__PRETTY_FUNCTION__));

34419

assert(Subtarget.hasSSE2() && "Requires at least SSE2!")(static_cast <bool> (Subtarget.hasSSE2() && "Requires at least SSE2!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"Requires at least SSE2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34419, __extension__
__PRETTY_FUNCTION__));

34420

assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34421, __extension__
__PRETTY_FUNCTION__))

34421

"Unexpected type action!")(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34421, __extension__
__PRETTY_FUNCTION__));

34422

if (Src.getValueType() == MVT::v2f64) {

34423

if (!IsSigned && !Subtarget.hasAVX512()) {

34424

SDValue Res =

34425

expandFP_TO_UINT_SSE(MVT::v4i32, Src, dl, DAG, Subtarget);

34426

Results.push_back(Res);

34427

return;

34428

}

34429

34430

unsigned Opc;

34431

if (IsStrict)

34432

Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI;

34433

else

34434

Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;

34435

34436

// If we have VLX we can emit a target specific FP_TO_UINT node,.

34437

if (!IsSigned && !Subtarget.hasVLX()) {

34438

// Otherwise we can defer to the generic legalizer which will widen

34439

// the input as well. This will be further widened during op

34440

// legalization to v8i32<-v8f64.

34441

// For strict nodes we'll need to widen ourselves.

34442

// FIXME: Fix the type legalizer to safely widen strict nodes?

34443

if (!IsStrict)

34444

return;

34445

Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f64, Src,

34446

DAG.getConstantFP(0.0, dl, MVT::v2f64));

34447

Opc = N->getOpcode();

34448

}

34449

SDValue Res;

34450

SDValue Chain;

34451

if (IsStrict) {

34452

Res = DAG.getNode(Opc, dl, {MVT::v4i32, MVT::Other},

34453

{N->getOperand(0), Src});

34454

Chain = Res.getValue(1);

34455

} else {

34456

Res = DAG.getNode(Opc, dl, MVT::v4i32, Src);

34457

}

34458

Results.push_back(Res);

34459

if (IsStrict)

34460

Results.push_back(Chain);

34461

return;

34462

}

34463

34464

// Custom widen strict v2f32->v2i32 by padding with zeros.

34465

// FIXME: Should generic type legalizer do this?

34466

if (Src.getValueType() == MVT::v2f32 && IsStrict) {

34467

Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,

34468

DAG.getConstantFP(0.0, dl, MVT::v2f32));

34469

SDValue Res = DAG.getNode(N->getOpcode(), dl, {MVT::v4i32, MVT::Other},

34470

{N->getOperand(0), Src});

34471

Results.push_back(Res);

34472

Results.push_back(Res.getValue(1));

34473

return;

34474

}

34475

34476

// The FP_TO_INTHelper below only handles f32/f64/f80 scalar inputs,

34477

// so early out here.

34478

return;

34479

}

34480

34481

assert(!VT.isVector() && "Vectors should have been handled above!")(static_cast <bool> (!VT.isVector() && "Vectors should have been handled above!"
) ? void (0) : __assert_fail ("!VT.isVector() && \"Vectors should have been handled above!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34481, __extension__
__PRETTY_FUNCTION__));

34482

34483

if ((Subtarget.hasDQI() && VT == MVT::i64 &&

34484

(SrcVT == MVT::f32 || SrcVT == MVT::f64)) ||

34485

(Subtarget.hasFP16() && SrcVT == MVT::f16)) {

34486

assert(!Subtarget.is64Bit() && "i64 should be legal")(static_cast <bool> (!Subtarget.is64Bit() && "i64 should be legal"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"i64 should be legal\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34486, __extension__
__PRETTY_FUNCTION__));

34487

unsigned NumElts = Subtarget.hasVLX() ? 2 : 8;

34488

// If we use a 128-bit result we might need to use a target specific node.

34489

unsigned SrcElts =

34490

std::max(NumElts, 128U / (unsigned)SrcVT.getSizeInBits());

34491

MVT VecVT = MVT::getVectorVT(MVT::i64, NumElts);

34492

MVT VecInVT = MVT::getVectorVT(SrcVT.getSimpleVT(), SrcElts);

34493

unsigned Opc = N->getOpcode();

34494

if (NumElts != SrcElts) {

34495

if (IsStrict)

34496

Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI;

34497

else

34498

Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;

34499

}

34500

34501

SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);

34502

SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecInVT,

34503

DAG.getConstantFP(0.0, dl, VecInVT), Src,

34504

ZeroIdx);

34505

SDValue Chain;

34506

if (IsStrict) {

34507

SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);

34508

Res = DAG.getNode(Opc, SDLoc(N), Tys, N->getOperand(0), Res);

34509

Chain = Res.getValue(1);

34510

} else

34511

Res = DAG.getNode(Opc, SDLoc(N), VecVT, Res);

34512

Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Res, ZeroIdx);

34513

Results.push_back(Res);

34514

if (IsStrict)

34515

Results.push_back(Chain);

34516

return;

34517

}

34518

34519

if (VT == MVT::i128 && Subtarget.isTargetWin64()) {

34520

SDValue Chain;

34521

SDValue V = LowerWin64_FP_TO_INT128(SDValue(N, 0), DAG, Chain);

34522

Results.push_back(V);

34523

if (IsStrict)

34524

Results.push_back(Chain);

34525

return;

34526

}

34527

34528

if (SDValue V = FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, Chain)) {

34529

Results.push_back(V);

34530

if (IsStrict)

34531

Results.push_back(Chain);

34532

}

34533

return;

34534

}

34535

case ISD::LRINT:

34536

case ISD::LLRINT: {

34537

if (SDValue V = LRINT_LLRINTHelper(N, DAG))

34538

Results.push_back(V);

34539

return;

34540

}

34541

34542

case ISD::SINT_TO_FP:

34543

case ISD::STRICT_SINT_TO_FP:

34544

case ISD::UINT_TO_FP:

34545

case ISD::STRICT_UINT_TO_FP: {

34546

bool IsStrict = N->isStrictFPOpcode();

34547

bool IsSigned = N->getOpcode() == ISD::SINT_TO_FP ||

34548

N->getOpcode() == ISD::STRICT_SINT_TO_FP;

34549

EVT VT = N->getValueType(0);

34550

SDValue Src = N->getOperand(IsStrict ? 1 : 0);

34551

if (VT.getVectorElementType() == MVT::f16 && Subtarget.hasFP16() &&

34552

Subtarget.hasVLX()) {

34553

if (Src.getValueType().getVectorElementType() == MVT::i16)

34554

return;

34555

34556

if (VT == MVT::v2f16 && Src.getValueType() == MVT::v2i32)

34557

Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,

34558

IsStrict ? DAG.getConstant(0, dl, MVT::v2i32)

34559

: DAG.getUNDEF(MVT::v2i32));

34560

if (IsStrict) {

34561

unsigned Opc =

34562

IsSigned ? X86ISD::STRICT_CVTSI2P : X86ISD::STRICT_CVTUI2P;

34563

SDValue Res = DAG.getNode(Opc, dl, {MVT::v8f16, MVT::Other},

34564

{N->getOperand(0), Src});

34565

Results.push_back(Res);

34566

Results.push_back(Res.getValue(1));

34567

} else {

34568

unsigned Opc = IsSigned ? X86ISD::CVTSI2P : X86ISD::CVTUI2P;

34569

Results.push_back(DAG.getNode(Opc, dl, MVT::v8f16, Src));

34570

}

34571

return;

34572

}

34573

if (VT != MVT::v2f32)

34574

return;

34575

EVT SrcVT = Src.getValueType();

34576

if (Subtarget.hasDQI() && Subtarget.hasVLX() && SrcVT == MVT::v2i64) {

34577

if (IsStrict) {

34578

unsigned Opc = IsSigned ? X86ISD::STRICT_CVTSI2P

34579

: X86ISD::STRICT_CVTUI2P;

34580

SDValue Res = DAG.getNode(Opc, dl, {MVT::v4f32, MVT::Other},

34581

{N->getOperand(0), Src});

34582

Results.push_back(Res);

34583

Results.push_back(Res.getValue(1));

34584

} else {

34585

unsigned Opc = IsSigned ? X86ISD::CVTSI2P : X86ISD::CVTUI2P;

34586

Results.push_back(DAG.getNode(Opc, dl, MVT::v4f32, Src));

34587

}

34588

return;

34589

}

34590

if (SrcVT == MVT::v2i64 && !IsSigned && Subtarget.is64Bit() &&

34591

Subtarget.hasSSE41() && !Subtarget.hasAVX512()) {

34592

SDValue Zero = DAG.getConstant(0, dl, SrcVT);

34593

SDValue One = DAG.getConstant(1, dl, SrcVT);

34594

SDValue Sign = DAG.getNode(ISD::OR, dl, SrcVT,

34595

DAG.getNode(ISD::SRL, dl, SrcVT, Src, One),

34596

DAG.getNode(ISD::AND, dl, SrcVT, Src, One));

34597

SDValue IsNeg = DAG.getSetCC(dl, MVT::v2i64, Src, Zero, ISD::SETLT);

34598

SDValue SignSrc = DAG.getSelect(dl, SrcVT, IsNeg, Sign, Src);

34599

SmallVector<SDValue, 4> SignCvts(4, DAG.getConstantFP(0.0, dl, MVT::f32));

34600

for (int i = 0; i != 2; ++i) {

34601

SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64,

34602

SignSrc, DAG.getIntPtrConstant(i, dl));

34603

if (IsStrict)

34604

SignCvts[i] =

34605

DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {MVT::f32, MVT::Other},

34606

{N->getOperand(0), Elt});

34607

else

34608

SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::f32, Elt);

34609

};

34610

SDValue SignCvt = DAG.getBuildVector(MVT::v4f32, dl, SignCvts);

34611

SDValue Slow, Chain;

34612

if (IsStrict) {

34613

Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,

34614

SignCvts[0].getValue(1), SignCvts[1].getValue(1));

34615

Slow = DAG.getNode(ISD::STRICT_FADD, dl, {MVT::v4f32, MVT::Other},

34616

{Chain, SignCvt, SignCvt});

34617

Chain = Slow.getValue(1);

34618

} else {

34619

Slow = DAG.getNode(ISD::FADD, dl, MVT::v4f32, SignCvt, SignCvt);

34620

}

34621

IsNeg = DAG.getBitcast(MVT::v4i32, IsNeg);

34622

IsNeg =

34623

DAG.getVectorShuffle(MVT::v4i32, dl, IsNeg, IsNeg, {1, 3, -1, -1});

34624

SDValue Cvt = DAG.getSelect(dl, MVT::v4f32, IsNeg, Slow, SignCvt);

34625

Results.push_back(Cvt);

34626

if (IsStrict)

34627

Results.push_back(Chain);

34628

return;

34629

}

34630

34631

if (SrcVT != MVT::v2i32)

34632

return;

34633

34634

if (IsSigned || Subtarget.hasAVX512()) {

34635

if (!IsStrict)

34636

return;

34637

34638

// Custom widen strict v2i32->v2f32 to avoid scalarization.

34639

// FIXME: Should generic type legalizer do this?

34640

Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,

34641

DAG.getConstant(0, dl, MVT::v2i32));

34642

SDValue Res = DAG.getNode(N->getOpcode(), dl, {MVT::v4f32, MVT::Other},

34643

{N->getOperand(0), Src});

34644

Results.push_back(Res);

34645

Results.push_back(Res.getValue(1));

34646

return;

34647

}

34648

34649

assert(Subtarget.hasSSE2() && "Requires at least SSE2!")(static_cast <bool> (Subtarget.hasSSE2() && "Requires at least SSE2!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"Requires at least SSE2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34649, __extension__
__PRETTY_FUNCTION__));

34650

SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, Src);

34651

SDValue VBias = DAG.getConstantFP(

34652

llvm::bit_cast<double>(0x4330000000000000ULL), dl, MVT::v2f64);

34653

SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,

34654

DAG.getBitcast(MVT::v2i64, VBias));

34655

Or = DAG.getBitcast(MVT::v2f64, Or);

34656

if (IsStrict) {

34657

SDValue Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::v2f64, MVT::Other},

34658

{N->getOperand(0), Or, VBias});

34659

SDValue Res = DAG.getNode(X86ISD::STRICT_VFPROUND, dl,

34660

{MVT::v4f32, MVT::Other},

34661

{Sub.getValue(1), Sub});

34662

Results.push_back(Res);

34663

Results.push_back(Res.getValue(1));

34664

} else {

34665

// TODO: Are there any fast-math-flags to propagate here?

34666

SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);

34667

Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));

34668

}

34669

return;

34670

}

34671

case ISD::STRICT_FP_ROUND:

34672

case ISD::FP_ROUND: {

34673

bool IsStrict = N->isStrictFPOpcode();

34674

SDValue Chain = IsStrict ? N->getOperand(0) : SDValue();

34675

SDValue Src = N->getOperand(IsStrict ? 1 : 0);

34676

SDValue Rnd = N->getOperand(IsStrict ? 2 : 1);

34677

EVT SrcVT = Src.getValueType();

34678

EVT VT = N->getValueType(0);

34679

SDValue V;

34680

if (VT == MVT::v2f16 && Src.getValueType() == MVT::v2f32) {

34681

SDValue Ext = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v2f32)

34682

: DAG.getUNDEF(MVT::v2f32);

34683

Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src, Ext);

34684

}

34685

if (!Subtarget.hasFP16() && VT.getVectorElementType() == MVT::f16) {

34686

assert(Subtarget.hasF16C() && "Cannot widen f16 without F16C")(static_cast <bool> (Subtarget.hasF16C() && "Cannot widen f16 without F16C"
) ? void (0) : __assert_fail ("Subtarget.hasF16C() && \"Cannot widen f16 without F16C\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34686, __extension__
__PRETTY_FUNCTION__));

34687

if (SrcVT.getVectorElementType() != MVT::f32)

34688

return;

34689

34690

if (IsStrict)

34691

V = DAG.getNode(X86ISD::STRICT_CVTPS2PH, dl, {MVT::v8i16, MVT::Other},

34692

{Chain, Src, Rnd});

34693

else

34694

V = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Src, Rnd);

34695

34696

Results.push_back(DAG.getBitcast(MVT::v8f16, V));

34697

if (IsStrict)

34698

Results.push_back(V.getValue(1));

34699

return;

34700

}

34701

if (!isTypeLegal(Src.getValueType()))

34702

return;

34703

EVT NewVT = VT.getVectorElementType() == MVT::f16 ? MVT::v8f16 : MVT::v4f32;

34704

if (IsStrict)

34705

V = DAG.getNode(X86ISD::STRICT_VFPROUND, dl, {NewVT, MVT::Other},

34706

{Chain, Src});

34707

else

34708

V = DAG.getNode(X86ISD::VFPROUND, dl, NewVT, Src);

34709

Results.push_back(V);

34710

if (IsStrict)

34711

Results.push_back(V.getValue(1));

34712

return;

34713

}

34714

case ISD::FP_EXTEND:

34715

case ISD::STRICT_FP_EXTEND: {

34716

// Right now, only MVT::v2f32 has OperationAction for FP_EXTEND.

34717

// No other ValueType for FP_EXTEND should reach this point.

34718

assert(N->getValueType(0) == MVT::v2f32 &&(static_cast <bool> (N->getValueType(0) == MVT::v2f32
&& "Do not know how to legalize this Node") ? void (
0) : __assert_fail ("N->getValueType(0) == MVT::v2f32 && \"Do not know how to legalize this Node\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34719, __extension__
__PRETTY_FUNCTION__))

34719

"Do not know how to legalize this Node")(static_cast <bool> (N->getValueType(0) == MVT::v2f32
&& "Do not know how to legalize this Node") ? void (
0) : __assert_fail ("N->getValueType(0) == MVT::v2f32 && \"Do not know how to legalize this Node\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34719, __extension__
__PRETTY_FUNCTION__));

34720

if (!Subtarget.hasFP16() || !Subtarget.hasVLX())

34721

return;

34722

bool IsStrict = N->isStrictFPOpcode();

34723

SDValue Src = N->getOperand(IsStrict ? 1 : 0);

34724

SDValue Ext = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v2f16)

34725

: DAG.getUNDEF(MVT::v2f16);

34726

SDValue V = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f16, Src, Ext);

34727

if (IsStrict)

34728

V = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::v4f32, MVT::Other},

34729

{N->getOperand(0), V});

34730

else

34731

V = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, V);

34732

Results.push_back(V);

34733

if (IsStrict)

34734

Results.push_back(V.getValue(1));

34735

return;

34736

}

34737

case ISD::INTRINSIC_W_CHAIN: {

34738

unsigned IntNo = N->getConstantOperandVal(1);

34739

switch (IntNo) {

34740

default : llvm_unreachable("Do not know how to custom type "::llvm::llvm_unreachable_internal("Do not know how to custom type "
"legalize this intrinsic operation!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34741)

34741

"legalize this intrinsic operation!")::llvm::llvm_unreachable_internal("Do not know how to custom type "
"legalize this intrinsic operation!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34741);

34742

case Intrinsic::x86_rdtsc:

34743

return getReadTimeStampCounter(N, dl, X86::RDTSC, DAG, Subtarget,

34744

Results);

34745

case Intrinsic::x86_rdtscp:

34746

return getReadTimeStampCounter(N, dl, X86::RDTSCP, DAG, Subtarget,

34747

Results);

34748

case Intrinsic::x86_rdpmc:

34749

expandIntrinsicWChainHelper(N, dl, DAG, X86::RDPMC, X86::ECX, Subtarget,

34750

Results);

34751

return;

34752

case Intrinsic::x86_rdpru:

34753

expandIntrinsicWChainHelper(N, dl, DAG, X86::RDPRU, X86::ECX, Subtarget,

34754

Results);

34755

return;

34756

case Intrinsic::x86_xgetbv:

34757

expandIntrinsicWChainHelper(N, dl, DAG, X86::XGETBV, X86::ECX, Subtarget,

34758

Results);

34759

return;

34760

}

34761

}

34762

case ISD::READCYCLECOUNTER: {

34763

return getReadTimeStampCounter(N, dl, X86::RDTSC, DAG, Subtarget, Results);

34764

}

34765

case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: {

34766

EVT T = N->getValueType(0);

34767

assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair")(static_cast <bool> ((T == MVT::i64 || T == MVT::i128) &&
"can only expand cmpxchg pair") ? void (0) : __assert_fail (
"(T == MVT::i64 || T == MVT::i128) && \"can only expand cmpxchg pair\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34767, __extension__
__PRETTY_FUNCTION__));

34768

bool Regs64bit = T == MVT::i128;

34769

assert((!Regs64bit || Subtarget.canUseCMPXCHG16B()) &&(static_cast <bool> ((!Regs64bit || Subtarget.canUseCMPXCHG16B
()) && "64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B"
) ? void (0) : __assert_fail ("(!Regs64bit || Subtarget.canUseCMPXCHG16B()) && \"64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34770, __extension__
__PRETTY_FUNCTION__))

34770

"64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B")(static_cast <bool> ((!Regs64bit || Subtarget.canUseCMPXCHG16B
()) && "64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B"
) ? void (0) : __assert_fail ("(!Regs64bit || Subtarget.canUseCMPXCHG16B()) && \"64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34770, __extension__
__PRETTY_FUNCTION__));

34771

MVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;

34772

SDValue cpInL, cpInH;

34773

std::tie(cpInL, cpInH) =

34774

DAG.SplitScalar(N->getOperand(2), dl, HalfT, HalfT);

34775

cpInL = DAG.getCopyToReg(N->getOperand(0), dl,

34776

Regs64bit ? X86::RAX : X86::EAX, cpInL, SDValue());

34777

cpInH =

34778

DAG.getCopyToReg(cpInL.getValue(0), dl, Regs64bit ? X86::RDX : X86::EDX,

34779

cpInH, cpInL.getValue(1));

34780

SDValue swapInL, swapInH;

34781

std::tie(swapInL, swapInH) =

34782

DAG.SplitScalar(N->getOperand(3), dl, HalfT, HalfT);

34783

swapInH =

34784

DAG.getCopyToReg(cpInH.getValue(0), dl, Regs64bit ? X86::RCX : X86::ECX,

34785

swapInH, cpInH.getValue(1));

34786

34787

// In 64-bit mode we might need the base pointer in RBX, but we can't know

34788

// until later. So we keep the RBX input in a vreg and use a custom

34789

// inserter.

34790

// Since RBX will be a reserved register the register allocator will not

34791

// make sure its value will be properly saved and restored around this

34792

// live-range.

34793

SDValue Result;

34794

SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);

34795

MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();

34796

if (Regs64bit) {

34797

SDValue Ops[] = {swapInH.getValue(0), N->getOperand(1), swapInL,

34798

swapInH.getValue(1)};

34799

Result =

34800

DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG16_DAG, dl, Tys, Ops, T, MMO);

34801

} else {

34802

swapInL = DAG.getCopyToReg(swapInH.getValue(0), dl, X86::EBX, swapInL,

34803

swapInH.getValue(1));

34804

SDValue Ops[] = {swapInL.getValue(0), N->getOperand(1),

34805

swapInL.getValue(1)};

34806

Result =

34807

DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG8_DAG, dl, Tys, Ops, T, MMO);

34808

}

34809

34810

SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,

34811

Regs64bit ? X86::RAX : X86::EAX,

34812

HalfT, Result.getValue(1));

34813

SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl,

34814

Regs64bit ? X86::RDX : X86::EDX,

34815

HalfT, cpOutL.getValue(2));

34816

SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};

34817

34818

SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS,

34819

MVT::i32, cpOutH.getValue(2));

34820

SDValue Success = getSETCC(X86::COND_E, EFLAGS, dl, DAG);

34821

Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1));

34822

34823

Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF));

34824

Results.push_back(Success);

34825

Results.push_back(EFLAGS.getValue(1));

34826

return;

34827

}

34828

case ISD::ATOMIC_LOAD: {

34829

assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!")(static_cast <bool> (N->getValueType(0) == MVT::i64 &&
"Unexpected VT!") ? void (0) : __assert_fail ("N->getValueType(0) == MVT::i64 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34829, __extension__
__PRETTY_FUNCTION__));

34830

bool NoImplicitFloatOps =

34831

DAG.getMachineFunction().getFunction().hasFnAttribute(

34832

Attribute::NoImplicitFloat);

34833

if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps) {

34834

auto *Node = cast<AtomicSDNode>(N);

34835

if (Subtarget.hasSSE1()) {

34836

// Use a VZEXT_LOAD which will be selected as MOVQ or XORPS+MOVLPS.

34837

// Then extract the lower 64-bits.

34838

MVT LdVT = Subtarget.hasSSE2() ? MVT::v2i64 : MVT::v4f32;

34839

SDVTList Tys = DAG.getVTList(LdVT, MVT::Other);

34840

SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };

34841

SDValue Ld = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,

34842

MVT::i64, Node->getMemOperand());

34843

if (Subtarget.hasSSE2()) {

34844

SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Ld,

34845

DAG.getIntPtrConstant(0, dl));

34846

Results.push_back(Res);

34847

Results.push_back(Ld.getValue(1));

34848

return;

34849

}

34850

// We use an alternative sequence for SSE1 that extracts as v2f32 and

34851

// then casts to i64. This avoids a 128-bit stack temporary being

34852

// created by type legalization if we were to cast v4f32->v2i64.

34853

SDValue Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2f32, Ld,

34854

DAG.getIntPtrConstant(0, dl));

34855

Res = DAG.getBitcast(MVT::i64, Res);

34856

Results.push_back(Res);

34857

Results.push_back(Ld.getValue(1));

34858

return;

34859

}

34860

if (Subtarget.hasX87()) {

34861

// First load this into an 80-bit X87 register. This will put the whole

34862

// integer into the significand.

34863

SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);

34864

SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };

34865

SDValue Result = DAG.getMemIntrinsicNode(X86ISD::FILD,

34866

dl, Tys, Ops, MVT::i64,

34867

Node->getMemOperand());

34868

SDValue Chain = Result.getValue(1);

34869

34870

// Now store the X87 register to a stack temporary and convert to i64.

34871

// This store is not atomic and doesn't need to be.

34872

// FIXME: We don't need a stack temporary if the result of the load

34873

// is already being stored. We could just directly store there.

34874

SDValue StackPtr = DAG.CreateStackTemporary(MVT::i64);

34875

int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();

34876

MachinePointerInfo MPI =

34877

MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);

34878

SDValue StoreOps[] = { Chain, Result, StackPtr };

34879

Chain = DAG.getMemIntrinsicNode(

34880

X86ISD::FIST, dl, DAG.getVTList(MVT::Other), StoreOps, MVT::i64,

34881

MPI, std::nullopt /*Align*/, MachineMemOperand::MOStore);

34882

34883

// Finally load the value back from the stack temporary and return it.

34884

// This load is not atomic and doesn't need to be.

34885

// This load will be further type legalized.

34886

Result = DAG.getLoad(MVT::i64, dl, Chain, StackPtr, MPI);

34887

Results.push_back(Result);

34888

Results.push_back(Result.getValue(1));

34889

return;

34890

}

34891

}

34892

// TODO: Use MOVLPS when SSE1 is available?

34893

// Delegate to generic TypeLegalization. Situations we can really handle

34894

// should have already been dealt with by AtomicExpandPass.cpp.

34895

break;

34896

}

34897

case ISD::ATOMIC_SWAP:

34898

case ISD::ATOMIC_LOAD_ADD:

34899

case ISD::ATOMIC_LOAD_SUB:

34900

case ISD::ATOMIC_LOAD_AND:

34901

case ISD::ATOMIC_LOAD_OR:

34902

case ISD::ATOMIC_LOAD_XOR:

34903

case ISD::ATOMIC_LOAD_NAND:

34904

case ISD::ATOMIC_LOAD_MIN:

34905

case ISD::ATOMIC_LOAD_MAX:

34906

case ISD::ATOMIC_LOAD_UMIN:

34907

case ISD::ATOMIC_LOAD_UMAX:

34908

// Delegate to generic TypeLegalization. Situations we can really handle

34909

// should have already been dealt with by AtomicExpandPass.cpp.

34910

break;

34911

34912

case ISD::BITCAST: {

34913

assert(Subtarget.hasSSE2() && "Requires at least SSE2!")(static_cast <bool> (Subtarget.hasSSE2() && "Requires at least SSE2!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"Requires at least SSE2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34913, __extension__
__PRETTY_FUNCTION__));

34914

EVT DstVT = N->getValueType(0);

34915

EVT SrcVT = N->getOperand(0).getValueType();

34916

34917

// If this is a bitcast from a v64i1 k-register to a i64 on a 32-bit target

34918

// we can split using the k-register rather than memory.

34919

if (SrcVT == MVT::v64i1 && DstVT == MVT::i64 && Subtarget.hasBWI()) {

34920

assert(!Subtarget.is64Bit() && "Expected 32-bit mode")(static_cast <bool> (!Subtarget.is64Bit() && "Expected 32-bit mode"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"Expected 32-bit mode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34920, __extension__
__PRETTY_FUNCTION__));

34921

SDValue Lo, Hi;

34922

std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);

34923

Lo = DAG.getBitcast(MVT::i32, Lo);

34924

Hi = DAG.getBitcast(MVT::i32, Hi);

34925

SDValue Res = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);

34926

Results.push_back(Res);

34927

return;

34928

}

34929

34930

if (DstVT.isVector() && SrcVT == MVT::x86mmx) {

34931

// FIXME: Use v4f32 for SSE1?

34932

assert(Subtarget.hasSSE2() && "Requires SSE2")(static_cast <bool> (Subtarget.hasSSE2() && "Requires SSE2"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"Requires SSE2\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34932, __extension__
__PRETTY_FUNCTION__));

34933

assert(getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector &&(static_cast <bool> (getTypeAction(*DAG.getContext(), DstVT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34934, __extension__
__PRETTY_FUNCTION__))

34934

"Unexpected type action!")(static_cast <bool> (getTypeAction(*DAG.getContext(), DstVT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34934, __extension__
__PRETTY_FUNCTION__));

34935

EVT WideVT = getTypeToTransformTo(*DAG.getContext(), DstVT);

34936

SDValue Res = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64,

34937

N->getOperand(0));

34938

Res = DAG.getBitcast(WideVT, Res);

34939

Results.push_back(Res);

34940

return;

34941

}

34942

34943

return;

34944

}

34945

case ISD::MGATHER: {

34946

EVT VT = N->getValueType(0);

34947

if ((VT == MVT::v2f32 || VT == MVT::v2i32) &&

34948

(Subtarget.hasVLX() || !Subtarget.hasAVX512())) {

34949

auto *Gather = cast<MaskedGatherSDNode>(N);

34950

SDValue Index = Gather->getIndex();

34951

if (Index.getValueType() != MVT::v2i64)

34952

return;

34953

assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34954, __extension__
__PRETTY_FUNCTION__))

34954

"Unexpected type action!")(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34954, __extension__
__PRETTY_FUNCTION__));

34955

EVT WideVT = getTypeToTransformTo(*DAG.getContext(), VT);

34956

SDValue Mask = Gather->getMask();

34957

assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type")(static_cast <bool> (Mask.getValueType() == MVT::v2i1 &&
"Unexpected mask type") ? void (0) : __assert_fail ("Mask.getValueType() == MVT::v2i1 && \"Unexpected mask type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34957, __extension__
__PRETTY_FUNCTION__));

34958

SDValue PassThru = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT,

34959

Gather->getPassThru(),

34960

DAG.getUNDEF(VT));

34961

if (!Subtarget.hasVLX()) {

34962

// We need to widen the mask, but the instruction will only use 2

34963

// of its elements. So we can use undef.

34964

Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,

34965

DAG.getUNDEF(MVT::v2i1));

34966

Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Mask);

34967

}

34968

SDValue Ops[] = { Gather->getChain(), PassThru, Mask,

34969

Gather->getBasePtr(), Index, Gather->getScale() };

34970

SDValue Res = DAG.getMemIntrinsicNode(

34971

X86ISD::MGATHER, dl, DAG.getVTList(WideVT, MVT::Other), Ops,

34972

Gather->getMemoryVT(), Gather->getMemOperand());

34973

Results.push_back(Res);

34974

Results.push_back(Res.getValue(1));

34975

return;

34976

}

34977

return;

34978

}

34979

case ISD::LOAD: {

34980

// Use an f64/i64 load and a scalar_to_vector for v2f32/v2i32 loads. This

34981

// avoids scalarizing in 32-bit mode. In 64-bit mode this avoids a int->fp

34982

// cast since type legalization will try to use an i64 load.

34983

MVT VT = N->getSimpleValueType(0);

34984

assert(VT.isVector() && VT.getSizeInBits() == 64 && "Unexpected VT")(static_cast <bool> (VT.isVector() && VT.getSizeInBits
() == 64 && "Unexpected VT") ? void (0) : __assert_fail
("VT.isVector() && VT.getSizeInBits() == 64 && \"Unexpected VT\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34984, __extension__
__PRETTY_FUNCTION__));

34985

assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34986, __extension__
__PRETTY_FUNCTION__))

34986

"Unexpected type action!")(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34986, __extension__
__PRETTY_FUNCTION__));

34987

if (!ISD::isNON_EXTLoad(N))

34988

return;

34989

auto *Ld = cast<LoadSDNode>(N);

34990

if (Subtarget.hasSSE2()) {

34991

MVT LdVT = Subtarget.is64Bit() && VT.isInteger() ? MVT::i64 : MVT::f64;

34992

SDValue Res = DAG.getLoad(LdVT, dl, Ld->getChain(), Ld->getBasePtr(),

34993

Ld->getPointerInfo(), Ld->getOriginalAlign(),

34994

Ld->getMemOperand()->getFlags());

34995

SDValue Chain = Res.getValue(1);

34996

MVT VecVT = MVT::getVectorVT(LdVT, 2);

34997

Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Res);

34998

EVT WideVT = getTypeToTransformTo(*DAG.getContext(), VT);

34999

Res = DAG.getBitcast(WideVT, Res);

35000

Results.push_back(Res);

35001

Results.push_back(Chain);

35002

return;

35003

}

35004

assert(Subtarget.hasSSE1() && "Expected SSE")(static_cast <bool> (Subtarget.hasSSE1() && "Expected SSE"
) ? void (0) : __assert_fail ("Subtarget.hasSSE1() && \"Expected SSE\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 35004, __extension__
__PRETTY_FUNCTION__));

35005

SDVTList Tys = DAG.getVTList(MVT::v4f32, MVT::Other);

35006

SDValue Ops[] = {Ld->getChain(), Ld->getBasePtr()};

35007

SDValue Res = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,

35008

MVT::i64, Ld->getMemOperand());

35009

Results.push_back(Res);

35010

Results.push_back(Res.getValue(1));

35011

return;

35012

}

35013

case ISD::ADDRSPACECAST: {

35014

SDValue V = LowerADDRSPACECAST(SDValue(N,0), DAG);

35015

Results.push_back(V);

35016

return;

35017

}

35018

case ISD::BITREVERSE: {

35019

assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!")(static_cast <bool> (N->getValueType(0) == MVT::i64 &&
"Unexpected VT!") ? void (0) : __assert_fail ("N->getValueType(0) == MVT::i64 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 35019, __extension__
__PRETTY_FUNCTION__));

35020

assert(Subtarget.hasXOP() && "Expected XOP")(static_cast <bool> (Subtarget.hasXOP() && "Expected XOP"
) ? void (0) : __assert_fail ("Subtarget.hasXOP() && \"Expected XOP\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 35020, __extension__
__PRETTY_FUNCTION__));

35021

// We can use VPPERM by copying to a vector register and back. We'll need

35022

// to move the scalar in two i32 pieces.

35023

Results.push_back(LowerBITREVERSE(SDValue(N, 0), Subtarget, DAG));

35024

return;

35025

}

35026

case ISD::EXTRACT_VECTOR_ELT: {

35027

// f16 = extract vXf16 %vec, i64 %idx

35028

assert(N->getSimpleValueType(0) == MVT::f16 &&(static_cast <bool> (N->getSimpleValueType(0) == MVT
::f16 && "Unexpected Value type of EXTRACT_VECTOR_ELT!"
) ? void (0) : __assert_fail ("N->getSimpleValueType(0) == MVT::f16 && \"Unexpected Value type of EXTRACT_VECTOR_ELT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 35029, __extension__
__PRETTY_FUNCTION__))

35029

"Unexpected Value type of EXTRACT_VECTOR_ELT!")(static_cast <bool> (N->getSimpleValueType(0) == MVT
::f16 && "Unexpected Value type of EXTRACT_VECTOR_ELT!"
) ? void (0) : __assert_fail ("N->getSimpleValueType(0) == MVT::f16 && \"Unexpected Value type of EXTRACT_VECTOR_ELT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 35029, __extension__
__PRETTY_FUNCTION__));

35030

assert(Subtarget.hasFP16() && "Expected FP16")(static_cast <bool> (Subtarget.hasFP16() && "Expected FP16"
) ? void (0) : __assert_fail ("Subtarget.hasFP16() && \"Expected FP16\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 35030, __extension__
__PRETTY_FUNCTION__));

35031

SDValue VecOp = N->getOperand(0);

35032

EVT ExtVT = VecOp.getValueType().changeVectorElementTypeToInteger();

35033

SDValue Split = DAG.getBitcast(ExtVT, N->getOperand(0));

35034

Split = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Split,

35035

N->getOperand(1));

35036

Split = DAG.getBitcast(MVT::f16, Split);

35037

Results.push_back(Split);

35038

return;

35039

}

35040

}

35041

}

35042

35043

const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {

35044

switch ((X86ISD::NodeType)Opcode) {

35045

case X86ISD::FIRST_NUMBER: break;

35046

#define NODE_NAME_CASE(NODE) case X86ISD::NODE: return "X86ISD::" #NODE;

35047

NODE_NAME_CASE(BSF)

35048

NODE_NAME_CASE(BSR)

35049

NODE_NAME_CASE(FSHL)

35050

NODE_NAME_CASE(FSHR)

35051

NODE_NAME_CASE(FAND)

35052

NODE_NAME_CASE(FANDN)

35053

NODE_NAME_CASE(FOR)

35054

NODE_NAME_CASE(FXOR)

35055

NODE_NAME_CASE(FILD)

35056

NODE_NAME_CASE(FIST)

35057

NODE_NAME_CASE(FP_TO_INT_IN_MEM)

35058

NODE_NAME_CASE(FLD)

35059

NODE_NAME_CASE(FST)

35060

NODE_NAME_CASE(CALL)

35061

NODE_NAME_CASE(CALL_RVMARKER)

35062

NODE_NAME_CASE(BT)

35063

NODE_NAME_CASE(CMP)

35064

NODE_NAME_CASE(FCMP)

35065

NODE_NAME_CASE(STRICT_FCMP)

35066

NODE_NAME_CASE(STRICT_FCMPS)

35067

NODE_NAME_CASE(COMI)

35068

NODE_NAME_CASE(UCOMI)

35069

NODE_NAME_CASE(CMPM)

35070

NODE_NAME_CASE(CMPMM)

35071

NODE_NAME_CASE(STRICT_CMPM)

35072

NODE_NAME_CASE(CMPMM_SAE)

35073

NODE_NAME_CASE(SETCC)

35074

NODE_NAME_CASE(SETCC_CARRY)

35075

NODE_NAME_CASE(FSETCC)

35076

NODE_NAME_CASE(FSETCCM)

35077

NODE_NAME_CASE(FSETCCM_SAE)

35078

NODE_NAME_CASE(CMOV)

35079

NODE_NAME_CASE(BRCOND)

35080

NODE_NAME_CASE(RET_GLUE)

35081

NODE_NAME_CASE(IRET)

35082

NODE_NAME_CASE(REP_STOS)

35083

NODE_NAME_CASE(REP_MOVS)

35084

NODE_NAME_CASE(GlobalBaseReg)

35085

NODE_NAME_CASE(Wrapper)

35086

NODE_NAME_CASE(WrapperRIP)

35087

NODE_NAME_CASE(MOVQ2DQ)

35088

NODE_NAME_CASE(MOVDQ2Q)

35089

NODE_NAME_CASE(MMX_MOVD2W)

35090

NODE_NAME_CASE(MMX_MOVW2D)

35091

NODE_NAME_CASE(PEXTRB)

35092

NODE_NAME_CASE(PEXTRW)

35093

NODE_NAME_CASE(INSERTPS)

35094

NODE_NAME_CASE(PINSRB)

35095

NODE_NAME_CASE(PINSRW)

35096

NODE_NAME_CASE(PSHUFB)

35097

NODE_NAME_CASE(ANDNP)

35098

NODE_NAME_CASE(BLENDI)

35099

NODE_NAME_CASE(BLENDV)

35100

NODE_NAME_CASE(HADD)

35101

NODE_NAME_CASE(HSUB)

35102

NODE_NAME_CASE(FHADD)

35103

NODE_NAME_CASE(FHSUB)

35104

NODE_NAME_CASE(CONFLICT)

35105

NODE_NAME_CASE(FMAX)

35106

NODE_NAME_CASE(FMAXS)

35107

NODE_NAME_CASE(FMAX_SAE)

35108

NODE_NAME_CASE(FMAXS_SAE)

35109

NODE_NAME_CASE(FMIN)

35110

NODE_NAME_CASE(FMINS)

35111

NODE_NAME_CASE(FMIN_SAE)

35112

NODE_NAME_CASE(FMINS_SAE)

35113

NODE_NAME_CASE(FMAXC)

35114

NODE_NAME_CASE(FMINC)

35115

NODE_NAME_CASE(FRSQRT)

35116

NODE_NAME_CASE(FRCP)

35117

NODE_NAME_CASE(EXTRQI)

35118

NODE_NAME_CASE(INSERTQI)

35119

NODE_NAME_CASE(TLSADDR)

35120

NODE_NAME_CASE(TLSBASEADDR)

35121

NODE_NAME_CASE(TLSCALL)

35122

NODE_NAME_CASE(EH_SJLJ_SETJMP)

35123

NODE_NAME_CASE(EH_SJLJ_LONGJMP)

35124

NODE_NAME_CASE(EH_SJLJ_SETUP_DISPATCH)

35125

NODE_NAME_CASE(EH_RETURN)

35126

NODE_NAME_CASE(TC_RETURN)

35127

NODE_NAME_CASE(FNSTCW16m)

35128

NODE_NAME_CASE(FLDCW16m)

35129

NODE_NAME_CASE(LCMPXCHG_DAG)

35130

NODE_NAME_CASE(LCMPXCHG8_DAG)

35131

NODE_NAME_CASE(LCMPXCHG16_DAG)

35132

NODE_NAME_CASE(LCMPXCHG16_SAVE_RBX_DAG)

35133

NODE_NAME_CASE(LADD)

35134

NODE_NAME_CASE(LSUB)

35135

NODE_NAME_CASE(LOR)

35136

NODE_NAME_CASE(LXOR)

35137

NODE_NAME_CASE(LAND)

35138

NODE_NAME_CASE(LBTS)

35139

NODE_NAME_CASE(LBTC)

35140

NODE_NAME_CASE(LBTR)

35141

NODE_NAME_CASE(LBTS_RM)

35142

NODE_NAME_CASE(LBTC_RM)

35143

NODE_NAME_CASE(LBTR_RM)

35144

NODE_NAME_CASE(AADD)

35145

NODE_NAME_CASE(AOR)

35146

NODE_NAME_CASE(AXOR)

35147

NODE_NAME_CASE(AAND)

35148

NODE_NAME_CASE(VZEXT_MOVL)

35149

NODE_NAME_CASE(VZEXT_LOAD)

35150

NODE_NAME_CASE(VEXTRACT_STORE)

35151

NODE_NAME_CASE(VTRUNC)

35152

NODE_NAME_CASE(VTRUNCS)

35153

NODE_NAME_CASE(VTRUNCUS)

35154

NODE_NAME_CASE(VMTRUNC)

35155

NODE_NAME_CASE(VMTRUNCS)

35156

NODE_NAME_CASE(VMTRUNCUS)

35157

NODE_NAME_CASE(VTRUNCSTORES)

35158

NODE_NAME_CASE(VTRUNCSTOREUS)

35159

NODE_NAME_CASE(VMTRUNCSTORES)

35160

NODE_NAME_CASE(VMTRUNCSTOREUS)

35161

NODE_NAME_CASE(VFPEXT)

35162

NODE_NAME_CASE(STRICT_VFPEXT)

35163

NODE_NAME_CASE(VFPEXT_SAE)

35164

NODE_NAME_CASE(VFPEXTS)

35165

NODE_NAME_CASE(VFPEXTS_SAE)

35166

NODE_NAME_CASE(VFPROUND)

35167

NODE_NAME_CASE(STRICT_VFPROUND)

35168

NODE_NAME_CASE(VMFPROUND)

35169

NODE_NAME_CASE(VFPROUND_RND)

35170

NODE_NAME_CASE(VFPROUNDS)

35171

NODE_NAME_CASE(VFPROUNDS_RND)

35172

NODE_NAME_CASE(VSHLDQ)

35173

NODE_NAME_CASE(VSRLDQ)

35174

NODE_NAME_CASE(VSHL)

35175

NODE_NAME_CASE(VSRL)

35176

NODE_NAME_CASE(VSRA)

35177

NODE_NAME_CASE(VSHLI)

35178

NODE_NAME_CASE(VSRLI)

35179

NODE_NAME_CASE(VSRAI)

35180

NODE_NAME_CASE(VSHLV)

35181

NODE_NAME_CASE(VSRLV)

35182

NODE_NAME_CASE(VSRAV)

35183

NODE_NAME_CASE(VROTLI)

35184

NODE_NAME_CASE(VROTRI)

35185

NODE_NAME_CASE(VPPERM)

35186

NODE_NAME_CASE(CMPP)

35187

NODE_NAME_CASE(STRICT_CMPP)

35188

NODE_NAME_CASE(PCMPEQ)

35189

NODE_NAME_CASE(PCMPGT)

35190

NODE_NAME_CASE(PHMINPOS)

35191

NODE_NAME_CASE(ADD)

35192

NODE_NAME_CASE(SUB)

35193

NODE_NAME_CASE(ADC)

35194

NODE_NAME_CASE(SBB)

35195

NODE_NAME_CASE(SMUL)

35196

NODE_NAME_CASE(UMUL)

35197

NODE_NAME_CASE(OR)

35198

NODE_NAME_CASE(XOR)

35199

NODE_NAME_CASE(AND)

35200

NODE_NAME_CASE(BEXTR)

35201

NODE_NAME_CASE(BEXTRI)

35202

NODE_NAME_CASE(BZHI)

35203

NODE_NAME_CASE(PDEP)

35204

NODE_NAME_CASE(PEXT)

35205

NODE_NAME_CASE(MUL_IMM)

35206

NODE_NAME_CASE(MOVMSK)

35207

NODE_NAME_CASE(PTEST)

35208

NODE_NAME_CASE(TESTP)

35209

NODE_NAME_CASE(KORTEST)

35210

NODE_NAME_CASE(KTEST)

35211

NODE_NAME_CASE(KADD)

35212

NODE_NAME_CASE(KSHIFTL)

35213

NODE_NAME_CASE(KSHIFTR)

35214

NODE_NAME_CASE(PACKSS)

35215

NODE_NAME_CASE(PACKUS)

35216

NODE_NAME_CASE(PALIGNR)

35217

NODE_NAME_CASE(VALIGN)

35218

NODE_NAME_CASE(VSHLD)

35219

NODE_NAME_CASE(VSHRD)

35220

NODE_NAME_CASE(VSHLDV)

35221

NODE_NAME_CASE(VSHRDV)

35222

NODE_NAME_CASE(PSHUFD)

35223

NODE_NAME_CASE(PSHUFHW)

35224

NODE_NAME_CASE(PSHUFLW)

35225

NODE_NAME_CASE(SHUFP)

35226

NODE_NAME_CASE(SHUF128)

35227

NODE_NAME_CASE(MOVLHPS)

35228

NODE_NAME_CASE(MOVHLPS)

35229

NODE_NAME_CASE(MOVDDUP)

35230

NODE_NAME_CASE(MOVSHDUP)

35231

NODE_NAME_CASE(MOVSLDUP)

35232

NODE_NAME_CASE(MOVSD)

35233

NODE_NAME_CASE(MOVSS)

35234

NODE_NAME_CASE(MOVSH)

35235

NODE_NAME_CASE(UNPCKL)

35236

NODE_NAME_CASE(UNPCKH)

35237

NODE_NAME_CASE(VBROADCAST)

35238

NODE_NAME_CASE(VBROADCAST_LOAD)

35239

NODE_NAME_CASE(VBROADCASTM)

35240

NODE_NAME_CASE(SUBV_BROADCAST_LOAD)

35241

NODE_NAME_CASE(VPERMILPV)

35242

NODE_NAME_CASE(VPERMILPI)

35243

NODE_NAME_CASE(VPERM2X128)

35244

NODE_NAME_CASE(VPERMV)

35245

NODE_NAME_CASE(VPERMV3)

35246

NODE_NAME_CASE(VPERMI)

35247

NODE_NAME_CASE(VPTERNLOG)

35248

NODE_NAME_CASE(VFIXUPIMM)

35249

NODE_NAME_CASE(VFIXUPIMM_SAE)

35250

NODE_NAME_CASE(VFIXUPIMMS)

35251

NODE_NAME_CASE(VFIXUPIMMS_SAE)

35252

NODE_NAME_CASE(VRANGE)

35253

NODE_NAME_CASE(VRANGE_SAE)

35254

NODE_NAME_CASE(VRANGES)

35255

NODE_NAME_CASE(VRANGES_SAE)

35256

NODE_NAME_CASE(PMULUDQ)

35257

NODE_NAME_CASE(PMULDQ)

35258

NODE_NAME_CASE(PSADBW)

35259

NODE_NAME_CASE(DBPSADBW)

35260

NODE_NAME_CASE(VASTART_SAVE_XMM_REGS)

35261

NODE_NAME_CASE(VAARG_64)

35262

NODE_NAME_CASE(VAARG_X32)

35263

NODE_NAME_CASE(DYN_ALLOCA)

35264

NODE_NAME_CASE(MFENCE)

35265

NODE_NAME_CASE(SEG_ALLOCA)

35266

NODE_NAME_CASE(PROBED_ALLOCA)

35267

NODE_NAME_CASE(RDRAND)

35268

NODE_NAME_CASE(RDSEED)

35269

NODE_NAME_CASE(RDPKRU)

35270

NODE_NAME_CASE(WRPKRU)

35271

NODE_NAME_CASE(VPMADDUBSW)

35272

NODE_NAME_CASE(VPMADDWD)

35273

NODE_NAME_CASE(VPSHA)

35274

NODE_NAME_CASE(VPSHL)

35275

NODE_NAME_CASE(VPCOM)

35276

NODE_NAME_CASE(VPCOMU)

35277

NODE_NAME_CASE(VPERMIL2)

35278

NODE_NAME_CASE(FMSUB)

35279

NODE_NAME_CASE(STRICT_FMSUB)

35280

NODE_NAME_CASE(FNMADD)

35281

NODE_NAME_CASE(STRICT_FNMADD)

35282

NODE_NAME_CASE(FNMSUB)

35283

NODE_NAME_CASE(STRICT_FNMSUB)

35284

NODE_NAME_CASE(FMADDSUB)

35285

NODE_NAME_CASE(FMSUBADD)

35286

NODE_NAME_CASE(FMADD_RND)

35287

NODE_NAME_CASE(FNMADD_RND)

35288

NODE_NAME_CASE(FMSUB_RND)

35289

NODE_NAME_CASE(FNMSUB_RND)

35290

NODE_NAME_CASE(FMADDSUB_RND)

35291

NODE_NAME_CASE(FMSUBADD_RND)

35292

NODE_NAME_CASE(VFMADDC)

35293

NODE_NAME_CASE(VFMADDC_RND)

35294

NODE_NAME_CASE(VFCMADDC)

35295

NODE_NAME_CASE(VFCMADDC_RND)

35296

NODE_NAME_CASE(VFMULC)

35297

NODE_NAME_CASE(VFMULC_RND)

35298

NODE_NAME_CASE(VFCMULC)

35299

NODE_NAME_CASE(VFCMULC_RND)

35300

NODE_NAME_CASE(VFMULCSH)

35301

NODE_NAME_CASE(VFMULCSH_RND)

35302

NODE_NAME_CASE(VFCMULCSH)

35303

NODE_NAME_CASE(VFCMULCSH_RND)

35304

NODE_NAME_CASE(VFMADDCSH)

35305

NODE_NAME_CASE(VFMADDCSH_RND)

35306

NODE_NAME_CASE(VFCMADDCSH)

35307

NODE_NAME_CASE(VFCMADDCSH_RND)

35308

NODE_NAME_CASE(VPMADD52H)

35309

NODE_NAME_CASE(VPMADD52L)

35310

NODE_NAME_CASE(VRNDSCALE)

35311

NODE_NAME_CASE(STRICT_VRNDSCALE)

35312

NODE_NAME_CASE(VRNDSCALE_SAE)

35313

NODE_NAME_CASE(VRNDSCALES)

35314

NODE_NAME_CASE(VRNDSCALES_SAE)

35315

NODE_NAME_CASE(VREDUCE)

35316

NODE_NAME_CASE(VREDUCE_SAE)

35317

NODE_NAME_CASE(VREDUCES)

35318

NODE_NAME_CASE(VREDUCES_SAE)

35319

NODE_NAME_CASE(VGETMANT)

35320

NODE_NAME_CASE(VGETMANT_SAE)

35321

NODE_NAME_CASE(VGETMANTS)

35322

NODE_NAME_CASE(VGETMANTS_SAE)

35323

NODE_NAME_CASE(PCMPESTR)

35324

NODE_NAME_CASE(PCMPISTR)

35325

NODE_NAME_CASE(XTEST)

35326

NODE_NAME_CASE(COMPRESS)

35327

NODE_NAME_CASE(EXPAND)

35328

NODE_NAME_CASE(SELECTS)

35329

NODE_NAME_CASE(ADDSUB)

35330

NODE_NAME_CASE(RCP14)

35331

NODE_NAME_CASE(RCP14S)

35332

NODE_NAME_CASE(RCP28)

35333

NODE_NAME_CASE(RCP28_SAE)

35334

NODE_NAME_CASE(RCP28S)

35335

NODE_NAME_CASE(RCP28S_SAE)

35336

NODE_NAME_CASE(EXP2)

35337

NODE_NAME_CASE(EXP2_SAE)

35338

NODE_NAME_CASE(RSQRT14)

35339

NODE_NAME_CASE(RSQRT14S)

35340

NODE_NAME_CASE(RSQRT28)

35341

NODE_NAME_CASE(RSQRT28_SAE)

35342

NODE_NAME_CASE(RSQRT28S)

35343

NODE_NAME_CASE(RSQRT28S_SAE)

35344

NODE_NAME_CASE(FADD_RND)

35345

NODE_NAME_CASE(FADDS)

35346

NODE_NAME_CASE(FADDS_RND)

35347

NODE_NAME_CASE(FSUB_RND)

35348

NODE_NAME_CASE(FSUBS)

35349

NODE_NAME_CASE(FSUBS_RND)

35350

NODE_NAME_CASE(FMUL_RND)

35351

NODE_NAME_CASE(FMULS)

35352

NODE_NAME_CASE(FMULS_RND)

35353

NODE_NAME_CASE(FDIV_RND)

35354

NODE_NAME_CASE(FDIVS)

35355

NODE_NAME_CASE(FDIVS_RND)

35356

NODE_NAME_CASE(FSQRT_RND)

35357

NODE_NAME_CASE(FSQRTS)

35358

NODE_NAME_CASE(FSQRTS_RND)

35359

NODE_NAME_CASE(FGETEXP)

35360

NODE_NAME_CASE(FGETEXP_SAE)

35361

NODE_NAME_CASE(FGETEXPS)

35362

NODE_NAME_CASE(FGETEXPS_SAE)

35363

NODE_NAME_CASE(SCALEF)

35364

NODE_NAME_CASE(SCALEF_RND)

35365

NODE_NAME_CASE(SCALEFS)

35366

NODE_NAME_CASE(SCALEFS_RND)

35367

NODE_NAME_CASE(MULHRS)

35368

NODE_NAME_CASE(SINT_TO_FP_RND)

35369

NODE_NAME_CASE(UINT_TO_FP_RND)

35370

NODE_NAME_CASE(CVTTP2SI)

35371

NODE_NAME_CASE(CVTTP2UI)

35372

NODE_NAME_CASE(STRICT_CVTTP2SI)

35373

NODE_NAME_CASE(STRICT_CVTTP2UI)

35374

NODE_NAME_CASE(MCVTTP2SI)

35375

NODE_NAME_CASE(MCVTTP2UI)

35376

NODE_NAME_CASE(CVTTP2SI_SAE)

35377

NODE_NAME_CASE(CVTTP2UI_SAE)

35378

NODE_NAME_CASE(CVTTS2SI)

35379

NODE_NAME_CASE(CVTTS2UI)

35380

NODE_NAME_CASE(CVTTS2SI_SAE)

35381

NODE_NAME_CASE(CVTTS2UI_SAE)

35382

NODE_NAME_CASE(CVTSI2P)

35383

NODE_NAME_CASE(CVTUI2P)

35384

NODE_NAME_CASE(STRICT_CVTSI2P)

35385

NODE_NAME_CASE(STRICT_CVTUI2P)

35386

NODE_NAME_CASE(MCVTSI2P)

35387

NODE_NAME_CASE(MCVTUI2P)

35388

NODE_NAME_CASE(VFPCLASS)

35389

NODE_NAME_CASE(VFPCLASSS)

35390

NODE_NAME_CASE(MULTISHIFT)

35391

NODE_NAME_CASE(SCALAR_SINT_TO_FP)

35392

NODE_NAME_CASE(SCALAR_SINT_TO_FP_RND)

35393

NODE_NAME_CASE(SCALAR_UINT_TO_FP)

35394

NODE_NAME_CASE(SCALAR_UINT_TO_FP_RND)

35395

NODE_NAME_CASE(CVTPS2PH)

35396

NODE_NAME_CASE(STRICT_CVTPS2PH)

35397

NODE_NAME_CASE(CVTPS2PH_SAE)

35398

NODE_NAME_CASE(MCVTPS2PH)

35399

NODE_NAME_CASE(MCVTPS2PH_SAE)

35400

NODE_NAME_CASE(CVTPH2PS)

35401

NODE_NAME_CASE(STRICT_CVTPH2PS)

35402

NODE_NAME_CASE(CVTPH2PS_SAE)

35403

NODE_NAME_CASE(CVTP2SI)

35404

NODE_NAME_CASE(CVTP2UI)

35405

NODE_NAME_CASE(MCVTP2SI)

35406

NODE_NAME_CASE(MCVTP2UI)

35407

NODE_NAME_CASE(CVTP2SI_RND)

35408

NODE_NAME_CASE(CVTP2UI_RND)

35409

NODE_NAME_CASE(CVTS2SI)

35410

NODE_NAME_CASE(CVTS2UI)

35411

NODE_NAME_CASE(CVTS2SI_RND)

35412

NODE_NAME_CASE(CVTS2UI_RND)

35413

NODE_NAME_CASE(CVTNE2PS2BF16)

35414

NODE_NAME_CASE(CVTNEPS2BF16)

35415

NODE_NAME_CASE(MCVTNEPS2BF16)

35416

NODE_NAME_CASE(DPBF16PS)

35417

NODE_NAME_CASE(LWPINS)

35418

NODE_NAME_CASE(MGATHER)

35419

NODE_NAME_CASE(MSCATTER)

35420

NODE_NAME_CASE(VPDPBUSD)

35421

NODE_NAME_CASE(VPDPBUSDS)

35422

NODE_NAME_CASE(VPDPWSSD)

35423

NODE_NAME_CASE(VPDPWSSDS)

35424

NODE_NAME_CASE(VPSHUFBITQMB)

35425

NODE_NAME_CASE(GF2P8MULB)

35426

NODE_NAME_CASE(GF2P8AFFINEQB)

35427

NODE_NAME_CASE(GF2P8AFFINEINVQB)

35428

NODE_NAME_CASE(NT_CALL)

35429

NODE_NAME_CASE(NT_BRIND)

35430

NODE_NAME_CASE(UMWAIT)

35431

NODE_NAME_CASE(TPAUSE)

35432

NODE_NAME_CASE(ENQCMD)

35433

NODE_NAME_CASE(ENQCMDS)

35434

NODE_NAME_CASE(VP2INTERSECT)

35435

NODE_NAME_CASE(VPDPBSUD)

35436

NODE_NAME_CASE(VPDPBSUDS)

35437

NODE_NAME_CASE(VPDPBUUD)

35438

NODE_NAME_CASE(VPDPBUUDS)

35439

NODE_NAME_CASE(VPDPBSSD)

35440

NODE_NAME_CASE(VPDPBSSDS)

35441

NODE_NAME_CASE(AESENC128KL)

35442

NODE_NAME_CASE(AESDEC128KL)

35443

NODE_NAME_CASE(AESENC256KL)

35444

NODE_NAME_CASE(AESDEC256KL)

35445

NODE_NAME_CASE(AESENCWIDE128KL)

35446

NODE_NAME_CASE(AESDECWIDE128KL)

35447

NODE_NAME_CASE(AESENCWIDE256KL)

35448

NODE_NAME_CASE(AESDECWIDE256KL)

35449

NODE_NAME_CASE(CMPCCXADD)

35450

NODE_NAME_CASE(TESTUI)

35451

NODE_NAME_CASE(FP80_ADD)

35452

NODE_NAME_CASE(STRICT_FP80_ADD)

35453

}

35454

return nullptr;

35455

#undef NODE_NAME_CASE

35456

}

35457

35458

/// Return true if the addressing mode represented by AM is legal for this

35459

/// target, for a load/store of the specified type.

35460

bool X86TargetLowering::isLegalAddressingMode(const DataLayout &DL,

35461

const AddrMode &AM, Type *Ty,

35462

unsigned AS,

35463

Instruction *I) const {

35464

// X86 supports extremely general addressing modes.

35465

CodeModel::Model M = getTargetMachine().getCodeModel();

35466

35467

// X86 allows a sign-extended 32-bit immediate field as a displacement.

35468

if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr))

35469

return false;

35470

35471

if (AM.BaseGV) {

35472

unsigned GVFlags = Subtarget.classifyGlobalReference(AM.BaseGV);

35473

35474

// If a reference to this global requires an extra load, we can't fold it.

35475

if (isGlobalStubReference(GVFlags))

35476

return false;

35477

35478

// If BaseGV requires a register for the PIC base, we cannot also have a

35479

// BaseReg specified.

35480

if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))

35481

return false;

35482

35483

// If lower 4G is not available, then we must use rip-relative addressing.

35484

if ((M != CodeModel::Small || isPositionIndependent()) &&

35485

Subtarget.is64Bit() && (AM.BaseOffs || AM.Scale > 1))

35486

return false;

35487

}

35488

35489

switch (AM.Scale) {

35490

case 0:

35491

case 1:

35492

case 2:

35493

case 4:

35494

case 8:

35495

// These scales always work.

35496

break;

35497

case 3:

35498

case 5:

35499

case 9:

35500

// These scales are formed with basereg+scalereg. Only accept if there is

35501

// no basereg yet.

35502

if (AM.HasBaseReg)

35503

return false;

35504

break;

35505

default: // Other stuff never works.

35506

return false;

35507

}

35508

35509

return true;

35510

}

35511

35512

bool X86TargetLowering::isVectorShiftByScalarCheap(Type *Ty) const {

35513

unsigned Bits = Ty->getScalarSizeInBits();

35514

35515

// XOP has v16i8/v8i16/v4i32/v2i64 variable vector shifts.

35516

// Splitting for v32i8/v16i16 on XOP+AVX2 targets is still preferred.

35517

if (Subtarget.hasXOP() &&

35518

(Bits == 8 || Bits == 16 || Bits == 32 || Bits == 64))

35519

return false;

35520

35521

// AVX2 has vpsllv[dq] instructions (and other shifts) that make variable

35522

// shifts just as cheap as scalar ones.

35523

if (Subtarget.hasAVX2() && (Bits == 32 || Bits == 64))

35524

return false;

35525

35526

// AVX512BW has shifts such as vpsllvw.

35527

if (Subtarget.hasBWI() && Bits == 16)

35528

return false;

35529

35530

// Otherwise, it's significantly cheaper to shift by a scalar amount than by a

35531

// fully general vector.

35532

return true;

35533

}

35534

35535

bool X86TargetLowering::isBinOp(unsigned Opcode) const {

35536

switch (Opcode) {

35537

// These are non-commutative binops.

35538

// TODO: Add more X86ISD opcodes once we have test coverage.

35539

case X86ISD::ANDNP:

35540

case X86ISD::PCMPGT:

35541

case X86ISD::FMAX:

35542

case X86ISD::FMIN:

35543

case X86ISD::FANDN:

35544

case X86ISD::VPSHA:

35545

case X86ISD::VPSHL:

35546

case X86ISD::VSHLV:

35547

case X86ISD::VSRLV:

35548

case X86ISD::VSRAV:

35549

return true;

35550

}

35551

35552

return TargetLoweringBase::isBinOp(Opcode);

35553

}

35554

35555

bool X86TargetLowering::isCommutativeBinOp(unsigned Opcode) const {

35556

switch (Opcode) {

35557

// TODO: Add more X86ISD opcodes once we have test coverage.

35558

case X86ISD::PCMPEQ:

35559

case X86ISD::PMULDQ:

35560

case X86ISD::PMULUDQ:

35561

case X86ISD::FMAXC:

35562

case X86ISD::FMINC:

35563

case X86ISD::FAND:

35564

case X86ISD::FOR:

35565

case X86ISD::FXOR:

35566

return true;

35567

}

35568

35569

return TargetLoweringBase::isCommutativeBinOp(Opcode);

35570

}

35571

35572

bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {

35573

if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())

35574

return false;

35575

unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();

35576

unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();

35577

return NumBits1 > NumBits2;

35578

}

35579

35580

bool X86TargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const {

35581

if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())

35582

return false;

35583

35584

if (!isTypeLegal(EVT::getEVT(Ty1)))

35585

return false;

35586

35587

assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop")(static_cast <bool> (Ty1->getPrimitiveSizeInBits() <=
64 && "i128 is probably not a noop") ? void (0) : __assert_fail
("Ty1->getPrimitiveSizeInBits() <= 64 && \"i128 is probably not a noop\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 35587, __extension__
__PRETTY_FUNCTION__));

35588

35589

// Assuming the caller doesn't have a zeroext or signext return parameter,

35590

// truncation all the way down to i1 is valid.

35591

return true;

35592

}

35593

35594

bool X86TargetLowering::isLegalICmpImmediate(int64_t Imm) const {

35595

return isInt<32>(Imm);

35596

}

35597

35598

bool X86TargetLowering::isLegalAddImmediate(int64_t Imm) const {

35599

// Can also use sub to handle negated immediates.

35600

return isInt<32>(Imm);

35601

}

35602

35603

bool X86TargetLowering::isLegalStoreImmediate(int64_t Imm) const {

35604

return isInt<32>(Imm);

35605

}

35606

35607

bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {

35608

if (!VT1.isScalarInteger() || !VT2.isScalarInteger())

35609

return false;

35610

unsigned NumBits1 = VT1.getSizeInBits();

35611

unsigned NumBits2 = VT2.getSizeInBits();

35612

return NumBits1 > NumBits2;

35613

}

35614

35615

bool X86TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {

35616

// x86-64 implicitly zero-extends 32-bit results in 64-bit registers.

35617

return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget.is64Bit();

35618

}

35619

35620

bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {

35621

// x86-64 implicitly zero-extends 32-bit results in 64-bit registers.

35622

return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget.is64Bit();

35623

}

35624

35625

bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {

35626

EVT VT1 = Val.getValueType();

35627

if (isZExtFree(VT1, VT2))

35628

return true;

35629

35630

if (Val.getOpcode() != ISD::LOAD)

35631

return false;

35632

35633

if (!VT1.isSimple() || !VT1.isInteger() ||

35634

!VT2.isSimple() || !VT2.isInteger())

35635

return false;

35636

35637

switch (VT1.getSimpleVT().SimpleTy) {

35638

default: break;

35639

case MVT::i8:

35640

case MVT::i16:

35641

case MVT::i32:

35642

// X86 has 8, 16, and 32-bit zero-extending loads.

35643

return true;

35644

}

35645

35646

return false;

35647

}

35648

35649

bool X86TargetLowering::shouldSinkOperands(Instruction *I,

35650

SmallVectorImpl<Use *> &Ops) const {

35651

using namespace llvm::PatternMatch;

35652

35653

FixedVectorType *VTy = dyn_cast<FixedVectorType>(I->getType());

35654

if (!VTy)

35655

return false;

35656

35657

if (I->getOpcode() == Instruction::Mul &&

35658

VTy->getElementType()->isIntegerTy(64)) {

35659

for (auto &Op : I->operands()) {

35660

// Make sure we are not already sinking this operand

35661

if (any_of(Ops, [&](Use *U) { return U->get() == Op; }))

35662

continue;

35663

35664

// Look for PMULDQ pattern where the input is a sext_inreg from vXi32 or

35665

// the PMULUDQ pattern where the input is a zext_inreg from vXi32.

35666

if (Subtarget.hasSSE41() &&

35667

match(Op.get(), m_AShr(m_Shl(m_Value(), m_SpecificInt(32)),

35668

m_SpecificInt(32)))) {

35669

Ops.push_back(&cast<Instruction>(Op)->getOperandUse(0));

35670

Ops.push_back(&Op);

35671

} else if (Subtarget.hasSSE2() &&

35672

match(Op.get(),

35673

m_And(m_Value(), m_SpecificInt(UINT64_C(0xffffffff)0xffffffffUL)))) {

35674

Ops.push_back(&Op);

35675

}

35676

}

35677

35678

return !Ops.empty();

35679

}

35680

35681

// A uniform shift amount in a vector shift or funnel shift may be much

35682

// cheaper than a generic variable vector shift, so make that pattern visible

35683

// to SDAG by sinking the shuffle instruction next to the shift.

35684

int ShiftAmountOpNum = -1;

35685

if (I->isShift())

35686

ShiftAmountOpNum = 1;

35687

else if (auto *II = dyn_cast<IntrinsicInst>(I)) {

35688

if (II->getIntrinsicID() == Intrinsic::fshl ||

35689

II->getIntrinsicID() == Intrinsic::fshr)

35690

ShiftAmountOpNum = 2;

35691

}

35692

35693

if (ShiftAmountOpNum == -1)

35694

return false;

35695

35696

auto *Shuf = dyn_cast<ShuffleVectorInst>(I->getOperand(ShiftAmountOpNum));

35697

if (Shuf && getSplatIndex(Shuf->getShuffleMask()) >= 0 &&

35698

isVectorShiftByScalarCheap(I->getType())) {

35699

Ops.push_back(&I->getOperandUse(ShiftAmountOpNum));

35700

return true;

35701

}

35702

35703

return false;

35704

}

35705

35706

bool X86TargetLowering::shouldConvertPhiType(Type *From, Type *To) const {

35707

if (!Subtarget.is64Bit())

35708

return false;

35709

return TargetLowering::shouldConvertPhiType(From, To);

35710

}

35711

35712

bool X86TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {

35713

if (isa<MaskedLoadSDNode>(ExtVal.getOperand(0)))

35714

return false;

35715

35716

EVT SrcVT = ExtVal.getOperand(0).getValueType();

35717

35718

// There is no extending load for vXi1.

35719

if (SrcVT.getScalarType() == MVT::i1)

35720

return false;

35721

35722

return true;

35723

}

35724

35725

bool X86TargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,

35726

EVT VT) const {

35727

if (!Subtarget.hasAnyFMA())

35728

return false;

35729

35730

VT = VT.getScalarType();

35731

35732

if (!VT.isSimple())

35733

return false;

35734

35735

switch (VT.getSimpleVT().SimpleTy) {

35736

case MVT::f16:

35737

return Subtarget.hasFP16();

35738

case MVT::f32:

35739

case MVT::f64:

35740

return true;

35741

default:

35742

break;

35743

}

35744

35745

return false;

35746

}

35747

35748

bool X86TargetLowering::isNarrowingProfitable(EVT SrcVT, EVT DestVT) const {

35749

// i16 instructions are longer (0x66 prefix) and potentially slower.

35750

return !(SrcVT == MVT::i32 && DestVT == MVT::i16);

35751

}

35752

35753

bool X86TargetLowering::shouldFoldSelectWithIdentityConstant(unsigned Opcode,

35754

EVT VT) const {

35755

// TODO: This is too general. There are cases where pre-AVX512 codegen would

35756

// benefit. The transform may also be profitable for scalar code.

35757

if (!Subtarget.hasAVX512())

35758

return false;

35759

if (!Subtarget.hasVLX() && !VT.is512BitVector())

35760

return false;

35761

if (!VT.isVector() || VT.getScalarType() == MVT::i1)

35762

return false;

35763

35764

return true;

35765

}

35766

35767

/// Targets can use this to indicate that they only support *some*

35768

/// VECTOR_SHUFFLE operations, those with specific masks.

35769

/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values

35770

/// are assumed to be legal.

35771

bool X86TargetLowering::isShuffleMaskLegal(ArrayRef<int> Mask, EVT VT) const {

35772

if (!VT.isSimple())

35773

return false;

35774

35775

// Not for i1 vectors

35776

if (VT.getSimpleVT().getScalarType() == MVT::i1)

35777

return false;

35778

35779

// Very little shuffling can be done for 64-bit vectors right now.

35780

if (VT.getSimpleVT().getSizeInBits() == 64)

35781

return false;

35782

35783

// We only care that the types being shuffled are legal. The lowering can

35784

// handle any possible shuffle mask that results.

35785

return isTypeLegal(VT.getSimpleVT());

35786

}

35787

35788

bool X86TargetLowering::isVectorClearMaskLegal(ArrayRef<int> Mask,

35789

EVT VT) const {

35790

// Don't convert an 'and' into a shuffle that we don't directly support.

35791

// vpblendw and vpshufb for 256-bit vectors are not available on AVX1.

35792

if (!Subtarget.hasAVX2())

35793

if (VT == MVT::v32i8 || VT == MVT::v16i16)

35794

return false;

35795

35796

// Just delegate to the generic legality, clear masks aren't special.

35797

return isShuffleMaskLegal(Mask, VT);

35798

}

35799

35800

bool X86TargetLowering::areJTsAllowed(const Function *Fn) const {

35801

// If the subtarget is using thunks, we need to not generate jump tables.

35802

if (Subtarget.useIndirectThunkBranches())

35803

return false;

35804

35805

// Otherwise, fallback on the generic logic.

35806

return TargetLowering::areJTsAllowed(Fn);

35807

}

35808

35809

MVT X86TargetLowering::getPreferredSwitchConditionType(LLVMContext &Context,

35810

EVT ConditionVT) const {

35811

// Avoid 8 and 16 bit types because they increase the chance for unnecessary

35812

// zero-extensions.

35813

if (ConditionVT.getSizeInBits() < 32)

35814

return MVT::i32;

35815

return TargetLoweringBase::getPreferredSwitchConditionType(Context,

35816

ConditionVT);

35817

}

35818

35819

//===----------------------------------------------------------------------===//

35820

// X86 Scheduler Hooks

35821

//===----------------------------------------------------------------------===//

35822

35823

// Returns true if EFLAG is consumed after this iterator in the rest of the

35824

// basic block or any successors of the basic block.

35825

static bool isEFLAGSLiveAfter(MachineBasicBlock::iterator Itr,

35826

MachineBasicBlock *BB) {

35827

// Scan forward through BB for a use/def of EFLAGS.

35828

for (const MachineInstr &mi : llvm::make_range(std::next(Itr), BB->end())) {

35829

if (mi.readsRegister(X86::EFLAGS))

35830

return true;

35831

// If we found a def, we can stop searching.

35832

if (mi.definesRegister(X86::EFLAGS))

35833

return false;

35834

}

35835

35836

// If we hit the end of the block, check whether EFLAGS is live into a

35837

// successor.

35838

for (MachineBasicBlock *Succ : BB->successors())

35839

if (Succ->isLiveIn(X86::EFLAGS))

35840

return true;

35841

35842

return false;

35843

}

35844

35845

/// Utility function to emit xbegin specifying the start of an RTM region.

35846

static MachineBasicBlock *emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB,

35847

const TargetInstrInfo *TII) {

35848

const DebugLoc &DL = MI.getDebugLoc();

35849

35850

const BasicBlock *BB = MBB->getBasicBlock();

35851

MachineFunction::iterator I = ++MBB->getIterator();

35852

35853

// For the v = xbegin(), we generate

35854

//

35855

// thisMBB:

35856

// xbegin sinkMBB

35857

//

35858

// mainMBB:

35859

// s0 = -1

35860

//

35861

// fallBB:

35862

// eax = # XABORT_DEF

35863

// s1 = eax

35864

//

35865

// sinkMBB:

35866

// v = phi(s0/mainBB, s1/fallBB)

35867

35868

MachineBasicBlock *thisMBB = MBB;

35869

MachineFunction *MF = MBB->getParent();

35870

MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);

35871

MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);

35872

MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);

35873

MF->insert(I, mainMBB);

35874

MF->insert(I, fallMBB);

35875

MF->insert(I, sinkMBB);

35876

35877

if (isEFLAGSLiveAfter(MI, MBB)) {

35878

mainMBB->addLiveIn(X86::EFLAGS);

35879

fallMBB->addLiveIn(X86::EFLAGS);

35880

sinkMBB->addLiveIn(X86::EFLAGS);

35881

}

35882

35883

// Transfer the remainder of BB and its successor edges to sinkMBB.

35884

sinkMBB->splice(sinkMBB->begin(), MBB,

35885

std::next(MachineBasicBlock::iterator(MI)), MBB->end());

35886

sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);

35887

35888

MachineRegisterInfo &MRI = MF->getRegInfo();

35889

Register DstReg = MI.getOperand(0).getReg();

35890

const TargetRegisterClass *RC = MRI.getRegClass(DstReg);

35891

Register mainDstReg = MRI.createVirtualRegister(RC);

35892

Register fallDstReg = MRI.createVirtualRegister(RC);

35893

35894

// thisMBB:

35895

// xbegin fallMBB

35896

// # fallthrough to mainMBB

35897

// # abortion to fallMBB

35898

BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(fallMBB);

35899

thisMBB->addSuccessor(mainMBB);

35900

thisMBB->addSuccessor(fallMBB);

35901

35902

// mainMBB:

35903

// mainDstReg := -1

35904

BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), mainDstReg).addImm(-1);

35905

BuildMI(mainMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);

35906

mainMBB->addSuccessor(sinkMBB);

35907

35908

// fallMBB:

35909

// ; pseudo instruction to model hardware's definition from XABORT

35910

// EAX := XABORT_DEF

35911

// fallDstReg := EAX

35912

BuildMI(fallMBB, DL, TII->get(X86::XABORT_DEF));

35913

BuildMI(fallMBB, DL, TII->get(TargetOpcode::COPY), fallDstReg)

35914

.addReg(X86::EAX);

35915

fallMBB->addSuccessor(sinkMBB);

35916

35917

// sinkMBB:

35918

// DstReg := phi(mainDstReg/mainBB, fallDstReg/fallBB)

35919

BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII->get(X86::PHI), DstReg)

35920

.addReg(mainDstReg).addMBB(mainMBB)

35921

.addReg(fallDstReg).addMBB(fallMBB);

35922

35923

MI.eraseFromParent();

35924

return sinkMBB;

35925

}

35926

35927

MachineBasicBlock *

35928

X86TargetLowering::EmitVAARGWithCustomInserter(MachineInstr &MI,

35929

MachineBasicBlock *MBB) const {

35930

// Emit va_arg instruction on X86-64.

35931

35932

// Operands to this pseudo-instruction:

35933

// 0 ) Output : destination address (reg)

35934

// 1-5) Input : va_list address (addr, i64mem)

35935

// 6 ) ArgSize : Size (in bytes) of vararg type

35936

// 7 ) ArgMode : 0=overflow only, 1=use gp_offset, 2=use fp_offset

35937

// 8 ) Align : Alignment of type

35938

// 9 ) EFLAGS (implicit-def)

35939

35940

assert(MI.getNumOperands() == 10 && "VAARG should have 10 operands!")(static_cast <bool> (MI.getNumOperands() == 10 &&
"VAARG should have 10 operands!") ? void (0) : __assert_fail
("MI.getNumOperands() == 10 && \"VAARG should have 10 operands!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 35940, __extension__
__PRETTY_FUNCTION__));

35941

static_assert(X86::AddrNumOperands == 5, "VAARG assumes 5 address operands");

35942

35943

Register DestReg = MI.getOperand(0).getReg();

35944

MachineOperand &Base = MI.getOperand(1);

35945

MachineOperand &Scale = MI.getOperand(2);

35946

MachineOperand &Index = MI.getOperand(3);

35947

MachineOperand &Disp = MI.getOperand(4);

35948

MachineOperand &Segment = MI.getOperand(5);

35949

unsigned ArgSize = MI.getOperand(6).getImm();

35950

unsigned ArgMode = MI.getOperand(7).getImm();

35951

Align Alignment = Align(MI.getOperand(8).getImm());

35952

35953

MachineFunction *MF = MBB->getParent();

35954

35955

// Memory Reference

35956

assert(MI.hasOneMemOperand() && "Expected VAARG to have one memoperand")(static_cast <bool> (MI.hasOneMemOperand() && "Expected VAARG to have one memoperand"
) ? void (0) : __assert_fail ("MI.hasOneMemOperand() && \"Expected VAARG to have one memoperand\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 35956, __extension__
__PRETTY_FUNCTION__));

35957

35958

MachineMemOperand *OldMMO = MI.memoperands().front();

35959

35960

// Clone the MMO into two separate MMOs for loading and storing

35961

MachineMemOperand *LoadOnlyMMO = MF->getMachineMemOperand(

35962

OldMMO, OldMMO->getFlags() & ~MachineMemOperand::MOStore);

35963

MachineMemOperand *StoreOnlyMMO = MF->getMachineMemOperand(

35964

OldMMO, OldMMO->getFlags() & ~MachineMemOperand::MOLoad);

35965

35966

// Machine Information

35967

const TargetInstrInfo *TII = Subtarget.getInstrInfo();

35968

MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();

35969

const TargetRegisterClass *AddrRegClass =

35970

getRegClassFor(getPointerTy(MBB->getParent()->getDataLayout()));

35971

const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);

35972

const DebugLoc &DL = MI.getDebugLoc();

35973

35974

// struct va_list {

35975

// i32 gp_offset

35976

// i32 fp_offset

35977

// i64 overflow_area (address)

35978

// i64 reg_save_area (address)

35979

// }

35980

// sizeof(va_list) = 24

35981

// alignment(va_list) = 8

35982

35983

unsigned TotalNumIntRegs = 6;

35984

unsigned TotalNumXMMRegs = 8;

35985

bool UseGPOffset = (ArgMode == 1);

35986

bool UseFPOffset = (ArgMode == 2);

35987

unsigned MaxOffset = TotalNumIntRegs * 8 +

35988

(UseFPOffset ? TotalNumXMMRegs * 16 : 0);

35989

35990

/* Align ArgSize to a multiple of 8 */

35991

unsigned ArgSizeA8 = (ArgSize + 7) & ~7;

35992

bool NeedsAlign = (Alignment > 8);

35993

35994

MachineBasicBlock *thisMBB = MBB;

35995

MachineBasicBlock *overflowMBB;

35996

MachineBasicBlock *offsetMBB;

35997

MachineBasicBlock *endMBB;

35998

35999

unsigned OffsetDestReg = 0; // Argument address computed by offsetMBB

36000

unsigned OverflowDestReg = 0; // Argument address computed by overflowMBB

36001

unsigned OffsetReg = 0;

36002

36003

if (!UseGPOffset && !UseFPOffset) {

36004

// If we only pull from the overflow region, we don't create a branch.

36005

// We don't need to alter control flow.

36006

OffsetDestReg = 0; // unused

36007

OverflowDestReg = DestReg;

36008

36009

offsetMBB = nullptr;

36010

overflowMBB = thisMBB;

36011

endMBB = thisMBB;

36012

} else {

36013

// First emit code to check if gp_offset (or fp_offset) is below the bound.

36014

// If so, pull the argument from reg_save_area. (branch to offsetMBB)

36015

// If not, pull from overflow_area. (branch to overflowMBB)

36016

//

36017

// thisMBB

36018

// | .

36019

// | .

36020

// offsetMBB overflowMBB

36021

// | .

36022

// | .

36023

// endMBB

36024

36025

// Registers for the PHI in endMBB

36026

OffsetDestReg = MRI.createVirtualRegister(AddrRegClass);

36027

OverflowDestReg = MRI.createVirtualRegister(AddrRegClass);

36028

36029

const BasicBlock *LLVM_BB = MBB->getBasicBlock();

36030

overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);

36031

offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);

36032

endMBB = MF->CreateMachineBasicBlock(LLVM_BB);

36033

36034

MachineFunction::iterator MBBIter = ++MBB->getIterator();

36035

36036

// Insert the new basic blocks

36037

MF->insert(MBBIter, offsetMBB);

36038

MF->insert(MBBIter, overflowMBB);

36039

MF->insert(MBBIter, endMBB);

36040

36041

// Transfer the remainder of MBB and its successor edges to endMBB.

36042

endMBB->splice(endMBB->begin(), thisMBB,

36043

std::next(MachineBasicBlock::iterator(MI)), thisMBB->end());

36044

endMBB->transferSuccessorsAndUpdatePHIs(thisMBB);

36045

36046

// Make offsetMBB and overflowMBB successors of thisMBB

36047

thisMBB->addSuccessor(offsetMBB);

36048

thisMBB->addSuccessor(overflowMBB);

36049

36050

// endMBB is a successor of both offsetMBB and overflowMBB

36051

offsetMBB->addSuccessor(endMBB);

36052

overflowMBB->addSuccessor(endMBB);

36053

36054

// Load the offset value into a register

36055

OffsetReg = MRI.createVirtualRegister(OffsetRegClass);

36056

BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg)

36057

.add(Base)

36058

.add(Scale)

36059

.add(Index)

36060

.addDisp(Disp, UseFPOffset ? 4 : 0)

36061

.add(Segment)

36062

.setMemRefs(LoadOnlyMMO);

36063

36064

// Check if there is enough room left to pull this argument.

36065

BuildMI(thisMBB, DL, TII->get(X86::CMP32ri))

36066

.addReg(OffsetReg)

36067

.addImm(MaxOffset + 8 - ArgSizeA8);

36068

36069

// Branch to "overflowMBB" if offset >= max

36070

// Fall through to "offsetMBB" otherwise

36071

BuildMI(thisMBB, DL, TII->get(X86::JCC_1))

36072

.addMBB(overflowMBB).addImm(X86::COND_AE);

36073

}

36074

36075

// In offsetMBB, emit code to use the reg_save_area.

36076

if (offsetMBB) {

36077

assert(OffsetReg != 0)(static_cast <bool> (OffsetReg != 0) ? void (0) : __assert_fail
("OffsetReg != 0", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 36077, __extension__ __PRETTY_FUNCTION__));

36078

36079

// Read the reg_save_area address.

36080

Register RegSaveReg = MRI.createVirtualRegister(AddrRegClass);

36081

BuildMI(

36082

offsetMBB, DL,

36083

TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64rm : X86::MOV32rm),

36084

RegSaveReg)

36085

.add(Base)

36086

.add(Scale)

36087

.add(Index)

36088

.addDisp(Disp, Subtarget.isTarget64BitLP64() ? 16 : 12)

36089

.add(Segment)

36090

.setMemRefs(LoadOnlyMMO);

36091

36092

if (Subtarget.isTarget64BitLP64()) {

36093

// Zero-extend the offset

36094

Register OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);

36095

BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64)

36096

.addImm(0)

36097

.addReg(OffsetReg)

36098

.addImm(X86::sub_32bit);

36099

36100

// Add the offset to the reg_save_area to get the final address.

36101

BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg)

36102

.addReg(OffsetReg64)

36103

.addReg(RegSaveReg);

36104

} else {

36105

// Add the offset to the reg_save_area to get the final address.

36106

BuildMI(offsetMBB, DL, TII->get(X86::ADD32rr), OffsetDestReg)

36107

.addReg(OffsetReg)

36108

.addReg(RegSaveReg);

36109

}

36110

36111

// Compute the offset for the next argument

36112

Register NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);

36113

BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg)

36114

.addReg(OffsetReg)

36115

.addImm(UseFPOffset ? 16 : 8);

36116

36117

// Store it back into the va_list.

36118

BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr))

36119

.add(Base)

36120

.add(Scale)

36121

.add(Index)

36122

.addDisp(Disp, UseFPOffset ? 4 : 0)

36123

.add(Segment)

36124

.addReg(NextOffsetReg)

36125

.setMemRefs(StoreOnlyMMO);

36126

36127

// Jump to endMBB

36128

BuildMI(offsetMBB, DL, TII->get(X86::JMP_1))

36129

.addMBB(endMBB);

36130

}

36131

36132

//

36133

// Emit code to use overflow area

36134

//

36135

36136

// Load the overflow_area address into a register.

36137

Register OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);

36138

BuildMI(overflowMBB, DL,

36139

TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64rm : X86::MOV32rm),

36140

OverflowAddrReg)

36141

.add(Base)

36142

.add(Scale)

36143

.add(Index)

36144

.addDisp(Disp, 8)

36145

.add(Segment)

36146

.setMemRefs(LoadOnlyMMO);

36147

36148

// If we need to align it, do so. Otherwise, just copy the address

36149

// to OverflowDestReg.

36150

if (NeedsAlign) {

36151

// Align the overflow address

36152

Register TmpReg = MRI.createVirtualRegister(AddrRegClass);

36153

36154

// aligned_addr = (addr + (align-1)) & ~(align-1)

36155

BuildMI(

36156

overflowMBB, DL,

36157

TII->get(Subtarget.isTarget64BitLP64() ? X86::ADD64ri32 : X86::ADD32ri),

36158

TmpReg)

36159

.addReg(OverflowAddrReg)

36160

.addImm(Alignment.value() - 1);

36161

36162

BuildMI(

36163

overflowMBB, DL,

36164

TII->get(Subtarget.isTarget64BitLP64() ? X86::AND64ri32 : X86::AND32ri),

36165

OverflowDestReg)

36166

.addReg(TmpReg)

36167

.addImm(~(uint64_t)(Alignment.value() - 1));

36168

} else {

36169

BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg)

36170

.addReg(OverflowAddrReg);

36171

}

36172

36173

// Compute the next overflow address after this argument.

36174

// (the overflow address should be kept 8-byte aligned)

36175

Register NextAddrReg = MRI.createVirtualRegister(AddrRegClass);

36176

BuildMI(

36177

overflowMBB, DL,

36178

TII->get(Subtarget.isTarget64BitLP64() ? X86::ADD64ri32 : X86::ADD32ri),

36179

NextAddrReg)

36180

.addReg(OverflowDestReg)

36181

.addImm(ArgSizeA8);

36182

36183

// Store the new overflow address.

36184

BuildMI(overflowMBB, DL,

36185

TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64mr : X86::MOV32mr))

36186

.add(Base)

36187

.add(Scale)

36188

.add(Index)

36189

.addDisp(Disp, 8)

36190

.add(Segment)

36191

.addReg(NextAddrReg)

36192

.setMemRefs(StoreOnlyMMO);

36193

36194

// If we branched, emit the PHI to the front of endMBB.

36195

if (offsetMBB) {

36196

BuildMI(*endMBB, endMBB->begin(), DL,

36197

TII->get(X86::PHI), DestReg)

36198

.addReg(OffsetDestReg).addMBB(offsetMBB)

36199

.addReg(OverflowDestReg).addMBB(overflowMBB);

36200

}

36201

36202

// Erase the pseudo instruction

36203

MI.eraseFromParent();

36204

36205

return endMBB;

36206

}

36207

36208

// The EFLAGS operand of SelectItr might be missing a kill marker

36209

// because there were multiple uses of EFLAGS, and ISel didn't know

36210

// which to mark. Figure out whether SelectItr should have had a

36211

// kill marker, and set it if it should. Returns the correct kill

36212

// marker value.

36213

static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,

36214

MachineBasicBlock* BB,

36215

const TargetRegisterInfo* TRI) {

36216

if (isEFLAGSLiveAfter(SelectItr, BB))

36217

return false;

36218

36219

// We found a def, or hit the end of the basic block and EFLAGS wasn't live

36220

// out. SelectMI should have a kill flag on EFLAGS.

36221

SelectItr->addRegisterKilled(X86::EFLAGS, TRI);

36222

return true;

36223

}

36224

36225

// Return true if it is OK for this CMOV pseudo-opcode to be cascaded

36226

// together with other CMOV pseudo-opcodes into a single basic-block with

36227

// conditional jump around it.

36228

static bool isCMOVPseudo(MachineInstr &MI) {

36229

switch (MI.getOpcode()) {

36230

case X86::CMOV_FR16:

36231

case X86::CMOV_FR16X:

36232

case X86::CMOV_FR32:

36233

case X86::CMOV_FR32X:

36234

case X86::CMOV_FR64:

36235

case X86::CMOV_FR64X:

36236

case X86::CMOV_GR8:

36237

case X86::CMOV_GR16:

36238

case X86::CMOV_GR32:

36239

case X86::CMOV_RFP32:

36240

case X86::CMOV_RFP64:

36241

case X86::CMOV_RFP80:

36242

case X86::CMOV_VR64:

36243

case X86::CMOV_VR128:

36244

case X86::CMOV_VR128X:

36245

case X86::CMOV_VR256:

36246

case X86::CMOV_VR256X:

36247

case X86::CMOV_VR512:

36248

case X86::CMOV_VK1:

36249

case X86::CMOV_VK2:

36250

case X86::CMOV_VK4:

36251

case X86::CMOV_VK8:

36252

case X86::CMOV_VK16:

36253

case X86::CMOV_VK32:

36254

case X86::CMOV_VK64:

36255

return true;

36256

36257

default:

36258

return false;

36259

}

36260

}

36261

36262

// Helper function, which inserts PHI functions into SinkMBB:

36263

// %Result(i) = phi [ %FalseValue(i), FalseMBB ], [ %TrueValue(i), TrueMBB ],

36264

// where %FalseValue(i) and %TrueValue(i) are taken from the consequent CMOVs

36265

// in [MIItBegin, MIItEnd) range. It returns the last MachineInstrBuilder for

36266

// the last PHI function inserted.

36267

static MachineInstrBuilder createPHIsForCMOVsInSinkBB(

36268

MachineBasicBlock::iterator MIItBegin, MachineBasicBlock::iterator MIItEnd,

36269

MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB,

36270

MachineBasicBlock *SinkMBB) {

36271

MachineFunction *MF = TrueMBB->getParent();

36272

const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();

36273

const DebugLoc &DL = MIItBegin->getDebugLoc();

36274

36275

X86::CondCode CC = X86::CondCode(MIItBegin->getOperand(3).getImm());

36276

X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);

36277

36278

MachineBasicBlock::iterator SinkInsertionPoint = SinkMBB->begin();

36279

36280

// As we are creating the PHIs, we have to be careful if there is more than

36281

// one. Later CMOVs may reference the results of earlier CMOVs, but later

36282

// PHIs have to reference the individual true/false inputs from earlier PHIs.

36283

// That also means that PHI construction must work forward from earlier to

36284

// later, and that the code must maintain a mapping from earlier PHI's

36285

// destination registers, and the registers that went into the PHI.

36286

DenseMap<unsigned, std::pair<unsigned, unsigned>> RegRewriteTable;

36287

MachineInstrBuilder MIB;

36288

36289

for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) {

36290

Register DestReg = MIIt->getOperand(0).getReg();

36291

Register Op1Reg = MIIt->getOperand(1).getReg();

36292

Register Op2Reg = MIIt->getOperand(2).getReg();

36293

36294

// If this CMOV we are generating is the opposite condition from

36295

// the jump we generated, then we have to swap the operands for the

36296

// PHI that is going to be generated.

36297

if (MIIt->getOperand(3).getImm() == OppCC)

36298

std::swap(Op1Reg, Op2Reg);

36299

36300

if (RegRewriteTable.contains(Op1Reg))

36301

Op1Reg = RegRewriteTable[Op1Reg].first;

36302

36303

if (RegRewriteTable.contains(Op2Reg))

36304

Op2Reg = RegRewriteTable[Op2Reg].second;

36305

36306

MIB = BuildMI(*SinkMBB, SinkInsertionPoint, DL, TII->get(X86::PHI), DestReg)

36307

.addReg(Op1Reg)

36308

.addMBB(FalseMBB)

36309

.addReg(Op2Reg)

36310

.addMBB(TrueMBB);

36311

36312

// Add this PHI to the rewrite table.

36313

RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg);

36314

}

36315

36316

return MIB;

36317

}

36318

36319

// Lower cascaded selects in form of (SecondCmov (FirstCMOV F, T, cc1), T, cc2).

36320

MachineBasicBlock *

36321

X86TargetLowering::EmitLoweredCascadedSelect(MachineInstr &FirstCMOV,

36322

MachineInstr &SecondCascadedCMOV,

36323

MachineBasicBlock *ThisMBB) const {

36324

const TargetInstrInfo *TII = Subtarget.getInstrInfo();

36325

const DebugLoc &DL = FirstCMOV.getDebugLoc();

36326

36327

// We lower cascaded CMOVs such as

36328

//

36329

// (SecondCascadedCMOV (FirstCMOV F, T, cc1), T, cc2)

36330

//

36331

// to two successive branches.

36332

//

36333

// Without this, we would add a PHI between the two jumps, which ends up

36334

// creating a few copies all around. For instance, for

36335

//

36336

// (sitofp (zext (fcmp une)))

36337

//

36338

// we would generate:

36339

//

36340

// ucomiss %xmm1, %xmm0

36341

// movss <1.0f>, %xmm0

36342

// movaps %xmm0, %xmm1

36343

// jne .LBB5_2

36344

// xorps %xmm1, %xmm1

36345

// .LBB5_2:

36346

// jp .LBB5_4

36347

// movaps %xmm1, %xmm0

36348

// .LBB5_4:

36349

// retq

36350

//

36351

// because this custom-inserter would have generated:

36352

//

36353

// A

36354

// | \

36355

// | B

36356

// | /

36357

// C

36358

// | \

36359

// | D

36360

// | /

36361

// E

36362

//

36363

// A: X = ...; Y = ...

36364

// B: empty

36365

// C: Z = PHI [X, A], [Y, B]

36366

// D: empty

36367

// E: PHI [X, C], [Z, D]

36368

//

36369

// If we lower both CMOVs in a single step, we can instead generate:

36370

//

36371

// A

36372

// | \

36373

// | C

36374

// | /|

36375

// |/ |

36376

// | |

36377

// | D

36378

// | /

36379

// E

36380

//

36381

// A: X = ...; Y = ...

36382

// D: empty

36383

// E: PHI [X, A], [X, C], [Y, D]

36384

//

36385

// Which, in our sitofp/fcmp example, gives us something like:

36386

//

36387

// ucomiss %xmm1, %xmm0

36388

// movss <1.0f>, %xmm0

36389

// jne .LBB5_4

36390

// jp .LBB5_4

36391

// xorps %xmm0, %xmm0

36392

// .LBB5_4:

36393

// retq

36394

//

36395

36396

// We lower cascaded CMOV into two successive branches to the same block.

36397

// EFLAGS is used by both, so mark it as live in the second.

36398

const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();

36399

MachineFunction *F = ThisMBB->getParent();

36400

MachineBasicBlock *FirstInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);

36401

MachineBasicBlock *SecondInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);

36402

MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);

36403

36404

MachineFunction::iterator It = ++ThisMBB->getIterator();

36405

F->insert(It, FirstInsertedMBB);

36406

F->insert(It, SecondInsertedMBB);

36407

F->insert(It, SinkMBB);

36408

36409

// For a cascaded CMOV, we lower it to two successive branches to

36410

// the same block (SinkMBB). EFLAGS is used by both, so mark it as live in

36411

// the FirstInsertedMBB.

36412

FirstInsertedMBB->addLiveIn(X86::EFLAGS);

36413

36414

// If the EFLAGS register isn't dead in the terminator, then claim that it's

36415

// live into the sink and copy blocks.

36416

const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();

36417

if (!SecondCascadedCMOV.killsRegister(X86::EFLAGS) &&

36418

!checkAndUpdateEFLAGSKill(SecondCascadedCMOV, ThisMBB, TRI)) {

36419

SecondInsertedMBB->addLiveIn(X86::EFLAGS);

36420

SinkMBB->addLiveIn(X86::EFLAGS);

36421

}

36422

36423

// Transfer the remainder of ThisMBB and its successor edges to SinkMBB.

36424

SinkMBB->splice(SinkMBB->begin(), ThisMBB,

36425

std::next(MachineBasicBlock::iterator(FirstCMOV)),

36426

ThisMBB->end());

36427

SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);

36428

36429

// Fallthrough block for ThisMBB.

36430

ThisMBB->addSuccessor(FirstInsertedMBB);

36431

// The true block target of the first branch is always SinkMBB.

36432

ThisMBB->addSuccessor(SinkMBB);

36433

// Fallthrough block for FirstInsertedMBB.

36434

FirstInsertedMBB->addSuccessor(SecondInsertedMBB);

36435

// The true block for the branch of FirstInsertedMBB.

36436

FirstInsertedMBB->addSuccessor(SinkMBB);

36437

// This is fallthrough.

36438

SecondInsertedMBB->addSuccessor(SinkMBB);

36439

36440

// Create the conditional branch instructions.

36441

X86::CondCode FirstCC = X86::CondCode(FirstCMOV.getOperand(3).getImm());

36442

BuildMI(ThisMBB, DL, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(FirstCC);

36443

36444

X86::CondCode SecondCC =

36445

X86::CondCode(SecondCascadedCMOV.getOperand(3).getImm());

36446

BuildMI(FirstInsertedMBB, DL, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(SecondCC);

36447

36448

// SinkMBB:

36449

// %Result = phi [ %FalseValue, SecondInsertedMBB ], [ %TrueValue, ThisMBB ]

36450

Register DestReg = SecondCascadedCMOV.getOperand(0).getReg();

36451

Register Op1Reg = FirstCMOV.getOperand(1).getReg();

36452

Register Op2Reg = FirstCMOV.getOperand(2).getReg();

36453

MachineInstrBuilder MIB =

36454

BuildMI(*SinkMBB, SinkMBB->begin(), DL, TII->get(X86::PHI), DestReg)

36455

.addReg(Op1Reg)

36456

.addMBB(SecondInsertedMBB)

36457

.addReg(Op2Reg)

36458

.addMBB(ThisMBB);

36459

36460

// The second SecondInsertedMBB provides the same incoming value as the

36461

// FirstInsertedMBB (the True operand of the SELECT_CC/CMOV nodes).

36462

MIB.addReg(FirstCMOV.getOperand(2).getReg()).addMBB(FirstInsertedMBB);

36463

36464

// Now remove the CMOVs.

36465

FirstCMOV.eraseFromParent();

36466

SecondCascadedCMOV.eraseFromParent();

36467

36468

return SinkMBB;

36469

}

36470

36471

MachineBasicBlock *

36472

X86TargetLowering::EmitLoweredSelect(MachineInstr &MI,

36473

MachineBasicBlock *ThisMBB) const {

36474

const TargetInstrInfo *TII = Subtarget.getInstrInfo();

36475

const DebugLoc &DL = MI.getDebugLoc();

36476

36477

// To "insert" a SELECT_CC instruction, we actually have to insert the

36478

// diamond control-flow pattern. The incoming instruction knows the

36479

// destination vreg to set, the condition code register to branch on, the

36480

// true/false values to select between and a branch opcode to use.

36481

36482

// ThisMBB:

36483

// ...

36484

// TrueVal = ...

36485

// cmpTY ccX, r1, r2

36486

// bCC copy1MBB

36487

// fallthrough --> FalseMBB

36488

36489

// This code lowers all pseudo-CMOV instructions. Generally it lowers these

36490

// as described above, by inserting a BB, and then making a PHI at the join

36491

// point to select the true and false operands of the CMOV in the PHI.

36492

//

36493

// The code also handles two different cases of multiple CMOV opcodes

36494

// in a row.

36495

//

36496

// Case 1:

36497

// In this case, there are multiple CMOVs in a row, all which are based on

36498

// the same condition setting (or the exact opposite condition setting).

36499

// In this case we can lower all the CMOVs using a single inserted BB, and

36500

// then make a number of PHIs at the join point to model the CMOVs. The only

36501

// trickiness here, is that in a case like:

36502

//

36503

// t2 = CMOV cond1 t1, f1

36504

// t3 = CMOV cond1 t2, f2

36505

//

36506

// when rewriting this into PHIs, we have to perform some renaming on the

36507

// temps since you cannot have a PHI operand refer to a PHI result earlier

36508

// in the same block. The "simple" but wrong lowering would be:

36509

//

36510

// t2 = PHI t1(BB1), f1(BB2)

36511

// t3 = PHI t2(BB1), f2(BB2)

36512

//

36513

// but clearly t2 is not defined in BB1, so that is incorrect. The proper

36514

// renaming is to note that on the path through BB1, t2 is really just a

36515

// copy of t1, and do that renaming, properly generating:

36516

//

36517

// t2 = PHI t1(BB1), f1(BB2)

36518

// t3 = PHI t1(BB1), f2(BB2)

36519

//

36520

// Case 2:

36521

// CMOV ((CMOV F, T, cc1), T, cc2) is checked here and handled by a separate

36522

// function - EmitLoweredCascadedSelect.

36523

36524

X86::CondCode CC = X86::CondCode(MI.getOperand(3).getImm());

36525

X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);

36526

MachineInstr *LastCMOV = &MI;

36527

MachineBasicBlock::iterator NextMIIt = MachineBasicBlock::iterator(MI);

36528

36529

// Check for case 1, where there are multiple CMOVs with the same condition

36530

// first. Of the two cases of multiple CMOV lowerings, case 1 reduces the

36531

// number of jumps the most.

36532

36533

if (isCMOVPseudo(MI)) {

36534

// See if we have a string of CMOVS with the same condition. Skip over

36535

// intervening debug insts.

36536

while (NextMIIt != ThisMBB->end() && isCMOVPseudo(*NextMIIt) &&

36537

(NextMIIt->getOperand(3).getImm() == CC ||

36538

NextMIIt->getOperand(3).getImm() == OppCC)) {

36539

LastCMOV = &*NextMIIt;

36540

NextMIIt = next_nodbg(NextMIIt, ThisMBB->end());

36541

}

36542

}

36543

36544

// This checks for case 2, but only do this if we didn't already find

36545

// case 1, as indicated by LastCMOV == MI.

36546

if (LastCMOV == &MI && NextMIIt != ThisMBB->end() &&

36547

NextMIIt->getOpcode() == MI.getOpcode() &&

36548

NextMIIt->getOperand(2).getReg() == MI.getOperand(2).getReg() &&

36549

NextMIIt->getOperand(1).getReg() == MI.getOperand(0).getReg() &&

36550

NextMIIt->getOperand(1).isKill()) {

36551

return EmitLoweredCascadedSelect(MI, *NextMIIt, ThisMBB);

36552

}

36553

36554

const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();

36555

MachineFunction *F = ThisMBB->getParent();

36556

MachineBasicBlock *FalseMBB = F->CreateMachineBasicBlock(LLVM_BB);

36557

MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);

36558

36559

MachineFunction::iterator It = ++ThisMBB->getIterator();

36560

F->insert(It, FalseMBB);

36561

F->insert(It, SinkMBB);

36562

36563

// If the EFLAGS register isn't dead in the terminator, then claim that it's

36564

// live into the sink and copy blocks.

36565

const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();

36566

if (!LastCMOV->killsRegister(X86::EFLAGS) &&

36567

!checkAndUpdateEFLAGSKill(LastCMOV, ThisMBB, TRI)) {

36568

FalseMBB->addLiveIn(X86::EFLAGS);

36569

SinkMBB->addLiveIn(X86::EFLAGS);

36570

}

36571

36572

// Transfer any debug instructions inside the CMOV sequence to the sunk block.

36573

auto DbgRange = llvm::make_range(MachineBasicBlock::iterator(MI),

36574

MachineBasicBlock::iterator(LastCMOV));

36575

for (MachineInstr &MI : llvm::make_early_inc_range(DbgRange))

36576

if (MI.isDebugInstr())

36577

SinkMBB->push_back(MI.removeFromParent());

36578

36579

// Transfer the remainder of ThisMBB and its successor edges to SinkMBB.

36580

SinkMBB->splice(SinkMBB->end(), ThisMBB,

36581

std::next(MachineBasicBlock::iterator(LastCMOV)),

36582

ThisMBB->end());

36583

SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);

36584

36585

// Fallthrough block for ThisMBB.

36586

ThisMBB->addSuccessor(FalseMBB);

36587

// The true block target of the first (or only) branch is always a SinkMBB.

36588

ThisMBB->addSuccessor(SinkMBB);

36589

// Fallthrough block for FalseMBB.

36590

FalseMBB->addSuccessor(SinkMBB);

36591

36592

// Create the conditional branch instruction.

36593

BuildMI(ThisMBB, DL, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(CC);

36594

36595

// SinkMBB:

36596

// %Result = phi [ %FalseValue, FalseMBB ], [ %TrueValue, ThisMBB ]

36597

// ...

36598

MachineBasicBlock::iterator MIItBegin = MachineBasicBlock::iterator(MI);

36599

MachineBasicBlock::iterator MIItEnd =

36600

std::next(MachineBasicBlock::iterator(LastCMOV));

36601

createPHIsForCMOVsInSinkBB(MIItBegin, MIItEnd, ThisMBB, FalseMBB, SinkMBB);

36602

36603

// Now remove the CMOV(s).

36604

ThisMBB->erase(MIItBegin, MIItEnd);

36605

36606

return SinkMBB;

36607

}

36608

36609

static unsigned getSUBriOpcode(bool IsLP64, int64_t Imm) {

36610

if (IsLP64) {

36611

if (isInt<8>(Imm))

36612

return X86::SUB64ri8;

36613

return X86::SUB64ri32;

36614

} else {

36615

if (isInt<8>(Imm))

36616

return X86::SUB32ri8;

36617

return X86::SUB32ri;

36618

}

36619

}

36620

36621

MachineBasicBlock *

36622

X86TargetLowering::EmitLoweredProbedAlloca(MachineInstr &MI,

36623

MachineBasicBlock *MBB) const {

36624

MachineFunction *MF = MBB->getParent();

36625

const TargetInstrInfo *TII = Subtarget.getInstrInfo();

36626

const X86FrameLowering &TFI = *Subtarget.getFrameLowering();

36627

const DebugLoc &DL = MI.getDebugLoc();

36628

const BasicBlock *LLVM_BB = MBB->getBasicBlock();

36629

36630

const unsigned ProbeSize = getStackProbeSize(*MF);

36631

36632

MachineRegisterInfo &MRI = MF->getRegInfo();

36633

MachineBasicBlock *testMBB = MF->CreateMachineBasicBlock(LLVM_BB);

36634

MachineBasicBlock *tailMBB = MF->CreateMachineBasicBlock(LLVM_BB);

36635

MachineBasicBlock *blockMBB = MF->CreateMachineBasicBlock(LLVM_BB);

36636

36637

MachineFunction::iterator MBBIter = ++MBB->getIterator();

36638

MF->insert(MBBIter, testMBB);

36639

MF->insert(MBBIter, blockMBB);

36640

MF->insert(MBBIter, tailMBB);

36641

36642

Register sizeVReg = MI.getOperand(1).getReg();

36643

36644

Register physSPReg = TFI.Uses64BitFramePtr ? X86::RSP : X86::ESP;

36645

36646

Register TmpStackPtr = MRI.createVirtualRegister(

36647

TFI.Uses64BitFramePtr ? &X86::GR64RegClass : &X86::GR32RegClass);

36648

Register FinalStackPtr = MRI.createVirtualRegister(

36649

TFI.Uses64BitFramePtr ? &X86::GR64RegClass : &X86::GR32RegClass);

36650

36651

BuildMI(*MBB, {MI}, DL, TII->get(TargetOpcode::COPY), TmpStackPtr)

36652

.addReg(physSPReg);

36653

{

36654

const unsigned Opc = TFI.Uses64BitFramePtr ? X86::SUB64rr : X86::SUB32rr;

36655

BuildMI(*MBB, {MI}, DL, TII->get(Opc), FinalStackPtr)

36656

.addReg(TmpStackPtr)

36657

.addReg(sizeVReg);

36658

}

36659

36660

// test rsp size

36661

36662

BuildMI(testMBB, DL,

36663

TII->get(TFI.Uses64BitFramePtr ? X86::CMP64rr : X86::CMP32rr))

36664

.addReg(FinalStackPtr)

36665

.addReg(physSPReg);

36666

36667

BuildMI(testMBB, DL, TII->get(X86::JCC_1))

36668

.addMBB(tailMBB)

36669

.addImm(X86::COND_GE);

36670

testMBB->addSuccessor(blockMBB);

36671

testMBB->addSuccessor(tailMBB);

36672

36673

// Touch the block then extend it. This is done on the opposite side of

36674

// static probe where we allocate then touch, to avoid the need of probing the

36675

// tail of the static alloca. Possible scenarios are:

36676

//

36677

// + ---- <- ------------ <- ------------- <- ------------ +

36678

// | |

36679

// [free probe] -> [page alloc] -> [alloc probe] -> [tail alloc] + -> [dyn probe] -> [page alloc] -> [dyn probe] -> [tail alloc] +

36680

// | |

36681

// + <- ----------- <- ------------ <- ----------- <- ------------ +

36682

//

36683

// The property we want to enforce is to never have more than [page alloc] between two probes.

36684

36685

const unsigned XORMIOpc =

36686

TFI.Uses64BitFramePtr ? X86::XOR64mi8 : X86::XOR32mi8;

36687

addRegOffset(BuildMI(blockMBB, DL, TII->get(XORMIOpc)), physSPReg, false, 0)

36688

.addImm(0);

36689

36690

BuildMI(blockMBB, DL,

36691

TII->get(getSUBriOpcode(TFI.Uses64BitFramePtr, ProbeSize)), physSPReg)

36692

.addReg(physSPReg)

36693

.addImm(ProbeSize);

36694

36695

36696

BuildMI(blockMBB, DL, TII->get(X86::JMP_1)).addMBB(testMBB);

36697

blockMBB->addSuccessor(testMBB);

36698

36699

// Replace original instruction by the expected stack ptr

36700

BuildMI(tailMBB, DL, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())

36701

.addReg(FinalStackPtr);

36702

36703

tailMBB->splice(tailMBB->end(), MBB,

36704

std::next(MachineBasicBlock::iterator(MI)), MBB->end());

36705

tailMBB->transferSuccessorsAndUpdatePHIs(MBB);

36706

MBB->addSuccessor(testMBB);

36707

36708

// Delete the original pseudo instruction.

36709

MI.eraseFromParent();

36710

36711

// And we're done.

36712

return tailMBB;

36713

}

36714

36715

MachineBasicBlock *

36716

X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI,

36717

MachineBasicBlock *BB) const {

36718

MachineFunction *MF = BB->getParent();

36719

const TargetInstrInfo *TII = Subtarget.getInstrInfo();

36720

const DebugLoc &DL = MI.getDebugLoc();

36721

const BasicBlock *LLVM_BB = BB->getBasicBlock();

36722

36723

assert(MF->shouldSplitStack())(static_cast <bool> (MF->shouldSplitStack()) ? void (
0) : __assert_fail ("MF->shouldSplitStack()", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 36723, __extension__ __PRETTY_FUNCTION__));

36724

36725

const bool Is64Bit = Subtarget.is64Bit();

36726

const bool IsLP64 = Subtarget.isTarget64BitLP64();

36727

36728

const unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;

36729

const unsigned TlsOffset = IsLP64 ? 0x70 : Is64Bit ? 0x40 : 0x30;

36730

36731

// BB:

36732

// ... [Till the alloca]

36733

// If stacklet is not large enough, jump to mallocMBB

36734

//

36735

// bumpMBB:

36736

// Allocate by subtracting from RSP

36737

// Jump to continueMBB

36738

//

36739

// mallocMBB:

36740

// Allocate by call to runtime

36741

//

36742

// continueMBB:

36743

// ...

36744

// [rest of original BB]

36745

//

36746

36747

MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB);

36748

MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB);

36749

MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB);

36750

36751

MachineRegisterInfo &MRI = MF->getRegInfo();

36752

const TargetRegisterClass *AddrRegClass =

36753

getRegClassFor(getPointerTy(MF->getDataLayout()));

36754

36755

Register mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),

36756

bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),

36757

tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),

36758

SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),

36759

sizeVReg = MI.getOperand(1).getReg(),

36760

physSPReg =

36761

IsLP64 || Subtarget.isTargetNaCl64() ? X86::RSP : X86::ESP;

36762

36763

MachineFunction::iterator MBBIter = ++BB->getIterator();

36764

36765

MF->insert(MBBIter, bumpMBB);

36766

MF->insert(MBBIter, mallocMBB);

36767

MF->insert(MBBIter, continueMBB);

36768

36769

continueMBB->splice(continueMBB->begin(), BB,

36770

std::next(MachineBasicBlock::iterator(MI)), BB->end());

36771

continueMBB->transferSuccessorsAndUpdatePHIs(BB);

36772

36773

// Add code to the main basic block to check if the stack limit has been hit,

36774

// and if so, jump to mallocMBB otherwise to bumpMBB.

36775

BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);

36776

BuildMI(BB, DL, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)

36777

.addReg(tmpSPVReg).addReg(sizeVReg);

36778

BuildMI(BB, DL, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr))

36779

.addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)

36780

.addReg(SPLimitVReg);

36781

BuildMI(BB, DL, TII->get(X86::JCC_1)).addMBB(mallocMBB).addImm(X86::COND_G);

36782

36783

// bumpMBB simply decreases the stack pointer, since we know the current

36784

// stacklet has enough space.

36785

BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), physSPReg)

36786

.addReg(SPLimitVReg);

36787

BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)

36788

.addReg(SPLimitVReg);

36789

BuildMI(bumpMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);

36790

36791

// Calls into a routine in libgcc to allocate more space from the heap.

36792

const uint32_t *RegMask =

36793

Subtarget.getRegisterInfo()->getCallPreservedMask(*MF, CallingConv::C);

36794

if (IsLP64) {

36795

BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI)

36796

.addReg(sizeVReg);

36797

BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))

36798

.addExternalSymbol("__morestack_allocate_stack_space")

36799

.addRegMask(RegMask)

36800

.addReg(X86::RDI, RegState::Implicit)

36801

.addReg(X86::RAX, RegState::ImplicitDefine);

36802

} else if (Is64Bit) {

36803

BuildMI(mallocMBB, DL, TII->get(X86::MOV32rr), X86::EDI)

36804

.addReg(sizeVReg);

36805

BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))

36806

.addExternalSymbol("__morestack_allocate_stack_space")

36807

.addRegMask(RegMask)

36808

.addReg(X86::EDI, RegState::Implicit)

36809

.addReg(X86::EAX, RegState::ImplicitDefine);

36810

} else {

36811

BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)

36812

.addImm(12);

36813

BuildMI(mallocMBB, DL, TII->get(X86::PUSH32r)).addReg(sizeVReg);

36814

BuildMI(mallocMBB, DL, TII->get(X86::CALLpcrel32))

36815

.addExternalSymbol("__morestack_allocate_stack_space")

36816

.addRegMask(RegMask)

36817

.addReg(X86::EAX, RegState::ImplicitDefine);

36818

}

36819

36820

if (!Is64Bit)

36821

BuildMI(mallocMBB, DL, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)

36822

.addImm(16);

36823

36824

BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg)

36825

.addReg(IsLP64 ? X86::RAX : X86::EAX);

36826

BuildMI(mallocMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);

36827

36828

// Set up the CFG correctly.

36829

BB->addSuccessor(bumpMBB);

36830

BB->addSuccessor(mallocMBB);

36831

mallocMBB->addSuccessor(continueMBB);

36832

bumpMBB->addSuccessor(continueMBB);

36833

36834

// Take care of the PHI nodes.

36835

BuildMI(*continueMBB, continueMBB->begin(), DL, TII->get(X86::PHI),

36836

MI.getOperand(0).getReg())

36837

.addReg(mallocPtrVReg)

36838

.addMBB(mallocMBB)

36839

.addReg(bumpSPPtrVReg)

36840

.addMBB(bumpMBB);

36841

36842

// Delete the original pseudo instruction.

36843

MI.eraseFromParent();

36844

36845

// And we're done.

36846

return continueMBB;

36847

}

36848

36849

MachineBasicBlock *

36850

X86TargetLowering::EmitLoweredCatchRet(MachineInstr &MI,

36851

MachineBasicBlock *BB) const {

36852

MachineFunction *MF = BB->getParent();

36853

const TargetInstrInfo &TII = *Subtarget.getInstrInfo();

36854

MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();

36855

const DebugLoc &DL = MI.getDebugLoc();

36856

36857

assert(!isAsynchronousEHPersonality((static_cast <bool> (!isAsynchronousEHPersonality( classifyEHPersonality
(MF->getFunction().getPersonalityFn())) && "SEH does not use catchret!"
) ? void (0) : __assert_fail ("!isAsynchronousEHPersonality( classifyEHPersonality(MF->getFunction().getPersonalityFn())) && \"SEH does not use catchret!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36859, __extension__
__PRETTY_FUNCTION__))

36858

classifyEHPersonality(MF->getFunction().getPersonalityFn())) &&(static_cast <bool> (!isAsynchronousEHPersonality( classifyEHPersonality
(MF->getFunction().getPersonalityFn())) && "SEH does not use catchret!"
) ? void (0) : __assert_fail ("!isAsynchronousEHPersonality( classifyEHPersonality(MF->getFunction().getPersonalityFn())) && \"SEH does not use catchret!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36859, __extension__
__PRETTY_FUNCTION__))

36859

"SEH does not use catchret!")(static_cast <bool> (!isAsynchronousEHPersonality( classifyEHPersonality
(MF->getFunction().getPersonalityFn())) && "SEH does not use catchret!"
) ? void (0) : __assert_fail ("!isAsynchronousEHPersonality( classifyEHPersonality(MF->getFunction().getPersonalityFn())) && \"SEH does not use catchret!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36859, __extension__
__PRETTY_FUNCTION__));

36860

36861

// Only 32-bit EH needs to worry about manually restoring stack pointers.

36862

if (!Subtarget.is32Bit())

36863

return BB;

36864

36865

// C++ EH creates a new target block to hold the restore code, and wires up

36866

// the new block to the return destination with a normal JMP_4.

36867

MachineBasicBlock *RestoreMBB =

36868

MF->CreateMachineBasicBlock(BB->getBasicBlock());

36869

assert(BB->succ_size() == 1)(static_cast <bool> (BB->succ_size() == 1) ? void (0
) : __assert_fail ("BB->succ_size() == 1", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 36869, __extension__ __PRETTY_FUNCTION__));

36870

MF->insert(std::next(BB->getIterator()), RestoreMBB);

36871

RestoreMBB->transferSuccessorsAndUpdatePHIs(BB);

36872

BB->addSuccessor(RestoreMBB);

36873

MI.getOperand(0).setMBB(RestoreMBB);

36874

36875

// Marking this as an EH pad but not a funclet entry block causes PEI to

36876

// restore stack pointers in the block.

36877

RestoreMBB->setIsEHPad(true);

36878

36879

auto RestoreMBBI = RestoreMBB->begin();

36880

BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::JMP_4)).addMBB(TargetMBB);

36881

return BB;

36882

}

36883

36884

MachineBasicBlock *

36885

X86TargetLowering::EmitLoweredTLSAddr(MachineInstr &MI,

36886

MachineBasicBlock *BB) const {

36887

// So, here we replace TLSADDR with the sequence:

36888

// adjust_stackdown -> TLSADDR -> adjust_stackup.

36889

// We need this because TLSADDR is lowered into calls

36890

// inside MC, therefore without the two markers shrink-wrapping

36891

// may push the prologue/epilogue pass them.

36892

const TargetInstrInfo &TII = *Subtarget.getInstrInfo();

36893

const DebugLoc &DL = MI.getDebugLoc();

36894

MachineFunction &MF = *BB->getParent();

36895

36896

// Emit CALLSEQ_START right before the instruction.

36897

unsigned AdjStackDown = TII.getCallFrameSetupOpcode();

36898

MachineInstrBuilder CallseqStart =

36899

BuildMI(MF, DL, TII.get(AdjStackDown)).addImm(0).addImm(0).addImm(0);

36900

BB->insert(MachineBasicBlock::iterator(MI), CallseqStart);

36901

36902

// Emit CALLSEQ_END right after the instruction.

36903

// We don't call erase from parent because we want to keep the

36904

// original instruction around.

36905

unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();

36906

MachineInstrBuilder CallseqEnd =

36907

BuildMI(MF, DL, TII.get(AdjStackUp)).addImm(0).addImm(0);

36908

BB->insertAfter(MachineBasicBlock::iterator(MI), CallseqEnd);

36909

36910

return BB;

36911

}

36912

36913

MachineBasicBlock *

36914

X86TargetLowering::EmitLoweredTLSCall(MachineInstr &MI,

36915

MachineBasicBlock *BB) const {

36916

// This is pretty easy. We're taking the value that we received from

36917

// our load from the relocation, sticking it in either RDI (x86-64)

36918

// or EAX and doing an indirect call. The return value will then

36919

// be in the normal return register.

36920

MachineFunction *F = BB->getParent();

36921

const X86InstrInfo *TII = Subtarget.getInstrInfo();

36922

const DebugLoc &DL = MI.getDebugLoc();

36923

36924

assert(Subtarget.isTargetDarwin() && "Darwin only instr emitted?")(static_cast <bool> (Subtarget.isTargetDarwin() &&
"Darwin only instr emitted?") ? void (0) : __assert_fail ("Subtarget.isTargetDarwin() && \"Darwin only instr emitted?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36924, __extension__
__PRETTY_FUNCTION__));

36925

assert(MI.getOperand(3).isGlobal() && "This should be a global")(static_cast <bool> (MI.getOperand(3).isGlobal() &&
"This should be a global") ? void (0) : __assert_fail ("MI.getOperand(3).isGlobal() && \"This should be a global\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36925, __extension__
__PRETTY_FUNCTION__));

36926

36927

// Get a register mask for the lowered call.

36928

// FIXME: The 32-bit calls have non-standard calling conventions. Use a

36929

// proper register mask.

36930

const uint32_t *RegMask =

36931

Subtarget.is64Bit() ?

36932

Subtarget.getRegisterInfo()->getDarwinTLSCallPreservedMask() :

36933

Subtarget.getRegisterInfo()->getCallPreservedMask(*F, CallingConv::C);

36934

if (Subtarget.is64Bit()) {

36935

MachineInstrBuilder MIB =

36936

BuildMI(*BB, MI, DL, TII->get(X86::MOV64rm), X86::RDI)

36937

.addReg(X86::RIP)

36938

.addImm(0)

36939

.addReg(0)

36940

.addGlobalAddress(MI.getOperand(3).getGlobal(), 0,

36941

MI.getOperand(3).getTargetFlags())

36942

.addReg(0);

36943

MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m));

36944

addDirectMem(MIB, X86::RDI);

36945

MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);

36946

} else if (!isPositionIndependent()) {

36947

MachineInstrBuilder MIB =

36948

BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)

36949

.addReg(0)

36950

.addImm(0)

36951

.addReg(0)

36952

.addGlobalAddress(MI.getOperand(3).getGlobal(), 0,

36953

MI.getOperand(3).getTargetFlags())

36954

.addReg(0);

36955

MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));

36956

addDirectMem(MIB, X86::EAX);

36957

MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);

36958

} else {

36959

MachineInstrBuilder MIB =

36960

BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)

36961

.addReg(TII->getGlobalBaseReg(F))

36962

.addImm(0)

36963

.addReg(0)

36964

.addGlobalAddress(MI.getOperand(3).getGlobal(), 0,

36965

MI.getOperand(3).getTargetFlags())

36966

.addReg(0);

36967

MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));

36968

addDirectMem(MIB, X86::EAX);

36969

MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);

36970

}

36971

36972

MI.eraseFromParent(); // The pseudo instruction is gone now.

36973

return BB;

36974

}

36975

36976

static unsigned getOpcodeForIndirectThunk(unsigned RPOpc) {

36977

switch (RPOpc) {

36978

case X86::INDIRECT_THUNK_CALL32:

36979

return X86::CALLpcrel32;

36980

case X86::INDIRECT_THUNK_CALL64:

36981

return X86::CALL64pcrel32;

36982

case X86::INDIRECT_THUNK_TCRETURN32:

36983

return X86::TCRETURNdi;

36984

case X86::INDIRECT_THUNK_TCRETURN64:

36985

return X86::TCRETURNdi64;

36986

}

36987

llvm_unreachable("not indirect thunk opcode")::llvm::llvm_unreachable_internal("not indirect thunk opcode"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36987);

36988

}

36989

36990

static const char *getIndirectThunkSymbol(const X86Subtarget &Subtarget,

36991

unsigned Reg) {

36992

if (Subtarget.useRetpolineExternalThunk()) {

36993

// When using an external thunk for retpolines, we pick names that match the

36994

// names GCC happens to use as well. This helps simplify the implementation

36995

// of the thunks for kernels where they have no easy ability to create

36996

// aliases and are doing non-trivial configuration of the thunk's body. For

36997

// example, the Linux kernel will do boot-time hot patching of the thunk

36998

// bodies and cannot easily export aliases of these to loaded modules.

36999

//

37000

// Note that at any point in the future, we may need to change the semantics

37001

// of how we implement retpolines and at that time will likely change the

37002

// name of the called thunk. Essentially, there is no hard guarantee that

37003

// LLVM will generate calls to specific thunks, we merely make a best-effort

37004

// attempt to help out kernels and other systems where duplicating the

37005

// thunks is costly.

37006

switch (Reg) {

37007

case X86::EAX:

37008

assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")(static_cast <bool> (!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37008, __extension__
__PRETTY_FUNCTION__));

37009

return "__x86_indirect_thunk_eax";

37010

case X86::ECX:

37011

assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")(static_cast <bool> (!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37011, __extension__
__PRETTY_FUNCTION__));

37012

return "__x86_indirect_thunk_ecx";

37013

case X86::EDX:

37014

assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")(static_cast <bool> (!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37014, __extension__
__PRETTY_FUNCTION__));

37015

return "__x86_indirect_thunk_edx";

37016

case X86::EDI:

37017

assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")(static_cast <bool> (!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37017, __extension__
__PRETTY_FUNCTION__));

37018

return "__x86_indirect_thunk_edi";

37019

case X86::R11:

37020

assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!")(static_cast <bool> (Subtarget.is64Bit() && "Should not be using a 64-bit thunk!"
) ? void (0) : __assert_fail ("Subtarget.is64Bit() && \"Should not be using a 64-bit thunk!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37020, __extension__
__PRETTY_FUNCTION__));

37021

return "__x86_indirect_thunk_r11";

37022

}

37023

llvm_unreachable("unexpected reg for external indirect thunk")::llvm::llvm_unreachable_internal("unexpected reg for external indirect thunk"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37023);

37024

}

37025

37026

if (Subtarget.useRetpolineIndirectCalls() ||

37027

Subtarget.useRetpolineIndirectBranches()) {

37028

// When targeting an internal COMDAT thunk use an LLVM-specific name.

37029

switch (Reg) {

37030

case X86::EAX:

37031

assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")(static_cast <bool> (!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37031, __extension__
__PRETTY_FUNCTION__));

37032

return "__llvm_retpoline_eax";

37033

case X86::ECX:

37034

assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")(static_cast <bool> (!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37034, __extension__
__PRETTY_FUNCTION__));

37035

return "__llvm_retpoline_ecx";

37036

case X86::EDX:

37037

assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")(static_cast <bool> (!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37037, __extension__
__PRETTY_FUNCTION__));

37038

return "__llvm_retpoline_edx";

37039

case X86::EDI:

37040

assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")(static_cast <bool> (!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37040, __extension__
__PRETTY_FUNCTION__));

37041

return "__llvm_retpoline_edi";

37042

case X86::R11:

37043

assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!")(static_cast <bool> (Subtarget.is64Bit() && "Should not be using a 64-bit thunk!"
) ? void (0) : __assert_fail ("Subtarget.is64Bit() && \"Should not be using a 64-bit thunk!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37043, __extension__
__PRETTY_FUNCTION__));

37044

return "__llvm_retpoline_r11";

37045

}

37046

llvm_unreachable("unexpected reg for retpoline")::llvm::llvm_unreachable_internal("unexpected reg for retpoline"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37046);

37047

}

37048

37049

if (Subtarget.useLVIControlFlowIntegrity()) {

37050

assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!")(static_cast <bool> (Subtarget.is64Bit() && "Should not be using a 64-bit thunk!"
) ? void (0) : __assert_fail ("Subtarget.is64Bit() && \"Should not be using a 64-bit thunk!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37050, __extension__
__PRETTY_FUNCTION__));

37051

return "__llvm_lvi_thunk_r11";

37052

}

37053

llvm_unreachable("getIndirectThunkSymbol() invoked without thunk feature")::llvm::llvm_unreachable_internal("getIndirectThunkSymbol() invoked without thunk feature"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37053);

37054

}

37055

37056

MachineBasicBlock *

37057

X86TargetLowering::EmitLoweredIndirectThunk(MachineInstr &MI,

37058

MachineBasicBlock *BB) const {

37059

// Copy the virtual register into the R11 physical register and

37060

// call the retpoline thunk.

37061

const DebugLoc &DL = MI.getDebugLoc();

37062

const X86InstrInfo *TII = Subtarget.getInstrInfo();

37063

Register CalleeVReg = MI.getOperand(0).getReg();

37064

unsigned Opc = getOpcodeForIndirectThunk(MI.getOpcode());

37065

37066

// Find an available scratch register to hold the callee. On 64-bit, we can

37067

// just use R11, but we scan for uses anyway to ensure we don't generate

37068

// incorrect code. On 32-bit, we use one of EAX, ECX, or EDX that isn't

37069

// already a register use operand to the call to hold the callee. If none

37070

// are available, use EDI instead. EDI is chosen because EBX is the PIC base

37071

// register and ESI is the base pointer to realigned stack frames with VLAs.

37072

SmallVector<unsigned, 3> AvailableRegs;

37073

if (Subtarget.is64Bit())

37074

AvailableRegs.push_back(X86::R11);

37075

else

37076

AvailableRegs.append({X86::EAX, X86::ECX, X86::EDX, X86::EDI});

37077

37078

// Zero out any registers that are already used.

37079

for (const auto &MO : MI.operands()) {

37080

if (MO.isReg() && MO.isUse())

37081

for (unsigned &Reg : AvailableRegs)

37082

if (Reg == MO.getReg())

37083

Reg = 0;

37084

}

37085

37086

// Choose the first remaining non-zero available register.

37087

unsigned AvailableReg = 0;

37088

for (unsigned MaybeReg : AvailableRegs) {

37089

if (MaybeReg) {

37090

AvailableReg = MaybeReg;

37091

break;

37092

}

37093

}

37094

if (!AvailableReg)

37095

report_fatal_error("calling convention incompatible with retpoline, no "

37096

"available registers");

37097

37098

const char *Symbol = getIndirectThunkSymbol(Subtarget, AvailableReg);

37099

37100

BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), AvailableReg)

37101

.addReg(CalleeVReg);

37102

MI.getOperand(0).ChangeToES(Symbol);

37103

MI.setDesc(TII->get(Opc));

37104

MachineInstrBuilder(*BB->getParent(), &MI)

37105

.addReg(AvailableReg, RegState::Implicit | RegState::Kill);

37106

return BB;

37107

}

37108

37109

/// SetJmp implies future control flow change upon calling the corresponding

37110

/// LongJmp.

37111

/// Instead of using the 'return' instruction, the long jump fixes the stack and

37112

/// performs an indirect branch. To do so it uses the registers that were stored

37113

/// in the jump buffer (when calling SetJmp).

37114

/// In case the shadow stack is enabled we need to fix it as well, because some

37115

/// return addresses will be skipped.

37116

/// The function will save the SSP for future fixing in the function

37117

/// emitLongJmpShadowStackFix.

37118

/// \sa emitLongJmpShadowStackFix

37119

/// \param [in] MI The temporary Machine Instruction for the builtin.

37120

/// \param [in] MBB The Machine Basic Block that will be modified.

37121

void X86TargetLowering::emitSetJmpShadowStackFix(MachineInstr &MI,

37122

MachineBasicBlock *MBB) const {

37123

const DebugLoc &DL = MI.getDebugLoc();

37124

MachineFunction *MF = MBB->getParent();

37125

const TargetInstrInfo *TII = Subtarget.getInstrInfo();

37126

MachineRegisterInfo &MRI = MF->getRegInfo();

37127

MachineInstrBuilder MIB;

37128

37129

// Memory Reference.

37130

SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),

37131

MI.memoperands_end());

37132

37133

// Initialize a register with zero.

37134

MVT PVT = getPointerTy(MF->getDataLayout());

37135

const TargetRegisterClass *PtrRC = getRegClassFor(PVT);

37136

Register ZReg = MRI.createVirtualRegister(PtrRC);

37137

unsigned XorRROpc = (PVT == MVT::i64) ? X86::XOR64rr : X86::XOR32rr;

37138

BuildMI(*MBB, MI, DL, TII->get(XorRROpc))

37139

.addDef(ZReg)

37140

.addReg(ZReg, RegState::Undef)

37141

.addReg(ZReg, RegState::Undef);

37142

37143

// Read the current SSP Register value to the zeroed register.

37144

Register SSPCopyReg = MRI.createVirtualRegister(PtrRC);

37145

unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;

37146

BuildMI(*MBB, MI, DL, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);

37147

37148

// Write the SSP register value to offset 3 in input memory buffer.

37149

unsigned PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;

37150

MIB = BuildMI(*MBB, MI, DL, TII->get(PtrStoreOpc));

37151

const int64_t SSPOffset = 3 * PVT.getStoreSize();

37152

const unsigned MemOpndSlot = 1;

37153

for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {

37154

if (i == X86::AddrDisp)

37155

MIB.addDisp(MI.getOperand(MemOpndSlot + i), SSPOffset);

37156

else

37157

MIB.add(MI.getOperand(MemOpndSlot + i));

37158

}

37159

MIB.addReg(SSPCopyReg);

37160

MIB.setMemRefs(MMOs);

37161

}

37162

37163

MachineBasicBlock *

37164

X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,

37165

MachineBasicBlock *MBB) const {

37166

const DebugLoc &DL = MI.getDebugLoc();

37167

MachineFunction *MF = MBB->getParent();

37168

const TargetInstrInfo *TII = Subtarget.getInstrInfo();

37169

const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();

37170

MachineRegisterInfo &MRI = MF->getRegInfo();

37171

37172

const BasicBlock *BB = MBB->getBasicBlock();

37173

MachineFunction::iterator I = ++MBB->getIterator();

37174

37175

// Memory Reference

37176

SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),

37177

MI.memoperands_end());

37178

37179

unsigned DstReg;

37180

unsigned MemOpndSlot = 0;

37181

37182

unsigned CurOp = 0;

37183

37184

DstReg = MI.getOperand(CurOp++).getReg();

37185

const TargetRegisterClass *RC = MRI.getRegClass(DstReg);

37186

assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!")(static_cast <bool> (TRI->isTypeLegalForClass(*RC, MVT
::i32) && "Invalid destination!") ? void (0) : __assert_fail
("TRI->isTypeLegalForClass(*RC, MVT::i32) && \"Invalid destination!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37186, __extension__
__PRETTY_FUNCTION__));

37187

(void)TRI;

37188

Register mainDstReg = MRI.createVirtualRegister(RC);

37189

Register restoreDstReg = MRI.createVirtualRegister(RC);

37190

37191

MemOpndSlot = CurOp;

37192

37193

MVT PVT = getPointerTy(MF->getDataLayout());

37194

assert((PVT == MVT::i64 || PVT == MVT::i32) &&(static_cast <bool> ((PVT == MVT::i64 || PVT == MVT::i32
) && "Invalid Pointer Size!") ? void (0) : __assert_fail
("(PVT == MVT::i64 || PVT == MVT::i32) && \"Invalid Pointer Size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37195, __extension__
__PRETTY_FUNCTION__))

37195

"Invalid Pointer Size!")(static_cast <bool> ((PVT == MVT::i64 || PVT == MVT::i32
) && "Invalid Pointer Size!") ? void (0) : __assert_fail
("(PVT == MVT::i64 || PVT == MVT::i32) && \"Invalid Pointer Size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37195, __extension__
__PRETTY_FUNCTION__));

37196

37197

// For v = setjmp(buf), we generate

37198

//

37199

// thisMBB:

37200

// buf[LabelOffset] = restoreMBB <-- takes address of restoreMBB

37201

// SjLjSetup restoreMBB

37202

//

37203

// mainMBB:

37204

// v_main = 0

37205

//

37206

// sinkMBB:

37207

// v = phi(main, restore)

37208

//

37209

// restoreMBB:

37210

// if base pointer being used, load it from frame

37211

// v_restore = 1

37212

37213

MachineBasicBlock *thisMBB = MBB;

37214

MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);

37215

MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);

37216

MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB);

37217

MF->insert(I, mainMBB);

37218

MF->insert(I, sinkMBB);

37219

MF->push_back(restoreMBB);

37220

restoreMBB->setMachineBlockAddressTaken();

37221

37222

MachineInstrBuilder MIB;

37223

37224

// Transfer the remainder of BB and its successor edges to sinkMBB.

37225

sinkMBB->splice(sinkMBB->begin(), MBB,

37226

std::next(MachineBasicBlock::iterator(MI)), MBB->end());

37227

sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);

37228

37229

// thisMBB:

37230

unsigned PtrStoreOpc = 0;

37231

unsigned LabelReg = 0;

37232

const int64_t LabelOffset = 1 * PVT.getStoreSize();

37233

bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&

37234

!isPositionIndependent();

37235

37236

// Prepare IP either in reg or imm.

37237

if (!UseImmLabel) {

37238

PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;

37239

const TargetRegisterClass *PtrRC = getRegClassFor(PVT);

37240

LabelReg = MRI.createVirtualRegister(PtrRC);

37241

if (Subtarget.is64Bit()) {

37242

MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA64r), LabelReg)

37243

.addReg(X86::RIP)

37244

.addImm(0)

37245

.addReg(0)

37246

.addMBB(restoreMBB)

37247

.addReg(0);

37248

} else {

37249

const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII);

37250

MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA32r), LabelReg)

37251

.addReg(XII->getGlobalBaseReg(MF))

37252

.addImm(0)

37253

.addReg(0)

37254

.addMBB(restoreMBB, Subtarget.classifyBlockAddressReference())

37255

.addReg(0);

37256

}

37257

} else

37258

PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;

37259

// Store IP

37260

MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrStoreOpc));

37261

for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {

37262

if (i == X86::AddrDisp)

37263

MIB.addDisp(MI.getOperand(MemOpndSlot + i), LabelOffset);

37264

else

37265

MIB.add(MI.getOperand(MemOpndSlot + i));

37266

}

37267

if (!UseImmLabel)

37268

MIB.addReg(LabelReg);

37269

else

37270

MIB.addMBB(restoreMBB);

37271

MIB.setMemRefs(MMOs);

37272

37273

if (MF->getMMI().getModule()->getModuleFlag("cf-protection-return")) {

37274

emitSetJmpShadowStackFix(MI, thisMBB);

37275

}

37276

37277

// Setup

37278

MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup))

37279

.addMBB(restoreMBB);

37280

37281

const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();

37282

MIB.addRegMask(RegInfo->getNoPreservedMask());

37283

thisMBB->addSuccessor(mainMBB);

37284

thisMBB->addSuccessor(restoreMBB);

37285

37286

// mainMBB:

37287

// EAX = 0

37288

BuildMI(mainMBB, DL, TII->get(X86::MOV32r0), mainDstReg);

37289

mainMBB->addSuccessor(sinkMBB);

37290

37291

// sinkMBB:

37292

BuildMI(*sinkMBB, sinkMBB->begin(), DL,

37293

TII->get(X86::PHI), DstReg)

37294

.addReg(mainDstReg).addMBB(mainMBB)

37295

.addReg(restoreDstReg).addMBB(restoreMBB);

37296

37297

// restoreMBB:

37298

if (RegInfo->hasBasePointer(*MF)) {

37299

const bool Uses64BitFramePtr =

37300

Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();

37301

X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();

37302

X86FI->setRestoreBasePointer(MF);

37303

Register FramePtr = RegInfo->getFrameRegister(*MF);

37304

Register BasePtr = RegInfo->getBaseRegister();

37305

unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm;

37306

addRegOffset(BuildMI(restoreMBB, DL, TII->get(Opm), BasePtr),

37307

FramePtr, true, X86FI->getRestoreBasePointerOffset())

37308

.setMIFlag(MachineInstr::FrameSetup);

37309

}

37310

BuildMI(restoreMBB, DL, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);

37311

BuildMI(restoreMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);

37312

restoreMBB->addSuccessor(sinkMBB);

37313

37314

MI.eraseFromParent();

37315

return sinkMBB;

37316

}

37317

37318

/// Fix the shadow stack using the previously saved SSP pointer.

37319

/// \sa emitSetJmpShadowStackFix

37320

/// \param [in] MI The temporary Machine Instruction for the builtin.

37321

/// \param [in] MBB The Machine Basic Block that will be modified.

37322

/// \return The sink MBB that will perform the future indirect branch.

37323

MachineBasicBlock *

37324

X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI,

37325

MachineBasicBlock *MBB) const {

37326

const DebugLoc &DL = MI.getDebugLoc();

37327

MachineFunction *MF = MBB->getParent();

37328

const TargetInstrInfo *TII = Subtarget.getInstrInfo();

37329

MachineRegisterInfo &MRI = MF->getRegInfo();

37330

37331

// Memory Reference

37332

SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),

37333

MI.memoperands_end());

37334

37335

MVT PVT = getPointerTy(MF->getDataLayout());

37336

const TargetRegisterClass *PtrRC = getRegClassFor(PVT);

37337

37338

// checkSspMBB:

37339

// xor vreg1, vreg1

37340

// rdssp vreg1

37341

// test vreg1, vreg1

37342

// je sinkMBB # Jump if Shadow Stack is not supported

37343

// fallMBB:

37344

// mov buf+24/12(%rip), vreg2

37345

// sub vreg1, vreg2

37346

// jbe sinkMBB # No need to fix the Shadow Stack

37347

// fixShadowMBB:

37348

// shr 3/2, vreg2

37349

// incssp vreg2 # fix the SSP according to the lower 8 bits

37350

// shr 8, vreg2

37351

// je sinkMBB

37352

// fixShadowLoopPrepareMBB:

37353

// shl vreg2

37354

// mov 128, vreg3

37355

// fixShadowLoopMBB:

37356

// incssp vreg3

37357

// dec vreg2

37358

// jne fixShadowLoopMBB # Iterate until you finish fixing

37359

// # the Shadow Stack

37360

// sinkMBB:

37361

37362

MachineFunction::iterator I = ++MBB->getIterator();

37363

const BasicBlock *BB = MBB->getBasicBlock();

37364

37365

MachineBasicBlock *checkSspMBB = MF->CreateMachineBasicBlock(BB);

37366

MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);

37367

MachineBasicBlock *fixShadowMBB = MF->CreateMachineBasicBlock(BB);

37368

MachineBasicBlock *fixShadowLoopPrepareMBB = MF->CreateMachineBasicBlock(BB);

37369

MachineBasicBlock *fixShadowLoopMBB = MF->CreateMachineBasicBlock(BB);

37370

MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);

37371

MF->insert(I, checkSspMBB);

37372

MF->insert(I, fallMBB);

37373

MF->insert(I, fixShadowMBB);

37374

MF->insert(I, fixShadowLoopPrepareMBB);

37375

MF->insert(I, fixShadowLoopMBB);

37376

MF->insert(I, sinkMBB);

37377

37378

// Transfer the remainder of BB and its successor edges to sinkMBB.

37379

sinkMBB->splice(sinkMBB->begin(), MBB, MachineBasicBlock::iterator(MI),

37380

MBB->end());

37381

sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);

37382

37383

MBB->addSuccessor(checkSspMBB);

37384

37385

// Initialize a register with zero.

37386

Register ZReg = MRI.createVirtualRegister(&X86::GR32RegClass);

37387

BuildMI(checkSspMBB, DL, TII->get(X86::MOV32r0), ZReg);

37388

37389

if (PVT == MVT::i64) {

37390

Register TmpZReg = MRI.createVirtualRegister(PtrRC);

37391

BuildMI(checkSspMBB, DL, TII->get(X86::SUBREG_TO_REG), TmpZReg)

37392

.addImm(0)

37393

.addReg(ZReg)

37394

.addImm(X86::sub_32bit);

37395

ZReg = TmpZReg;

37396

}

37397

37398

// Read the current SSP Register value to the zeroed register.

37399

Register SSPCopyReg = MRI.createVirtualRegister(PtrRC);

37400

unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;

37401

BuildMI(checkSspMBB, DL, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);

37402

37403

// Check whether the result of the SSP register is zero and jump directly

37404

// to the sink.

37405

unsigned TestRROpc = (PVT == MVT::i64) ? X86::TEST64rr : X86::TEST32rr;

37406

BuildMI(checkSspMBB, DL, TII->get(TestRROpc))

37407

.addReg(SSPCopyReg)

37408

.addReg(SSPCopyReg);

37409

BuildMI(checkSspMBB, DL, TII->get(X86::JCC_1)).addMBB(sinkMBB).addImm(X86::COND_E);

37410

checkSspMBB->addSuccessor(sinkMBB);

37411

checkSspMBB->addSuccessor(fallMBB);

37412

37413

// Reload the previously saved SSP register value.

37414

Register PrevSSPReg = MRI.createVirtualRegister(PtrRC);

37415

unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;

37416

const int64_t SPPOffset = 3 * PVT.getStoreSize();

37417

MachineInstrBuilder MIB =

37418

BuildMI(fallMBB, DL, TII->get(PtrLoadOpc), PrevSSPReg);

37419

for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {

37420

const MachineOperand &MO = MI.getOperand(i);

37421

if (i == X86::AddrDisp)

37422

MIB.addDisp(MO, SPPOffset);

37423

else if (MO.isReg()) // Don't add the whole operand, we don't want to

37424

// preserve kill flags.

37425

MIB.addReg(MO.getReg());

37426

else

37427

MIB.add(MO);

37428

}

37429

MIB.setMemRefs(MMOs);

37430

37431

// Subtract the current SSP from the previous SSP.

37432

Register SspSubReg = MRI.createVirtualRegister(PtrRC);

37433

unsigned SubRROpc = (PVT == MVT::i64) ? X86::SUB64rr : X86::SUB32rr;

37434

BuildMI(fallMBB, DL, TII->get(SubRROpc), SspSubReg)

37435

.addReg(PrevSSPReg)

37436

.addReg(SSPCopyReg);

37437

37438

// Jump to sink in case PrevSSPReg <= SSPCopyReg.

37439

BuildMI(fallMBB, DL, TII->get(X86::JCC_1)).addMBB(sinkMBB).addImm(X86::COND_BE);

37440

fallMBB->addSuccessor(sinkMBB);

37441

fallMBB->addSuccessor(fixShadowMBB);

37442

37443

// Shift right by 2/3 for 32/64 because incssp multiplies the argument by 4/8.

37444

unsigned ShrRIOpc = (PVT == MVT::i64) ? X86::SHR64ri : X86::SHR32ri;

37445

unsigned Offset = (PVT == MVT::i64) ? 3 : 2;

37446

Register SspFirstShrReg = MRI.createVirtualRegister(PtrRC);

37447

BuildMI(fixShadowMBB, DL, TII->get(ShrRIOpc), SspFirstShrReg)

37448

.addReg(SspSubReg)

37449

.addImm(Offset);

37450

37451

// Increase SSP when looking only on the lower 8 bits of the delta.

37452

unsigned IncsspOpc = (PVT == MVT::i64) ? X86::INCSSPQ : X86::INCSSPD;

37453

BuildMI(fixShadowMBB, DL, TII->get(IncsspOpc)).addReg(SspFirstShrReg);

37454

37455

// Reset the lower 8 bits.

37456

Register SspSecondShrReg = MRI.createVirtualRegister(PtrRC);

37457

BuildMI(fixShadowMBB, DL, TII->get(ShrRIOpc), SspSecondShrReg)

37458

.addReg(SspFirstShrReg)

37459

.addImm(8);

37460

37461

// Jump if the result of the shift is zero.

37462

BuildMI(fixShadowMBB, DL, TII->get(X86::JCC_1)).addMBB(sinkMBB).addImm(X86::COND_E);

37463

fixShadowMBB->addSuccessor(sinkMBB);

37464

fixShadowMBB->addSuccessor(fixShadowLoopPrepareMBB);

37465

37466

// Do a single shift left.

37467

unsigned ShlR1Opc = (PVT == MVT::i64) ? X86::SHL64r1 : X86::SHL32r1;

37468

Register SspAfterShlReg = MRI.createVirtualRegister(PtrRC);

37469

BuildMI(fixShadowLoopPrepareMBB, DL, TII->get(ShlR1Opc), SspAfterShlReg)

37470

.addReg(SspSecondShrReg);

37471

37472

// Save the value 128 to a register (will be used next with incssp).

37473

Register Value128InReg = MRI.createVirtualRegister(PtrRC);

37474

unsigned MovRIOpc = (PVT == MVT::i64) ? X86::MOV64ri32 : X86::MOV32ri;

37475

BuildMI(fixShadowLoopPrepareMBB, DL, TII->get(MovRIOpc), Value128InReg)

37476

.addImm(128);

37477

fixShadowLoopPrepareMBB->addSuccessor(fixShadowLoopMBB);

37478

37479

// Since incssp only looks at the lower 8 bits, we might need to do several

37480

// iterations of incssp until we finish fixing the shadow stack.

37481

Register DecReg = MRI.createVirtualRegister(PtrRC);

37482

Register CounterReg = MRI.createVirtualRegister(PtrRC);

37483

BuildMI(fixShadowLoopMBB, DL, TII->get(X86::PHI), CounterReg)

37484

.addReg(SspAfterShlReg)

37485

.addMBB(fixShadowLoopPrepareMBB)

37486

.addReg(DecReg)

37487

.addMBB(fixShadowLoopMBB);

37488

37489

// Every iteration we increase the SSP by 128.

37490

BuildMI(fixShadowLoopMBB, DL, TII->get(IncsspOpc)).addReg(Value128InReg);

37491

37492

// Every iteration we decrement the counter by 1.

37493

unsigned DecROpc = (PVT == MVT::i64) ? X86::DEC64r : X86::DEC32r;

37494

BuildMI(fixShadowLoopMBB, DL, TII->get(DecROpc), DecReg).addReg(CounterReg);

37495

37496

// Jump if the counter is not zero yet.

37497

BuildMI(fixShadowLoopMBB, DL, TII->get(X86::JCC_1)).addMBB(fixShadowLoopMBB).addImm(X86::COND_NE);

37498

fixShadowLoopMBB->addSuccessor(sinkMBB);

37499

fixShadowLoopMBB->addSuccessor(fixShadowLoopMBB);

37500

37501

return sinkMBB;

37502

}

37503

37504

MachineBasicBlock *

37505

X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,

37506

MachineBasicBlock *MBB) const {

37507

const DebugLoc &DL = MI.getDebugLoc();

37508

MachineFunction *MF = MBB->getParent();

37509

const TargetInstrInfo *TII = Subtarget.getInstrInfo();

37510

MachineRegisterInfo &MRI = MF->getRegInfo();

37511

37512

// Memory Reference

37513

SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),

37514

MI.memoperands_end());

37515

37516

MVT PVT = getPointerTy(MF->getDataLayout());

37517

assert((PVT == MVT::i64 || PVT == MVT::i32) &&(static_cast <bool> ((PVT == MVT::i64 || PVT == MVT::i32
) && "Invalid Pointer Size!") ? void (0) : __assert_fail
("(PVT == MVT::i64 || PVT == MVT::i32) && \"Invalid Pointer Size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37518, __extension__
__PRETTY_FUNCTION__))

37518

"Invalid Pointer Size!")(static_cast <bool> ((PVT == MVT::i64 || PVT == MVT::i32
) && "Invalid Pointer Size!") ? void (0) : __assert_fail
("(PVT == MVT::i64 || PVT == MVT::i32) && \"Invalid Pointer Size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37518, __extension__
__PRETTY_FUNCTION__));

37519

37520

const TargetRegisterClass *RC =

37521

(PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;

37522

Register Tmp = MRI.createVirtualRegister(RC);

37523

// Since FP is only updated here but NOT referenced, it's treated as GPR.

37524

const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();

37525

Register FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;

37526

Register SP = RegInfo->getStackRegister();

37527

37528

MachineInstrBuilder MIB;

37529

37530

const int64_t LabelOffset = 1 * PVT.getStoreSize();

37531

const int64_t SPOffset = 2 * PVT.getStoreSize();

37532

37533

unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;

37534

unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r;

37535

37536

MachineBasicBlock *thisMBB = MBB;

37537

37538

// When CET and shadow stack is enabled, we need to fix the Shadow Stack.

37539

if (MF->getMMI().getModule()->getModuleFlag("cf-protection-return")) {

37540

thisMBB = emitLongJmpShadowStackFix(MI, thisMBB);

37541

}

37542

37543

// Reload FP

37544

MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrLoadOpc), FP);

37545

for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {

37546

const MachineOperand &MO = MI.getOperand(i);

37547

if (MO.isReg()) // Don't add the whole operand, we don't want to

37548

// preserve kill flags.

37549

MIB.addReg(MO.getReg());

37550

else

37551

MIB.add(MO);

37552

}

37553

MIB.setMemRefs(MMOs);

37554

37555

// Reload IP

37556

MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrLoadOpc), Tmp);

37557

for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {

37558

const MachineOperand &MO = MI.getOperand(i);

37559

if (i == X86::AddrDisp)

37560

MIB.addDisp(MO, LabelOffset);

37561

else if (MO.isReg()) // Don't add the whole operand, we don't want to

37562

// preserve kill flags.

37563

MIB.addReg(MO.getReg());

37564

else

37565

MIB.add(MO);

37566

}

37567

MIB.setMemRefs(MMOs);

37568

37569

// Reload SP

37570

MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrLoadOpc), SP);

37571

for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {

37572

if (i == X86::AddrDisp)

37573

MIB.addDisp(MI.getOperand(i), SPOffset);

37574

else

37575

MIB.add(MI.getOperand(i)); // We can preserve the kill flags here, it's

37576

// the last instruction of the expansion.

37577

}

37578

MIB.setMemRefs(MMOs);

37579

37580

// Jump

37581

BuildMI(*thisMBB, MI, DL, TII->get(IJmpOpc)).addReg(Tmp);

37582

37583

MI.eraseFromParent();

37584

return thisMBB;

37585

}

37586

37587

void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,

37588

MachineBasicBlock *MBB,

37589

MachineBasicBlock *DispatchBB,

37590

int FI) const {

37591

const DebugLoc &DL = MI.getDebugLoc();

37592

MachineFunction *MF = MBB->getParent();

37593

MachineRegisterInfo *MRI = &MF->getRegInfo();

37594

const X86InstrInfo *TII = Subtarget.getInstrInfo();

37595

37596

MVT PVT = getPointerTy(MF->getDataLayout());

37597

assert((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!")(static_cast <bool> ((PVT == MVT::i64 || PVT == MVT::i32
) && "Invalid Pointer Size!") ? void (0) : __assert_fail
("(PVT == MVT::i64 || PVT == MVT::i32) && \"Invalid Pointer Size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37597, __extension__
__PRETTY_FUNCTION__));

37598

37599

unsigned Op = 0;

37600

unsigned VR = 0;

37601

37602

bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&

37603

!isPositionIndependent();

37604

37605

if (UseImmLabel) {

37606

Op = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;

37607

} else {

37608

const TargetRegisterClass *TRC =

37609

(PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;

37610

VR = MRI->createVirtualRegister(TRC);

37611

Op = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;

37612

37613

if (Subtarget.is64Bit())

37614

BuildMI(*MBB, MI, DL, TII->get(X86::LEA64r), VR)

37615

.addReg(X86::RIP)

37616

.addImm(1)

37617

.addReg(0)

37618

.addMBB(DispatchBB)

37619

.addReg(0);

37620

else

37621

BuildMI(*MBB, MI, DL, TII->get(X86::LEA32r), VR)

37622

.addReg(0) /* TII->getGlobalBaseReg(MF) */

37623

.addImm(1)

37624

.addReg(0)

37625

.addMBB(DispatchBB, Subtarget.classifyBlockAddressReference())

37626

.addReg(0);

37627

}

37628

37629

MachineInstrBuilder MIB = BuildMI(*MBB, MI, DL, TII->get(Op));

37630

addFrameReference(MIB, FI, Subtarget.is64Bit() ? 56 : 36);

37631

if (UseImmLabel)

37632

MIB.addMBB(DispatchBB);

37633

else

37634

MIB.addReg(VR);

37635

}

37636

37637

MachineBasicBlock *

37638

X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,

37639

MachineBasicBlock *BB) const {

37640

const DebugLoc &DL = MI.getDebugLoc();

37641

MachineFunction *MF = BB->getParent();

37642

MachineRegisterInfo *MRI = &MF->getRegInfo();

37643

const X86InstrInfo *TII = Subtarget.getInstrInfo();

37644

int FI = MF->getFrameInfo().getFunctionContextIndex();

37645

37646

// Get a mapping of the call site numbers to all of the landing pads they're

37647

// associated with.

37648

DenseMap<unsigned, SmallVector<MachineBasicBlock *, 2>> CallSiteNumToLPad;

37649

unsigned MaxCSNum = 0;

37650

for (auto &MBB : *MF) {

37651

if (!MBB.isEHPad())

37652

continue;

37653

37654

MCSymbol *Sym = nullptr;

37655

for (const auto &MI : MBB) {

37656

if (MI.isDebugInstr())

37657

continue;

37658

37659

assert(MI.isEHLabel() && "expected EH_LABEL")(static_cast <bool> (MI.isEHLabel() && "expected EH_LABEL"
) ? void (0) : __assert_fail ("MI.isEHLabel() && \"expected EH_LABEL\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37659, __extension__
__PRETTY_FUNCTION__));

37660

Sym = MI.getOperand(0).getMCSymbol();

37661

break;

37662

}

37663

37664

if (!MF->hasCallSiteLandingPad(Sym))

37665

continue;

37666

37667

for (unsigned CSI : MF->getCallSiteLandingPad(Sym)) {

37668

CallSiteNumToLPad[CSI].push_back(&MBB);

37669

MaxCSNum = std::max(MaxCSNum, CSI);

37670

}

37671

}

37672

37673

// Get an ordered list of the machine basic blocks for the jump table.

37674

std::vector<MachineBasicBlock *> LPadList;

37675

SmallPtrSet<MachineBasicBlock *, 32> InvokeBBs;

37676

LPadList.reserve(CallSiteNumToLPad.size());

37677

37678

for (unsigned CSI = 1; CSI <= MaxCSNum; ++CSI) {

37679

for (auto &LP : CallSiteNumToLPad[CSI]) {

37680

LPadList.push_back(LP);

37681

InvokeBBs.insert(LP->pred_begin(), LP->pred_end());

37682

}

37683

}

37684

37685

assert(!LPadList.empty() &&(static_cast <bool> (!LPadList.empty() && "No landing pad destinations for the dispatch jump table!"
) ? void (0) : __assert_fail ("!LPadList.empty() && \"No landing pad destinations for the dispatch jump table!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37686, __extension__
__PRETTY_FUNCTION__))

37686

"No landing pad destinations for the dispatch jump table!")(static_cast <bool> (!LPadList.empty() && "No landing pad destinations for the dispatch jump table!"
) ? void (0) : __assert_fail ("!LPadList.empty() && \"No landing pad destinations for the dispatch jump table!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37686, __extension__
__PRETTY_FUNCTION__));

37687

37688

// Create the MBBs for the dispatch code.

37689

37690

// Shove the dispatch's address into the return slot in the function context.

37691

MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();

37692

DispatchBB->setIsEHPad(true);

37693

37694

MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();

37695

BuildMI(TrapBB, DL, TII->get(X86::TRAP));

37696

DispatchBB->addSuccessor(TrapBB);

37697

37698

MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();

37699

DispatchBB->addSuccessor(DispContBB);

37700

37701

// Insert MBBs.

37702

MF->push_back(DispatchBB);

37703

MF->push_back(DispContBB);

37704

MF->push_back(TrapBB);

37705

37706

// Insert code into the entry block that creates and registers the function

37707

// context.

37708

SetupEntryBlockForSjLj(MI, BB, DispatchBB, FI);

37709

37710

// Create the jump table and associated information

37711

unsigned JTE = getJumpTableEncoding();

37712

MachineJumpTableInfo *JTI = MF->getOrCreateJumpTableInfo(JTE);

37713

unsigned MJTI = JTI->createJumpTableIndex(LPadList);

37714

37715

const X86RegisterInfo &RI = TII->getRegisterInfo();

37716

// Add a register mask with no preserved registers. This results in all

37717

// registers being marked as clobbered.

37718

if (RI.hasBasePointer(*MF)) {

37719

const bool FPIs64Bit =

37720

Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();

37721

X86MachineFunctionInfo *MFI = MF->getInfo<X86MachineFunctionInfo>();

37722

MFI->setRestoreBasePointer(MF);

37723

37724

Register FP = RI.getFrameRegister(*MF);

37725

Register BP = RI.getBaseRegister();

37726

unsigned Op = FPIs64Bit ? X86::MOV64rm : X86::MOV32rm;

37727

addRegOffset(BuildMI(DispatchBB, DL, TII->get(Op), BP), FP, true,

37728

MFI->getRestoreBasePointerOffset())

37729

.addRegMask(RI.getNoPreservedMask());

37730

} else {

37731

BuildMI(DispatchBB, DL, TII->get(X86::NOOP))

37732

.addRegMask(RI.getNoPreservedMask());

37733

}

37734

37735

// IReg is used as an index in a memory operand and therefore can't be SP

37736

Register IReg = MRI->createVirtualRegister(&X86::GR32_NOSPRegClass);

37737

addFrameReference(BuildMI(DispatchBB, DL, TII->get(X86::MOV32rm), IReg), FI,

37738

Subtarget.is64Bit() ? 8 : 4);

37739

BuildMI(DispatchBB, DL, TII->get(X86::CMP32ri))

37740

.addReg(IReg)

37741

.addImm(LPadList.size());

37742

BuildMI(DispatchBB, DL, TII->get(X86::JCC_1)).addMBB(TrapBB).addImm(X86::COND_AE);

37743

37744

if (Subtarget.is64Bit()) {

37745

Register BReg = MRI->createVirtualRegister(&X86::GR64RegClass);

37746

Register IReg64 = MRI->createVirtualRegister(&X86::GR64_NOSPRegClass);

37747

37748

// leaq .LJTI0_0(%rip), BReg

37749

BuildMI(DispContBB, DL, TII->get(X86::LEA64r), BReg)

37750

.addReg(X86::RIP)

37751

.addImm(1)

37752

.addReg(0)

37753

.addJumpTableIndex(MJTI)

37754

.addReg(0);

37755

// movzx IReg64, IReg

37756

BuildMI(DispContBB, DL, TII->get(TargetOpcode::SUBREG_TO_REG), IReg64)

37757

.addImm(0)

37758

.addReg(IReg)

37759

.addImm(X86::sub_32bit);

37760

37761

switch (JTE) {

37762

case MachineJumpTableInfo::EK_BlockAddress:

37763

// jmpq *(BReg,IReg64,8)

37764

BuildMI(DispContBB, DL, TII->get(X86::JMP64m))

37765

.addReg(BReg)

37766

.addImm(8)

37767

.addReg(IReg64)

37768

.addImm(0)

37769

.addReg(0);

37770

break;

37771

case MachineJumpTableInfo::EK_LabelDifference32: {

37772

Register OReg = MRI->createVirtualRegister(&X86::GR32RegClass);

37773

Register OReg64 = MRI->createVirtualRegister(&X86::GR64RegClass);

37774

Register TReg = MRI->createVirtualRegister(&X86::GR64RegClass);

37775

37776

// movl (BReg,IReg64,4), OReg

37777

BuildMI(DispContBB, DL, TII->get(X86::MOV32rm), OReg)

37778

.addReg(BReg)

37779

.addImm(4)

37780

.addReg(IReg64)

37781

.addImm(0)

37782

.addReg(0);

37783

// movsx OReg64, OReg

37784

BuildMI(DispContBB, DL, TII->get(X86::MOVSX64rr32), OReg64).addReg(OReg);

37785

// addq BReg, OReg64, TReg

37786

BuildMI(DispContBB, DL, TII->get(X86::ADD64rr), TReg)

37787

.addReg(OReg64)

37788

.addReg(BReg);

37789

// jmpq *TReg

37790

BuildMI(DispContBB, DL, TII->get(X86::JMP64r)).addReg(TReg);

37791

break;

37792

}

37793

default:

37794

llvm_unreachable("Unexpected jump table encoding")::llvm::llvm_unreachable_internal("Unexpected jump table encoding"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37794);

37795

}

37796

} else {

37797

// jmpl *.LJTI0_0(,IReg,4)

37798

BuildMI(DispContBB, DL, TII->get(X86::JMP32m))

37799

.addReg(0)

37800

.addImm(4)

37801

.addReg(IReg)

37802

.addJumpTableIndex(MJTI)

37803

.addReg(0);

37804

}

37805

37806

// Add the jump table entries as successors to the MBB.

37807

SmallPtrSet<MachineBasicBlock *, 8> SeenMBBs;

37808

for (auto &LP : LPadList)

37809

if (SeenMBBs.insert(LP).second)

37810

DispContBB->addSuccessor(LP);

37811

37812

// N.B. the order the invoke BBs are processed in doesn't matter here.

37813

SmallVector<MachineBasicBlock *, 64> MBBLPads;

37814

const MCPhysReg *SavedRegs = MF->getRegInfo().getCalleeSavedRegs();

37815

for (MachineBasicBlock *MBB : InvokeBBs) {

37816

// Remove the landing pad successor from the invoke block and replace it

37817

// with the new dispatch block.

37818

// Keep a copy of Successors since it's modified inside the loop.

37819

SmallVector<MachineBasicBlock *, 8> Successors(MBB->succ_rbegin(),

37820

MBB->succ_rend());

37821

// FIXME: Avoid quadratic complexity.

37822

for (auto *MBBS : Successors) {

37823

if (MBBS->isEHPad()) {

37824

MBB->removeSuccessor(MBBS);

37825

MBBLPads.push_back(MBBS);

37826

}

37827

}

37828

37829

MBB->addSuccessor(DispatchBB);

37830

37831

// Find the invoke call and mark all of the callee-saved registers as

37832

// 'implicit defined' so that they're spilled. This prevents code from

37833

// moving instructions to before the EH block, where they will never be

37834

// executed.

37835

for (auto &II : reverse(*MBB)) {

37836

if (!II.isCall())

37837

continue;

37838

37839

DenseMap<unsigned, bool> DefRegs;

37840

for (auto &MOp : II.operands())

37841

if (MOp.isReg())

37842

DefRegs[MOp.getReg()] = true;

37843

37844

MachineInstrBuilder MIB(*MF, &II);

37845

for (unsigned RegIdx = 0; SavedRegs[RegIdx]; ++RegIdx) {

37846

unsigned Reg = SavedRegs[RegIdx];

37847

if (!DefRegs[Reg])

37848

MIB.addReg(Reg, RegState::ImplicitDefine | RegState::Dead);

37849

}

37850

37851

break;

37852

}

37853

}

37854

37855

// Mark all former landing pads as non-landing pads. The dispatch is the only

37856

// landing pad now.

37857

for (auto &LP : MBBLPads)

37858

LP->setIsEHPad(false);

37859

37860

// The instruction is gone now.

37861

MI.eraseFromParent();

37862

return BB;

37863

}

37864

37865

MachineBasicBlock *

37866

X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,

37867

MachineBasicBlock *BB) const {

37868

MachineFunction *MF = BB->getParent();

37869

const TargetInstrInfo *TII = Subtarget.getInstrInfo();

37870

const DebugLoc &DL = MI.getDebugLoc();

37871

37872

auto TMMImmToTMMReg = [](unsigned Imm) {

37873

assert (Imm < 8 && "Illegal tmm index")(static_cast <bool> (Imm < 8 && "Illegal tmm index"
) ? void (0) : __assert_fail ("Imm < 8 && \"Illegal tmm index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37873, __extension__
__PRETTY_FUNCTION__));

37874

return X86::TMM0 + Imm;

37875

};

37876

switch (MI.getOpcode()) {

37877

default: llvm_unreachable("Unexpected instr type to insert")::llvm::llvm_unreachable_internal("Unexpected instr type to insert"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37877);

37878

case X86::TLS_addr32:

37879

case X86::TLS_addr64:

37880

case X86::TLS_addrX32:

37881

case X86::TLS_base_addr32:

37882

case X86::TLS_base_addr64:

37883

case X86::TLS_base_addrX32:

37884

return EmitLoweredTLSAddr(MI, BB);

37885

case X86::INDIRECT_THUNK_CALL32:

37886

case X86::INDIRECT_THUNK_CALL64:

37887

case X86::INDIRECT_THUNK_TCRETURN32:

37888

case X86::INDIRECT_THUNK_TCRETURN64:

37889

return EmitLoweredIndirectThunk(MI, BB);

37890

case X86::CATCHRET:

37891

return EmitLoweredCatchRet(MI, BB);

37892

case X86::SEG_ALLOCA_32:

37893

case X86::SEG_ALLOCA_64:

37894

return EmitLoweredSegAlloca(MI, BB);

37895

case X86::PROBED_ALLOCA_32:

37896

case X86::PROBED_ALLOCA_64:

37897

return EmitLoweredProbedAlloca(MI, BB);

37898

case X86::TLSCall_32:

37899

case X86::TLSCall_64:

37900

return EmitLoweredTLSCall(MI, BB);

37901

case X86::CMOV_FR16:

37902

case X86::CMOV_FR16X:

37903

case X86::CMOV_FR32:

37904

case X86::CMOV_FR32X:

37905

case X86::CMOV_FR64:

37906

case X86::CMOV_FR64X:

37907

case X86::CMOV_GR8:

37908

case X86::CMOV_GR16:

37909

case X86::CMOV_GR32:

37910

case X86::CMOV_RFP32:

37911

case X86::CMOV_RFP64:

37912

case X86::CMOV_RFP80:

37913

case X86::CMOV_VR64:

37914

case X86::CMOV_VR128:

37915

case X86::CMOV_VR128X:

37916

case X86::CMOV_VR256:

37917

case X86::CMOV_VR256X:

37918

case X86::CMOV_VR512:

37919

case X86::CMOV_VK1:

37920

case X86::CMOV_VK2:

37921

case X86::CMOV_VK4:

37922

case X86::CMOV_VK8:

37923

case X86::CMOV_VK16:

37924

case X86::CMOV_VK32:

37925

case X86::CMOV_VK64:

37926

return EmitLoweredSelect(MI, BB);

37927

37928

case X86::FP80_ADDr:

37929

case X86::FP80_ADDm32: {

37930

// Change the floating point control register to use double extended

37931

// precision when performing the addition.

37932

int OrigCWFrameIdx =

37933

MF->getFrameInfo().CreateStackObject(2, Align(2), false);

37934

addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::FNSTCW16m)),

37935

OrigCWFrameIdx);

37936

37937

// Load the old value of the control word...

37938

Register OldCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);

37939

addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOVZX32rm16), OldCW),

37940

OrigCWFrameIdx);

37941

37942

// OR 0b11 into bit 8 and 9. 0b11 is the encoding for double extended

37943

// precision.

37944

Register NewCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);

37945

BuildMI(*BB, MI, DL, TII->get(X86::OR32ri), NewCW)

37946

.addReg(OldCW, RegState::Kill)

37947

.addImm(0x300);

37948

37949

// Extract to 16 bits.

37950

Register NewCW16 =

37951

MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);

37952

BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), NewCW16)

37953

.addReg(NewCW, RegState::Kill, X86::sub_16bit);

37954

37955

// Prepare memory for FLDCW.

37956

int NewCWFrameIdx =

37957

MF->getFrameInfo().CreateStackObject(2, Align(2), false);

37958

addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)),

37959

NewCWFrameIdx)

37960

.addReg(NewCW16, RegState::Kill);

37961

37962

// Reload the modified control word now...

37963

addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::FLDCW16m)),

37964

NewCWFrameIdx);

37965

37966

// Do the addition.

37967

if (MI.getOpcode() == X86::FP80_ADDr) {

37968

BuildMI(*BB, MI, DL, TII->get(X86::ADD_Fp80))

37969

.add(MI.getOperand(0))

37970

.add(MI.getOperand(1))

37971

.add(MI.getOperand(2));

37972

} else {

37973

BuildMI(*BB, MI, DL, TII->get(X86::ADD_Fp80m32))

37974

.add(MI.getOperand(0))

37975

.add(MI.getOperand(1))

37976

.add(MI.getOperand(2))

37977

.add(MI.getOperand(3))

37978

.add(MI.getOperand(4))

37979

.add(MI.getOperand(5))

37980

.add(MI.getOperand(6));

37981

}

37982

37983

// Reload the original control word now.

37984

addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::FLDCW16m)),

37985

OrigCWFrameIdx);

37986

37987

MI.eraseFromParent(); // The pseudo instruction is gone now.

37988

return BB;

37989

}

37990

37991

case X86::FP32_TO_INT16_IN_MEM:

37992

case X86::FP32_TO_INT32_IN_MEM:

37993

case X86::FP32_TO_INT64_IN_MEM:

37994

case X86::FP64_TO_INT16_IN_MEM:

37995

case X86::FP64_TO_INT32_IN_MEM:

37996

case X86::FP64_TO_INT64_IN_MEM:

37997

case X86::FP80_TO_INT16_IN_MEM:

37998

case X86::FP80_TO_INT32_IN_MEM:

37999

case X86::FP80_TO_INT64_IN_MEM: {

38000

// Change the floating point control register to use "round towards zero"

38001

// mode when truncating to an integer value.

38002

int OrigCWFrameIdx =

38003

MF->getFrameInfo().CreateStackObject(2, Align(2), false);

38004

addFrameReference(BuildMI(*BB, MI, DL,

38005

TII->get(X86::FNSTCW16m)), OrigCWFrameIdx);

38006

38007

// Load the old value of the control word...

38008

Register OldCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);

38009

addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOVZX32rm16), OldCW),

38010

OrigCWFrameIdx);

38011

38012

// OR 0b11 into bit 10 and 11. 0b11 is the encoding for round toward zero.

38013

Register NewCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);

38014

BuildMI(*BB, MI, DL, TII->get(X86::OR32ri), NewCW)

38015

.addReg(OldCW, RegState::Kill).addImm(0xC00);

38016

38017

// Extract to 16 bits.

38018

Register NewCW16 =

38019

MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);

38020

BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), NewCW16)

38021

.addReg(NewCW, RegState::Kill, X86::sub_16bit);

38022

38023

// Prepare memory for FLDCW.

38024

int NewCWFrameIdx =

38025

MF->getFrameInfo().CreateStackObject(2, Align(2), false);

38026

addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)),

38027

NewCWFrameIdx)

38028

.addReg(NewCW16, RegState::Kill);

38029

38030

// Reload the modified control word now...

38031

addFrameReference(BuildMI(*BB, MI, DL,

38032

TII->get(X86::FLDCW16m)), NewCWFrameIdx);

38033

38034

// Get the X86 opcode to use.

38035

unsigned Opc;

38036

switch (MI.getOpcode()) {

38037

default: llvm_unreachable("illegal opcode!")::llvm::llvm_unreachable_internal("illegal opcode!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 38037);

38038

case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;

38039

case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;

38040

case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;

38041

case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;

38042

case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;

38043

case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;

38044

case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;

38045

case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;

38046

case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;

38047

}

38048

38049

X86AddressMode AM = getAddressFromInstr(&MI, 0);

38050

addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM)

38051

.addReg(MI.getOperand(X86::AddrNumOperands).getReg());

38052

38053

// Reload the original control word now.

38054

addFrameReference(BuildMI(*BB, MI, DL,

38055

TII->get(X86::FLDCW16m)), OrigCWFrameIdx);

38056

38057

MI.eraseFromParent(); // The pseudo instruction is gone now.

38058

return BB;

38059

}

38060

38061

// xbegin

38062

case X86::XBEGIN:

38063

return emitXBegin(MI, BB, Subtarget.getInstrInfo());

38064

38065

case X86::VAARG_64:

38066

case X86::VAARG_X32:

38067

return EmitVAARGWithCustomInserter(MI, BB);

38068

38069

case X86::EH_SjLj_SetJmp32:

38070

case X86::EH_SjLj_SetJmp64:

38071

return emitEHSjLjSetJmp(MI, BB);

38072

38073

case X86::EH_SjLj_LongJmp32:

38074

case X86::EH_SjLj_LongJmp64:

38075

return emitEHSjLjLongJmp(MI, BB);

38076

38077

case X86::Int_eh_sjlj_setup_dispatch:

38078

return EmitSjLjDispatchBlock(MI, BB);

38079

38080

case TargetOpcode::STATEPOINT:

38081

// As an implementation detail, STATEPOINT shares the STACKMAP format at

38082

// this point in the process. We diverge later.

38083

return emitPatchPoint(MI, BB);

38084

38085

case TargetOpcode::STACKMAP:

38086

case TargetOpcode::PATCHPOINT:

38087

return emitPatchPoint(MI, BB);

38088

38089

case TargetOpcode::PATCHABLE_EVENT_CALL:

38090

case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:

38091

return BB;

38092

38093

case X86::LCMPXCHG8B: {

38094

const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();

38095

// In addition to 4 E[ABCD] registers implied by encoding, CMPXCHG8B

38096

// requires a memory operand. If it happens that current architecture is

38097

// i686 and for current function we need a base pointer

38098

// - which is ESI for i686 - register allocator would not be able to

38099

// allocate registers for an address in form of X(%reg, %reg, Y)

38100

// - there never would be enough unreserved registers during regalloc

38101

// (without the need for base ptr the only option would be X(%edi, %esi, Y).

38102

// We are giving a hand to register allocator by precomputing the address in

38103

// a new vreg using LEA.

38104

38105

// If it is not i686 or there is no base pointer - nothing to do here.

38106

if (!Subtarget.is32Bit() || !TRI->hasBasePointer(*MF))

38107

return BB;

38108

38109

// Even though this code does not necessarily needs the base pointer to

38110

// be ESI, we check for that. The reason: if this assert fails, there are

38111

// some changes happened in the compiler base pointer handling, which most

38112

// probably have to be addressed somehow here.

38113

assert(TRI->getBaseRegister() == X86::ESI &&(static_cast <bool> (TRI->getBaseRegister() == X86::
ESI && "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "
"base pointer in mind") ? void (0) : __assert_fail ("TRI->getBaseRegister() == X86::ESI && \"LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a \" \"base pointer in mind\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38115, __extension__
__PRETTY_FUNCTION__))

38114

"LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "(static_cast <bool> (TRI->getBaseRegister() == X86::
ESI && "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "
"base pointer in mind") ? void (0) : __assert_fail ("TRI->getBaseRegister() == X86::ESI && \"LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a \" \"base pointer in mind\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38115, __extension__
__PRETTY_FUNCTION__))

38115

"base pointer in mind")(static_cast <bool> (TRI->getBaseRegister() == X86::
ESI && "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "
"base pointer in mind") ? void (0) : __assert_fail ("TRI->getBaseRegister() == X86::ESI && \"LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a \" \"base pointer in mind\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38115, __extension__
__PRETTY_FUNCTION__));

38116

38117

MachineRegisterInfo &MRI = MF->getRegInfo();

38118

MVT SPTy = getPointerTy(MF->getDataLayout());

38119

const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);

38120

Register computedAddrVReg = MRI.createVirtualRegister(AddrRegClass);

38121

38122

X86AddressMode AM = getAddressFromInstr(&MI, 0);

38123

// Regalloc does not need any help when the memory operand of CMPXCHG8B

38124

// does not use index register.

38125

if (AM.IndexReg == X86::NoRegister)

38126

return BB;

38127

38128

// After X86TargetLowering::ReplaceNodeResults CMPXCHG8B is glued to its

38129

// four operand definitions that are E[ABCD] registers. We skip them and

38130

// then insert the LEA.

38131

MachineBasicBlock::reverse_iterator RMBBI(MI.getReverseIterator());

38132

while (RMBBI != BB->rend() && (RMBBI->definesRegister(X86::EAX) ||

38133

RMBBI->definesRegister(X86::EBX) ||

38134

RMBBI->definesRegister(X86::ECX) ||

38135

RMBBI->definesRegister(X86::EDX))) {

38136

++RMBBI;

38137

}

38138

MachineBasicBlock::iterator MBBI(RMBBI);

38139

addFullAddress(

38140

BuildMI(*BB, *MBBI, DL, TII->get(X86::LEA32r), computedAddrVReg), AM);

38141

38142

setDirectAddressInInstr(&MI, 0, computedAddrVReg);

38143

38144

return BB;

38145

}

38146

case X86::LCMPXCHG16B_NO_RBX: {

38147

const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();

38148

Register BasePtr = TRI->getBaseRegister();

38149

if (TRI->hasBasePointer(*MF) &&

38150

(BasePtr == X86::RBX || BasePtr == X86::EBX)) {

38151

if (!BB->isLiveIn(BasePtr))

38152

BB->addLiveIn(BasePtr);

38153

// Save RBX into a virtual register.

38154

Register SaveRBX =

38155

MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);

38156

BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), SaveRBX)

38157

.addReg(X86::RBX);

38158

Register Dst = MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);

38159

MachineInstrBuilder MIB =

38160

BuildMI(*BB, MI, DL, TII->get(X86::LCMPXCHG16B_SAVE_RBX), Dst);

38161

for (unsigned Idx = 0; Idx < X86::AddrNumOperands; ++Idx)

38162

MIB.add(MI.getOperand(Idx));

38163

MIB.add(MI.getOperand(X86::AddrNumOperands));

38164

MIB.addReg(SaveRBX);

38165

} else {

38166

// Simple case, just copy the virtual register to RBX.

38167

BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::RBX)

38168

.add(MI.getOperand(X86::AddrNumOperands));

38169

MachineInstrBuilder MIB =

38170

BuildMI(*BB, MI, DL, TII->get(X86::LCMPXCHG16B));

38171

for (unsigned Idx = 0; Idx < X86::AddrNumOperands; ++Idx)

38172

MIB.add(MI.getOperand(Idx));

38173

}

38174

MI.eraseFromParent();

38175

return BB;

38176

}

38177

case X86::MWAITX: {

38178

const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();

38179

Register BasePtr = TRI->getBaseRegister();

38180

bool IsRBX = (BasePtr == X86::RBX || BasePtr == X86::EBX);

38181

// If no need to save the base pointer, we generate MWAITXrrr,

38182

// else we generate pseudo MWAITX_SAVE_RBX.

38183

if (!IsRBX || !TRI->hasBasePointer(*MF)) {

38184

BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::ECX)

38185

.addReg(MI.getOperand(0).getReg());

38186

BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::EAX)

38187

.addReg(MI.getOperand(1).getReg());

38188

BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::EBX)

38189

.addReg(MI.getOperand(2).getReg());

38190

BuildMI(*BB, MI, DL, TII->get(X86::MWAITXrrr));

38191

MI.eraseFromParent();

38192

} else {

38193

if (!BB->isLiveIn(BasePtr)) {

38194

BB->addLiveIn(BasePtr);

38195

}

38196

// Parameters can be copied into ECX and EAX but not EBX yet.

38197

BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::ECX)

38198

.addReg(MI.getOperand(0).getReg());

38199

BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::EAX)

38200

.addReg(MI.getOperand(1).getReg());

38201

assert(Subtarget.is64Bit() && "Expected 64-bit mode!")(static_cast <bool> (Subtarget.is64Bit() && "Expected 64-bit mode!"
) ? void (0) : __assert_fail ("Subtarget.is64Bit() && \"Expected 64-bit mode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38201, __extension__
__PRETTY_FUNCTION__));

38202

// Save RBX into a virtual register.

38203

Register SaveRBX =

38204

MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);

38205

BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), SaveRBX)

38206

.addReg(X86::RBX);

38207

// Generate mwaitx pseudo.

38208

Register Dst = MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);

38209

BuildMI(*BB, MI, DL, TII->get(X86::MWAITX_SAVE_RBX))

38210

.addDef(Dst) // Destination tied in with SaveRBX.

38211

.addReg(MI.getOperand(2).getReg()) // input value of EBX.

38212

.addUse(SaveRBX); // Save of base pointer.

38213

MI.eraseFromParent();

38214

}

38215

return BB;

38216

}

38217

case TargetOpcode::PREALLOCATED_SETUP: {

38218

assert(Subtarget.is32Bit() && "preallocated only used in 32-bit")(static_cast <bool> (Subtarget.is32Bit() && "preallocated only used in 32-bit"
) ? void (0) : __assert_fail ("Subtarget.is32Bit() && \"preallocated only used in 32-bit\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38218, __extension__
__PRETTY_FUNCTION__));

38219

auto MFI = MF->getInfo<X86MachineFunctionInfo>();

38220

MFI->setHasPreallocatedCall(true);

38221

int64_t PreallocatedId = MI.getOperand(0).getImm();

38222

size_t StackAdjustment = MFI->getPreallocatedStackSize(PreallocatedId);

38223

assert(StackAdjustment != 0 && "0 stack adjustment")(static_cast <bool> (StackAdjustment != 0 && "0 stack adjustment"
) ? void (0) : __assert_fail ("StackAdjustment != 0 && \"0 stack adjustment\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38223, __extension__
__PRETTY_FUNCTION__));

38224

LLVM_DEBUG(dbgs() << "PREALLOCATED_SETUP stack adjustment "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("x86-isel")) { dbgs() << "PREALLOCATED_SETUP stack adjustment "
<< StackAdjustment << "\n"; } } while (false)

38225

<< StackAdjustment << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("x86-isel")) { dbgs() << "PREALLOCATED_SETUP stack adjustment "
<< StackAdjustment << "\n"; } } while (false);

38226

BuildMI(*BB, MI, DL, TII->get(X86::SUB32ri), X86::ESP)

38227

.addReg(X86::ESP)

38228

.addImm(StackAdjustment);

38229

MI.eraseFromParent();

38230

return BB;

38231

}

38232

case TargetOpcode::PREALLOCATED_ARG: {

38233

assert(Subtarget.is32Bit() && "preallocated calls only used in 32-bit")(static_cast <bool> (Subtarget.is32Bit() && "preallocated calls only used in 32-bit"
) ? void (0) : __assert_fail ("Subtarget.is32Bit() && \"preallocated calls only used in 32-bit\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38233, __extension__
__PRETTY_FUNCTION__));

38234

int64_t PreallocatedId = MI.getOperand(1).getImm();

38235

int64_t ArgIdx = MI.getOperand(2).getImm();

38236

auto MFI = MF->getInfo<X86MachineFunctionInfo>();

38237

size_t ArgOffset = MFI->getPreallocatedArgOffsets(PreallocatedId)[ArgIdx];

38238

LLVM_DEBUG(dbgs() << "PREALLOCATED_ARG arg index " << ArgIdxdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("x86-isel")) { dbgs() << "PREALLOCATED_ARG arg index "
<< ArgIdx << ", arg offset " << ArgOffset <<
"\n"; } } while (false)

38239

<< ", arg offset " << ArgOffset << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("x86-isel")) { dbgs() << "PREALLOCATED_ARG arg index "
<< ArgIdx << ", arg offset " << ArgOffset <<
"\n"; } } while (false);

38240

// stack pointer + offset

38241

addRegOffset(

38242

BuildMI(*BB, MI, DL, TII->get(X86::LEA32r), MI.getOperand(0).getReg()),

38243

X86::ESP, false, ArgOffset);

38244

MI.eraseFromParent();

38245

return BB;

38246

}

38247

case X86::PTDPBSSD:

38248

case X86::PTDPBSUD:

38249

case X86::PTDPBUSD:

38250

case X86::PTDPBUUD:

38251

case X86::PTDPBF16PS:

38252

case X86::PTDPFP16PS: {

38253

unsigned Opc;

38254

switch (MI.getOpcode()) {

38255

default: llvm_unreachable("illegal opcode!")::llvm::llvm_unreachable_internal("illegal opcode!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 38255);

38256

case X86::PTDPBSSD: Opc = X86::TDPBSSD; break;

38257

case X86::PTDPBSUD: Opc = X86::TDPBSUD; break;

38258

case X86::PTDPBUSD: Opc = X86::TDPBUSD; break;

38259

case X86::PTDPBUUD: Opc = X86::TDPBUUD; break;

38260

case X86::PTDPBF16PS: Opc = X86::TDPBF16PS; break;

38261

case X86::PTDPFP16PS: Opc = X86::TDPFP16PS; break;

38262

}

38263

38264

MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));

38265

MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Define);

38266

MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Undef);

38267

MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef);

38268

MIB.addReg(TMMImmToTMMReg(MI.getOperand(2).getImm()), RegState::Undef);

38269

38270

MI.eraseFromParent(); // The pseudo is gone now.

38271

return BB;

38272

}

38273

case X86::PTILEZERO: {

38274

unsigned Imm = MI.getOperand(0).getImm();

38275

BuildMI(*BB, MI, DL, TII->get(X86::TILEZERO), TMMImmToTMMReg(Imm));

38276

MI.eraseFromParent(); // The pseudo is gone now.

38277

return BB;

38278

}

38279

case X86::PTILELOADD:

38280

case X86::PTILELOADDT1:

38281

case X86::PTILESTORED: {

38282

unsigned Opc;

38283

switch (MI.getOpcode()) {

38284

default: llvm_unreachable("illegal opcode!")::llvm::llvm_unreachable_internal("illegal opcode!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 38284);

38285

case X86::PTILELOADD: Opc = X86::TILELOADD; break;

38286

case X86::PTILELOADDT1: Opc = X86::TILELOADDT1; break;

38287

case X86::PTILESTORED: Opc = X86::TILESTORED; break;

38288

}

38289

38290

MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));

38291

unsigned CurOp = 0;

38292

if (Opc != X86::TILESTORED)

38293

MIB.addReg(TMMImmToTMMReg(MI.getOperand(CurOp++).getImm()),

38294

RegState::Define);

38295

38296

MIB.add(MI.getOperand(CurOp++)); // base

38297

MIB.add(MI.getOperand(CurOp++)); // scale

38298

MIB.add(MI.getOperand(CurOp++)); // index -- stride

38299

MIB.add(MI.getOperand(CurOp++)); // displacement

38300

MIB.add(MI.getOperand(CurOp++)); // segment

38301

38302

if (Opc == X86::TILESTORED)

38303

MIB.addReg(TMMImmToTMMReg(MI.getOperand(CurOp++).getImm()),

38304

RegState::Undef);

38305

38306

MI.eraseFromParent(); // The pseudo is gone now.

38307

return BB;

38308

}

38309

case X86::PTCMMIMFP16PS:

38310

case X86::PTCMMRLFP16PS: {

38311

const DebugLoc &DL = MI.getDebugLoc();

38312

unsigned Opc;

38313

switch (MI.getOpcode()) {

38314

default: llvm_unreachable("Unexpected instruction!")::llvm::llvm_unreachable_internal("Unexpected instruction!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 38314);

38315

case X86::PTCMMIMFP16PS: Opc = X86::TCMMIMFP16PS; break;

38316

case X86::PTCMMRLFP16PS: Opc = X86::TCMMRLFP16PS; break;

38317

}

38318

MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));

38319

MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Define);

38320

MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Undef);

38321

MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef);

38322

MIB.addReg(TMMImmToTMMReg(MI.getOperand(2).getImm()), RegState::Undef);

38323

MI.eraseFromParent(); // The pseudo is gone now.

38324

return BB;

38325

}

38326

}

38327

}

38328

38329

//===----------------------------------------------------------------------===//

38330

// X86 Optimization Hooks

38331

//===----------------------------------------------------------------------===//

38332

38333

bool

38334

X86TargetLowering::targetShrinkDemandedConstant(SDValue Op,

38335

const APInt &DemandedBits,

38336

const APInt &DemandedElts,

38337

TargetLoweringOpt &TLO) const {

38338

EVT VT = Op.getValueType();

38339

unsigned Opcode = Op.getOpcode();

38340

unsigned EltSize = VT.getScalarSizeInBits();

38341

38342

if (VT.isVector()) {

38343

// If the constant is only all signbits in the active bits, then we should

38344

// extend it to the entire constant to allow it act as a boolean constant

38345

// vector.

38346

auto NeedsSignExtension = [&](SDValue V, unsigned ActiveBits) {

38347

if (!ISD::isBuildVectorOfConstantSDNodes(V.getNode()))

38348

return false;

38349

for (unsigned i = 0, e = V.getNumOperands(); i != e; ++i) {

38350

if (!DemandedElts[i] || V.getOperand(i).isUndef())

38351

continue;

38352

const APInt &Val = V.getConstantOperandAPInt(i);

38353

if (Val.getBitWidth() > Val.getNumSignBits() &&

38354

Val.trunc(ActiveBits).getNumSignBits() == ActiveBits)

38355

return true;

38356

}

38357

return false;

38358

};

38359

// For vectors - if we have a constant, then try to sign extend.

38360

// TODO: Handle AND/ANDN cases.

38361

unsigned ActiveBits = DemandedBits.getActiveBits();

38362

if (EltSize > ActiveBits && EltSize > 1 && isTypeLegal(VT) &&

38363

(Opcode == ISD::OR || Opcode == ISD::XOR) &&

38364

NeedsSignExtension(Op.getOperand(1), ActiveBits)) {

38365

EVT ExtSVT = EVT::getIntegerVT(*TLO.DAG.getContext(), ActiveBits);

38366

EVT ExtVT = EVT::getVectorVT(*TLO.DAG.getContext(), ExtSVT,

38367

VT.getVectorNumElements());

38368

SDValue NewC =

38369

TLO.DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(Op), VT,

38370

Op.getOperand(1), TLO.DAG.getValueType(ExtVT));

38371

SDValue NewOp =

38372

TLO.DAG.getNode(Opcode, SDLoc(Op), VT, Op.getOperand(0), NewC);

38373

return TLO.CombineTo(Op, NewOp);

38374

}

38375

return false;

38376

}

38377

38378

// Only optimize Ands to prevent shrinking a constant that could be

38379

// matched by movzx.

38380

if (Opcode != ISD::AND)

38381

return false;

38382

38383

// Make sure the RHS really is a constant.

38384

ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));

38385

if (!C)

38386

return false;

38387

38388

const APInt &Mask = C->getAPIntValue();

38389

38390

// Clear all non-demanded bits initially.

38391

APInt ShrunkMask = Mask & DemandedBits;

38392

38393

// Find the width of the shrunk mask.

38394

unsigned Width = ShrunkMask.getActiveBits();

38395

38396

// If the mask is all 0s there's nothing to do here.

38397

if (Width == 0)

38398

return false;

38399

38400

// Find the next power of 2 width, rounding up to a byte.

38401

Width = llvm::bit_ceil(std::max(Width, 8U));

38402

// Truncate the width to size to handle illegal types.

38403

Width = std::min(Width, EltSize);

38404

38405

// Calculate a possible zero extend mask for this constant.

38406

APInt ZeroExtendMask = APInt::getLowBitsSet(EltSize, Width);

38407

38408

// If we aren't changing the mask, just return true to keep it and prevent

38409

// the caller from optimizing.

38410

if (ZeroExtendMask == Mask)

38411

return true;

38412

38413

// Make sure the new mask can be represented by a combination of mask bits

38414

// and non-demanded bits.

38415

if (!ZeroExtendMask.isSubsetOf(Mask | ~DemandedBits))

38416

return false;

38417

38418

// Replace the constant with the zero extend mask.

38419

SDLoc DL(Op);

38420

SDValue NewC = TLO.DAG.getConstant(ZeroExtendMask, DL, VT);

38421

SDValue NewOp = TLO.DAG.getNode(ISD::AND, DL, VT, Op.getOperand(0), NewC);

38422

return TLO.CombineTo(Op, NewOp);

38423

}

38424

38425

void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,

38426

KnownBits &Known,

38427

const APInt &DemandedElts,

38428

const SelectionDAG &DAG,

38429

unsigned Depth) const {

38430

unsigned BitWidth = Known.getBitWidth();

38431

unsigned NumElts = DemandedElts.getBitWidth();

38432

unsigned Opc = Op.getOpcode();

38433

EVT VT = Op.getValueType();

38434

assert((Opc >= ISD::BUILTIN_OP_END ||(static_cast <bool> ((Opc >= ISD::BUILTIN_OP_END || Opc
== ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN ||
Opc == ISD::INTRINSIC_VOID) && "Should use MaskedValueIsZero if you don't know whether Op"
" is a target node!") ? void (0) : __assert_fail ("(Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && \"Should use MaskedValueIsZero if you don't know whether Op\" \" is a target node!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38439, __extension__
__PRETTY_FUNCTION__))

38435

Opc == ISD::INTRINSIC_WO_CHAIN ||(static_cast <bool> ((Opc >= ISD::BUILTIN_OP_END || Opc
== ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN ||
Opc == ISD::INTRINSIC_VOID) && "Should use MaskedValueIsZero if you don't know whether Op"
" is a target node!") ? void (0) : __assert_fail ("(Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && \"Should use MaskedValueIsZero if you don't know whether Op\" \" is a target node!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38439, __extension__
__PRETTY_FUNCTION__))

38436

Opc == ISD::INTRINSIC_W_CHAIN ||(static_cast <bool> ((Opc >= ISD::BUILTIN_OP_END || Opc
== ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN ||
Opc == ISD::INTRINSIC_VOID) && "Should use MaskedValueIsZero if you don't know whether Op"
" is a target node!") ? void (0) : __assert_fail ("(Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && \"Should use MaskedValueIsZero if you don't know whether Op\" \" is a target node!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38439, __extension__
__PRETTY_FUNCTION__))

38437

Opc == ISD::INTRINSIC_VOID) &&(static_cast <bool> ((Opc >= ISD::BUILTIN_OP_END || Opc
== ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN ||
Opc == ISD::INTRINSIC_VOID) && "Should use MaskedValueIsZero if you don't know whether Op"
" is a target node!") ? void (0) : __assert_fail ("(Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && \"Should use MaskedValueIsZero if you don't know whether Op\" \" is a target node!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38439, __extension__
__PRETTY_FUNCTION__))

38438

"Should use MaskedValueIsZero if you don't know whether Op"(static_cast <bool> ((Opc >= ISD::BUILTIN_OP_END || Opc
== ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN ||
Opc == ISD::INTRINSIC_VOID) && "Should use MaskedValueIsZero if you don't know whether Op"
" is a target node!") ? void (0) : __assert_fail ("(Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && \"Should use MaskedValueIsZero if you don't know whether Op\" \" is a target node!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38439, __extension__
__PRETTY_FUNCTION__))

38439

" is a target node!")(static_cast <bool> ((Opc >= ISD::BUILTIN_OP_END || Opc
== ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN ||
Opc == ISD::INTRINSIC_VOID) && "Should use MaskedValueIsZero if you don't know whether Op"
" is a target node!") ? void (0) : __assert_fail ("(Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && \"Should use MaskedValueIsZero if you don't know whether Op\" \" is a target node!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38439, __extension__
__PRETTY_FUNCTION__));

38440

38441

Known.resetAll();

38442

switch (Opc) {

38443

default: break;

38444

case X86ISD::MUL_IMM: {

38445

KnownBits Known2;

38446

Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);

38447

Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);

38448

Known = KnownBits::mul(Known, Known2);

38449

break;

38450

}

38451

case X86ISD::SETCC:

38452

Known.Zero.setBitsFrom(1);

38453

break;

38454

case X86ISD::MOVMSK: {

38455

unsigned NumLoBits = Op.getOperand(0).getValueType().getVectorNumElements();

38456

Known.Zero.setBitsFrom(NumLoBits);

38457

break;

38458

}

38459

case X86ISD::PEXTRB:

38460

case X86ISD::PEXTRW: {

38461

SDValue Src = Op.getOperand(0);

38462

EVT SrcVT = Src.getValueType();

38463

APInt DemandedElt = APInt::getOneBitSet(SrcVT.getVectorNumElements(),

38464

Op.getConstantOperandVal(1));

38465

Known = DAG.computeKnownBits(Src, DemandedElt, Depth + 1);

38466

Known = Known.anyextOrTrunc(BitWidth);

38467

Known.Zero.setBitsFrom(SrcVT.getScalarSizeInBits());

38468

break;

38469

}

38470

case X86ISD::VSRAI:

38471

case X86ISD::VSHLI:

38472

case X86ISD::VSRLI: {

38473

unsigned ShAmt = Op.getConstantOperandVal(1);

38474

if (ShAmt >= VT.getScalarSizeInBits()) {

38475

// Out of range logical bit shifts are guaranteed to be zero.

38476

// Out of range arithmetic bit shifts splat the sign bit.

38477

if (Opc != X86ISD::VSRAI) {

38478

Known.setAllZero();

38479

break;

38480

}

38481

38482

ShAmt = VT.getScalarSizeInBits() - 1;

38483

}

38484

38485

Known = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);

38486

if (Opc == X86ISD::VSHLI) {

38487

Known.Zero <<= ShAmt;

38488

Known.One <<= ShAmt;

38489

// Low bits are known zero.

38490

Known.Zero.setLowBits(ShAmt);

38491

} else if (Opc == X86ISD::VSRLI) {

38492

Known.Zero.lshrInPlace(ShAmt);

38493

Known.One.lshrInPlace(ShAmt);

38494

// High bits are known zero.

38495

Known.Zero.setHighBits(ShAmt);

38496

} else {

38497

Known.Zero.ashrInPlace(ShAmt);

38498

Known.One.ashrInPlace(ShAmt);

38499

}

38500

break;

38501

}

38502

case X86ISD::PACKUS: {

38503

// PACKUS is just a truncation if the upper half is zero.

38504

APInt DemandedLHS, DemandedRHS;

38505

getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);

38506

38507

Known.One = APInt::getAllOnes(BitWidth * 2);

38508

Known.Zero = APInt::getAllOnes(BitWidth * 2);

38509

38510

KnownBits Known2;

38511

if (!!DemandedLHS) {

38512

Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedLHS, Depth + 1);

38513

Known = KnownBits::commonBits(Known, Known2);

38514

}

38515

if (!!DemandedRHS) {

38516

Known2 = DAG.computeKnownBits(Op.getOperand(1), DemandedRHS, Depth + 1);

38517

Known = KnownBits::commonBits(Known, Known2);

38518

}

38519

38520

if (Known.countMinLeadingZeros() < BitWidth)

38521

Known.resetAll();

38522

Known = Known.trunc(BitWidth);

38523

break;

38524

}

38525

case X86ISD::VBROADCAST: {

38526

SDValue Src = Op.getOperand(0);

38527

if (!Src.getSimpleValueType().isVector()) {

38528

Known = DAG.computeKnownBits(Src, Depth + 1);

38529

return;

38530

}

38531

break;

38532

}

38533

case X86ISD::AND: {

38534

if (Op.getResNo() == 0) {

38535

KnownBits Known2;

38536

Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);

38537

Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);

38538

Known &= Known2;

38539

}

38540

break;

38541

}

38542

case X86ISD::ANDNP: {

38543

KnownBits Known2;

38544

Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);

38545

Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);

38546

38547

// ANDNP = (~X & Y);

38548

Known.One &= Known2.Zero;

38549

Known.Zero |= Known2.One;

38550

break;

38551

}

38552

case X86ISD::FOR: {

38553

KnownBits Known2;

38554

Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);

38555

Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);

38556

38557

Known |= Known2;

38558

break;

38559

}

38560

case X86ISD::PSADBW: {

38561

assert(VT.getScalarType() == MVT::i64 &&(static_cast <bool> (VT.getScalarType() == MVT::i64 &&
Op.getOperand(0).getValueType().getScalarType() == MVT::i8 &&
"Unexpected PSADBW types") ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i64 && Op.getOperand(0).getValueType().getScalarType() == MVT::i8 && \"Unexpected PSADBW types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38563, __extension__
__PRETTY_FUNCTION__))

38562

Op.getOperand(0).getValueType().getScalarType() == MVT::i8 &&(static_cast <bool> (VT.getScalarType() == MVT::i64 &&
Op.getOperand(0).getValueType().getScalarType() == MVT::i8 &&
"Unexpected PSADBW types") ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i64 && Op.getOperand(0).getValueType().getScalarType() == MVT::i8 && \"Unexpected PSADBW types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38563, __extension__
__PRETTY_FUNCTION__))

38563

"Unexpected PSADBW types")(static_cast <bool> (VT.getScalarType() == MVT::i64 &&
Op.getOperand(0).getValueType().getScalarType() == MVT::i8 &&
"Unexpected PSADBW types") ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i64 && Op.getOperand(0).getValueType().getScalarType() == MVT::i8 && \"Unexpected PSADBW types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38563, __extension__
__PRETTY_FUNCTION__));

38564

38565

// PSADBW - fills low 16 bits and zeros upper 48 bits of each i64 result.

38566

Known.Zero.setBitsFrom(16);

38567

break;

38568

}

38569

case X86ISD::PMULUDQ: {

38570

KnownBits Known2;

38571

Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);

38572

Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);

38573

38574

Known = Known.trunc(BitWidth / 2).zext(BitWidth);

38575

Known2 = Known2.trunc(BitWidth / 2).zext(BitWidth);

38576

Known = KnownBits::mul(Known, Known2);

38577

break;

38578

}

38579

case X86ISD::CMOV: {

38580

Known = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);

38581

// If we don't know any bits, early out.

38582

if (Known.isUnknown())

38583

break;

38584

KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);

38585

38586

// Only known if known in both the LHS and RHS.

38587

Known = KnownBits::commonBits(Known, Known2);

38588

break;

38589

}

38590

case X86ISD::BEXTR:

38591

case X86ISD::BEXTRI: {

38592

SDValue Op0 = Op.getOperand(0);

38593

SDValue Op1 = Op.getOperand(1);

38594

38595

if (auto* Cst1 = dyn_cast<ConstantSDNode>(Op1)) {

38596

unsigned Shift = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 0);

38597

unsigned Length = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 8);

38598

38599

// If the length is 0, the result is 0.

38600

if (Length == 0) {

38601

Known.setAllZero();

38602

break;

38603

}

38604

38605

if ((Shift + Length) <= BitWidth) {

38606

Known = DAG.computeKnownBits(Op0, Depth + 1);

38607

Known = Known.extractBits(Length, Shift);

38608

Known = Known.zextOrTrunc(BitWidth);

38609

}

38610

}

38611

break;

38612

}

38613

case X86ISD::PDEP: {

38614

KnownBits Known2;

38615

Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);

38616

Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);

38617

// Zeros are retained from the mask operand. But not ones.

38618

Known.One.clearAllBits();

38619

// The result will have at least as many trailing zeros as the non-mask

38620

// operand since bits can only map to the same or higher bit position.

38621

Known.Zero.setLowBits(Known2.countMinTrailingZeros());

38622

break;

38623

}

38624

case X86ISD::PEXT: {

38625

Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);

38626

// The result has as many leading zeros as the number of zeroes in the mask.

38627

unsigned Count = Known.Zero.popcount();

38628

Known.Zero = APInt::getHighBitsSet(BitWidth, Count);

38629

Known.One.clearAllBits();

38630

break;

38631

}

38632

case X86ISD::VTRUNC:

38633

case X86ISD::VTRUNCS:

38634

case X86ISD::VTRUNCUS:

38635

case X86ISD::CVTSI2P:

38636

case X86ISD::CVTUI2P:

38637

case X86ISD::CVTP2SI:

38638

case X86ISD::CVTP2UI:

38639

case X86ISD::MCVTP2SI:

38640

case X86ISD::MCVTP2UI:

38641

case X86ISD::CVTTP2SI:

38642

case X86ISD::CVTTP2UI:

38643

case X86ISD::MCVTTP2SI:

38644

case X86ISD::MCVTTP2UI:

38645

case X86ISD::MCVTSI2P:

38646

case X86ISD::MCVTUI2P:

38647

case X86ISD::VFPROUND:

38648

case X86ISD::VMFPROUND:

38649

case X86ISD::CVTPS2PH:

38650

case X86ISD::MCVTPS2PH: {

38651

// Truncations/Conversions - upper elements are known zero.

38652

EVT SrcVT = Op.getOperand(0).getValueType();

38653

if (SrcVT.isVector()) {

38654

unsigned NumSrcElts = SrcVT.getVectorNumElements();

38655

if (NumElts > NumSrcElts && DemandedElts.countr_zero() >= NumSrcElts)

38656

Known.setAllZero();

38657

}

38658

break;

38659

}

38660

case X86ISD::STRICT_CVTTP2SI:

38661

case X86ISD::STRICT_CVTTP2UI:

38662

case X86ISD::STRICT_CVTSI2P:

38663

case X86ISD::STRICT_CVTUI2P:

38664

case X86ISD::STRICT_VFPROUND:

38665

case X86ISD::STRICT_CVTPS2PH: {

38666

// Strict Conversions - upper elements are known zero.

38667

EVT SrcVT = Op.getOperand(1).getValueType();

38668

if (SrcVT.isVector()) {

38669

unsigned NumSrcElts = SrcVT.getVectorNumElements();

38670

if (NumElts > NumSrcElts && DemandedElts.countr_zero() >= NumSrcElts)

38671

Known.setAllZero();

38672

}

38673

break;

38674

}

38675

case X86ISD::MOVQ2DQ: {

38676

// Move from MMX to XMM. Upper half of XMM should be 0.

38677

if (DemandedElts.countr_zero() >= (NumElts / 2))

38678

Known.setAllZero();

38679

break;

38680

}

38681

case X86ISD::VBROADCAST_LOAD: {

38682

APInt UndefElts;

38683

SmallVector<APInt, 16> EltBits;

38684

if (getTargetConstantBitsFromNode(Op, BitWidth, UndefElts, EltBits,

38685

/*AllowWholeUndefs*/ false,

38686

/*AllowPartialUndefs*/ false)) {

38687

Known.Zero.setAllBits();

38688

Known.One.setAllBits();

38689

for (unsigned I = 0; I != NumElts; ++I) {

38690

if (!DemandedElts[I])

38691

continue;

38692

if (UndefElts[I]) {

38693

Known.resetAll();

38694

break;

38695

}

38696

KnownBits Known2 = KnownBits::makeConstant(EltBits[I]);

38697

Known = KnownBits::commonBits(Known, Known2);

38698

}

38699

return;

38700

}

38701

break;

38702

}

38703

}

38704

38705

// Handle target shuffles.

38706

// TODO - use resolveTargetShuffleInputs once we can limit recursive depth.

38707

if (isTargetShuffle(Opc)) {

38708

SmallVector<int, 64> Mask;

38709

SmallVector<SDValue, 2> Ops;

38710

if (getTargetShuffleMask(Op.getNode(), VT.getSimpleVT(), true, Ops, Mask)) {

38711

unsigned NumOps = Ops.size();

38712

unsigned NumElts = VT.getVectorNumElements();

38713

if (Mask.size() == NumElts) {

38714

SmallVector<APInt, 2> DemandedOps(NumOps, APInt(NumElts, 0));

38715

Known.Zero.setAllBits(); Known.One.setAllBits();

38716

for (unsigned i = 0; i != NumElts; ++i) {

38717

if (!DemandedElts[i])

38718

continue;

38719

int M = Mask[i];

38720

if (M == SM_SentinelUndef) {

38721

// For UNDEF elements, we don't know anything about the common state

38722

// of the shuffle result.

38723

Known.resetAll();

38724

break;

38725

}

38726

if (M == SM_SentinelZero) {

38727

Known.One.clearAllBits();

38728

continue;

38729

}

38730

assert(0 <= M && (unsigned)M < (NumOps * NumElts) &&(static_cast <bool> (0 <= M && (unsigned)M <
(NumOps * NumElts) && "Shuffle index out of range") ?
void (0) : __assert_fail ("0 <= M && (unsigned)M < (NumOps * NumElts) && \"Shuffle index out of range\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38731, __extension__
__PRETTY_FUNCTION__))

38731

"Shuffle index out of range")(static_cast <bool> (0 <= M && (unsigned)M <
(NumOps * NumElts) && "Shuffle index out of range") ?
void (0) : __assert_fail ("0 <= M && (unsigned)M < (NumOps * NumElts) && \"Shuffle index out of range\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38731, __extension__
__PRETTY_FUNCTION__));

38732

38733

unsigned OpIdx = (unsigned)M / NumElts;

38734

unsigned EltIdx = (unsigned)M % NumElts;

38735

if (Ops[OpIdx].getValueType() != VT) {

38736

// TODO - handle target shuffle ops with different value types.

38737

Known.resetAll();

38738

break;

38739

}

38740

DemandedOps[OpIdx].setBit(EltIdx);

38741

}

38742

// Known bits are the values that are shared by every demanded element.

38743

for (unsigned i = 0; i != NumOps && !Known.isUnknown(); ++i) {

38744

if (!DemandedOps[i])

38745

continue;

38746

KnownBits Known2 =

38747

DAG.computeKnownBits(Ops[i], DemandedOps[i], Depth + 1);

38748

Known = KnownBits::commonBits(Known, Known2);

38749

}

38750

}

38751

}

38752

}

38753

}

38754

38755

unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(

38756

SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,

38757

unsigned Depth) const {

38758

EVT VT = Op.getValueType();

38759

unsigned VTBits = VT.getScalarSizeInBits();

38760

unsigned Opcode = Op.getOpcode();

38761

switch (Opcode) {

38762

case X86ISD::SETCC_CARRY:

38763

// SETCC_CARRY sets the dest to ~0 for true or 0 for false.

38764

return VTBits;

38765

38766

case X86ISD::VTRUNC: {

38767

SDValue Src = Op.getOperand(0);

38768

MVT SrcVT = Src.getSimpleValueType();

38769

unsigned NumSrcBits = SrcVT.getScalarSizeInBits();

38770

assert(VTBits < NumSrcBits && "Illegal truncation input type")(static_cast <bool> (VTBits < NumSrcBits && "Illegal truncation input type"
) ? void (0) : __assert_fail ("VTBits < NumSrcBits && \"Illegal truncation input type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38770, __extension__
__PRETTY_FUNCTION__));

38771

APInt DemandedSrc = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());

38772

unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedSrc, Depth + 1);

38773

if (Tmp > (NumSrcBits - VTBits))

38774

return Tmp - (NumSrcBits - VTBits);

38775

return 1;

38776

}

38777

38778

case X86ISD::PACKSS: {

38779

// PACKSS is just a truncation if the sign bits extend to the packed size.

38780

APInt DemandedLHS, DemandedRHS;

38781

getPackDemandedElts(Op.getValueType(), DemandedElts, DemandedLHS,

38782

DemandedRHS);

38783

38784

unsigned SrcBits = Op.getOperand(0).getScalarValueSizeInBits();

38785

unsigned Tmp0 = SrcBits, Tmp1 = SrcBits;

38786

if (!!DemandedLHS)

38787

Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), DemandedLHS, Depth + 1);

38788

if (!!DemandedRHS)

38789

Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), DemandedRHS, Depth + 1);

38790

unsigned Tmp = std::min(Tmp0, Tmp1);

38791

if (Tmp > (SrcBits - VTBits))

38792

return Tmp - (SrcBits - VTBits);

38793

return 1;

38794

}

38795

38796

case X86ISD::VBROADCAST: {

38797

SDValue Src = Op.getOperand(0);

38798

if (!Src.getSimpleValueType().isVector())

38799

return DAG.ComputeNumSignBits(Src, Depth + 1);

38800

break;

38801

}

38802

38803

case X86ISD::VSHLI: {

38804

SDValue Src = Op.getOperand(0);

38805

const APInt &ShiftVal = Op.getConstantOperandAPInt(1);

38806

if (ShiftVal.uge(VTBits))

38807

return VTBits; // Shifted all bits out --> zero.

38808

unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);

38809

if (ShiftVal.uge(Tmp))

38810

return 1; // Shifted all sign bits out --> unknown.

38811

return Tmp - ShiftVal.getZExtValue();

38812

}

38813

38814

case X86ISD::VSRAI: {

38815

SDValue Src = Op.getOperand(0);

38816

APInt ShiftVal = Op.getConstantOperandAPInt(1);

38817

if (ShiftVal.uge(VTBits - 1))

38818

return VTBits; // Sign splat.

38819

unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);

38820

ShiftVal += Tmp;

38821

return ShiftVal.uge(VTBits) ? VTBits : ShiftVal.getZExtValue();

38822

}

38823

38824

case X86ISD::FSETCC:

38825

// cmpss/cmpsd return zero/all-bits result values in the bottom element.

38826

if (VT == MVT::f32 || VT == MVT::f64 ||

38827

((VT == MVT::v4f32 || VT == MVT::v2f64) && DemandedElts == 1))

38828

return VTBits;

38829

break;

38830

38831

case X86ISD::PCMPGT:

38832

case X86ISD::PCMPEQ:

38833

case X86ISD::CMPP:

38834

case X86ISD::VPCOM:

38835

case X86ISD::VPCOMU:

38836

// Vector compares return zero/all-bits result values.

38837

return VTBits;

38838

38839

case X86ISD::ANDNP: {

38840

unsigned Tmp0 =

38841

DAG.ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth + 1);

38842

if (Tmp0 == 1) return 1; // Early out.

38843

unsigned Tmp1 =

38844

DAG.ComputeNumSignBits(Op.getOperand(1), DemandedElts, Depth + 1);

38845

return std::min(Tmp0, Tmp1);

38846

}

38847

38848

case X86ISD::CMOV: {

38849

unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth+1);

38850

if (Tmp0 == 1) return 1; // Early out.

38851

unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth+1);

38852

return std::min(Tmp0, Tmp1);

38853

}

38854

}

38855

38856

// Handle target shuffles.

38857

// TODO - use resolveTargetShuffleInputs once we can limit recursive depth.

38858

if (isTargetShuffle(Opcode)) {

38859

SmallVector<int, 64> Mask;

38860

SmallVector<SDValue, 2> Ops;

38861

if (getTargetShuffleMask(Op.getNode(), VT.getSimpleVT(), true, Ops, Mask)) {

38862

unsigned NumOps = Ops.size();

38863

unsigned NumElts = VT.getVectorNumElements();

38864

if (Mask.size() == NumElts) {

38865

SmallVector<APInt, 2> DemandedOps(NumOps, APInt(NumElts, 0));

38866

for (unsigned i = 0; i != NumElts; ++i) {

38867

if (!DemandedElts[i])

38868

continue;

38869

int M = Mask[i];

38870

if (M == SM_SentinelUndef) {

38871

// For UNDEF elements, we don't know anything about the common state

38872

// of the shuffle result.

38873

return 1;

38874

} else if (M == SM_SentinelZero) {

38875

// Zero = all sign bits.

38876

continue;

38877

}

38878

assert(0 <= M && (unsigned)M < (NumOps * NumElts) &&(static_cast <bool> (0 <= M && (unsigned)M <
(NumOps * NumElts) && "Shuffle index out of range") ?
void (0) : __assert_fail ("0 <= M && (unsigned)M < (NumOps * NumElts) && \"Shuffle index out of range\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38879, __extension__
__PRETTY_FUNCTION__))

38879

"Shuffle index out of range")(static_cast <bool> (0 <= M && (unsigned)M <
(NumOps * NumElts) && "Shuffle index out of range") ?
void (0) : __assert_fail ("0 <= M && (unsigned)M < (NumOps * NumElts) && \"Shuffle index out of range\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38879, __extension__
__PRETTY_FUNCTION__));

38880

38881

unsigned OpIdx = (unsigned)M / NumElts;

38882

unsigned EltIdx = (unsigned)M % NumElts;

38883

if (Ops[OpIdx].getValueType() != VT) {

38884

// TODO - handle target shuffle ops with different value types.

38885

return 1;

38886

}

38887

DemandedOps[OpIdx].setBit(EltIdx);

38888

}

38889

unsigned Tmp0 = VTBits;

38890

for (unsigned i = 0; i != NumOps && Tmp0 > 1; ++i) {

38891

if (!DemandedOps[i])

38892

continue;

38893

unsigned Tmp1 =

38894

DAG.ComputeNumSignBits(Ops[i], DemandedOps[i], Depth + 1);

38895

Tmp0 = std::min(Tmp0, Tmp1);

38896

}

38897

return Tmp0;

38898

}

38899

}

38900

}

38901

38902

// Fallback case.

38903

return 1;

38904

}

38905

38906

SDValue X86TargetLowering::unwrapAddress(SDValue N) const {

38907

if (N->getOpcode() == X86ISD::Wrapper || N->getOpcode() == X86ISD::WrapperRIP)

38908

return N->getOperand(0);

38909

return N;

38910

}

38911

38912

// Helper to look for a normal load that can be narrowed into a vzload with the

38913

// specified VT and memory VT. Returns SDValue() on failure.

38914

static SDValue narrowLoadToVZLoad(LoadSDNode *LN, MVT MemVT, MVT VT,

38915

SelectionDAG &DAG) {

38916

// Can't if the load is volatile or atomic.

38917

if (!LN->isSimple())

38918

return SDValue();

38919

38920

SDVTList Tys = DAG.getVTList(VT, MVT::Other);

38921

SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};

38922

return DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, SDLoc(LN), Tys, Ops, MemVT,

38923

LN->getPointerInfo(), LN->getOriginalAlign(),

38924

LN->getMemOperand()->getFlags());

38925

}

38926

38927

// Attempt to match a combined shuffle mask against supported unary shuffle

38928

// instructions.

38929

// TODO: Investigate sharing more of this with shuffle lowering.

38930

static bool matchUnaryShuffle(MVT MaskVT, ArrayRef<int> Mask,

38931

bool AllowFloatDomain, bool AllowIntDomain,

38932

SDValue V1, const SelectionDAG &DAG,

38933

const X86Subtarget &Subtarget, unsigned &Shuffle,

38934

MVT &SrcVT, MVT &DstVT) {

38935

unsigned NumMaskElts = Mask.size();

38936

unsigned MaskEltSize = MaskVT.getScalarSizeInBits();

38937

38938

// Match against a VZEXT_MOVL vXi32 and vXi16 zero-extending instruction.

38939

if (Mask[0] == 0 &&

38940

(MaskEltSize == 32 || (MaskEltSize == 16 && Subtarget.hasFP16()))) {

38941

if ((isUndefOrZero(Mask[1]) && isUndefInRange(Mask, 2, NumMaskElts - 2)) ||

38942

(V1.getOpcode() == ISD::SCALAR_TO_VECTOR &&

38943

isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1))) {

38944

Shuffle = X86ISD::VZEXT_MOVL;

38945

if (MaskEltSize == 16)

38946

SrcVT = DstVT = MaskVT.changeVectorElementType(MVT::f16);

38947

else

38948

SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;

38949

return true;

38950

}

38951

}

38952

38953

// Match against a ANY/ZERO_EXTEND_VECTOR_INREG instruction.

38954

// TODO: Add 512-bit vector support (split AVX512F and AVX512BW).

38955

if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE41()) ||

38956

(MaskVT.is256BitVector() && Subtarget.hasInt256()))) {

38957

unsigned MaxScale = 64 / MaskEltSize;

38958

for (unsigned Scale = 2; Scale <= MaxScale; Scale *= 2) {

38959

bool MatchAny = true;

38960

bool MatchZero = true;

38961

unsigned NumDstElts = NumMaskElts / Scale;

38962

for (unsigned i = 0; i != NumDstElts && (MatchAny || MatchZero); ++i) {

38963

if (!isUndefOrEqual(Mask[i * Scale], (int)i)) {

38964

MatchAny = MatchZero = false;

38965

break;

38966

}

38967

MatchAny &= isUndefInRange(Mask, (i * Scale) + 1, Scale - 1);

38968

MatchZero &= isUndefOrZeroInRange(Mask, (i * Scale) + 1, Scale - 1);

38969

}

38970

if (MatchAny || MatchZero) {

38971

assert(MatchZero && "Failed to match zext but matched aext?")(static_cast <bool> (MatchZero && "Failed to match zext but matched aext?"
) ? void (0) : __assert_fail ("MatchZero && \"Failed to match zext but matched aext?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38971, __extension__
__PRETTY_FUNCTION__));

38972

unsigned SrcSize = std::max(128u, NumDstElts * MaskEltSize);

38973

MVT ScalarTy = MaskVT.isInteger() ? MaskVT.getScalarType() :

38974

MVT::getIntegerVT(MaskEltSize);

38975

SrcVT = MVT::getVectorVT(ScalarTy, SrcSize / MaskEltSize);

38976

38977

Shuffle = unsigned(MatchAny ? ISD::ANY_EXTEND : ISD::ZERO_EXTEND);

38978

if (SrcVT.getVectorNumElements() != NumDstElts)

38979

Shuffle = DAG.getOpcode_EXTEND_VECTOR_INREG(Shuffle);

38980

38981

DstVT = MVT::getIntegerVT(Scale * MaskEltSize);

38982

DstVT = MVT::getVectorVT(DstVT, NumDstElts);

38983

return true;

38984

}

38985

}

38986

}

38987

38988

// Match against a VZEXT_MOVL instruction, SSE1 only supports 32-bits (MOVSS).

38989

if (((MaskEltSize == 32) || (MaskEltSize == 64 && Subtarget.hasSSE2()) ||

38990

(MaskEltSize == 16 && Subtarget.hasFP16())) &&

38991

isUndefOrEqual(Mask[0], 0) &&

38992

isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) {

38993

Shuffle = X86ISD::VZEXT_MOVL;

38994

if (MaskEltSize == 16)

38995

SrcVT = DstVT = MaskVT.changeVectorElementType(MVT::f16);

38996

else

38997

SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;

38998

return true;

38999

}

39000

39001

// Check if we have SSE3 which will let us use MOVDDUP etc. The

39002

// instructions are no slower than UNPCKLPD but has the option to

39003

// fold the input operand into even an unaligned memory load.

39004

if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && AllowFloatDomain) {

39005

if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0}, DAG, V1)) {

39006

Shuffle = X86ISD::MOVDDUP;

39007

SrcVT = DstVT = MVT::v2f64;

39008

return true;

39009

}

39010

if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2}, DAG, V1)) {

39011

Shuffle = X86ISD::MOVSLDUP;

39012

SrcVT = DstVT = MVT::v4f32;

39013

return true;

39014

}

39015

if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1, 3, 3}, DAG, V1)) {

39016

Shuffle = X86ISD::MOVSHDUP;

39017

SrcVT = DstVT = MVT::v4f32;

39018

return true;

39019

}

39020

}

39021

39022

if (MaskVT.is256BitVector() && AllowFloatDomain) {

39023

assert(Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles")(static_cast <bool> (Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles"
) ? void (0) : __assert_fail ("Subtarget.hasAVX() && \"AVX required for 256-bit vector shuffles\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39023, __extension__
__PRETTY_FUNCTION__));

39024

if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2}, DAG, V1)) {

39025

Shuffle = X86ISD::MOVDDUP;

39026

SrcVT = DstVT = MVT::v4f64;

39027

return true;

39028

}

39029

if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2, 4, 4, 6, 6}, DAG,

39030

V1)) {

39031

Shuffle = X86ISD::MOVSLDUP;

39032

SrcVT = DstVT = MVT::v8f32;

39033

return true;

39034

}

39035

if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1, 3, 3, 5, 5, 7, 7}, DAG,

39036

V1)) {

39037

Shuffle = X86ISD::MOVSHDUP;

39038

SrcVT = DstVT = MVT::v8f32;

39039

return true;

39040

}

39041

}

39042

39043

if (MaskVT.is512BitVector() && AllowFloatDomain) {

39044

assert(Subtarget.hasAVX512() &&(static_cast <bool> (Subtarget.hasAVX512() && "AVX512 required for 512-bit vector shuffles"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && \"AVX512 required for 512-bit vector shuffles\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39045, __extension__
__PRETTY_FUNCTION__))

39045

"AVX512 required for 512-bit vector shuffles")(static_cast <bool> (Subtarget.hasAVX512() && "AVX512 required for 512-bit vector shuffles"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && \"AVX512 required for 512-bit vector shuffles\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39045, __extension__
__PRETTY_FUNCTION__));

39046

if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2, 4, 4, 6, 6}, DAG,

39047

V1)) {

39048

Shuffle = X86ISD::MOVDDUP;

39049

SrcVT = DstVT = MVT::v8f64;

39050

return true;

39051

}

39052

if (isTargetShuffleEquivalent(

39053

MaskVT, Mask,

39054

{0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14}, DAG, V1)) {

39055

Shuffle = X86ISD::MOVSLDUP;

39056

SrcVT = DstVT = MVT::v16f32;

39057

return true;

39058

}

39059

if (isTargetShuffleEquivalent(

39060

MaskVT, Mask,

39061

{1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15}, DAG, V1)) {

39062

Shuffle = X86ISD::MOVSHDUP;

39063

SrcVT = DstVT = MVT::v16f32;

39064

return true;

39065

}

39066

}

39067

39068

return false;

39069

}

39070

39071

// Attempt to match a combined shuffle mask against supported unary immediate

39072

// permute instructions.

39073

// TODO: Investigate sharing more of this with shuffle lowering.

39074

static bool matchUnaryPermuteShuffle(MVT MaskVT, ArrayRef<int> Mask,

39075

const APInt &Zeroable,

39076

bool AllowFloatDomain, bool AllowIntDomain,

39077

const SelectionDAG &DAG,

39078

const X86Subtarget &Subtarget,

39079

unsigned &Shuffle, MVT &ShuffleVT,

39080

unsigned &PermuteImm) {

39081

unsigned NumMaskElts = Mask.size();

39082

unsigned InputSizeInBits = MaskVT.getSizeInBits();

39083

unsigned MaskScalarSizeInBits = InputSizeInBits / NumMaskElts;

39084

MVT MaskEltVT = MVT::getIntegerVT(MaskScalarSizeInBits);

39085

bool ContainsZeros = isAnyZero(Mask);

39086

39087

// Handle VPERMI/VPERMILPD vXi64/vXi64 patterns.

39088

if (!ContainsZeros && MaskScalarSizeInBits == 64) {

39089

// Check for lane crossing permutes.

39090

if (is128BitLaneCrossingShuffleMask(MaskEltVT, Mask)) {

39091

// PERMPD/PERMQ permutes within a 256-bit vector (AVX2+).

39092

if (Subtarget.hasAVX2() && MaskVT.is256BitVector()) {

39093

Shuffle = X86ISD::VPERMI;

39094

ShuffleVT = (AllowFloatDomain ? MVT::v4f64 : MVT::v4i64);

39095

PermuteImm = getV4X86ShuffleImm(Mask);

39096

return true;

39097

}

39098

if (Subtarget.hasAVX512() && MaskVT.is512BitVector()) {

39099

SmallVector<int, 4> RepeatedMask;

39100

if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) {

39101

Shuffle = X86ISD::VPERMI;

39102

ShuffleVT = (AllowFloatDomain ? MVT::v8f64 : MVT::v8i64);

39103

PermuteImm = getV4X86ShuffleImm(RepeatedMask);

39104

return true;

39105

}

39106

}

39107

} else if (AllowFloatDomain && Subtarget.hasAVX()) {

39108

// VPERMILPD can permute with a non-repeating shuffle.

39109

Shuffle = X86ISD::VPERMILPI;

39110

ShuffleVT = MVT::getVectorVT(MVT::f64, Mask.size());

39111

PermuteImm = 0;

39112

for (int i = 0, e = Mask.size(); i != e; ++i) {

39113

int M = Mask[i];

39114

if (M == SM_SentinelUndef)

39115

continue;

39116

assert(((M / 2) == (i / 2)) && "Out of range shuffle mask index")(static_cast <bool> (((M / 2) == (i / 2)) && "Out of range shuffle mask index"
) ? void (0) : __assert_fail ("((M / 2) == (i / 2)) && \"Out of range shuffle mask index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39116, __extension__
__PRETTY_FUNCTION__));

39117

PermuteImm |= (M & 1) << i;

39118

}

39119

return true;

39120

}

39121

}

39122

39123

// We are checking for shuffle match or shift match. Loop twice so we can

39124

// order which we try and match first depending on target preference.

39125

for (unsigned Order = 0; Order < 2; ++Order) {

39126

if (Subtarget.preferLowerShuffleAsShift() ? (Order == 1) : (Order == 0)) {

39127

// Handle PSHUFD/VPERMILPI vXi32/vXf32 repeated patterns.

39128

// AVX introduced the VPERMILPD/VPERMILPS float permutes, before then we

39129

// had to use 2-input SHUFPD/SHUFPS shuffles (not handled here).

39130

if ((MaskScalarSizeInBits == 64 || MaskScalarSizeInBits == 32) &&

39131

!ContainsZeros && (AllowIntDomain || Subtarget.hasAVX())) {

39132

SmallVector<int, 4> RepeatedMask;

39133

if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {

39134

// Narrow the repeated mask to create 32-bit element permutes.

39135

SmallVector<int, 4> WordMask = RepeatedMask;

39136

if (MaskScalarSizeInBits == 64)

39137

narrowShuffleMaskElts(2, RepeatedMask, WordMask);

39138

39139

Shuffle = (AllowIntDomain ? X86ISD::PSHUFD : X86ISD::VPERMILPI);

39140

ShuffleVT = (AllowIntDomain ? MVT::i32 : MVT::f32);

39141

ShuffleVT = MVT::getVectorVT(ShuffleVT, InputSizeInBits / 32);

39142

PermuteImm = getV4X86ShuffleImm(WordMask);

39143

return true;

39144

}

39145

}

39146

39147

// Handle PSHUFLW/PSHUFHW vXi16 repeated patterns.

39148

if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits == 16 &&

39149

((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||

39150

(MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||

39151

(MaskVT.is512BitVector() && Subtarget.hasBWI()))) {

39152

SmallVector<int, 4> RepeatedMask;

39153

if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {

39154

ArrayRef<int> LoMask(RepeatedMask.data() + 0, 4);

39155

ArrayRef<int> HiMask(RepeatedMask.data() + 4, 4);

39156

39157

// PSHUFLW: permute lower 4 elements only.

39158

if (isUndefOrInRange(LoMask, 0, 4) &&

39159

isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {

39160

Shuffle = X86ISD::PSHUFLW;

39161

ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);

39162

PermuteImm = getV4X86ShuffleImm(LoMask);

39163

return true;

39164

}

39165

39166

// PSHUFHW: permute upper 4 elements only.

39167

if (isUndefOrInRange(HiMask, 4, 8) &&

39168

isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {

39169

// Offset the HiMask so that we can create the shuffle immediate.

39170

int OffsetHiMask[4];

39171

for (int i = 0; i != 4; ++i)

39172

OffsetHiMask[i] = (HiMask[i] < 0 ? HiMask[i] : HiMask[i] - 4);

39173

39174

Shuffle = X86ISD::PSHUFHW;

39175

ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);

39176

PermuteImm = getV4X86ShuffleImm(OffsetHiMask);

39177

return true;

39178

}

39179

}

39180

}

39181

} else {

39182

// Attempt to match against bit rotates.

39183

if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits < 64 &&

39184

((MaskVT.is128BitVector() && Subtarget.hasXOP()) ||

39185

Subtarget.hasAVX512())) {

39186

int RotateAmt = matchShuffleAsBitRotate(ShuffleVT, MaskScalarSizeInBits,

39187

Subtarget, Mask);

39188

if (0 < RotateAmt) {

39189

Shuffle = X86ISD::VROTLI;

39190

PermuteImm = (unsigned)RotateAmt;

39191

return true;

39192

}

39193

}

39194

}

39195

// Attempt to match against byte/bit shifts.

39196

if (AllowIntDomain &&

39197

((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||

39198

(MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||

39199

(MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {

39200

int ShiftAmt =

39201

matchShuffleAsShift(ShuffleVT, Shuffle, MaskScalarSizeInBits, Mask, 0,

39202

Zeroable, Subtarget);

39203

if (0 < ShiftAmt && (!ShuffleVT.is512BitVector() || Subtarget.hasBWI() ||

39204

32 <= ShuffleVT.getScalarSizeInBits())) {

39205

// Byte shifts can be slower so only match them on second attempt.

39206

if (Order == 0 &&

39207

(Shuffle == X86ISD::VSHLDQ || Shuffle == X86ISD::VSRLDQ))

39208

continue;

39209

39210

PermuteImm = (unsigned)ShiftAmt;

39211

return true;

39212

}

39213

39214

}

39215

}

39216

39217

return false;

39218

}

39219

39220

// Attempt to match a combined unary shuffle mask against supported binary

39221

// shuffle instructions.

39222

// TODO: Investigate sharing more of this with shuffle lowering.

39223

static bool matchBinaryShuffle(MVT MaskVT, ArrayRef<int> Mask,

39224

bool AllowFloatDomain, bool AllowIntDomain,

39225

SDValue &V1, SDValue &V2, const SDLoc &DL,

39226

SelectionDAG &DAG, const X86Subtarget &Subtarget,

39227

unsigned &Shuffle, MVT &SrcVT, MVT &DstVT,

39228

bool IsUnary) {

39229

unsigned NumMaskElts = Mask.size();

39230

unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();

39231

unsigned SizeInBits = MaskVT.getSizeInBits();

39232

39233

if (MaskVT.is128BitVector()) {

39234

if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0}, DAG) &&

39235

AllowFloatDomain) {

39236

V2 = V1;

39237

V1 = (SM_SentinelUndef == Mask[0] ? DAG.getUNDEF(MVT::v4f32) : V1);

39238

Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKL : X86ISD::MOVLHPS;

39239

SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32;

39240

return true;

39241

}

39242

if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1}, DAG) &&

39243

AllowFloatDomain) {

39244

V2 = V1;

39245

Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKH : X86ISD::MOVHLPS;

39246

SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32;

39247

return true;

39248

}

39249

if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 3}, DAG) &&

39250

Subtarget.hasSSE2() && (AllowFloatDomain || !Subtarget.hasSSE41())) {

39251

std::swap(V1, V2);

39252

Shuffle = X86ISD::MOVSD;

39253

SrcVT = DstVT = MVT::v2f64;

39254

return true;

39255

}

39256

if (isTargetShuffleEquivalent(MaskVT, Mask, {4, 1, 2, 3}, DAG) &&

39257

(AllowFloatDomain || !Subtarget.hasSSE41())) {

39258

Shuffle = X86ISD::MOVSS;

39259

SrcVT = DstVT = MVT::v4f32;

39260

return true;

39261

}

39262

if (isTargetShuffleEquivalent(MaskVT, Mask, {8, 1, 2, 3, 4, 5, 6, 7},

39263

DAG) &&

39264

Subtarget.hasFP16()) {

39265

Shuffle = X86ISD::MOVSH;

39266

SrcVT = DstVT = MVT::v8f16;

39267

return true;

39268

}

39269

}

39270

39271

// Attempt to match against either an unary or binary PACKSS/PACKUS shuffle.

39272

if (((MaskVT == MVT::v8i16 || MaskVT == MVT::v16i8) && Subtarget.hasSSE2()) ||

39273

((MaskVT == MVT::v16i16 || MaskVT == MVT::v32i8) && Subtarget.hasInt256()) ||

39274

((MaskVT == MVT::v32i16 || MaskVT == MVT::v64i8) && Subtarget.hasBWI())) {

39275

if (matchShuffleWithPACK(MaskVT, SrcVT, V1, V2, Shuffle, Mask, DAG,

39276

Subtarget)) {

39277

DstVT = MaskVT;

39278

return true;

39279

}

39280

}

39281

39282

// Attempt to match against either a unary or binary UNPCKL/UNPCKH shuffle.

39283

if ((MaskVT == MVT::v4f32 && Subtarget.hasSSE1()) ||

39284

(MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||

39285

(MaskVT.is256BitVector() && 32 <= EltSizeInBits && Subtarget.hasAVX()) ||

39286

(MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||

39287

(MaskVT.is512BitVector() && Subtarget.hasAVX512())) {

39288

if (matchShuffleWithUNPCK(MaskVT, V1, V2, Shuffle, IsUnary, Mask, DL, DAG,

39289

Subtarget)) {

39290

SrcVT = DstVT = MaskVT;

39291

if (MaskVT.is256BitVector() && !Subtarget.hasAVX2())

39292

SrcVT = DstVT = (32 == EltSizeInBits ? MVT::v8f32 : MVT::v4f64);

39293

return true;

39294

}

39295

}

39296

39297

// Attempt to match against a OR if we're performing a blend shuffle and the

39298

// non-blended source element is zero in each case.

39299

// TODO: Handle cases where V1/V2 sizes doesn't match SizeInBits.

39300

if (SizeInBits == V1.getValueSizeInBits() &&

39301

SizeInBits == V2.getValueSizeInBits() &&

39302

(EltSizeInBits % V1.getScalarValueSizeInBits()) == 0 &&

39303

(EltSizeInBits % V2.getScalarValueSizeInBits()) == 0) {

39304

bool IsBlend = true;

39305

unsigned NumV1Elts = V1.getValueType().getVectorNumElements();

39306

unsigned NumV2Elts = V2.getValueType().getVectorNumElements();

39307

unsigned Scale1 = NumV1Elts / NumMaskElts;

39308

unsigned Scale2 = NumV2Elts / NumMaskElts;

39309

APInt DemandedZeroV1 = APInt::getZero(NumV1Elts);

39310

APInt DemandedZeroV2 = APInt::getZero(NumV2Elts);

39311

for (unsigned i = 0; i != NumMaskElts; ++i) {

39312

int M = Mask[i];

39313

if (M == SM_SentinelUndef)

39314

continue;

39315

if (M == SM_SentinelZero) {

39316

DemandedZeroV1.setBits(i * Scale1, (i + 1) * Scale1);

39317

DemandedZeroV2.setBits(i * Scale2, (i + 1) * Scale2);

39318

continue;

39319

}

39320

if (M == (int)i) {

39321

DemandedZeroV2.setBits(i * Scale2, (i + 1) * Scale2);

39322

continue;

39323

}

39324

if (M == (int)(i + NumMaskElts)) {

39325

DemandedZeroV1.setBits(i * Scale1, (i + 1) * Scale1);

39326

continue;

39327

}

39328

IsBlend = false;

39329

break;

39330

}

39331

if (IsBlend) {

39332

if (DAG.MaskedVectorIsZero(V1, DemandedZeroV1) &&

39333

DAG.MaskedVectorIsZero(V2, DemandedZeroV2)) {

39334

Shuffle = ISD::OR;

39335

SrcVT = DstVT = MaskVT.changeTypeToInteger();

39336

return true;

39337

}

39338

if (NumV1Elts == NumV2Elts && NumV1Elts == NumMaskElts) {

39339

// FIXME: handle mismatched sizes?

39340

// TODO: investigate if `ISD::OR` handling in

39341

// `TargetLowering::SimplifyDemandedVectorElts` can be improved instead.

39342

auto computeKnownBitsElementWise = [&DAG](SDValue V) {

39343

unsigned NumElts = V.getValueType().getVectorNumElements();

39344

KnownBits Known(NumElts);

39345

for (unsigned EltIdx = 0; EltIdx != NumElts; ++EltIdx) {

39346

APInt Mask = APInt::getOneBitSet(NumElts, EltIdx);

39347

KnownBits PeepholeKnown = DAG.computeKnownBits(V, Mask);

39348

if (PeepholeKnown.isZero())

39349

Known.Zero.setBit(EltIdx);

39350

if (PeepholeKnown.isAllOnes())

39351

Known.One.setBit(EltIdx);

39352

}

39353

return Known;

39354

};

39355

39356

KnownBits V1Known = computeKnownBitsElementWise(V1);

39357

KnownBits V2Known = computeKnownBitsElementWise(V2);

39358

39359

for (unsigned i = 0; i != NumMaskElts && IsBlend; ++i) {

39360

int M = Mask[i];

39361

if (M == SM_SentinelUndef)

39362

continue;

39363

if (M == SM_SentinelZero) {

39364

IsBlend &= V1Known.Zero[i] && V2Known.Zero[i];

39365

continue;

39366

}

39367

if (M == (int)i) {

39368

IsBlend &= V2Known.Zero[i] || V1Known.One[i];

39369

continue;

39370

}

39371

if (M == (int)(i + NumMaskElts)) {

39372

IsBlend &= V1Known.Zero[i] || V2Known.One[i];

39373

continue;

39374

}

39375

llvm_unreachable("will not get here.")::llvm::llvm_unreachable_internal("will not get here.", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 39375);

39376

}

39377

if (IsBlend) {

39378

Shuffle = ISD::OR;

39379

SrcVT = DstVT = MaskVT.changeTypeToInteger();

39380

return true;

39381

}

39382

}

39383

}

39384

}

39385

39386

return false;

39387

}

39388

39389

static bool matchBinaryPermuteShuffle(

39390

MVT MaskVT, ArrayRef<int> Mask, const APInt &Zeroable,

39391

bool AllowFloatDomain, bool AllowIntDomain, SDValue &V1, SDValue &V2,

39392

const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget,

39393

unsigned &Shuffle, MVT &ShuffleVT, unsigned &PermuteImm) {

39394

unsigned NumMaskElts = Mask.size();

39395

unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();

39396

39397

// Attempt to match against VALIGND/VALIGNQ rotate.

39398

if (AllowIntDomain && (EltSizeInBits == 64 || EltSizeInBits == 32) &&

39399

((MaskVT.is128BitVector() && Subtarget.hasVLX()) ||

39400

(MaskVT.is256BitVector() && Subtarget.hasVLX()) ||

39401

(MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {

39402

if (!isAnyZero(Mask)) {

39403

int Rotation = matchShuffleAsElementRotate(V1, V2, Mask);

39404

if (0 < Rotation) {

39405

Shuffle = X86ISD::VALIGN;

39406

if (EltSizeInBits == 64)

39407

ShuffleVT = MVT::getVectorVT(MVT::i64, MaskVT.getSizeInBits() / 64);

39408

else

39409

ShuffleVT = MVT::getVectorVT(MVT::i32, MaskVT.getSizeInBits() / 32);

39410

PermuteImm = Rotation;

39411

return true;

39412

}

39413

}

39414

}

39415

39416

// Attempt to match against PALIGNR byte rotate.

39417

if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSSE3()) ||

39418

(MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||

39419

(MaskVT.is512BitVector() && Subtarget.hasBWI()))) {

39420

int ByteRotation = matchShuffleAsByteRotate(MaskVT, V1, V2, Mask);

39421

if (0 < ByteRotation) {

39422

Shuffle = X86ISD::PALIGNR;

39423

ShuffleVT = MVT::getVectorVT(MVT::i8, MaskVT.getSizeInBits() / 8);

39424

PermuteImm = ByteRotation;

39425

return true;

39426

}

39427

}

39428

39429

// Attempt to combine to X86ISD::BLENDI.

39430

if ((NumMaskElts <= 8 && ((Subtarget.hasSSE41() && MaskVT.is128BitVector()) ||

39431

(Subtarget.hasAVX() && MaskVT.is256BitVector()))) ||

39432

(MaskVT == MVT::v16i16 && Subtarget.hasAVX2())) {

39433

uint64_t BlendMask = 0;

39434

bool ForceV1Zero = false, ForceV2Zero = false;

39435

SmallVector<int, 8> TargetMask(Mask);

39436

if (matchShuffleAsBlend(MaskVT, V1, V2, TargetMask, Zeroable, ForceV1Zero,

39437

ForceV2Zero, BlendMask)) {

39438

if (MaskVT == MVT::v16i16) {

39439

// We can only use v16i16 PBLENDW if the lanes are repeated.

39440

SmallVector<int, 8> RepeatedMask;

39441

if (isRepeatedTargetShuffleMask(128, MaskVT, TargetMask,

39442

RepeatedMask)) {

39443

assert(RepeatedMask.size() == 8 &&(static_cast <bool> (RepeatedMask.size() == 8 &&
"Repeated mask size doesn't match!") ? void (0) : __assert_fail
("RepeatedMask.size() == 8 && \"Repeated mask size doesn't match!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39444, __extension__
__PRETTY_FUNCTION__))

39444

"Repeated mask size doesn't match!")(static_cast <bool> (RepeatedMask.size() == 8 &&
"Repeated mask size doesn't match!") ? void (0) : __assert_fail
("RepeatedMask.size() == 8 && \"Repeated mask size doesn't match!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39444, __extension__
__PRETTY_FUNCTION__));

39445

PermuteImm = 0;

39446

for (int i = 0; i < 8; ++i)

39447

if (RepeatedMask[i] >= 8)

39448

PermuteImm |= 1 << i;

39449

V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;

39450

V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;

39451

Shuffle = X86ISD::BLENDI;

39452

ShuffleVT = MaskVT;

39453

return true;

39454

}

39455

} else {

39456

V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;

39457

V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;

39458

PermuteImm = (unsigned)BlendMask;

39459

Shuffle = X86ISD::BLENDI;

39460

ShuffleVT = MaskVT;

39461

return true;

39462

}

39463

}

39464

}

39465

39466

// Attempt to combine to INSERTPS, but only if it has elements that need to

39467

// be set to zero.

39468

if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&

39469

MaskVT.is128BitVector() && isAnyZero(Mask) &&

39470

matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {

39471

Shuffle = X86ISD::INSERTPS;

39472

ShuffleVT = MVT::v4f32;

39473

return true;

39474

}

39475

39476

// Attempt to combine to SHUFPD.

39477

if (AllowFloatDomain && EltSizeInBits == 64 &&

39478

((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||

39479

(MaskVT.is256BitVector() && Subtarget.hasAVX()) ||

39480

(MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {

39481

bool ForceV1Zero = false, ForceV2Zero = false;

39482

if (matchShuffleWithSHUFPD(MaskVT, V1, V2, ForceV1Zero, ForceV2Zero,

39483

PermuteImm, Mask, Zeroable)) {

39484

V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;

39485

V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;

39486

Shuffle = X86ISD::SHUFP;

39487

ShuffleVT = MVT::getVectorVT(MVT::f64, MaskVT.getSizeInBits() / 64);

39488

return true;

39489

}

39490

}

39491

39492

// Attempt to combine to SHUFPS.

39493

if (AllowFloatDomain && EltSizeInBits == 32 &&

39494

((MaskVT.is128BitVector() && Subtarget.hasSSE1()) ||

39495

(MaskVT.is256BitVector() && Subtarget.hasAVX()) ||

39496

(MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {

39497

SmallVector<int, 4> RepeatedMask;

39498

if (isRepeatedTargetShuffleMask(128, MaskVT, Mask, RepeatedMask)) {

39499

// Match each half of the repeated mask, to determine if its just

39500

// referencing one of the vectors, is zeroable or entirely undef.

39501

auto MatchHalf = [&](unsigned Offset, int &S0, int &S1) {

39502

int M0 = RepeatedMask[Offset];

39503

int M1 = RepeatedMask[Offset + 1];

39504

39505

if (isUndefInRange(RepeatedMask, Offset, 2)) {

39506

return DAG.getUNDEF(MaskVT);

39507

} else if (isUndefOrZeroInRange(RepeatedMask, Offset, 2)) {

39508

S0 = (SM_SentinelUndef == M0 ? -1 : 0);

39509

S1 = (SM_SentinelUndef == M1 ? -1 : 1);

39510

return getZeroVector(MaskVT, Subtarget, DAG, DL);

39511

} else if (isUndefOrInRange(M0, 0, 4) && isUndefOrInRange(M1, 0, 4)) {

39512

S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);

39513

S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);

39514

return V1;

39515

} else if (isUndefOrInRange(M0, 4, 8) && isUndefOrInRange(M1, 4, 8)) {

39516

S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);

39517

S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);

39518

return V2;

39519

}

39520

39521

return SDValue();

39522

};

39523

39524

int ShufMask[4] = {-1, -1, -1, -1};

39525

SDValue Lo = MatchHalf(0, ShufMask[0], ShufMask[1]);

39526

SDValue Hi = MatchHalf(2, ShufMask[2], ShufMask[3]);

39527

39528

if (Lo && Hi) {

39529

V1 = Lo;

39530

V2 = Hi;

39531

Shuffle = X86ISD::SHUFP;

39532

ShuffleVT = MVT::getVectorVT(MVT::f32, MaskVT.getSizeInBits() / 32);

39533

PermuteImm = getV4X86ShuffleImm(ShufMask);

39534

return true;

39535

}

39536

}

39537

}

39538

39539

// Attempt to combine to INSERTPS more generally if X86ISD::SHUFP failed.

39540

if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&

39541

MaskVT.is128BitVector() &&

39542

matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {

39543

Shuffle = X86ISD::INSERTPS;

39544

ShuffleVT = MVT::v4f32;

39545

return true;

39546

}

39547

39548

return false;

39549

}

39550

39551

static SDValue combineX86ShuffleChainWithExtract(

39552

ArrayRef<SDValue> Inputs, SDValue Root, ArrayRef<int> BaseMask, int Depth,

39553

bool HasVariableMask, bool AllowVariableCrossLaneMask,

39554

bool AllowVariablePerLaneMask, SelectionDAG &DAG,

39555

const X86Subtarget &Subtarget);

39556

39557

/// Combine an arbitrary chain of shuffles into a single instruction if

39558

/// possible.

39559

///

39560

/// This is the leaf of the recursive combine below. When we have found some

39561

/// chain of single-use x86 shuffle instructions and accumulated the combined

39562

/// shuffle mask represented by them, this will try to pattern match that mask

39563

/// into either a single instruction if there is a special purpose instruction

39564

/// for this operation, or into a PSHUFB instruction which is a fully general

39565

/// instruction but should only be used to replace chains over a certain depth.

39566

static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,

39567

ArrayRef<int> BaseMask, int Depth,

39568

bool HasVariableMask,

39569

bool AllowVariableCrossLaneMask,

39570

bool AllowVariablePerLaneMask,

39571

SelectionDAG &DAG,

39572

const X86Subtarget &Subtarget) {

39573

assert(!BaseMask.empty() && "Cannot combine an empty shuffle mask!")(static_cast <bool> (!BaseMask.empty() && "Cannot combine an empty shuffle mask!"
) ? void (0) : __assert_fail ("!BaseMask.empty() && \"Cannot combine an empty shuffle mask!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39573, __extension__
__PRETTY_FUNCTION__));

39574

assert((Inputs.size() == 1 || Inputs.size() == 2) &&(static_cast <bool> ((Inputs.size() == 1 || Inputs.size
() == 2) && "Unexpected number of shuffle inputs!") ?
void (0) : __assert_fail ("(Inputs.size() == 1 || Inputs.size() == 2) && \"Unexpected number of shuffle inputs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39575, __extension__
__PRETTY_FUNCTION__))

39575

"Unexpected number of shuffle inputs!")(static_cast <bool> ((Inputs.size() == 1 || Inputs.size
() == 2) && "Unexpected number of shuffle inputs!") ?
void (0) : __assert_fail ("(Inputs.size() == 1 || Inputs.size() == 2) && \"Unexpected number of shuffle inputs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39575, __extension__
__PRETTY_FUNCTION__));

39576

39577

SDLoc DL(Root);

39578

MVT RootVT = Root.getSimpleValueType();

39579

unsigned RootSizeInBits = RootVT.getSizeInBits();

39580

unsigned NumRootElts = RootVT.getVectorNumElements();

39581

39582

// Canonicalize shuffle input op to the requested type.

39583

auto CanonicalizeShuffleInput = [&](MVT VT, SDValue Op) {

39584

if (VT.getSizeInBits() > Op.getValueSizeInBits())

39585

Op = widenSubVector(Op, false, Subtarget, DAG, DL, VT.getSizeInBits());

39586

else if (VT.getSizeInBits() < Op.getValueSizeInBits())

39587

Op = extractSubVector(Op, 0, DAG, DL, VT.getSizeInBits());

39588

return DAG.getBitcast(VT, Op);

39589

};

39590

39591

// Find the inputs that enter the chain. Note that multiple uses are OK

39592

// here, we're not going to remove the operands we find.

39593

bool UnaryShuffle = (Inputs.size() == 1);

39594

SDValue V1 = peekThroughBitcasts(Inputs[0]);

39595

SDValue V2 = (UnaryShuffle ? DAG.getUNDEF(V1.getValueType())

39596

: peekThroughBitcasts(Inputs[1]));

39597

39598

MVT VT1 = V1.getSimpleValueType();

39599

MVT VT2 = V2.getSimpleValueType();

39600

assert((RootSizeInBits % VT1.getSizeInBits()) == 0 &&(static_cast <bool> ((RootSizeInBits % VT1.getSizeInBits
()) == 0 && (RootSizeInBits % VT2.getSizeInBits()) ==
0 && "Vector size mismatch") ? void (0) : __assert_fail
("(RootSizeInBits % VT1.getSizeInBits()) == 0 && (RootSizeInBits % VT2.getSizeInBits()) == 0 && \"Vector size mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39601, __extension__
__PRETTY_FUNCTION__))

39601

(RootSizeInBits % VT2.getSizeInBits()) == 0 && "Vector size mismatch")(static_cast <bool> ((RootSizeInBits % VT1.getSizeInBits
()) == 0 && (RootSizeInBits % VT2.getSizeInBits()) ==
0 && "Vector size mismatch") ? void (0) : __assert_fail
("(RootSizeInBits % VT1.getSizeInBits()) == 0 && (RootSizeInBits % VT2.getSizeInBits()) == 0 && \"Vector size mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39601, __extension__
__PRETTY_FUNCTION__));

39602

39603

SDValue Res;

39604

39605

unsigned NumBaseMaskElts = BaseMask.size();

39606

if (NumBaseMaskElts == 1) {

39607

assert(BaseMask[0] == 0 && "Invalid shuffle index found!")(static_cast <bool> (BaseMask[0] == 0 && "Invalid shuffle index found!"
) ? void (0) : __assert_fail ("BaseMask[0] == 0 && \"Invalid shuffle index found!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39607, __extension__
__PRETTY_FUNCTION__));

39608

return CanonicalizeShuffleInput(RootVT, V1);

39609

}

39610

39611

bool OptForSize = DAG.shouldOptForSize();

39612

unsigned BaseMaskEltSizeInBits = RootSizeInBits / NumBaseMaskElts;

39613

bool FloatDomain = VT1.isFloatingPoint() || VT2.isFloatingPoint() ||

39614

(RootVT.isFloatingPoint() && Depth >= 1) ||

39615

(RootVT.is256BitVector() && !Subtarget.hasAVX2());

39616

39617

// Don't combine if we are a AVX512/EVEX target and the mask element size

39618

// is different from the root element size - this would prevent writemasks

39619

// from being reused.

39620

bool IsMaskedShuffle = false;

39621

if (RootSizeInBits == 512 || (Subtarget.hasVLX() && RootSizeInBits >= 128)) {

39622

if (Root.hasOneUse() && Root->use_begin()->getOpcode() == ISD::VSELECT &&

39623

Root->use_begin()->getOperand(0).getScalarValueSizeInBits() == 1) {

39624

IsMaskedShuffle = true;

39625

}

39626

}

39627

39628

// If we are shuffling a splat (and not introducing zeros) then we can just

39629

// use it directly. This works for smaller elements as well as they already

39630

// repeat across each mask element.

39631

if (UnaryShuffle && !isAnyZero(BaseMask) &&

39632

V1.getValueSizeInBits() >= RootSizeInBits &&

39633

(BaseMaskEltSizeInBits % V1.getScalarValueSizeInBits()) == 0 &&

39634

DAG.isSplatValue(V1, /*AllowUndefs*/ false)) {

39635

return CanonicalizeShuffleInput(RootVT, V1);

39636

}

39637

39638

SmallVector<int, 64> Mask(BaseMask);

39639

39640

// See if the shuffle is a hidden identity shuffle - repeated args in HOPs

39641

// etc. can be simplified.

39642

if (VT1 == VT2 && VT1.getSizeInBits() == RootSizeInBits && VT1.isVector()) {

39643

SmallVector<int> ScaledMask, IdentityMask;

39644

unsigned NumElts = VT1.getVectorNumElements();

39645

if (Mask.size() <= NumElts &&

39646

scaleShuffleElements(Mask, NumElts, ScaledMask)) {

39647

for (unsigned i = 0; i != NumElts; ++i)

39648

IdentityMask.push_back(i);

39649

if (isTargetShuffleEquivalent(RootVT, ScaledMask, IdentityMask, DAG, V1,

39650

V2))

39651

return CanonicalizeShuffleInput(RootVT, V1);

39652

}

39653

}

39654

39655

// Handle 128/256-bit lane shuffles of 512-bit vectors.

39656

if (RootVT.is512BitVector() &&

39657

(NumBaseMaskElts == 2 || NumBaseMaskElts == 4)) {

39658

// If the upper subvectors are zeroable, then an extract+insert is more

39659

// optimal than using X86ISD::SHUF128. The insertion is free, even if it has

39660

// to zero the upper subvectors.

39661

if (isUndefOrZeroInRange(Mask, 1, NumBaseMaskElts - 1)) {

39662

if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)

39663

return SDValue(); // Nothing to do!

39664

assert(isInRange(Mask[0], 0, NumBaseMaskElts) &&(static_cast <bool> (isInRange(Mask[0], 0, NumBaseMaskElts
) && "Unexpected lane shuffle") ? void (0) : __assert_fail
("isInRange(Mask[0], 0, NumBaseMaskElts) && \"Unexpected lane shuffle\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39665, __extension__
__PRETTY_FUNCTION__))

39665

"Unexpected lane shuffle")(static_cast <bool> (isInRange(Mask[0], 0, NumBaseMaskElts
) && "Unexpected lane shuffle") ? void (0) : __assert_fail
("isInRange(Mask[0], 0, NumBaseMaskElts) && \"Unexpected lane shuffle\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39665, __extension__
__PRETTY_FUNCTION__));

39666

Res = CanonicalizeShuffleInput(RootVT, V1);

39667

unsigned SubIdx = Mask[0] * (NumRootElts / NumBaseMaskElts);

39668

bool UseZero = isAnyZero(Mask);

39669

Res = extractSubVector(Res, SubIdx, DAG, DL, BaseMaskEltSizeInBits);

39670

return widenSubVector(Res, UseZero, Subtarget, DAG, DL, RootSizeInBits);

39671

}

39672

39673

// Narrow shuffle mask to v4x128.

39674

SmallVector<int, 4> ScaledMask;

39675

assert((BaseMaskEltSizeInBits % 128) == 0 && "Illegal mask size")(static_cast <bool> ((BaseMaskEltSizeInBits % 128) == 0
&& "Illegal mask size") ? void (0) : __assert_fail (
"(BaseMaskEltSizeInBits % 128) == 0 && \"Illegal mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39675, __extension__
__PRETTY_FUNCTION__));

39676

narrowShuffleMaskElts(BaseMaskEltSizeInBits / 128, Mask, ScaledMask);

39677

39678

// Try to lower to vshuf64x2/vshuf32x4.

39679

auto MatchSHUF128 = [&](MVT ShuffleVT, const SDLoc &DL,

39680

ArrayRef<int> ScaledMask, SDValue V1, SDValue V2,

39681

SelectionDAG &DAG) {

39682

unsigned PermMask = 0;

39683

// Insure elements came from the same Op.

39684

SDValue Ops[2] = {DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT)};

39685

for (int i = 0; i < 4; ++i) {

39686

assert(ScaledMask[i] >= -1 && "Illegal shuffle sentinel value")(static_cast <bool> (ScaledMask[i] >= -1 && "Illegal shuffle sentinel value"
) ? void (0) : __assert_fail ("ScaledMask[i] >= -1 && \"Illegal shuffle sentinel value\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39686, __extension__
__PRETTY_FUNCTION__));

39687

if (ScaledMask[i] < 0)

39688

continue;

39689

39690

SDValue Op = ScaledMask[i] >= 4 ? V2 : V1;

39691

unsigned OpIndex = i / 2;

39692

if (Ops[OpIndex].isUndef())

39693

Ops[OpIndex] = Op;

39694

else if (Ops[OpIndex] != Op)

39695

return SDValue();

39696

39697

// Convert the 128-bit shuffle mask selection values into 128-bit

39698

// selection bits defined by a vshuf64x2 instruction's immediate control

39699

// byte.

39700

PermMask |= (ScaledMask[i] % 4) << (i * 2);

39701

}

39702

39703

return DAG.getNode(X86ISD::SHUF128, DL, ShuffleVT,

39704

CanonicalizeShuffleInput(ShuffleVT, Ops[0]),

39705

CanonicalizeShuffleInput(ShuffleVT, Ops[1]),

39706

DAG.getTargetConstant(PermMask, DL, MVT::i8));

39707

};

39708

39709

// FIXME: Is there a better way to do this? is256BitLaneRepeatedShuffleMask

39710

// doesn't work because our mask is for 128 bits and we don't have an MVT

39711

// to match that.

39712

bool PreferPERMQ = UnaryShuffle && isUndefOrInRange(ScaledMask[0], 0, 2) &&

39713

isUndefOrInRange(ScaledMask[1], 0, 2) &&

39714

isUndefOrInRange(ScaledMask[2], 2, 4) &&

39715

isUndefOrInRange(ScaledMask[3], 2, 4) &&

39716

(ScaledMask[0] < 0 || ScaledMask[2] < 0 ||

39717

ScaledMask[0] == (ScaledMask[2] % 2)) &&

39718

(ScaledMask[1] < 0 || ScaledMask[3] < 0 ||

39719

ScaledMask[1] == (ScaledMask[3] % 2));

39720

39721

if (!isAnyZero(ScaledMask) && !PreferPERMQ) {

39722

if (Depth == 0 && Root.getOpcode() == X86ISD::SHUF128)

39723

return SDValue(); // Nothing to do!

39724

MVT ShuffleVT = (FloatDomain ? MVT::v8f64 : MVT::v8i64);

39725

if (SDValue V = MatchSHUF128(ShuffleVT, DL, ScaledMask, V1, V2, DAG))

39726

return DAG.getBitcast(RootVT, V);

39727

}

39728

}

39729

39730

// Handle 128-bit lane shuffles of 256-bit vectors.

39731

if (RootVT.is256BitVector() && NumBaseMaskElts == 2) {

39732

// If the upper half is zeroable, then an extract+insert is more optimal

39733

// than using X86ISD::VPERM2X128. The insertion is free, even if it has to

39734

// zero the upper half.

39735

if (isUndefOrZero(Mask[1])) {

39736

if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)

39737

return SDValue(); // Nothing to do!

39738

assert(isInRange(Mask[0], 0, 2) && "Unexpected lane shuffle")(static_cast <bool> (isInRange(Mask[0], 0, 2) &&
"Unexpected lane shuffle") ? void (0) : __assert_fail ("isInRange(Mask[0], 0, 2) && \"Unexpected lane shuffle\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39738, __extension__
__PRETTY_FUNCTION__));

39739

Res = CanonicalizeShuffleInput(RootVT, V1);

39740

Res = extract128BitVector(Res, Mask[0] * (NumRootElts / 2), DAG, DL);

39741

return widenSubVector(Res, Mask[1] == SM_SentinelZero, Subtarget, DAG, DL,

39742

256);

39743

}

39744

39745

// If we're inserting the low subvector, an insert-subvector 'concat'

39746

// pattern is quicker than VPERM2X128.

39747

// TODO: Add AVX2 support instead of VPERMQ/VPERMPD.

39748

if (BaseMask[0] == 0 && (BaseMask[1] == 0 || BaseMask[1] == 2) &&

39749

!Subtarget.hasAVX2()) {

39750

if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)

39751

return SDValue(); // Nothing to do!

39752

SDValue Lo = CanonicalizeShuffleInput(RootVT, V1);

39753

SDValue Hi = CanonicalizeShuffleInput(RootVT, BaseMask[1] == 0 ? V1 : V2);

39754

Hi = extractSubVector(Hi, 0, DAG, DL, 128);

39755

return insertSubVector(Lo, Hi, NumRootElts / 2, DAG, DL, 128);

39756

}

39757

39758

if (Depth == 0 && Root.getOpcode() == X86ISD::VPERM2X128)

39759

return SDValue(); // Nothing to do!

39760

39761

// If we have AVX2, prefer to use VPERMQ/VPERMPD for unary shuffles unless

39762

// we need to use the zeroing feature.

39763

// Prefer blends for sequential shuffles unless we are optimizing for size.

39764

if (UnaryShuffle &&

39765

!(Subtarget.hasAVX2() && isUndefOrInRange(Mask, 0, 2)) &&

39766

(OptForSize || !isSequentialOrUndefOrZeroInRange(Mask, 0, 2, 0))) {

39767

unsigned PermMask = 0;

39768

PermMask |= ((Mask[0] < 0 ? 0x8 : (Mask[0] & 1)) << 0);

39769

PermMask |= ((Mask[1] < 0 ? 0x8 : (Mask[1] & 1)) << 4);

39770

return DAG.getNode(

39771

X86ISD::VPERM2X128, DL, RootVT, CanonicalizeShuffleInput(RootVT, V1),

39772

DAG.getUNDEF(RootVT), DAG.getTargetConstant(PermMask, DL, MVT::i8));

39773

}

39774

39775

if (Depth == 0 && Root.getOpcode() == X86ISD::SHUF128)

39776

return SDValue(); // Nothing to do!

39777

39778

// TODO - handle AVX512VL cases with X86ISD::SHUF128.

39779

if (!UnaryShuffle && !IsMaskedShuffle) {

39780

assert(llvm::all_of(Mask, [](int M) { return 0 <= M && M < 4; }) &&(static_cast <bool> (llvm::all_of(Mask, [](int M) { return
0 <= M && M < 4; }) && "Unexpected shuffle sentinel value"
) ? void (0) : __assert_fail ("llvm::all_of(Mask, [](int M) { return 0 <= M && M < 4; }) && \"Unexpected shuffle sentinel value\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39781, __extension__
__PRETTY_FUNCTION__))

39781

"Unexpected shuffle sentinel value")(static_cast <bool> (llvm::all_of(Mask, [](int M) { return
0 <= M && M < 4; }) && "Unexpected shuffle sentinel value"
) ? void (0) : __assert_fail ("llvm::all_of(Mask, [](int M) { return 0 <= M && M < 4; }) && \"Unexpected shuffle sentinel value\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39781, __extension__
__PRETTY_FUNCTION__));

39782

// Prefer blends to X86ISD::VPERM2X128.

39783

if (!((Mask[0] == 0 && Mask[1] == 3) || (Mask[0] == 2 && Mask[1] == 1))) {

39784

unsigned PermMask = 0;

39785

PermMask |= ((Mask[0] & 3) << 0);

39786

PermMask |= ((Mask[1] & 3) << 4);

39787

SDValue LHS = isInRange(Mask[0], 0, 2) ? V1 : V2;

39788

SDValue RHS = isInRange(Mask[1], 0, 2) ? V1 : V2;

39789

return DAG.getNode(X86ISD::VPERM2X128, DL, RootVT,

39790

CanonicalizeShuffleInput(RootVT, LHS),

39791

CanonicalizeShuffleInput(RootVT, RHS),

39792

DAG.getTargetConstant(PermMask, DL, MVT::i8));

39793

}

39794

}

39795

}

39796

39797

// For masks that have been widened to 128-bit elements or more,

39798

// narrow back down to 64-bit elements.

39799

if (BaseMaskEltSizeInBits > 64) {

39800

assert((BaseMaskEltSizeInBits % 64) == 0 && "Illegal mask size")(static_cast <bool> ((BaseMaskEltSizeInBits % 64) == 0 &&
"Illegal mask size") ? void (0) : __assert_fail ("(BaseMaskEltSizeInBits % 64) == 0 && \"Illegal mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39800, __extension__
__PRETTY_FUNCTION__));

39801

int MaskScale = BaseMaskEltSizeInBits / 64;

39802

SmallVector<int, 64> ScaledMask;

39803

narrowShuffleMaskElts(MaskScale, Mask, ScaledMask);

39804

Mask = std::move(ScaledMask);

39805

}

39806

39807

// For masked shuffles, we're trying to match the root width for better

39808

// writemask folding, attempt to scale the mask.

39809

// TODO - variable shuffles might need this to be widened again.

39810

if (IsMaskedShuffle && NumRootElts > Mask.size()) {

39811

assert((NumRootElts % Mask.size()) == 0 && "Illegal mask size")(static_cast <bool> ((NumRootElts % Mask.size()) == 0 &&
"Illegal mask size") ? void (0) : __assert_fail ("(NumRootElts % Mask.size()) == 0 && \"Illegal mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39811, __extension__
__PRETTY_FUNCTION__));

39812

int MaskScale = NumRootElts / Mask.size();

39813

SmallVector<int, 64> ScaledMask;

39814

narrowShuffleMaskElts(MaskScale, Mask, ScaledMask);

39815

Mask = std::move(ScaledMask);

39816

}

39817

39818

unsigned NumMaskElts = Mask.size();

39819

unsigned MaskEltSizeInBits = RootSizeInBits / NumMaskElts;

39820

39821

// Determine the effective mask value type.

39822

FloatDomain &= (32 <= MaskEltSizeInBits);

39823

MVT MaskVT = FloatDomain ? MVT::getFloatingPointVT(MaskEltSizeInBits)

39824

: MVT::getIntegerVT(MaskEltSizeInBits);

39825

MaskVT = MVT::getVectorVT(MaskVT, NumMaskElts);

39826

39827

// Only allow legal mask types.

39828

if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT))

39829

return SDValue();

39830

39831

// Attempt to match the mask against known shuffle patterns.

39832

MVT ShuffleSrcVT, ShuffleVT;

39833

unsigned Shuffle, PermuteImm;

39834

39835

// Which shuffle domains are permitted?

39836

// Permit domain crossing at higher combine depths.

39837

// TODO: Should we indicate which domain is preferred if both are allowed?

39838

bool AllowFloatDomain = FloatDomain || (Depth >= 3);

39839

bool AllowIntDomain = (!FloatDomain || (Depth >= 3)) && Subtarget.hasSSE2() &&

39840

(!MaskVT.is256BitVector() || Subtarget.hasAVX2());

39841

39842

// Determine zeroable mask elements.

39843

APInt KnownUndef, KnownZero;

39844

resolveZeroablesFromTargetShuffle(Mask, KnownUndef, KnownZero);

39845

APInt Zeroable = KnownUndef | KnownZero;

39846

39847

if (UnaryShuffle) {

39848

// Attempt to match against broadcast-from-vector.

39849

// Limit AVX1 to cases where we're loading+broadcasting a scalar element.

39850

if ((Subtarget.hasAVX2() ||

39851

(Subtarget.hasAVX() && 32 <= MaskEltSizeInBits)) &&

39852

(!IsMaskedShuffle || NumRootElts == NumMaskElts)) {

39853

if (isUndefOrEqual(Mask, 0)) {

39854

if (V1.getValueType() == MaskVT &&

39855

V1.getOpcode() == ISD::SCALAR_TO_VECTOR &&

39856

X86::mayFoldLoad(V1.getOperand(0), Subtarget)) {

39857

if (Depth == 0 && Root.getOpcode() == X86ISD::VBROADCAST)

39858

return SDValue(); // Nothing to do!

39859

Res = V1.getOperand(0);

39860

Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res);

39861

return DAG.getBitcast(RootVT, Res);

39862

}

39863

if (Subtarget.hasAVX2()) {

39864

if (Depth == 0 && Root.getOpcode() == X86ISD::VBROADCAST)

39865

return SDValue(); // Nothing to do!

39866

Res = CanonicalizeShuffleInput(MaskVT, V1);

39867

Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res);

39868

return DAG.getBitcast(RootVT, Res);

39869

}

39870

}

39871

}

39872

39873

if (matchUnaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, V1,

39874

DAG, Subtarget, Shuffle, ShuffleSrcVT, ShuffleVT) &&

39875

(!IsMaskedShuffle ||

39876

(NumRootElts == ShuffleVT.getVectorNumElements()))) {

39877

if (Depth == 0 && Root.getOpcode() == Shuffle)

39878

return SDValue(); // Nothing to do!

39879

Res = CanonicalizeShuffleInput(ShuffleSrcVT, V1);

39880

Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res);

39881

return DAG.getBitcast(RootVT, Res);

39882

}

39883

39884

if (matchUnaryPermuteShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,

39885

AllowIntDomain, DAG, Subtarget, Shuffle, ShuffleVT,

39886

PermuteImm) &&

39887

(!IsMaskedShuffle ||

39888

(NumRootElts == ShuffleVT.getVectorNumElements()))) {

39889

if (Depth == 0 && Root.getOpcode() == Shuffle)

39890

return SDValue(); // Nothing to do!

39891

Res = CanonicalizeShuffleInput(ShuffleVT, V1);

39892

Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res,

39893

DAG.getTargetConstant(PermuteImm, DL, MVT::i8));

39894

return DAG.getBitcast(RootVT, Res);

39895

}

39896

}

39897

39898

// Attempt to combine to INSERTPS, but only if the inserted element has come

39899

// from a scalar.

39900

// TODO: Handle other insertions here as well?

39901

if (!UnaryShuffle && AllowFloatDomain && RootSizeInBits == 128 &&

39902

Subtarget.hasSSE41() &&

39903

!isTargetShuffleEquivalent(MaskVT, Mask, {4, 1, 2, 3}, DAG)) {

39904

if (MaskEltSizeInBits == 32) {

39905

SDValue SrcV1 = V1, SrcV2 = V2;

39906

if (matchShuffleAsInsertPS(SrcV1, SrcV2, PermuteImm, Zeroable, Mask,

39907

DAG) &&

39908

SrcV2.getOpcode() == ISD::SCALAR_TO_VECTOR) {

39909

if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTPS)

39910

return SDValue(); // Nothing to do!

39911

Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32,

39912

CanonicalizeShuffleInput(MVT::v4f32, SrcV1),

39913

CanonicalizeShuffleInput(MVT::v4f32, SrcV2),

39914

DAG.getTargetConstant(PermuteImm, DL, MVT::i8));

39915

return DAG.getBitcast(RootVT, Res);

39916

}

39917

}

39918

if (MaskEltSizeInBits == 64 &&

39919

isTargetShuffleEquivalent(MaskVT, Mask, {0, 2}, DAG) &&

39920

V2.getOpcode() == ISD::SCALAR_TO_VECTOR &&

39921

V2.getScalarValueSizeInBits() <= 32) {

39922

if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTPS)

39923

return SDValue(); // Nothing to do!

39924

PermuteImm = (/*DstIdx*/ 2 << 4) | (/*SrcIdx*/ 0 << 0);

39925

Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32,

39926

CanonicalizeShuffleInput(MVT::v4f32, V1),

39927

CanonicalizeShuffleInput(MVT::v4f32, V2),

39928

DAG.getTargetConstant(PermuteImm, DL, MVT::i8));

39929

return DAG.getBitcast(RootVT, Res);

39930

}

39931

}

39932

39933

SDValue NewV1 = V1; // Save operands in case early exit happens.

39934

SDValue NewV2 = V2;

39935

if (matchBinaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, NewV1,

39936

NewV2, DL, DAG, Subtarget, Shuffle, ShuffleSrcVT,

39937

ShuffleVT, UnaryShuffle) &&

39938

(!IsMaskedShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {

39939

if (Depth == 0 && Root.getOpcode() == Shuffle)

39940

return SDValue(); // Nothing to do!

39941

NewV1 = CanonicalizeShuffleInput(ShuffleSrcVT, NewV1);

39942

NewV2 = CanonicalizeShuffleInput(ShuffleSrcVT, NewV2);

39943

Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2);

39944

return DAG.getBitcast(RootVT, Res);

39945

}

39946

39947

NewV1 = V1; // Save operands in case early exit happens.

39948

NewV2 = V2;

39949

if (matchBinaryPermuteShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,

39950

AllowIntDomain, NewV1, NewV2, DL, DAG,

39951

Subtarget, Shuffle, ShuffleVT, PermuteImm) &&

39952

(!IsMaskedShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {

39953

if (Depth == 0 && Root.getOpcode() == Shuffle)

39954

return SDValue(); // Nothing to do!

39955

NewV1 = CanonicalizeShuffleInput(ShuffleVT, NewV1);

39956

NewV2 = CanonicalizeShuffleInput(ShuffleVT, NewV2);

39957

Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2,

39958

DAG.getTargetConstant(PermuteImm, DL, MVT::i8));

39959

return DAG.getBitcast(RootVT, Res);

39960

}

39961

39962

// Typically from here on, we need an integer version of MaskVT.

39963

MVT IntMaskVT = MVT::getIntegerVT(MaskEltSizeInBits);

39964

IntMaskVT = MVT::getVectorVT(IntMaskVT, NumMaskElts);

39965

39966

// Annoyingly, SSE4A instructions don't map into the above match helpers.

39967

if (Subtarget.hasSSE4A() && AllowIntDomain && RootSizeInBits == 128) {

39968

uint64_t BitLen, BitIdx;

39969

if (matchShuffleAsEXTRQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx,

39970

Zeroable)) {

39971

if (Depth == 0 && Root.getOpcode() == X86ISD::EXTRQI)

39972

return SDValue(); // Nothing to do!

39973

V1 = CanonicalizeShuffleInput(IntMaskVT, V1);

39974

Res = DAG.getNode(X86ISD::EXTRQI, DL, IntMaskVT, V1,

39975

DAG.getTargetConstant(BitLen, DL, MVT::i8),

39976

DAG.getTargetConstant(BitIdx, DL, MVT::i8));

39977

return DAG.getBitcast(RootVT, Res);

39978

}

39979

39980

if (matchShuffleAsINSERTQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx)) {

39981

if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTQI)

39982

return SDValue(); // Nothing to do!

39983

V1 = CanonicalizeShuffleInput(IntMaskVT, V1);

39984

V2 = CanonicalizeShuffleInput(IntMaskVT, V2);

39985

Res = DAG.getNode(X86ISD::INSERTQI, DL, IntMaskVT, V1, V2,

39986

DAG.getTargetConstant(BitLen, DL, MVT::i8),

39987

DAG.getTargetConstant(BitIdx, DL, MVT::i8));

39988

return DAG.getBitcast(RootVT, Res);

39989

}

39990

}

39991

39992

// Match shuffle against TRUNCATE patterns.

39993

if (AllowIntDomain && MaskEltSizeInBits < 64 && Subtarget.hasAVX512()) {

39994

// Match against a VTRUNC instruction, accounting for src/dst sizes.

39995

if (matchShuffleAsVTRUNC(ShuffleSrcVT, ShuffleVT, IntMaskVT, Mask, Zeroable,

39996

Subtarget)) {

39997

bool IsTRUNCATE = ShuffleVT.getVectorNumElements() ==

39998

ShuffleSrcVT.getVectorNumElements();

39999

unsigned Opc =

40000

IsTRUNCATE ? (unsigned)ISD::TRUNCATE : (unsigned)X86ISD::VTRUNC;

40001

if (Depth == 0 && Root.getOpcode() == Opc)

40002

return SDValue(); // Nothing to do!

40003

V1 = CanonicalizeShuffleInput(ShuffleSrcVT, V1);

40004

Res = DAG.getNode(Opc, DL, ShuffleVT, V1);

40005

if (ShuffleVT.getSizeInBits() < RootSizeInBits)

40006

Res = widenSubVector(Res, true, Subtarget, DAG, DL, RootSizeInBits);

40007

return DAG.getBitcast(RootVT, Res);

40008

}

40009

40010

// Do we need a more general binary truncation pattern?

40011

if (RootSizeInBits < 512 &&

40012

((RootVT.is256BitVector() && Subtarget.useAVX512Regs()) ||

40013

(RootVT.is128BitVector() && Subtarget.hasVLX())) &&

40014

(MaskEltSizeInBits > 8 || Subtarget.hasBWI()) &&

40015

isSequentialOrUndefInRange(Mask, 0, NumMaskElts, 0, 2)) {

40016

// Bail if this was already a truncation or PACK node.

40017

// We sometimes fail to match PACK if we demand known undef elements.

40018

if (Depth == 0 && (Root.getOpcode() == ISD::TRUNCATE ||

40019

Root.getOpcode() == X86ISD::PACKSS ||

40020

Root.getOpcode() == X86ISD::PACKUS))

40021

return SDValue(); // Nothing to do!

40022

ShuffleSrcVT = MVT::getIntegerVT(MaskEltSizeInBits * 2);

40023

ShuffleSrcVT = MVT::getVectorVT(ShuffleSrcVT, NumMaskElts / 2);

40024

V1 = CanonicalizeShuffleInput(ShuffleSrcVT, V1);

40025

V2 = CanonicalizeShuffleInput(ShuffleSrcVT, V2);

40026

ShuffleSrcVT = MVT::getIntegerVT(MaskEltSizeInBits * 2);

40027

ShuffleSrcVT = MVT::getVectorVT(ShuffleSrcVT, NumMaskElts);

40028

Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, ShuffleSrcVT, V1, V2);

40029

Res = DAG.getNode(ISD::TRUNCATE, DL, IntMaskVT, Res);

40030

return DAG.getBitcast(RootVT, Res);

40031

}

40032

}

40033

40034

// Don't try to re-form single instruction chains under any circumstances now

40035

// that we've done encoding canonicalization for them.

40036

if (Depth < 1)

40037

return SDValue();

40038

40039

// Depth threshold above which we can efficiently use variable mask shuffles.

40040

int VariableCrossLaneShuffleDepth =

40041

Subtarget.hasFastVariableCrossLaneShuffle() ? 1 : 2;

40042

int VariablePerLaneShuffleDepth =

40043

Subtarget.hasFastVariablePerLaneShuffle() ? 1 : 2;

40044

AllowVariableCrossLaneMask &=

40045

(Depth >= VariableCrossLaneShuffleDepth) || HasVariableMask;

40046

AllowVariablePerLaneMask &=

40047

(Depth >= VariablePerLaneShuffleDepth) || HasVariableMask;

40048

// VPERMI2W/VPERMI2B are 3 uops on Skylake and Icelake so we require a

40049

// higher depth before combining them.

40050

bool AllowBWIVPERMV3 =

40051

(Depth >= (VariableCrossLaneShuffleDepth + 2) || HasVariableMask);

40052

40053

bool MaskContainsZeros = isAnyZero(Mask);

40054

40055

if (is128BitLaneCrossingShuffleMask(MaskVT, Mask)) {

40056

// If we have a single input lane-crossing shuffle then lower to VPERMV.

40057

if (UnaryShuffle && AllowVariableCrossLaneMask && !MaskContainsZeros) {

40058

if (Subtarget.hasAVX2() &&

40059

(MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) {

40060

SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);

40061

Res = CanonicalizeShuffleInput(MaskVT, V1);

40062

Res = DAG.getNode(X86ISD::VPERMV, DL, MaskVT, VPermMask, Res);

40063

return DAG.getBitcast(RootVT, Res);

40064

}

40065

// AVX512 variants (non-VLX will pad to 512-bit shuffles).

40066

if ((Subtarget.hasAVX512() &&

40067

(MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||

40068

MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||

40069

(Subtarget.hasBWI() &&

40070

(MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||

40071

(Subtarget.hasVBMI() &&

40072

(MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8))) {

40073

V1 = CanonicalizeShuffleInput(MaskVT, V1);

40074

V2 = DAG.getUNDEF(MaskVT);

40075

Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);

40076

return DAG.getBitcast(RootVT, Res);

40077

}

40078

}

40079

40080

// Lower a unary+zero lane-crossing shuffle as VPERMV3 with a zero

40081

// vector as the second source (non-VLX will pad to 512-bit shuffles).

40082

if (UnaryShuffle && AllowVariableCrossLaneMask &&

40083

((Subtarget.hasAVX512() &&

40084

(MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||

40085

MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||

40086

MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32 ||

40087

MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||

40088

(Subtarget.hasBWI() && AllowBWIVPERMV3 &&

40089

(MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||

40090

(Subtarget.hasVBMI() && AllowBWIVPERMV3 &&

40091

(MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8)))) {

40092

// Adjust shuffle mask - replace SM_SentinelZero with second source index.

40093

for (unsigned i = 0; i != NumMaskElts; ++i)

40094

if (Mask[i] == SM_SentinelZero)

40095

Mask[i] = NumMaskElts + i;

40096

V1 = CanonicalizeShuffleInput(MaskVT, V1);

40097

V2 = getZeroVector(MaskVT, Subtarget, DAG, DL);

40098

Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);

40099

return DAG.getBitcast(RootVT, Res);

40100

}

40101

40102

// If that failed and either input is extracted then try to combine as a

40103

// shuffle with the larger type.

40104

if (SDValue WideShuffle = combineX86ShuffleChainWithExtract(

40105

Inputs, Root, BaseMask, Depth, HasVariableMask,

40106

AllowVariableCrossLaneMask, AllowVariablePerLaneMask, DAG,

40107

Subtarget))

40108

return WideShuffle;

40109

40110

// If we have a dual input lane-crossing shuffle then lower to VPERMV3,

40111

// (non-VLX will pad to 512-bit shuffles).

40112

if (AllowVariableCrossLaneMask && !MaskContainsZeros &&

40113

((Subtarget.hasAVX512() &&

40114

(MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||

40115

MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||

40116

MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32 ||

40117

MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||

40118

(Subtarget.hasBWI() && AllowBWIVPERMV3 &&

40119

(MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||

40120

(Subtarget.hasVBMI() && AllowBWIVPERMV3 &&

40121

(MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8)))) {

40122

V1 = CanonicalizeShuffleInput(MaskVT, V1);

40123

V2 = CanonicalizeShuffleInput(MaskVT, V2);

40124

Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);

40125

return DAG.getBitcast(RootVT, Res);

40126

}

40127

return SDValue();

40128

}

40129

40130

// See if we can combine a single input shuffle with zeros to a bit-mask,

40131

// which is much simpler than any shuffle.

40132

if (UnaryShuffle && MaskContainsZeros && AllowVariablePerLaneMask &&

40133

isSequentialOrUndefOrZeroInRange(Mask, 0, NumMaskElts, 0) &&

40134

DAG.getTargetLoweringInfo().isTypeLegal(MaskVT)) {

40135

APInt Zero = APInt::getZero(MaskEltSizeInBits);

40136

APInt AllOnes = APInt::getAllOnes(MaskEltSizeInBits);

40137

APInt UndefElts(NumMaskElts, 0);

40138

SmallVector<APInt, 64> EltBits(NumMaskElts, Zero);

40139

for (unsigned i = 0; i != NumMaskElts; ++i) {

40140

int M = Mask[i];

40141

if (M == SM_SentinelUndef) {

40142

UndefElts.setBit(i);

40143

continue;

40144

}

40145

if (M == SM_SentinelZero)

40146

continue;

40147

EltBits[i] = AllOnes;

40148

}

40149

SDValue BitMask = getConstVector(EltBits, UndefElts, MaskVT, DAG, DL);

40150

Res = CanonicalizeShuffleInput(MaskVT, V1);

40151

unsigned AndOpcode =

40152

MaskVT.isFloatingPoint() ? unsigned(X86ISD::FAND) : unsigned(ISD::AND);

40153

Res = DAG.getNode(AndOpcode, DL, MaskVT, Res, BitMask);

40154

return DAG.getBitcast(RootVT, Res);

40155

}

40156

40157

// If we have a single input shuffle with different shuffle patterns in the

40158

// the 128-bit lanes use the variable mask to VPERMILPS.

40159

// TODO Combine other mask types at higher depths.

40160

if (UnaryShuffle && AllowVariablePerLaneMask && !MaskContainsZeros &&

40161

((MaskVT == MVT::v8f32 && Subtarget.hasAVX()) ||

40162

(MaskVT == MVT::v16f32 && Subtarget.hasAVX512()))) {

40163

SmallVector<SDValue, 16> VPermIdx;

40164

for (int M : Mask) {

40165

SDValue Idx =

40166

M < 0 ? DAG.getUNDEF(MVT::i32) : DAG.getConstant(M % 4, DL, MVT::i32);

40167

VPermIdx.push_back(Idx);

40168

}

40169

SDValue VPermMask = DAG.getBuildVector(IntMaskVT, DL, VPermIdx);

40170

Res = CanonicalizeShuffleInput(MaskVT, V1);

40171

Res = DAG.getNode(X86ISD::VPERMILPV, DL, MaskVT, Res, VPermMask);

40172

return DAG.getBitcast(RootVT, Res);

40173

}

40174

40175

// With XOP, binary shuffles of 128/256-bit floating point vectors can combine

40176

// to VPERMIL2PD/VPERMIL2PS.

40177

if (AllowVariablePerLaneMask && Subtarget.hasXOP() &&

40178

(MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v4f32 ||

40179

MaskVT == MVT::v8f32)) {

40180

// VPERMIL2 Operation.

40181

// Bits[3] - Match Bit.

40182

// Bits[2:1] - (Per Lane) PD Shuffle Mask.

40183

// Bits[2:0] - (Per Lane) PS Shuffle Mask.

40184

unsigned NumLanes = MaskVT.getSizeInBits() / 128;

40185

unsigned NumEltsPerLane = NumMaskElts / NumLanes;

40186

SmallVector<int, 8> VPerm2Idx;

40187

unsigned M2ZImm = 0;

40188

for (int M : Mask) {

40189

if (M == SM_SentinelUndef) {

40190

VPerm2Idx.push_back(-1);

40191

continue;

40192

}

40193

if (M == SM_SentinelZero) {

40194

M2ZImm = 2;

40195

VPerm2Idx.push_back(8);

40196

continue;

40197

}

40198

int Index = (M % NumEltsPerLane) + ((M / NumMaskElts) * NumEltsPerLane);

40199

Index = (MaskVT.getScalarSizeInBits() == 64 ? Index << 1 : Index);

40200

VPerm2Idx.push_back(Index);

40201

}

40202

V1 = CanonicalizeShuffleInput(MaskVT, V1);

40203

V2 = CanonicalizeShuffleInput(MaskVT, V2);

40204

SDValue VPerm2MaskOp = getConstVector(VPerm2Idx, IntMaskVT, DAG, DL, true);

40205

Res = DAG.getNode(X86ISD::VPERMIL2, DL, MaskVT, V1, V2, VPerm2MaskOp,

40206

DAG.getTargetConstant(M2ZImm, DL, MVT::i8));

40207

return DAG.getBitcast(RootVT, Res);

40208

}

40209

40210

// If we have 3 or more shuffle instructions or a chain involving a variable

40211

// mask, we can replace them with a single PSHUFB instruction profitably.

40212

// Intel's manuals suggest only using PSHUFB if doing so replacing 5

40213

// instructions, but in practice PSHUFB tends to be *very* fast so we're

40214

// more aggressive.

40215

if (UnaryShuffle && AllowVariablePerLaneMask &&

40216

((RootVT.is128BitVector() && Subtarget.hasSSSE3()) ||

40217

(RootVT.is256BitVector() && Subtarget.hasAVX2()) ||

40218

(RootVT.is512BitVector() && Subtarget.hasBWI()))) {

40219

SmallVector<SDValue, 16> PSHUFBMask;

40220

int NumBytes = RootVT.getSizeInBits() / 8;

40221

int Ratio = NumBytes / NumMaskElts;

40222

for (int i = 0; i < NumBytes; ++i) {

40223

int M = Mask[i / Ratio];

40224

if (M == SM_SentinelUndef) {

40225

PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8));

40226

continue;

40227

}

40228

if (M == SM_SentinelZero) {

40229

PSHUFBMask.push_back(DAG.getConstant(0x80, DL, MVT::i8));

40230

continue;

40231

}

40232

M = Ratio * M + i % Ratio;

40233

assert((M / 16) == (i / 16) && "Lane crossing detected")(static_cast <bool> ((M / 16) == (i / 16) && "Lane crossing detected"
) ? void (0) : __assert_fail ("(M / 16) == (i / 16) && \"Lane crossing detected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40233, __extension__
__PRETTY_FUNCTION__));

40234

PSHUFBMask.push_back(DAG.getConstant(M, DL, MVT::i8));

40235

}

40236

MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes);

40237

Res = CanonicalizeShuffleInput(ByteVT, V1);

40238

SDValue PSHUFBMaskOp = DAG.getBuildVector(ByteVT, DL, PSHUFBMask);

40239

Res = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, Res, PSHUFBMaskOp);

40240

return DAG.getBitcast(RootVT, Res);

40241

}

40242

40243

// With XOP, if we have a 128-bit binary input shuffle we can always combine

40244

// to VPPERM. We match the depth requirement of PSHUFB - VPPERM is never

40245

// slower than PSHUFB on targets that support both.

40246

if (AllowVariablePerLaneMask && RootVT.is128BitVector() &&

40247

Subtarget.hasXOP()) {

40248

// VPPERM Mask Operation

40249

// Bits[4:0] - Byte Index (0 - 31)

40250

// Bits[7:5] - Permute Operation (0 - Source byte, 4 - ZERO)

40251

SmallVector<SDValue, 16> VPPERMMask;

40252

int NumBytes = 16;

40253

int Ratio = NumBytes / NumMaskElts;

40254

for (int i = 0; i < NumBytes; ++i) {

40255

int M = Mask[i / Ratio];

40256

if (M == SM_SentinelUndef) {

40257

VPPERMMask.push_back(DAG.getUNDEF(MVT::i8));

40258

continue;

40259

}

40260

if (M == SM_SentinelZero) {

40261

VPPERMMask.push_back(DAG.getConstant(0x80, DL, MVT::i8));

40262

continue;

40263

}

40264

M = Ratio * M + i % Ratio;

40265

VPPERMMask.push_back(DAG.getConstant(M, DL, MVT::i8));

40266

}

40267

MVT ByteVT = MVT::v16i8;

40268

V1 = CanonicalizeShuffleInput(ByteVT, V1);

40269

V2 = CanonicalizeShuffleInput(ByteVT, V2);

40270

SDValue VPPERMMaskOp = DAG.getBuildVector(ByteVT, DL, VPPERMMask);

40271

Res = DAG.getNode(X86ISD::VPPERM, DL, ByteVT, V1, V2, VPPERMMaskOp);

40272

return DAG.getBitcast(RootVT, Res);

40273

}

40274

40275

// If that failed and either input is extracted then try to combine as a

40276

// shuffle with the larger type.

40277

if (SDValue WideShuffle = combineX86ShuffleChainWithExtract(

40278

Inputs, Root, BaseMask, Depth, HasVariableMask,

40279

AllowVariableCrossLaneMask, AllowVariablePerLaneMask, DAG, Subtarget))

40280

return WideShuffle;

40281

40282

// If we have a dual input shuffle then lower to VPERMV3,

40283

// (non-VLX will pad to 512-bit shuffles)

40284

if (!UnaryShuffle && AllowVariablePerLaneMask && !MaskContainsZeros &&

40285

((Subtarget.hasAVX512() &&

40286

(MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v8f64 ||

40287

MaskVT == MVT::v2i64 || MaskVT == MVT::v4i64 || MaskVT == MVT::v8i64 ||

40288

MaskVT == MVT::v4f32 || MaskVT == MVT::v4i32 || MaskVT == MVT::v8f32 ||

40289

MaskVT == MVT::v8i32 || MaskVT == MVT::v16f32 ||

40290

MaskVT == MVT::v16i32)) ||

40291

(Subtarget.hasBWI() && AllowBWIVPERMV3 &&

40292

(MaskVT == MVT::v8i16 || MaskVT == MVT::v16i16 ||

40293

MaskVT == MVT::v32i16)) ||

40294

(Subtarget.hasVBMI() && AllowBWIVPERMV3 &&

40295

(MaskVT == MVT::v16i8 || MaskVT == MVT::v32i8 ||

40296

MaskVT == MVT::v64i8)))) {

40297

V1 = CanonicalizeShuffleInput(MaskVT, V1);

40298

V2 = CanonicalizeShuffleInput(MaskVT, V2);

40299

Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);

40300

return DAG.getBitcast(RootVT, Res);

40301

}

40302

40303

// Failed to find any combines.

40304

return SDValue();

40305

}

40306

40307

// Combine an arbitrary chain of shuffles + extract_subvectors into a single

40308

// instruction if possible.

40309

//

40310

// Wrapper for combineX86ShuffleChain that extends the shuffle mask to a larger

40311

// type size to attempt to combine:

40312

// shuffle(extract_subvector(x,c1),extract_subvector(y,c2),m1)

40313

// -->

40314

// extract_subvector(shuffle(x,y,m2),0)

40315

static SDValue combineX86ShuffleChainWithExtract(

40316

ArrayRef<SDValue> Inputs, SDValue Root, ArrayRef<int> BaseMask, int Depth,

40317

bool HasVariableMask, bool AllowVariableCrossLaneMask,

40318

bool AllowVariablePerLaneMask, SelectionDAG &DAG,

40319

const X86Subtarget &Subtarget) {

40320

unsigned NumMaskElts = BaseMask.size();

40321

unsigned NumInputs = Inputs.size();

40322

if (NumInputs == 0)

40323

return SDValue();

40324

40325

EVT RootVT = Root.getValueType();

40326

unsigned RootSizeInBits = RootVT.getSizeInBits();

40327

unsigned RootEltSizeInBits = RootSizeInBits / NumMaskElts;

40328

assert((RootSizeInBits % NumMaskElts) == 0 && "Unexpected root shuffle mask")(static_cast <bool> ((RootSizeInBits % NumMaskElts) == 0
&& "Unexpected root shuffle mask") ? void (0) : __assert_fail
("(RootSizeInBits % NumMaskElts) == 0 && \"Unexpected root shuffle mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40328, __extension__
__PRETTY_FUNCTION__));

40329

40330

// Peek through extract_subvector to find widest legal vector.

40331

// TODO: Handle ISD::TRUNCATE

40332

unsigned WideSizeInBits = RootSizeInBits;

40333

for (unsigned I = 0; I != NumInputs; ++I) {

40334

SDValue Input = peekThroughBitcasts(Inputs[I]);

40335

while (Input.getOpcode() == ISD::EXTRACT_SUBVECTOR)

40336

Input = peekThroughBitcasts(Input.getOperand(0));

40337

if (DAG.getTargetLoweringInfo().isTypeLegal(Input.getValueType()) &&

40338

WideSizeInBits < Input.getValueSizeInBits())

40339

WideSizeInBits = Input.getValueSizeInBits();

40340

}

40341

40342

// Bail if we fail to find a source larger than the existing root.

40343

unsigned Scale = WideSizeInBits / RootSizeInBits;

40344

if (WideSizeInBits <= RootSizeInBits ||

40345

(WideSizeInBits % RootSizeInBits) != 0)

40346

return SDValue();

40347

40348

// Create new mask for larger type.

40349

SmallVector<int, 64> WideMask(BaseMask);

40350

for (int &M : WideMask) {

40351

if (M < 0)

40352

continue;

40353

M = (M % NumMaskElts) + ((M / NumMaskElts) * Scale * NumMaskElts);

40354

}

40355

WideMask.append((Scale - 1) * NumMaskElts, SM_SentinelUndef);

40356

40357

// Attempt to peek through inputs and adjust mask when we extract from an

40358

// upper subvector.

40359

int AdjustedMasks = 0;

40360

SmallVector<SDValue, 4> WideInputs(Inputs.begin(), Inputs.end());

40361

for (unsigned I = 0; I != NumInputs; ++I) {

40362

SDValue &Input = WideInputs[I];

40363

Input = peekThroughBitcasts(Input);

40364

while (Input.getOpcode() == ISD::EXTRACT_SUBVECTOR &&

40365

Input.getOperand(0).getValueSizeInBits() <= WideSizeInBits) {

40366

uint64_t Idx = Input.getConstantOperandVal(1);

40367

if (Idx != 0) {

40368

++AdjustedMasks;

40369

unsigned InputEltSizeInBits = Input.getScalarValueSizeInBits();

40370

Idx = (Idx * InputEltSizeInBits) / RootEltSizeInBits;

40371

40372

int lo = I * WideMask.size();

40373

int hi = (I + 1) * WideMask.size();

40374

for (int &M : WideMask)

40375

if (lo <= M && M < hi)

40376

M += Idx;

40377

}

40378

Input = peekThroughBitcasts(Input.getOperand(0));

40379

}

40380

}

40381

40382

// Remove unused/repeated shuffle source ops.

40383

resolveTargetShuffleInputsAndMask(WideInputs, WideMask);

40384

assert(!WideInputs.empty() && "Shuffle with no inputs detected")(static_cast <bool> (!WideInputs.empty() && "Shuffle with no inputs detected"
) ? void (0) : __assert_fail ("!WideInputs.empty() && \"Shuffle with no inputs detected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40384, __extension__
__PRETTY_FUNCTION__));

40385

40386

// Bail if we're always extracting from the lowest subvectors,

40387

// combineX86ShuffleChain should match this for the current width, or the

40388

// shuffle still references too many inputs.

40389

if (AdjustedMasks == 0 || WideInputs.size() > 2)

40390

return SDValue();

40391

40392

// Minor canonicalization of the accumulated shuffle mask to make it easier

40393

// to match below. All this does is detect masks with sequential pairs of

40394

// elements, and shrink them to the half-width mask. It does this in a loop

40395

// so it will reduce the size of the mask to the minimal width mask which

40396

// performs an equivalent shuffle.

40397

while (WideMask.size() > 1) {

40398

SmallVector<int, 64> WidenedMask;

40399

if (!canWidenShuffleElements(WideMask, WidenedMask))

40400

break;

40401

WideMask = std::move(WidenedMask);

40402

}

40403

40404

// Canonicalization of binary shuffle masks to improve pattern matching by

40405

// commuting the inputs.

40406

if (WideInputs.size() == 2 && canonicalizeShuffleMaskWithCommute(WideMask)) {

40407

ShuffleVectorSDNode::commuteMask(WideMask);

40408

std::swap(WideInputs[0], WideInputs[1]);

40409

}

40410

40411

// Increase depth for every upper subvector we've peeked through.

40412

Depth += AdjustedMasks;

40413

40414

// Attempt to combine wider chain.

40415

// TODO: Can we use a better Root?

40416

SDValue WideRoot = WideInputs.front().getValueSizeInBits() >

40417

WideInputs.back().getValueSizeInBits()

40418

? WideInputs.front()

40419

: WideInputs.back();

40420

assert(WideRoot.getValueSizeInBits() == WideSizeInBits &&(static_cast <bool> (WideRoot.getValueSizeInBits() == WideSizeInBits
&& "WideRootSize mismatch") ? void (0) : __assert_fail
("WideRoot.getValueSizeInBits() == WideSizeInBits && \"WideRootSize mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40421, __extension__
__PRETTY_FUNCTION__))

40421

"WideRootSize mismatch")(static_cast <bool> (WideRoot.getValueSizeInBits() == WideSizeInBits
&& "WideRootSize mismatch") ? void (0) : __assert_fail
("WideRoot.getValueSizeInBits() == WideSizeInBits && \"WideRootSize mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40421, __extension__
__PRETTY_FUNCTION__));

40422

40423

if (SDValue WideShuffle =

40424

combineX86ShuffleChain(WideInputs, WideRoot, WideMask, Depth,

40425

HasVariableMask, AllowVariableCrossLaneMask,

40426

AllowVariablePerLaneMask, DAG, Subtarget)) {

40427

WideShuffle =

40428

extractSubVector(WideShuffle, 0, DAG, SDLoc(Root), RootSizeInBits);

40429

return DAG.getBitcast(RootVT, WideShuffle);

40430

}

40431

40432

return SDValue();

40433

}

40434

40435

// Canonicalize the combined shuffle mask chain with horizontal ops.

40436

// NOTE: This may update the Ops and Mask.

40437

static SDValue canonicalizeShuffleMaskWithHorizOp(

40438

MutableArrayRef<SDValue> Ops, MutableArrayRef<int> Mask,

40439

unsigned RootSizeInBits, const SDLoc &DL, SelectionDAG &DAG,

40440

const X86Subtarget &Subtarget) {

40441

if (Mask.empty() || Ops.empty())

40442

return SDValue();

40443

40444

SmallVector<SDValue> BC;

40445

for (SDValue Op : Ops)

40446

BC.push_back(peekThroughBitcasts(Op));

40447

40448

// All ops must be the same horizop + type.

40449

SDValue BC0 = BC[0];

40450

EVT VT0 = BC0.getValueType();

40451

unsigned Opcode0 = BC0.getOpcode();

40452

if (VT0.getSizeInBits() != RootSizeInBits || llvm::any_of(BC, [&](SDValue V) {

40453

return V.getOpcode() != Opcode0 || V.getValueType() != VT0;

40454

}))

40455

return SDValue();

40456

40457

bool isHoriz = (Opcode0 == X86ISD::FHADD || Opcode0 == X86ISD::HADD ||

40458

Opcode0 == X86ISD::FHSUB || Opcode0 == X86ISD::HSUB);

40459

bool isPack = (Opcode0 == X86ISD::PACKSS || Opcode0 == X86ISD::PACKUS);

40460

if (!isHoriz && !isPack)

40461

return SDValue();

40462

40463

// Do all ops have a single use?

40464

bool OneUseOps = llvm::all_of(Ops, [](SDValue Op) {

40465

return Op.hasOneUse() &&

40466

peekThroughBitcasts(Op) == peekThroughOneUseBitcasts(Op);

40467

});

40468

40469

int NumElts = VT0.getVectorNumElements();

40470

int NumLanes = VT0.getSizeInBits() / 128;

40471

int NumEltsPerLane = NumElts / NumLanes;

40472

int NumHalfEltsPerLane = NumEltsPerLane / 2;

40473

MVT SrcVT = BC0.getOperand(0).getSimpleValueType();

40474

unsigned EltSizeInBits = RootSizeInBits / Mask.size();

40475

40476

if (NumEltsPerLane >= 4 &&

40477

(isPack || shouldUseHorizontalOp(Ops.size() == 1, DAG, Subtarget))) {

40478

SmallVector<int> LaneMask, ScaledMask;

40479

if (isRepeatedTargetShuffleMask(128, EltSizeInBits, Mask, LaneMask) &&

40480

scaleShuffleElements(LaneMask, 4, ScaledMask)) {

40481

// See if we can remove the shuffle by resorting the HOP chain so that

40482

// the HOP args are pre-shuffled.

40483

// TODO: Generalize to any sized/depth chain.

40484

// TODO: Add support for PACKSS/PACKUS.

40485

if (isHoriz) {

40486

// Attempt to find a HOP(HOP(X,Y),HOP(Z,W)) source operand.

40487

auto GetHOpSrc = [&](int M) {

40488

if (M == SM_SentinelUndef)

40489

return DAG.getUNDEF(VT0);

40490

if (M == SM_SentinelZero)

40491

return getZeroVector(VT0.getSimpleVT(), Subtarget, DAG, DL);

40492

SDValue Src0 = BC[M / 4];

40493

SDValue Src1 = Src0.getOperand((M % 4) >= 2);

40494

if (Src1.getOpcode() == Opcode0 && Src0->isOnlyUserOf(Src1.getNode()))

40495

return Src1.getOperand(M % 2);

40496

return SDValue();

40497

};

40498

SDValue M0 = GetHOpSrc(ScaledMask[0]);

40499

SDValue M1 = GetHOpSrc(ScaledMask[1]);

40500

SDValue M2 = GetHOpSrc(ScaledMask[2]);

40501

SDValue M3 = GetHOpSrc(ScaledMask[3]);

40502

if (M0 && M1 && M2 && M3) {

40503

SDValue LHS = DAG.getNode(Opcode0, DL, SrcVT, M0, M1);

40504

SDValue RHS = DAG.getNode(Opcode0, DL, SrcVT, M2, M3);

40505

return DAG.getNode(Opcode0, DL, VT0, LHS, RHS);

40506

}

40507

}

40508

// shuffle(hop(x,y),hop(z,w)) -> permute(hop(x,z)) etc.

40509

if (Ops.size() >= 2) {

40510

SDValue LHS, RHS;

40511

auto GetHOpSrc = [&](int M, int &OutM) {

40512

// TODO: Support SM_SentinelZero

40513

if (M < 0)

40514

return M == SM_SentinelUndef;

40515

SDValue Src = BC[M / 4].getOperand((M % 4) >= 2);

40516

if (!LHS || LHS == Src) {

40517

LHS = Src;

40518

OutM = (M % 2);

40519

return true;

40520

}

40521

if (!RHS || RHS == Src) {

40522

RHS = Src;

40523

OutM = (M % 2) + 2;

40524

return true;

40525

}

40526

return false;

40527

};

40528

int PostMask[4] = {-1, -1, -1, -1};

40529

if (GetHOpSrc(ScaledMask[0], PostMask[0]) &&

40530

GetHOpSrc(ScaledMask[1], PostMask[1]) &&

40531

GetHOpSrc(ScaledMask[2], PostMask[2]) &&

40532

GetHOpSrc(ScaledMask[3], PostMask[3])) {

40533

LHS = DAG.getBitcast(SrcVT, LHS);

40534

RHS = DAG.getBitcast(SrcVT, RHS ? RHS : LHS);

40535

SDValue Res = DAG.getNode(Opcode0, DL, VT0, LHS, RHS);

40536

// Use SHUFPS for the permute so this will work on SSE3 targets,

40537

// shuffle combining and domain handling will simplify this later on.

40538

MVT ShuffleVT = MVT::getVectorVT(MVT::f32, RootSizeInBits / 32);

40539

Res = DAG.getBitcast(ShuffleVT, Res);

40540

return DAG.getNode(X86ISD::SHUFP, DL, ShuffleVT, Res, Res,

40541

getV4X86ShuffleImm8ForMask(PostMask, DL, DAG));

40542

}

40543

}

40544

}

40545

}

40546

40547

if (2 < Ops.size())

40548

return SDValue();

40549

40550

SDValue BC1 = BC[BC.size() - 1];

40551

if (Mask.size() == VT0.getVectorNumElements()) {

40552

// Canonicalize binary shuffles of horizontal ops that use the

40553

// same sources to an unary shuffle.

40554

// TODO: Try to perform this fold even if the shuffle remains.

40555

if (Ops.size() == 2) {

40556

auto ContainsOps = [](SDValue HOp, SDValue Op) {

40557

return Op == HOp.getOperand(0) || Op == HOp.getOperand(1);

40558

};

40559

// Commute if all BC0's ops are contained in BC1.

40560

if (ContainsOps(BC1, BC0.getOperand(0)) &&

40561

ContainsOps(BC1, BC0.getOperand(1))) {

40562

ShuffleVectorSDNode::commuteMask(Mask);

40563

std::swap(Ops[0], Ops[1]);

40564

std::swap(BC0, BC1);

40565

}

40566

40567

// If BC1 can be represented by BC0, then convert to unary shuffle.

40568

if (ContainsOps(BC0, BC1.getOperand(0)) &&

40569

ContainsOps(BC0, BC1.getOperand(1))) {

40570

for (int &M : Mask) {

40571

if (M < NumElts) // BC0 element or UNDEF/Zero sentinel.

40572

continue;

40573

int SubLane = ((M % NumEltsPerLane) >= NumHalfEltsPerLane) ? 1 : 0;

40574

M -= NumElts + (SubLane * NumHalfEltsPerLane);

40575

if (BC1.getOperand(SubLane) != BC0.getOperand(0))

40576

M += NumHalfEltsPerLane;

40577

}

40578

}

40579

}

40580

40581

// Canonicalize unary horizontal ops to only refer to lower halves.

40582

for (int i = 0; i != NumElts; ++i) {

40583

int &M = Mask[i];

40584

if (isUndefOrZero(M))

40585

continue;

40586

if (M < NumElts && BC0.getOperand(0) == BC0.getOperand(1) &&

40587

(M % NumEltsPerLane) >= NumHalfEltsPerLane)

40588

M -= NumHalfEltsPerLane;

40589

if (NumElts <= M && BC1.getOperand(0) == BC1.getOperand(1) &&

40590

(M % NumEltsPerLane) >= NumHalfEltsPerLane)

40591

M -= NumHalfEltsPerLane;

40592

}

40593

}

40594

40595

// Combine binary shuffle of 2 similar 'Horizontal' instructions into a

40596

// single instruction. Attempt to match a v2X64 repeating shuffle pattern that

40597

// represents the LHS/RHS inputs for the lower/upper halves.

40598

SmallVector<int, 16> TargetMask128, WideMask128;

40599

if (isRepeatedTargetShuffleMask(128, EltSizeInBits, Mask, TargetMask128) &&

40600

scaleShuffleElements(TargetMask128, 2, WideMask128)) {

40601

assert(isUndefOrZeroOrInRange(WideMask128, 0, 4) && "Illegal shuffle")(static_cast <bool> (isUndefOrZeroOrInRange(WideMask128
, 0, 4) && "Illegal shuffle") ? void (0) : __assert_fail
("isUndefOrZeroOrInRange(WideMask128, 0, 4) && \"Illegal shuffle\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40601, __extension__
__PRETTY_FUNCTION__));

40602

bool SingleOp = (Ops.size() == 1);

40603

if (isPack || OneUseOps ||

40604

shouldUseHorizontalOp(SingleOp, DAG, Subtarget)) {

40605

SDValue Lo = isInRange(WideMask128[0], 0, 2) ? BC0 : BC1;

40606

SDValue Hi = isInRange(WideMask128[1], 0, 2) ? BC0 : BC1;

40607

Lo = Lo.getOperand(WideMask128[0] & 1);

40608

Hi = Hi.getOperand(WideMask128[1] & 1);

40609

if (SingleOp) {

40610

SDValue Undef = DAG.getUNDEF(SrcVT);

40611

SDValue Zero = getZeroVector(SrcVT, Subtarget, DAG, DL);

40612

Lo = (WideMask128[0] == SM_SentinelZero ? Zero : Lo);

40613

Hi = (WideMask128[1] == SM_SentinelZero ? Zero : Hi);

40614

Lo = (WideMask128[0] == SM_SentinelUndef ? Undef : Lo);

40615

Hi = (WideMask128[1] == SM_SentinelUndef ? Undef : Hi);

40616

}

40617

return DAG.getNode(Opcode0, DL, VT0, Lo, Hi);

40618

}

40619

}

40620

40621

return SDValue();

40622

}

40623

40624

// Attempt to constant fold all of the constant source ops.

40625

// Returns true if the entire shuffle is folded to a constant.

40626

// TODO: Extend this to merge multiple constant Ops and update the mask.

40627

static SDValue combineX86ShufflesConstants(ArrayRef<SDValue> Ops,

40628

ArrayRef<int> Mask, SDValue Root,

40629

bool HasVariableMask,

40630

SelectionDAG &DAG,

40631

const X86Subtarget &Subtarget) {

40632

MVT VT = Root.getSimpleValueType();

40633

40634

unsigned SizeInBits = VT.getSizeInBits();

40635

unsigned NumMaskElts = Mask.size();

40636

unsigned MaskSizeInBits = SizeInBits / NumMaskElts;

40637

unsigned NumOps = Ops.size();

40638

40639

// Extract constant bits from each source op.

40640

SmallVector<APInt, 16> UndefEltsOps(NumOps);

40641

SmallVector<SmallVector<APInt, 16>, 16> RawBitsOps(NumOps);

40642

for (unsigned I = 0; I != NumOps; ++I)

40643

if (!getTargetConstantBitsFromNode(Ops[I], MaskSizeInBits, UndefEltsOps[I],

40644

RawBitsOps[I]))

40645

return SDValue();

40646

40647

// If we're optimizing for size, only fold if at least one of the constants is

40648

// only used once or the combined shuffle has included a variable mask

40649

// shuffle, this is to avoid constant pool bloat.

40650

bool IsOptimizingSize = DAG.shouldOptForSize();

40651

if (IsOptimizingSize && !HasVariableMask &&

40652

llvm::none_of(Ops, [](SDValue SrcOp) { return SrcOp->hasOneUse(); }))

40653

return SDValue();

40654

40655

// Shuffle the constant bits according to the mask.

40656

SDLoc DL(Root);

40657

APInt UndefElts(NumMaskElts, 0);

40658

APInt ZeroElts(NumMaskElts, 0);

40659

APInt ConstantElts(NumMaskElts, 0);

40660

SmallVector<APInt, 8> ConstantBitData(NumMaskElts,

40661

APInt::getZero(MaskSizeInBits));

40662

for (unsigned i = 0; i != NumMaskElts; ++i) {

40663

int M = Mask[i];

40664

if (M == SM_SentinelUndef) {

40665

UndefElts.setBit(i);

40666

continue;

40667

} else if (M == SM_SentinelZero) {

40668

ZeroElts.setBit(i);

40669

continue;

40670

}

40671

assert(0 <= M && M < (int)(NumMaskElts * NumOps))(static_cast <bool> (0 <= M && M < (int)(
NumMaskElts * NumOps)) ? void (0) : __assert_fail ("0 <= M && M < (int)(NumMaskElts * NumOps)"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40671, __extension__
__PRETTY_FUNCTION__));

40672

40673

unsigned SrcOpIdx = (unsigned)M / NumMaskElts;

40674

unsigned SrcMaskIdx = (unsigned)M % NumMaskElts;

40675

40676

auto &SrcUndefElts = UndefEltsOps[SrcOpIdx];

40677

if (SrcUndefElts[SrcMaskIdx]) {

40678

UndefElts.setBit(i);

40679

continue;

40680

}

40681

40682

auto &SrcEltBits = RawBitsOps[SrcOpIdx];

40683

APInt &Bits = SrcEltBits[SrcMaskIdx];

40684

if (!Bits) {

40685

ZeroElts.setBit(i);

40686

continue;

40687

}

40688

40689

ConstantElts.setBit(i);

40690

ConstantBitData[i] = Bits;

40691

}

40692

40693

40694

// Attempt to create a zero vector.

40695

if ((UndefElts | ZeroElts).isAllOnes())

40696

return getZeroVector(Root.getSimpleValueType(), Subtarget, DAG, DL);

40697

40698

// Create the constant data.

40699

MVT MaskSVT;

40700

if (VT.isFloatingPoint() && (MaskSizeInBits == 32 || MaskSizeInBits == 64))

40701

MaskSVT = MVT::getFloatingPointVT(MaskSizeInBits);

40702

else

40703

MaskSVT = MVT::getIntegerVT(MaskSizeInBits);

40704

40705

MVT MaskVT = MVT::getVectorVT(MaskSVT, NumMaskElts);

40706

if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT))

40707

return SDValue();

40708

40709

SDValue CstOp = getConstVector(ConstantBitData, UndefElts, MaskVT, DAG, DL);

40710

return DAG.getBitcast(VT, CstOp);

40711

}

40712

40713

namespace llvm {

40714

namespace X86 {

40715

enum {

40716

MaxShuffleCombineDepth = 8

40717

};

40718

}

40719

} // namespace llvm

40720

40721

/// Fully generic combining of x86 shuffle instructions.

40722

///

40723

/// This should be the last combine run over the x86 shuffle instructions. Once

40724

/// they have been fully optimized, this will recursively consider all chains

40725

/// of single-use shuffle instructions, build a generic model of the cumulative

40726

/// shuffle operation, and check for simpler instructions which implement this

40727

/// operation. We use this primarily for two purposes:

40728

///

40729

/// 1) Collapse generic shuffles to specialized single instructions when

40730

/// equivalent. In most cases, this is just an encoding size win, but

40731

/// sometimes we will collapse multiple generic shuffles into a single

40732

/// special-purpose shuffle.

40733

/// 2) Look for sequences of shuffle instructions with 3 or more total

40734

/// instructions, and replace them with the slightly more expensive SSSE3

40735

/// PSHUFB instruction if available. We do this as the last combining step

40736

/// to ensure we avoid using PSHUFB if we can implement the shuffle with

40737

/// a suitable short sequence of other instructions. The PSHUFB will either

40738

/// use a register or have to read from memory and so is slightly (but only

40739

/// slightly) more expensive than the other shuffle instructions.

40740

///

40741

/// Because this is inherently a quadratic operation (for each shuffle in

40742

/// a chain, we recurse up the chain), the depth is limited to 8 instructions.

40743

/// This should never be an issue in practice as the shuffle lowering doesn't

40744

/// produce sequences of more than 8 instructions.

40745

///

40746

/// FIXME: We will currently miss some cases where the redundant shuffling

40747

/// would simplify under the threshold for PSHUFB formation because of

40748

/// combine-ordering. To fix this, we should do the redundant instruction

40749

/// combining in this recursive walk.

40750

static SDValue combineX86ShufflesRecursively(

40751

ArrayRef<SDValue> SrcOps, int SrcOpIndex, SDValue Root,

40752

ArrayRef<int> RootMask, ArrayRef<const SDNode *> SrcNodes, unsigned Depth,

40753

unsigned MaxDepth, bool HasVariableMask, bool AllowVariableCrossLaneMask,

40754

bool AllowVariablePerLaneMask, SelectionDAG &DAG,

40755

const X86Subtarget &Subtarget) {

40756

assert(!RootMask.empty() &&(static_cast <bool> (!RootMask.empty() && (RootMask
.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0
)) && "Illegal shuffle root mask") ? void (0) : __assert_fail
("!RootMask.empty() && (RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0)) && \"Illegal shuffle root mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40758, __extension__
__PRETTY_FUNCTION__))

40757

(RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0)) &&(static_cast <bool> (!RootMask.empty() && (RootMask
.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0
)) && "Illegal shuffle root mask") ? void (0) : __assert_fail
("!RootMask.empty() && (RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0)) && \"Illegal shuffle root mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40758, __extension__
__PRETTY_FUNCTION__))

40758

"Illegal shuffle root mask")(static_cast <bool> (!RootMask.empty() && (RootMask
.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0
)) && "Illegal shuffle root mask") ? void (0) : __assert_fail
("!RootMask.empty() && (RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0)) && \"Illegal shuffle root mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40758, __extension__
__PRETTY_FUNCTION__));

40759

MVT RootVT = Root.getSimpleValueType();

40760

assert(RootVT.isVector() && "Shuffles operate on vector types!")(static_cast <bool> (RootVT.isVector() && "Shuffles operate on vector types!"
) ? void (0) : __assert_fail ("RootVT.isVector() && \"Shuffles operate on vector types!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40760, __extension__
__PRETTY_FUNCTION__));

40761

unsigned RootSizeInBits = RootVT.getSizeInBits();

40762

40763

// Bound the depth of our recursive combine because this is ultimately

40764

// quadratic in nature.

40765

if (Depth >= MaxDepth)

40766

return SDValue();

40767

40768

// Directly rip through bitcasts to find the underlying operand.

40769

SDValue Op = SrcOps[SrcOpIndex];

40770

Op = peekThroughOneUseBitcasts(Op);

40771

40772

EVT VT = Op.getValueType();

40773

if (!VT.isVector() || !VT.isSimple())

40774

return SDValue(); // Bail if we hit a non-simple non-vector.

40775

40776

// FIXME: Just bail on f16 for now.

40777

if (VT.getVectorElementType() == MVT::f16)

40778

return SDValue();

40779

40780

assert((RootSizeInBits % VT.getSizeInBits()) == 0 &&(static_cast <bool> ((RootSizeInBits % VT.getSizeInBits
()) == 0 && "Can only combine shuffles upto size of the root op."
) ? void (0) : __assert_fail ("(RootSizeInBits % VT.getSizeInBits()) == 0 && \"Can only combine shuffles upto size of the root op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40781, __extension__
__PRETTY_FUNCTION__))

40781

"Can only combine shuffles upto size of the root op.")(static_cast <bool> ((RootSizeInBits % VT.getSizeInBits
()) == 0 && "Can only combine shuffles upto size of the root op."
) ? void (0) : __assert_fail ("(RootSizeInBits % VT.getSizeInBits()) == 0 && \"Can only combine shuffles upto size of the root op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40781, __extension__
__PRETTY_FUNCTION__));

40782

40783

// Create a demanded elts mask from the referenced elements of Op.

40784

APInt OpDemandedElts = APInt::getZero(RootMask.size());

40785

for (int M : RootMask) {

40786

int BaseIdx = RootMask.size() * SrcOpIndex;

40787

if (isInRange(M, BaseIdx, BaseIdx + RootMask.size()))

40788

OpDemandedElts.setBit(M - BaseIdx);

40789

}

40790

if (RootSizeInBits != VT.getSizeInBits()) {

40791

// Op is smaller than Root - extract the demanded elts for the subvector.

40792

unsigned Scale = RootSizeInBits / VT.getSizeInBits();

40793

unsigned NumOpMaskElts = RootMask.size() / Scale;

40794

assert((RootMask.size() % Scale) == 0 && "Root mask size mismatch")(static_cast <bool> ((RootMask.size() % Scale) == 0 &&
"Root mask size mismatch") ? void (0) : __assert_fail ("(RootMask.size() % Scale) == 0 && \"Root mask size mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40794, __extension__
__PRETTY_FUNCTION__));

40795

assert(OpDemandedElts(static_cast <bool> (OpDemandedElts .extractBits(RootMask
.size() - NumOpMaskElts, NumOpMaskElts) .isZero() && "Out of range elements referenced in root mask"
) ? void (0) : __assert_fail ("OpDemandedElts .extractBits(RootMask.size() - NumOpMaskElts, NumOpMaskElts) .isZero() && \"Out of range elements referenced in root mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40798, __extension__
__PRETTY_FUNCTION__))

40796

.extractBits(RootMask.size() - NumOpMaskElts, NumOpMaskElts)(static_cast <bool> (OpDemandedElts .extractBits(RootMask
.size() - NumOpMaskElts, NumOpMaskElts) .isZero() && "Out of range elements referenced in root mask"
) ? void (0) : __assert_fail ("OpDemandedElts .extractBits(RootMask.size() - NumOpMaskElts, NumOpMaskElts) .isZero() && \"Out of range elements referenced in root mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40798, __extension__
__PRETTY_FUNCTION__))

40797

.isZero() &&(static_cast <bool> (OpDemandedElts .extractBits(RootMask
.size() - NumOpMaskElts, NumOpMaskElts) .isZero() && "Out of range elements referenced in root mask"
) ? void (0) : __assert_fail ("OpDemandedElts .extractBits(RootMask.size() - NumOpMaskElts, NumOpMaskElts) .isZero() && \"Out of range elements referenced in root mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40798, __extension__
__PRETTY_FUNCTION__))

40798

"Out of range elements referenced in root mask")(static_cast <bool> (OpDemandedElts .extractBits(RootMask
.size() - NumOpMaskElts, NumOpMaskElts) .isZero() && "Out of range elements referenced in root mask"
) ? void (0) : __assert_fail ("OpDemandedElts .extractBits(RootMask.size() - NumOpMaskElts, NumOpMaskElts) .isZero() && \"Out of range elements referenced in root mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40798, __extension__
__PRETTY_FUNCTION__));

40799

OpDemandedElts = OpDemandedElts.extractBits(NumOpMaskElts, 0);

40800

}

40801

OpDemandedElts =

40802

APIntOps::ScaleBitMask(OpDemandedElts, VT.getVectorNumElements());

40803

40804

// Extract target shuffle mask and resolve sentinels and inputs.

40805

SmallVector<int, 64> OpMask;

40806

SmallVector<SDValue, 2> OpInputs;

40807

APInt OpUndef, OpZero;

40808

bool IsOpVariableMask = isTargetShuffleVariableMask(Op.getOpcode());

40809

if (getTargetShuffleInputs(Op, OpDemandedElts, OpInputs, OpMask, OpUndef,

40810

OpZero, DAG, Depth, false)) {

40811

// Shuffle inputs must not be larger than the shuffle result.

40812

// TODO: Relax this for single input faux shuffles (e.g. trunc).

40813

if (llvm::any_of(OpInputs, [VT](SDValue OpInput) {

40814

return OpInput.getValueSizeInBits() > VT.getSizeInBits();

40815

}))

40816

return SDValue();

40817

} else if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&

40818

(RootSizeInBits % Op.getOperand(0).getValueSizeInBits()) == 0 &&

40819

!isNullConstant(Op.getOperand(1))) {

40820

SDValue SrcVec = Op.getOperand(0);

40821

int ExtractIdx = Op.getConstantOperandVal(1);

40822

unsigned NumElts = VT.getVectorNumElements();

40823

OpInputs.assign({SrcVec});

40824

OpMask.assign(NumElts, SM_SentinelUndef);

40825

std::iota(OpMask.begin(), OpMask.end(), ExtractIdx);

40826

OpZero = OpUndef = APInt::getZero(NumElts);

40827

} else {

40828

return SDValue();

40829

}

40830

40831

// If the shuffle result was smaller than the root, we need to adjust the

40832

// mask indices and pad the mask with undefs.

40833

if (RootSizeInBits > VT.getSizeInBits()) {

40834

unsigned NumSubVecs = RootSizeInBits / VT.getSizeInBits();

40835

unsigned OpMaskSize = OpMask.size();

40836

if (OpInputs.size() > 1) {

40837

unsigned PaddedMaskSize = NumSubVecs * OpMaskSize;

40838

for (int &M : OpMask) {

40839

if (M < 0)

40840

continue;

40841

int EltIdx = M % OpMaskSize;

40842

int OpIdx = M / OpMaskSize;

40843

M = (PaddedMaskSize * OpIdx) + EltIdx;

40844

}

40845

}

40846

OpZero = OpZero.zext(NumSubVecs * OpMaskSize);

40847

OpUndef = OpUndef.zext(NumSubVecs * OpMaskSize);

40848

OpMask.append((NumSubVecs - 1) * OpMaskSize, SM_SentinelUndef);

40849

}

40850

40851

SmallVector<int, 64> Mask;

40852

SmallVector<SDValue, 16> Ops;

40853

40854

// We don't need to merge masks if the root is empty.

40855

bool EmptyRoot = (Depth == 0) && (RootMask.size() == 1);

40856

if (EmptyRoot) {

40857

// Only resolve zeros if it will remove an input, otherwise we might end

40858

// up in an infinite loop.

40859

bool ResolveKnownZeros = true;

40860

if (!OpZero.isZero()) {

40861

APInt UsedInputs = APInt::getZero(OpInputs.size());

40862

for (int i = 0, e = OpMask.size(); i != e; ++i) {

40863

int M = OpMask[i];

40864

if (OpUndef[i] || OpZero[i] || isUndefOrZero(M))

40865

continue;

40866

UsedInputs.setBit(M / OpMask.size());

40867

if (UsedInputs.isAllOnes()) {

40868

ResolveKnownZeros = false;

40869

break;

40870

}

40871

}

40872

}

40873

resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero,

40874

ResolveKnownZeros);

40875

40876

Mask = OpMask;

40877

Ops.append(OpInputs.begin(), OpInputs.end());

40878

} else {

40879

resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero);

40880

40881

// Add the inputs to the Ops list, avoiding duplicates.

40882

Ops.append(SrcOps.begin(), SrcOps.end());

40883

40884

auto AddOp = [&Ops](SDValue Input, int InsertionPoint) -> int {

40885

// Attempt to find an existing match.

40886

SDValue InputBC = peekThroughBitcasts(Input);

40887

for (int i = 0, e = Ops.size(); i < e; ++i)

40888

if (InputBC == peekThroughBitcasts(Ops[i]))

40889

return i;

40890

// Match failed - should we replace an existing Op?

40891

if (InsertionPoint >= 0) {

40892

Ops[InsertionPoint] = Input;

40893

return InsertionPoint;

40894

}

40895

// Add to the end of the Ops list.

40896

Ops.push_back(Input);

40897

return Ops.size() - 1;

40898

};

40899

40900

SmallVector<int, 2> OpInputIdx;

40901

for (SDValue OpInput : OpInputs)

40902

OpInputIdx.push_back(

40903

AddOp(OpInput, OpInputIdx.empty() ? SrcOpIndex : -1));

40904

40905

assert(((RootMask.size() > OpMask.size() &&(static_cast <bool> (((RootMask.size() > OpMask.size
() && RootMask.size() % OpMask.size() == 0) || (OpMask
.size() > RootMask.size() && OpMask.size() % RootMask
.size() == 0) || OpMask.size() == RootMask.size()) &&
"The smaller number of elements must divide the larger.") ? void
(0) : __assert_fail ("((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && \"The smaller number of elements must divide the larger.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40910, __extension__
__PRETTY_FUNCTION__))

40906

RootMask.size() % OpMask.size() == 0) ||(static_cast <bool> (((RootMask.size() > OpMask.size
() && RootMask.size() % OpMask.size() == 0) || (OpMask
.size() > RootMask.size() && OpMask.size() % RootMask
.size() == 0) || OpMask.size() == RootMask.size()) &&
"The smaller number of elements must divide the larger.") ? void
(0) : __assert_fail ("((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && \"The smaller number of elements must divide the larger.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40910, __extension__
__PRETTY_FUNCTION__))

40907

(OpMask.size() > RootMask.size() &&(static_cast <bool> (((RootMask.size() > OpMask.size
() && RootMask.size() % OpMask.size() == 0) || (OpMask
.size() > RootMask.size() && OpMask.size() % RootMask
.size() == 0) || OpMask.size() == RootMask.size()) &&
"The smaller number of elements must divide the larger.") ? void
(0) : __assert_fail ("((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && \"The smaller number of elements must divide the larger.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40910, __extension__
__PRETTY_FUNCTION__))

40908

OpMask.size() % RootMask.size() == 0) ||(static_cast <bool> (((RootMask.size() > OpMask.size
() && RootMask.size() % OpMask.size() == 0) || (OpMask
.size() > RootMask.size() && OpMask.size() % RootMask
.size() == 0) || OpMask.size() == RootMask.size()) &&
"The smaller number of elements must divide the larger.") ? void
(0) : __assert_fail ("((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && \"The smaller number of elements must divide the larger.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40910, __extension__
__PRETTY_FUNCTION__))

40909

OpMask.size() == RootMask.size()) &&(static_cast <bool> (((RootMask.size() > OpMask.size
() && RootMask.size() % OpMask.size() == 0) || (OpMask
.size() > RootMask.size() && OpMask.size() % RootMask
.size() == 0) || OpMask.size() == RootMask.size()) &&
"The smaller number of elements must divide the larger.") ? void
(0) : __assert_fail ("((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && \"The smaller number of elements must divide the larger.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40910, __extension__
__PRETTY_FUNCTION__))

40910

"The smaller number of elements must divide the larger.")(static_cast <bool> (((RootMask.size() > OpMask.size
() && RootMask.size() % OpMask.size() == 0) || (OpMask
.size() > RootMask.size() && OpMask.size() % RootMask
.size() == 0) || OpMask.size() == RootMask.size()) &&
"The smaller number of elements must divide the larger.") ? void
(0) : __assert_fail ("((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && \"The smaller number of elements must divide the larger.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40910, __extension__
__PRETTY_FUNCTION__));

40911

40912

// This function can be performance-critical, so we rely on the power-of-2

40913

// knowledge that we have about the mask sizes to replace div/rem ops with

40914

// bit-masks and shifts.

40915

assert(llvm::has_single_bit<uint32_t>(RootMask.size()) &&(static_cast <bool> (llvm::has_single_bit<uint32_t>
(RootMask.size()) && "Non-power-of-2 shuffle mask sizes"
) ? void (0) : __assert_fail ("llvm::has_single_bit<uint32_t>(RootMask.size()) && \"Non-power-of-2 shuffle mask sizes\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40916, __extension__
__PRETTY_FUNCTION__))

40916

"Non-power-of-2 shuffle mask sizes")(static_cast <bool> (llvm::has_single_bit<uint32_t>
(RootMask.size()) && "Non-power-of-2 shuffle mask sizes"
) ? void (0) : __assert_fail ("llvm::has_single_bit<uint32_t>(RootMask.size()) && \"Non-power-of-2 shuffle mask sizes\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40916, __extension__
__PRETTY_FUNCTION__));

40917

assert(llvm::has_single_bit<uint32_t>(OpMask.size()) &&(static_cast <bool> (llvm::has_single_bit<uint32_t>
(OpMask.size()) && "Non-power-of-2 shuffle mask sizes"
) ? void (0) : __assert_fail ("llvm::has_single_bit<uint32_t>(OpMask.size()) && \"Non-power-of-2 shuffle mask sizes\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40918, __extension__
__PRETTY_FUNCTION__))

40918

"Non-power-of-2 shuffle mask sizes")(static_cast <bool> (llvm::has_single_bit<uint32_t>
(OpMask.size()) && "Non-power-of-2 shuffle mask sizes"
) ? void (0) : __assert_fail ("llvm::has_single_bit<uint32_t>(OpMask.size()) && \"Non-power-of-2 shuffle mask sizes\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40918, __extension__
__PRETTY_FUNCTION__));

40919

unsigned RootMaskSizeLog2 = llvm::countr_zero(RootMask.size());

40920

unsigned OpMaskSizeLog2 = llvm::countr_zero(OpMask.size());

40921

40922

unsigned MaskWidth = std::max<unsigned>(OpMask.size(), RootMask.size());

40923

unsigned RootRatio =

40924

std::max<unsigned>(1, OpMask.size() >> RootMaskSizeLog2);

40925

unsigned OpRatio = std::max<unsigned>(1, RootMask.size() >> OpMaskSizeLog2);

40926

assert((RootRatio == 1 || OpRatio == 1) &&(static_cast <bool> ((RootRatio == 1 || OpRatio == 1) &&
"Must not have a ratio for both incoming and op masks!") ? void
(0) : __assert_fail ("(RootRatio == 1 || OpRatio == 1) && \"Must not have a ratio for both incoming and op masks!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40927, __extension__
__PRETTY_FUNCTION__))

40927

"Must not have a ratio for both incoming and op masks!")(static_cast <bool> ((RootRatio == 1 || OpRatio == 1) &&
"Must not have a ratio for both incoming and op masks!") ? void
(0) : __assert_fail ("(RootRatio == 1 || OpRatio == 1) && \"Must not have a ratio for both incoming and op masks!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40927, __extension__
__PRETTY_FUNCTION__));

40928

40929

assert(isPowerOf2_32(MaskWidth) && "Non-power-of-2 shuffle mask sizes")(static_cast <bool> (isPowerOf2_32(MaskWidth) &&
"Non-power-of-2 shuffle mask sizes") ? void (0) : __assert_fail
("isPowerOf2_32(MaskWidth) && \"Non-power-of-2 shuffle mask sizes\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40929, __extension__
__PRETTY_FUNCTION__));

40930

assert(isPowerOf2_32(RootRatio) && "Non-power-of-2 shuffle mask sizes")(static_cast <bool> (isPowerOf2_32(RootRatio) &&
"Non-power-of-2 shuffle mask sizes") ? void (0) : __assert_fail
("isPowerOf2_32(RootRatio) && \"Non-power-of-2 shuffle mask sizes\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40930, __extension__
__PRETTY_FUNCTION__));

40931

assert(isPowerOf2_32(OpRatio) && "Non-power-of-2 shuffle mask sizes")(static_cast <bool> (isPowerOf2_32(OpRatio) && "Non-power-of-2 shuffle mask sizes"
) ? void (0) : __assert_fail ("isPowerOf2_32(OpRatio) && \"Non-power-of-2 shuffle mask sizes\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40931, __extension__
__PRETTY_FUNCTION__));

40932

unsigned RootRatioLog2 = llvm::countr_zero(RootRatio);

40933

unsigned OpRatioLog2 = llvm::countr_zero(OpRatio);

40934

40935

Mask.resize(MaskWidth, SM_SentinelUndef);

40936

40937

// Merge this shuffle operation's mask into our accumulated mask. Note that

40938

// this shuffle's mask will be the first applied to the input, followed by

40939

// the root mask to get us all the way to the root value arrangement. The

40940

// reason for this order is that we are recursing up the operation chain.

40941

for (unsigned i = 0; i < MaskWidth; ++i) {

40942

unsigned RootIdx = i >> RootRatioLog2;

40943

if (RootMask[RootIdx] < 0) {

40944

// This is a zero or undef lane, we're done.

40945

Mask[i] = RootMask[RootIdx];

40946

continue;

40947

}

40948

40949

unsigned RootMaskedIdx =

40950

RootRatio == 1

40951

? RootMask[RootIdx]

40952

: (RootMask[RootIdx] << RootRatioLog2) + (i & (RootRatio - 1));

40953

40954

// Just insert the scaled root mask value if it references an input other

40955

// than the SrcOp we're currently inserting.

40956

if ((RootMaskedIdx < (SrcOpIndex * MaskWidth)) ||

40957

(((SrcOpIndex + 1) * MaskWidth) <= RootMaskedIdx)) {

40958

Mask[i] = RootMaskedIdx;

40959

continue;

40960

}

40961

40962

RootMaskedIdx = RootMaskedIdx & (MaskWidth - 1);

40963

unsigned OpIdx = RootMaskedIdx >> OpRatioLog2;

40964

if (OpMask[OpIdx] < 0) {

40965

// The incoming lanes are zero or undef, it doesn't matter which ones we

40966

// are using.

40967

Mask[i] = OpMask[OpIdx];

40968

continue;

40969

}

40970

40971

// Ok, we have non-zero lanes, map them through to one of the Op's inputs.

40972

unsigned OpMaskedIdx = OpRatio == 1 ? OpMask[OpIdx]

40973

: (OpMask[OpIdx] << OpRatioLog2) +

40974

(RootMaskedIdx & (OpRatio - 1));

40975

40976

OpMaskedIdx = OpMaskedIdx & (MaskWidth - 1);

40977

int InputIdx = OpMask[OpIdx] / (int)OpMask.size();

40978

assert(0 <= OpInputIdx[InputIdx] && "Unknown target shuffle input")(static_cast <bool> (0 <= OpInputIdx[InputIdx] &&
"Unknown target shuffle input") ? void (0) : __assert_fail (
"0 <= OpInputIdx[InputIdx] && \"Unknown target shuffle input\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40978, __extension__
__PRETTY_FUNCTION__));

40979

OpMaskedIdx += OpInputIdx[InputIdx] * MaskWidth;

40980

40981

Mask[i] = OpMaskedIdx;

40982

}

40983

}

40984

40985

// Peek through vector widenings and set out of bounds mask indices to undef.

40986

// TODO: Can resolveTargetShuffleInputsAndMask do some of this?

40987

for (unsigned I = 0, E = Ops.size(); I != E; ++I) {

40988

SDValue &Op = Ops[I];

40989

if (Op.getOpcode() == ISD::INSERT_SUBVECTOR && Op.getOperand(0).isUndef() &&

40990

isNullConstant(Op.getOperand(2))) {

40991

Op = Op.getOperand(1);

40992

unsigned Scale = RootSizeInBits / Op.getValueSizeInBits();

40993

int Lo = I * Mask.size();

40994

int Hi = (I + 1) * Mask.size();

40995

int NewHi = Lo + (Mask.size() / Scale);

40996

for (int &M : Mask) {

40997

if (Lo <= M && NewHi <= M && M < Hi)

40998

M = SM_SentinelUndef;

40999

}

41000

}

41001

}

41002

41003

// Peek through any free extract_subvector nodes back to root size.

41004

for (SDValue &Op : Ops)

41005

while (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&

41006

(RootSizeInBits % Op.getOperand(0).getValueSizeInBits()) == 0 &&

41007

isNullConstant(Op.getOperand(1)))

41008

Op = Op.getOperand(0);

41009

41010

// Remove unused/repeated shuffle source ops.

41011

resolveTargetShuffleInputsAndMask(Ops, Mask);

41012

41013

// Handle the all undef/zero/ones cases early.

41014

if (all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; }))

41015

return DAG.getUNDEF(RootVT);

41016

if (all_of(Mask, [](int Idx) { return Idx < 0; }))

41017

return getZeroVector(RootVT, Subtarget, DAG, SDLoc(Root));

41018

if (Ops.size() == 1 && ISD::isBuildVectorAllOnes(Ops[0].getNode()) &&

41019

!llvm::is_contained(Mask, SM_SentinelZero))

41020

return getOnesVector(RootVT, DAG, SDLoc(Root));

41021

41022

assert(!Ops.empty() && "Shuffle with no inputs detected")(static_cast <bool> (!Ops.empty() && "Shuffle with no inputs detected"
) ? void (0) : __assert_fail ("!Ops.empty() && \"Shuffle with no inputs detected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41022, __extension__
__PRETTY_FUNCTION__));

41023

HasVariableMask |= IsOpVariableMask;

41024

41025

// Update the list of shuffle nodes that have been combined so far.

41026

SmallVector<const SDNode *, 16> CombinedNodes(SrcNodes.begin(),

41027

SrcNodes.end());

41028

CombinedNodes.push_back(Op.getNode());

41029

41030

// See if we can recurse into each shuffle source op (if it's a target

41031

// shuffle). The source op should only be generally combined if it either has

41032

// a single use (i.e. current Op) or all its users have already been combined,

41033

// if not then we can still combine but should prevent generation of variable

41034

// shuffles to avoid constant pool bloat.

41035

// Don't recurse if we already have more source ops than we can combine in

41036

// the remaining recursion depth.

41037

if (Ops.size() < (MaxDepth - Depth)) {

41038

for (int i = 0, e = Ops.size(); i < e; ++i) {

41039

// For empty roots, we need to resolve zeroable elements before combining

41040

// them with other shuffles.

41041

SmallVector<int, 64> ResolvedMask = Mask;

41042

if (EmptyRoot)

41043

resolveTargetShuffleFromZeroables(ResolvedMask, OpUndef, OpZero);

41044

bool AllowCrossLaneVar = false;

41045

bool AllowPerLaneVar = false;

41046

if (Ops[i].getNode()->hasOneUse() ||

41047

SDNode::areOnlyUsersOf(CombinedNodes, Ops[i].getNode())) {

41048

AllowCrossLaneVar = AllowVariableCrossLaneMask;

41049

AllowPerLaneVar = AllowVariablePerLaneMask;

41050

}

41051

if (SDValue Res = combineX86ShufflesRecursively(

41052

Ops, i, Root, ResolvedMask, CombinedNodes, Depth + 1, MaxDepth,

41053

HasVariableMask, AllowCrossLaneVar, AllowPerLaneVar, DAG,

41054

Subtarget))

41055

return Res;

41056

}

41057

}

41058

41059

// Attempt to constant fold all of the constant source ops.

41060

if (SDValue Cst = combineX86ShufflesConstants(

41061

Ops, Mask, Root, HasVariableMask, DAG, Subtarget))

41062

return Cst;

41063

41064

// If constant fold failed and we only have constants - then we have

41065

// multiple uses by a single non-variable shuffle - just bail.

41066

if (Depth == 0 && llvm::all_of(Ops, [&](SDValue Op) {

41067

APInt UndefElts;

41068

SmallVector<APInt> RawBits;

41069

unsigned EltSizeInBits = RootSizeInBits / Mask.size();

41070

return getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,

41071

RawBits);

41072

})) {

41073

return SDValue();

41074

}

41075

41076

// Canonicalize the combined shuffle mask chain with horizontal ops.

41077

// NOTE: This will update the Ops and Mask.

41078

if (SDValue HOp = canonicalizeShuffleMaskWithHorizOp(

41079

Ops, Mask, RootSizeInBits, SDLoc(Root), DAG, Subtarget))

41080

return DAG.getBitcast(RootVT, HOp);

41081

41082

// Try to refine our inputs given our knowledge of target shuffle mask.

41083

for (auto I : enumerate(Ops)) {

41084

int OpIdx = I.index();

41085

SDValue &Op = I.value();

41086

41087

// What range of shuffle mask element values results in picking from Op?

41088

int Lo = OpIdx * Mask.size();

41089

int Hi = Lo + Mask.size();

41090

41091

// Which elements of Op do we demand, given the mask's granularity?

41092

APInt OpDemandedElts(Mask.size(), 0);

41093

for (int MaskElt : Mask) {

41094

if (isInRange(MaskElt, Lo, Hi)) { // Picks from Op?

41095

int OpEltIdx = MaskElt - Lo;

41096

OpDemandedElts.setBit(OpEltIdx);

41097

}

41098

}

41099

41100

// Is the shuffle result smaller than the root?

41101

if (Op.getValueSizeInBits() < RootSizeInBits) {

41102

// We padded the mask with undefs. But we now need to undo that.

41103

unsigned NumExpectedVectorElts = Mask.size();

41104

unsigned EltSizeInBits = RootSizeInBits / NumExpectedVectorElts;

41105

unsigned NumOpVectorElts = Op.getValueSizeInBits() / EltSizeInBits;

41106

assert(!OpDemandedElts.extractBits((static_cast <bool> (!OpDemandedElts.extractBits( NumExpectedVectorElts
- NumOpVectorElts, NumOpVectorElts) && "Demanding the virtual undef widening padding?"
) ? void (0) : __assert_fail ("!OpDemandedElts.extractBits( NumExpectedVectorElts - NumOpVectorElts, NumOpVectorElts) && \"Demanding the virtual undef widening padding?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41108, __extension__
__PRETTY_FUNCTION__))

41107

NumExpectedVectorElts - NumOpVectorElts, NumOpVectorElts) &&(static_cast <bool> (!OpDemandedElts.extractBits( NumExpectedVectorElts
- NumOpVectorElts, NumOpVectorElts) && "Demanding the virtual undef widening padding?"
) ? void (0) : __assert_fail ("!OpDemandedElts.extractBits( NumExpectedVectorElts - NumOpVectorElts, NumOpVectorElts) && \"Demanding the virtual undef widening padding?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41108, __extension__
__PRETTY_FUNCTION__))

41108

"Demanding the virtual undef widening padding?")(static_cast <bool> (!OpDemandedElts.extractBits( NumExpectedVectorElts
- NumOpVectorElts, NumOpVectorElts) && "Demanding the virtual undef widening padding?"
) ? void (0) : __assert_fail ("!OpDemandedElts.extractBits( NumExpectedVectorElts - NumOpVectorElts, NumOpVectorElts) && \"Demanding the virtual undef widening padding?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41108, __extension__
__PRETTY_FUNCTION__));

41109

OpDemandedElts = OpDemandedElts.trunc(NumOpVectorElts); // NUW

41110

}

41111

41112

// The Op itself may be of different VT, so we need to scale the mask.

41113

unsigned NumOpElts = Op.getValueType().getVectorNumElements();

41114

APInt OpScaledDemandedElts = APIntOps::ScaleBitMask(OpDemandedElts, NumOpElts);

41115

41116

// Can this operand be simplified any further, given it's demanded elements?

41117

if (SDValue NewOp =

41118

DAG.getTargetLoweringInfo().SimplifyMultipleUseDemandedVectorElts(

41119

Op, OpScaledDemandedElts, DAG))

41120

Op = NewOp;

41121

}

41122

// FIXME: should we rerun resolveTargetShuffleInputsAndMask() now?

41123

41124

// Widen any subvector shuffle inputs we've collected.

41125

// TODO: Remove this to avoid generating temporary nodes, we should only

41126

// widen once combineX86ShuffleChain has found a match.

41127

if (any_of(Ops, [RootSizeInBits](SDValue Op) {

41128

return Op.getValueSizeInBits() < RootSizeInBits;

41129

})) {

41130

for (SDValue &Op : Ops)

41131

if (Op.getValueSizeInBits() < RootSizeInBits)

41132

Op = widenSubVector(Op, false, Subtarget, DAG, SDLoc(Op),

41133

RootSizeInBits);

41134

// Reresolve - we might have repeated subvector sources.

41135

resolveTargetShuffleInputsAndMask(Ops, Mask);

41136

}

41137

41138

// We can only combine unary and binary shuffle mask cases.

41139

if (Ops.size() <= 2) {

41140

// Minor canonicalization of the accumulated shuffle mask to make it easier

41141

// to match below. All this does is detect masks with sequential pairs of

41142

// elements, and shrink them to the half-width mask. It does this in a loop

41143

// so it will reduce the size of the mask to the minimal width mask which

41144

// performs an equivalent shuffle.

41145

while (Mask.size() > 1) {

41146

SmallVector<int, 64> WidenedMask;

41147

if (!canWidenShuffleElements(Mask, WidenedMask))

41148

break;

41149

Mask = std::move(WidenedMask);

41150

}

41151

41152

// Canonicalization of binary shuffle masks to improve pattern matching by

41153

// commuting the inputs.

41154

if (Ops.size() == 2 && canonicalizeShuffleMaskWithCommute(Mask)) {

41155

ShuffleVectorSDNode::commuteMask(Mask);

41156

std::swap(Ops[0], Ops[1]);

41157

}

41158

41159

// Try to combine into a single shuffle instruction.

41160

if (SDValue Shuffle = combineX86ShuffleChain(

41161

Ops, Root, Mask, Depth, HasVariableMask, AllowVariableCrossLaneMask,

41162

AllowVariablePerLaneMask, DAG, Subtarget))

41163

return Shuffle;

41164

41165

// If all the operands come from the same larger vector, fallthrough and try

41166

// to use combineX86ShuffleChainWithExtract.

41167

SDValue LHS = peekThroughBitcasts(Ops.front());

41168

SDValue RHS = peekThroughBitcasts(Ops.back());

41169

if (Ops.size() != 2 || !Subtarget.hasAVX2() || RootSizeInBits != 128 ||

41170

(RootSizeInBits / Mask.size()) != 64 ||

41171

LHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||

41172

RHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||

41173

LHS.getOperand(0) != RHS.getOperand(0))

41174

return SDValue();

41175

}

41176

41177

// If that failed and any input is extracted then try to combine as a

41178

// shuffle with the larger type.

41179

return combineX86ShuffleChainWithExtract(

41180

Ops, Root, Mask, Depth, HasVariableMask, AllowVariableCrossLaneMask,

41181

AllowVariablePerLaneMask, DAG, Subtarget);

41182

}

41183

41184

/// Helper entry wrapper to combineX86ShufflesRecursively.

41185

static SDValue combineX86ShufflesRecursively(SDValue Op, SelectionDAG &DAG,

41186

const X86Subtarget &Subtarget) {

41187

return combineX86ShufflesRecursively(

41188

{Op}, 0, Op, {0}, {}, /*Depth*/ 0, X86::MaxShuffleCombineDepth,

41189

/*HasVarMask*/ false,

41190

/*AllowCrossLaneVarMask*/ true, /*AllowPerLaneVarMask*/ true, DAG,

41191

Subtarget);

41192

}

41193

41194

/// Get the PSHUF-style mask from PSHUF node.

41195

///

41196

/// This is a very minor wrapper around getTargetShuffleMask to easy forming v4

41197

/// PSHUF-style masks that can be reused with such instructions.

41198

static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) {

41199

MVT VT = N.getSimpleValueType();

41200

SmallVector<int, 4> Mask;

41201

SmallVector<SDValue, 2> Ops;

41202

bool HaveMask =

41203

getTargetShuffleMask(N.getNode(), VT, false, Ops, Mask);

41204

(void)HaveMask;

41205

assert(HaveMask)(static_cast <bool> (HaveMask) ? void (0) : __assert_fail
("HaveMask", "llvm/lib/Target/X86/X86ISelLowering.cpp", 41205
, __extension__ __PRETTY_FUNCTION__));

41206

41207

// If we have more than 128-bits, only the low 128-bits of shuffle mask

41208

// matter. Check that the upper masks are repeats and remove them.

41209

if (VT.getSizeInBits() > 128) {

41210

int LaneElts = 128 / VT.getScalarSizeInBits();

41211

#ifndef NDEBUG

41212

for (int i = 1, NumLanes = VT.getSizeInBits() / 128; i < NumLanes; ++i)

41213

for (int j = 0; j < LaneElts; ++j)

41214

assert(Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) &&(static_cast <bool> (Mask[j] == Mask[i * LaneElts + j] -
(LaneElts * i) && "Mask doesn't repeat in high 128-bit lanes!"
) ? void (0) : __assert_fail ("Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) && \"Mask doesn't repeat in high 128-bit lanes!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41215, __extension__
__PRETTY_FUNCTION__))

41215

"Mask doesn't repeat in high 128-bit lanes!")(static_cast <bool> (Mask[j] == Mask[i * LaneElts + j] -
(LaneElts * i) && "Mask doesn't repeat in high 128-bit lanes!"
) ? void (0) : __assert_fail ("Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) && \"Mask doesn't repeat in high 128-bit lanes!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41215, __extension__
__PRETTY_FUNCTION__));

41216

#endif

41217

Mask.resize(LaneElts);

41218

}

41219

41220

switch (N.getOpcode()) {

41221

case X86ISD::PSHUFD:

41222

return Mask;

41223

case X86ISD::PSHUFLW:

41224

Mask.resize(4);

41225

return Mask;

41226

case X86ISD::PSHUFHW:

41227

Mask.erase(Mask.begin(), Mask.begin() + 4);

41228

for (int &M : Mask)

41229

M -= 4;

41230

return Mask;

41231

default:

41232

llvm_unreachable("No valid shuffle instruction found!")::llvm::llvm_unreachable_internal("No valid shuffle instruction found!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41232);

41233

}

41234

}

41235

41236

/// Search for a combinable shuffle across a chain ending in pshufd.

41237

///

41238

/// We walk up the chain and look for a combinable shuffle, skipping over

41239

/// shuffles that we could hoist this shuffle's transformation past without

41240

/// altering anything.

41241

static SDValue

41242

combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,

41243

SelectionDAG &DAG) {

41244

assert(N.getOpcode() == X86ISD::PSHUFD &&(static_cast <bool> (N.getOpcode() == X86ISD::PSHUFD &&
"Called with something other than an x86 128-bit half shuffle!"
) ? void (0) : __assert_fail ("N.getOpcode() == X86ISD::PSHUFD && \"Called with something other than an x86 128-bit half shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41245, __extension__
__PRETTY_FUNCTION__))

41245

"Called with something other than an x86 128-bit half shuffle!")(static_cast <bool> (N.getOpcode() == X86ISD::PSHUFD &&
"Called with something other than an x86 128-bit half shuffle!"
) ? void (0) : __assert_fail ("N.getOpcode() == X86ISD::PSHUFD && \"Called with something other than an x86 128-bit half shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41245, __extension__
__PRETTY_FUNCTION__));

41246

SDLoc DL(N);

41247

41248

// Walk up a single-use chain looking for a combinable shuffle. Keep a stack

41249

// of the shuffles in the chain so that we can form a fresh chain to replace

41250

// this one.

41251

SmallVector<SDValue, 8> Chain;

41252

SDValue V = N.getOperand(0);

41253

for (; V.hasOneUse(); V = V.getOperand(0)) {

41254

switch (V.getOpcode()) {

41255

default:

41256

return SDValue(); // Nothing combined!

41257

41258

case ISD::BITCAST:

41259

// Skip bitcasts as we always know the type for the target specific

41260

// instructions.

41261

continue;

41262

41263

case X86ISD::PSHUFD:

41264

// Found another dword shuffle.

41265

break;

41266

41267

case X86ISD::PSHUFLW:

41268

// Check that the low words (being shuffled) are the identity in the

41269

// dword shuffle, and the high words are self-contained.

41270

if (Mask[0] != 0 || Mask[1] != 1 ||

41271

!(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4))

41272

return SDValue();

41273

41274

Chain.push_back(V);

41275

continue;

41276

41277

case X86ISD::PSHUFHW:

41278

// Check that the high words (being shuffled) are the identity in the

41279

// dword shuffle, and the low words are self-contained.

41280

if (Mask[2] != 2 || Mask[3] != 3 ||

41281

!(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2))

41282

return SDValue();

41283

41284

Chain.push_back(V);

41285

continue;

41286

41287

case X86ISD::UNPCKL:

41288

case X86ISD::UNPCKH:

41289

// For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword

41290

// shuffle into a preceding word shuffle.

41291

if (V.getSimpleValueType().getVectorElementType() != MVT::i8 &&

41292

V.getSimpleValueType().getVectorElementType() != MVT::i16)

41293

return SDValue();

41294

41295

// Search for a half-shuffle which we can combine with.

41296

unsigned CombineOp =

41297

V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;

41298

if (V.getOperand(0) != V.getOperand(1) ||

41299

!V->isOnlyUserOf(V.getOperand(0).getNode()))

41300

return SDValue();

41301

Chain.push_back(V);

41302

V = V.getOperand(0);

41303

do {

41304

switch (V.getOpcode()) {

41305

default:

41306

return SDValue(); // Nothing to combine.

41307

41308

case X86ISD::PSHUFLW:

41309

case X86ISD::PSHUFHW:

41310

if (V.getOpcode() == CombineOp)

41311

break;

41312

41313

Chain.push_back(V);

41314

41315

[[fallthrough]];

41316

case ISD::BITCAST:

41317

V = V.getOperand(0);

41318

continue;

41319

}

41320

break;

41321

} while (V.hasOneUse());

41322

break;

41323

}

41324

// Break out of the loop if we break out of the switch.

41325

break;

41326

}

41327

41328

if (!V.hasOneUse())

41329

// We fell out of the loop without finding a viable combining instruction.

41330

return SDValue();

41331

41332

// Merge this node's mask and our incoming mask.

41333

SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);

41334

for (int &M : Mask)

41335

M = VMask[M];

41336

V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0),

41337

getV4X86ShuffleImm8ForMask(Mask, DL, DAG));

41338

41339

// Rebuild the chain around this new shuffle.

41340

while (!Chain.empty()) {

41341

SDValue W = Chain.pop_back_val();

41342

41343

if (V.getValueType() != W.getOperand(0).getValueType())

41344

V = DAG.getBitcast(W.getOperand(0).getValueType(), V);

41345

41346

switch (W.getOpcode()) {

41347

default:

41348

llvm_unreachable("Only PSHUF and UNPCK instructions get here!")::llvm::llvm_unreachable_internal("Only PSHUF and UNPCK instructions get here!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41348);

41349

41350

case X86ISD::UNPCKL:

41351

case X86ISD::UNPCKH:

41352

V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, V);

41353

break;

41354

41355

case X86ISD::PSHUFD:

41356

case X86ISD::PSHUFLW:

41357

case X86ISD::PSHUFHW:

41358

V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, W.getOperand(1));

41359

break;

41360

}

41361

}

41362

if (V.getValueType() != N.getValueType())

41363

V = DAG.getBitcast(N.getValueType(), V);

41364

41365

// Return the new chain to replace N.

41366

return V;

41367

}

41368

41369

// Attempt to commute shufps LHS loads:

41370

// permilps(shufps(load(),x)) --> permilps(shufps(x,load()))

41371

static SDValue combineCommutableSHUFP(SDValue N, MVT VT, const SDLoc &DL,

41372

SelectionDAG &DAG) {

41373

// TODO: Add vXf64 support.

41374

if (VT != MVT::v4f32 && VT != MVT::v8f32 && VT != MVT::v16f32)

41375

return SDValue();

41376

41377

// SHUFP(LHS, RHS) -> SHUFP(RHS, LHS) iff LHS is foldable + RHS is not.

41378

auto commuteSHUFP = [&VT, &DL, &DAG](SDValue Parent, SDValue V) {

41379

if (V.getOpcode() != X86ISD::SHUFP || !Parent->isOnlyUserOf(V.getNode()))

41380

return SDValue();

41381

SDValue N0 = V.getOperand(0);

41382

SDValue N1 = V.getOperand(1);

41383

unsigned Imm = V.getConstantOperandVal(2);

41384

const X86Subtarget &Subtarget = DAG.getSubtarget<X86Subtarget>();

41385

if (!X86::mayFoldLoad(peekThroughOneUseBitcasts(N0), Subtarget) ||

41386

X86::mayFoldLoad(peekThroughOneUseBitcasts(N1), Subtarget))

41387

return SDValue();

41388

Imm = ((Imm & 0x0F) << 4) | ((Imm & 0xF0) >> 4);

41389

return DAG.getNode(X86ISD::SHUFP, DL, VT, N1, N0,

41390

DAG.getTargetConstant(Imm, DL, MVT::i8));

41391

};

41392

41393

switch (N.getOpcode()) {

41394

case X86ISD::VPERMILPI:

41395

if (SDValue NewSHUFP = commuteSHUFP(N, N.getOperand(0))) {

41396

unsigned Imm = N.getConstantOperandVal(1);

41397

return DAG.getNode(X86ISD::VPERMILPI, DL, VT, NewSHUFP,

41398

DAG.getTargetConstant(Imm ^ 0xAA, DL, MVT::i8));

41399

}

41400

break;

41401

case X86ISD::SHUFP: {

41402

SDValue N0 = N.getOperand(0);

41403

SDValue N1 = N.getOperand(1);

41404

unsigned Imm = N.getConstantOperandVal(2);

41405

if (N0 == N1) {

41406

if (SDValue NewSHUFP = commuteSHUFP(N, N0))

41407

return DAG.getNode(X86ISD::SHUFP, DL, VT, NewSHUFP, NewSHUFP,

41408

DAG.getTargetConstant(Imm ^ 0xAA, DL, MVT::i8));

41409

} else if (SDValue NewSHUFP = commuteSHUFP(N, N0)) {

41410

return DAG.getNode(X86ISD::SHUFP, DL, VT, NewSHUFP, N1,

41411

DAG.getTargetConstant(Imm ^ 0x0A, DL, MVT::i8));

41412

} else if (SDValue NewSHUFP = commuteSHUFP(N, N1)) {

41413

return DAG.getNode(X86ISD::SHUFP, DL, VT, N0, NewSHUFP,

41414

DAG.getTargetConstant(Imm ^ 0xA0, DL, MVT::i8));

41415

}

41416

break;

41417

}

41418

}

41419

41420

return SDValue();

41421

}

41422

41423

// Canonicalize SHUFFLE(BINOP(X,Y)) -> BINOP(SHUFFLE(X),SHUFFLE(Y)).

41424

static SDValue canonicalizeShuffleWithBinOps(SDValue N, SelectionDAG &DAG,

41425

const SDLoc &DL) {

41426

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

41427

EVT ShuffleVT = N.getValueType();

41428

41429

auto IsMergeableWithShuffle = [&DAG](SDValue Op, bool FoldLoad = false) {

41430

// AllZeros/AllOnes constants are freely shuffled and will peek through

41431

// bitcasts. Other constant build vectors do not peek through bitcasts. Only

41432

// merge with target shuffles if it has one use so shuffle combining is

41433

// likely to kick in. Shuffles of splats are expected to be removed.

41434

return ISD::isBuildVectorAllOnes(Op.getNode()) ||

41435

ISD::isBuildVectorAllZeros(Op.getNode()) ||

41436

ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) ||

41437

ISD::isBuildVectorOfConstantFPSDNodes(Op.getNode()) ||

41438

(Op.getOpcode() == ISD::INSERT_SUBVECTOR && Op->hasOneUse()) ||

41439

(isTargetShuffle(Op.getOpcode()) && Op->hasOneUse()) ||

41440

(FoldLoad && isShuffleFoldableLoad(Op)) ||

41441

DAG.isSplatValue(Op, /*AllowUndefs*/ false);

41442

};

41443

auto IsSafeToMoveShuffle = [ShuffleVT](SDValue Op, unsigned BinOp) {

41444

// Ensure we only shuffle whole vector src elements, unless its a logical

41445

// binops where we can more aggressively move shuffles from dst to src.

41446

return BinOp == ISD::AND || BinOp == ISD::OR || BinOp == ISD::XOR ||

41447

BinOp == X86ISD::ANDNP ||

41448

(Op.getScalarValueSizeInBits() <= ShuffleVT.getScalarSizeInBits());

41449

};

41450

41451

unsigned Opc = N.getOpcode();

41452

switch (Opc) {

41453

// Unary and Unary+Permute Shuffles.

41454

case X86ISD::PSHUFB: {

41455

// Don't merge PSHUFB if it contains zero'd elements.

41456

SmallVector<int> Mask;

41457

SmallVector<SDValue> Ops;

41458

if (!getTargetShuffleMask(N.getNode(), ShuffleVT.getSimpleVT(), false, Ops,

41459

Mask))

41460

break;

41461

[[fallthrough]];

41462

}

41463

case X86ISD::VBROADCAST:

41464

case X86ISD::MOVDDUP:

41465

case X86ISD::PSHUFD:

41466

case X86ISD::PSHUFHW:

41467

case X86ISD::PSHUFLW:

41468

case X86ISD::VPERMI:

41469

case X86ISD::VPERMILPI: {

41470

if (N.getOperand(0).getValueType() == ShuffleVT &&

41471

N->isOnlyUserOf(N.getOperand(0).getNode())) {

41472

SDValue N0 = peekThroughOneUseBitcasts(N.getOperand(0));

41473

unsigned SrcOpcode = N0.getOpcode();

41474

if (TLI.isBinOp(SrcOpcode) && IsSafeToMoveShuffle(N0, SrcOpcode)) {

41475

SDValue Op00 = peekThroughOneUseBitcasts(N0.getOperand(0));

41476

SDValue Op01 = peekThroughOneUseBitcasts(N0.getOperand(1));

41477

if (IsMergeableWithShuffle(Op00, Opc != X86ISD::PSHUFB) ||

41478

IsMergeableWithShuffle(Op01, Opc != X86ISD::PSHUFB)) {

41479

SDValue LHS, RHS;

41480

Op00 = DAG.getBitcast(ShuffleVT, Op00);

41481

Op01 = DAG.getBitcast(ShuffleVT, Op01);

41482

if (N.getNumOperands() == 2) {

41483

LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, N.getOperand(1));

41484

RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, N.getOperand(1));

41485

} else {

41486

LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00);

41487

RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01);

41488

}

41489

EVT OpVT = N0.getValueType();

41490

return DAG.getBitcast(ShuffleVT,

41491

DAG.getNode(SrcOpcode, DL, OpVT,

41492

DAG.getBitcast(OpVT, LHS),

41493

DAG.getBitcast(OpVT, RHS)));

41494

}

41495

}

41496

}

41497

break;

41498

}

41499

// Binary and Binary+Permute Shuffles.

41500

case X86ISD::INSERTPS: {

41501

// Don't merge INSERTPS if it contains zero'd elements.

41502

unsigned InsertPSMask = N.getConstantOperandVal(2);

41503

unsigned ZeroMask = InsertPSMask & 0xF;

41504

if (ZeroMask != 0)

41505

break;

41506

[[fallthrough]];

41507

}

41508

case X86ISD::MOVSD:

41509

case X86ISD::MOVSS:

41510

case X86ISD::BLENDI:

41511

case X86ISD::SHUFP:

41512

case X86ISD::UNPCKH:

41513

case X86ISD::UNPCKL: {

41514

if (N->isOnlyUserOf(N.getOperand(0).getNode()) &&

41515

N->isOnlyUserOf(N.getOperand(1).getNode())) {

41516

SDValue N0 = peekThroughOneUseBitcasts(N.getOperand(0));

41517

SDValue N1 = peekThroughOneUseBitcasts(N.getOperand(1));

41518

unsigned SrcOpcode = N0.getOpcode();

41519

if (TLI.isBinOp(SrcOpcode) && N1.getOpcode() == SrcOpcode &&

41520

IsSafeToMoveShuffle(N0, SrcOpcode) &&

41521

IsSafeToMoveShuffle(N1, SrcOpcode)) {

41522

SDValue Op00 = peekThroughOneUseBitcasts(N0.getOperand(0));

41523

SDValue Op10 = peekThroughOneUseBitcasts(N1.getOperand(0));

41524

SDValue Op01 = peekThroughOneUseBitcasts(N0.getOperand(1));

41525

SDValue Op11 = peekThroughOneUseBitcasts(N1.getOperand(1));

41526

// Ensure the total number of shuffles doesn't increase by folding this

41527

// shuffle through to the source ops.

41528

if (((IsMergeableWithShuffle(Op00) && IsMergeableWithShuffle(Op10)) ||

41529

(IsMergeableWithShuffle(Op01) && IsMergeableWithShuffle(Op11))) ||

41530

((IsMergeableWithShuffle(Op00) || IsMergeableWithShuffle(Op10)) &&

41531

(IsMergeableWithShuffle(Op01) || IsMergeableWithShuffle(Op11)))) {

41532

SDValue LHS, RHS;

41533

Op00 = DAG.getBitcast(ShuffleVT, Op00);

41534

Op10 = DAG.getBitcast(ShuffleVT, Op10);

41535

Op01 = DAG.getBitcast(ShuffleVT, Op01);

41536

Op11 = DAG.getBitcast(ShuffleVT, Op11);

41537

if (N.getNumOperands() == 3) {

41538

LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10, N.getOperand(2));

41539

RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, Op11, N.getOperand(2));

41540

} else {

41541

LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10);

41542

RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, Op11);

41543

}

41544

EVT OpVT = N0.getValueType();

41545

return DAG.getBitcast(ShuffleVT,

41546

DAG.getNode(SrcOpcode, DL, OpVT,

41547

DAG.getBitcast(OpVT, LHS),

41548

DAG.getBitcast(OpVT, RHS)));

41549

}

41550

}

41551

}

41552

break;

41553

}

41554

}

41555

return SDValue();

41556

}

41557

41558

/// Attempt to fold vpermf128(op(),op()) -> op(vpermf128(),vpermf128()).

41559

static SDValue canonicalizeLaneShuffleWithRepeatedOps(SDValue V,

41560

SelectionDAG &DAG,

41561

const SDLoc &DL) {

41562

assert(V.getOpcode() == X86ISD::VPERM2X128 && "Unknown lane shuffle")(static_cast <bool> (V.getOpcode() == X86ISD::VPERM2X128
&& "Unknown lane shuffle") ? void (0) : __assert_fail
("V.getOpcode() == X86ISD::VPERM2X128 && \"Unknown lane shuffle\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41562, __extension__
__PRETTY_FUNCTION__));

41563

41564

MVT VT = V.getSimpleValueType();

41565

SDValue Src0 = peekThroughBitcasts(V.getOperand(0));

41566

SDValue Src1 = peekThroughBitcasts(V.getOperand(1));

41567

unsigned SrcOpc0 = Src0.getOpcode();

41568

unsigned SrcOpc1 = Src1.getOpcode();

41569

EVT SrcVT0 = Src0.getValueType();

41570

EVT SrcVT1 = Src1.getValueType();

41571

41572

if (!Src1.isUndef() && (SrcVT0 != SrcVT1 || SrcOpc0 != SrcOpc1))

41573

return SDValue();

41574

41575

switch (SrcOpc0) {

41576

case X86ISD::MOVDDUP: {

41577

SDValue LHS = Src0.getOperand(0);

41578

SDValue RHS = Src1.isUndef() ? Src1 : Src1.getOperand(0);

41579

SDValue Res =

41580

DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT0, LHS, RHS, V.getOperand(2));

41581

Res = DAG.getNode(SrcOpc0, DL, SrcVT0, Res);

41582

return DAG.getBitcast(VT, Res);

41583

}

41584

case X86ISD::VPERMILPI:

41585

// TODO: Handle v4f64 permutes with different low/high lane masks.

41586

if (SrcVT0 == MVT::v4f64) {

41587

uint64_t Mask = Src0.getConstantOperandVal(1);

41588

if ((Mask & 0x3) != ((Mask >> 2) & 0x3))

41589

break;

41590

}

41591

[[fallthrough]];

41592

case X86ISD::VSHLI:

41593

case X86ISD::VSRLI:

41594

case X86ISD::VSRAI:

41595

case X86ISD::PSHUFD:

41596

if (Src1.isUndef() || Src0.getOperand(1) == Src1.getOperand(1)) {

41597

SDValue LHS = Src0.getOperand(0);

41598

SDValue RHS = Src1.isUndef() ? Src1 : Src1.getOperand(0);

41599

SDValue Res = DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT0, LHS, RHS,

41600

V.getOperand(2));

41601

Res = DAG.getNode(SrcOpc0, DL, SrcVT0, Res, Src0.getOperand(1));

41602

return DAG.getBitcast(VT, Res);

41603

}

41604

break;

41605

}

41606

41607

return SDValue();

41608

}

41609

41610

/// Try to combine x86 target specific shuffles.

41611

static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,

41612

TargetLowering::DAGCombinerInfo &DCI,

41613

const X86Subtarget &Subtarget) {

41614

SDLoc DL(N);

41615

MVT VT = N.getSimpleValueType();

41616

SmallVector<int, 4> Mask;

41617

unsigned Opcode = N.getOpcode();

41618

41619

if (SDValue R = combineCommutableSHUFP(N, VT, DL, DAG))

41620

return R;

41621

41622

// Handle specific target shuffles.

41623

switch (Opcode) {

41624

case X86ISD::MOVDDUP: {

41625

SDValue Src = N.getOperand(0);

41626

// Turn a 128-bit MOVDDUP of a full vector load into movddup+vzload.

41627

if (VT == MVT::v2f64 && Src.hasOneUse() &&

41628

ISD::isNormalLoad(Src.getNode())) {

41629

LoadSDNode *LN = cast<LoadSDNode>(Src);

41630

if (SDValue VZLoad = narrowLoadToVZLoad(LN, MVT::f64, MVT::v2f64, DAG)) {

41631

SDValue Movddup = DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, VZLoad);

41632

DCI.CombineTo(N.getNode(), Movddup);

41633

DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));

41634

DCI.recursivelyDeleteUnusedNodes(LN);

41635

return N; // Return N so it doesn't get rechecked!

41636

}

41637

}

41638

41639

return SDValue();

41640

}

41641

case X86ISD::VBROADCAST: {

41642

SDValue Src = N.getOperand(0);

41643

SDValue BC = peekThroughBitcasts(Src);

41644

EVT SrcVT = Src.getValueType();

41645

EVT BCVT = BC.getValueType();

41646

41647

// If broadcasting from another shuffle, attempt to simplify it.

41648

// TODO - we really need a general SimplifyDemandedVectorElts mechanism.

41649

if (isTargetShuffle(BC.getOpcode()) &&

41650

VT.getScalarSizeInBits() % BCVT.getScalarSizeInBits() == 0) {

41651

unsigned Scale = VT.getScalarSizeInBits() / BCVT.getScalarSizeInBits();

41652

SmallVector<int, 16> DemandedMask(BCVT.getVectorNumElements(),

41653

SM_SentinelUndef);

41654

for (unsigned i = 0; i != Scale; ++i)

41655

DemandedMask[i] = i;

41656

if (SDValue Res = combineX86ShufflesRecursively(

41657

{BC}, 0, BC, DemandedMask, {}, /*Depth*/ 0,

41658

X86::MaxShuffleCombineDepth,

41659

/*HasVarMask*/ false, /*AllowCrossLaneVarMask*/ true,

41660

/*AllowPerLaneVarMask*/ true, DAG, Subtarget))

41661

return DAG.getNode(X86ISD::VBROADCAST, DL, VT,

41662

DAG.getBitcast(SrcVT, Res));

41663

}

41664

41665

// broadcast(bitcast(src)) -> bitcast(broadcast(src))

41666

// 32-bit targets have to bitcast i64 to f64, so better to bitcast upward.

41667

if (Src.getOpcode() == ISD::BITCAST &&

41668

SrcVT.getScalarSizeInBits() == BCVT.getScalarSizeInBits() &&

41669

DAG.getTargetLoweringInfo().isTypeLegal(BCVT) &&

41670

FixedVectorType::isValidElementType(

41671

BCVT.getScalarType().getTypeForEVT(*DAG.getContext()))) {

41672

EVT NewVT = EVT::getVectorVT(*DAG.getContext(), BCVT.getScalarType(),

41673

VT.getVectorNumElements());

41674

return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, DL, NewVT, BC));

41675

}

41676

41677

// vbroadcast(bitcast(vbroadcast(src))) -> bitcast(vbroadcast(src))

41678

// If we're re-broadcasting a smaller type then broadcast with that type and

41679

// bitcast.

41680

// TODO: Do this for any splat?

41681

if (Src.getOpcode() == ISD::BITCAST &&

41682

(BC.getOpcode() == X86ISD::VBROADCAST ||

41683

BC.getOpcode() == X86ISD::VBROADCAST_LOAD) &&

41684

(VT.getScalarSizeInBits() % BCVT.getScalarSizeInBits()) == 0 &&

41685

(VT.getSizeInBits() % BCVT.getSizeInBits()) == 0) {

41686

MVT NewVT =

41687

MVT::getVectorVT(BCVT.getSimpleVT().getScalarType(),

41688

VT.getSizeInBits() / BCVT.getScalarSizeInBits());

41689

return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, DL, NewVT, BC));

41690

}

41691

41692

// Reduce broadcast source vector to lowest 128-bits.

41693

if (SrcVT.getSizeInBits() > 128)

41694

return DAG.getNode(X86ISD::VBROADCAST, DL, VT,

41695

extract128BitVector(Src, 0, DAG, DL));

41696

41697

// broadcast(scalar_to_vector(x)) -> broadcast(x).

41698

if (Src.getOpcode() == ISD::SCALAR_TO_VECTOR)

41699

return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Src.getOperand(0));

41700

41701

// broadcast(extract_vector_elt(x, 0)) -> broadcast(x).

41702

if (Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&

41703

isNullConstant(Src.getOperand(1)) &&

41704

DAG.getTargetLoweringInfo().isTypeLegal(

41705

Src.getOperand(0).getValueType()))

41706

return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Src.getOperand(0));

41707

41708

// Share broadcast with the longest vector and extract low subvector (free).

41709

// Ensure the same SDValue from the SDNode use is being used.

41710

for (SDNode *User : Src->uses())

41711

if (User != N.getNode() && User->getOpcode() == X86ISD::VBROADCAST &&

41712

Src == User->getOperand(0) &&

41713

User->getValueSizeInBits(0).getFixedValue() >

41714

VT.getFixedSizeInBits()) {

41715

return extractSubVector(SDValue(User, 0), 0, DAG, DL,

41716

VT.getSizeInBits());

41717

}

41718

41719

// vbroadcast(scalarload X) -> vbroadcast_load X

41720

// For float loads, extract other uses of the scalar from the broadcast.

41721

if (!SrcVT.isVector() && (Src.hasOneUse() || VT.isFloatingPoint()) &&

41722

ISD::isNormalLoad(Src.getNode())) {

41723

LoadSDNode *LN = cast<LoadSDNode>(Src);

41724

SDVTList Tys = DAG.getVTList(VT, MVT::Other);

41725

SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };

41726

SDValue BcastLd =

41727

DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops,

41728

LN->getMemoryVT(), LN->getMemOperand());

41729

// If the load value is used only by N, replace it via CombineTo N.

41730

bool NoReplaceExtract = Src.hasOneUse();

41731

DCI.CombineTo(N.getNode(), BcastLd);

41732

if (NoReplaceExtract) {

41733

DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));

41734

DCI.recursivelyDeleteUnusedNodes(LN);

41735

} else {

41736

SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SrcVT, BcastLd,

41737

DAG.getIntPtrConstant(0, DL));

41738

DCI.CombineTo(LN, Scl, BcastLd.getValue(1));

41739

}

41740

return N; // Return N so it doesn't get rechecked!

41741

}

41742

41743

// Due to isTypeDesirableForOp, we won't always shrink a load truncated to

41744

// i16. So shrink it ourselves if we can make a broadcast_load.

41745

if (SrcVT == MVT::i16 && Src.getOpcode() == ISD::TRUNCATE &&

41746

Src.hasOneUse() && Src.getOperand(0).hasOneUse()) {

41747

assert(Subtarget.hasAVX2() && "Expected AVX2")(static_cast <bool> (Subtarget.hasAVX2() && "Expected AVX2"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"Expected AVX2\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41747, __extension__
__PRETTY_FUNCTION__));

41748

SDValue TruncIn = Src.getOperand(0);

41749

41750

// If this is a truncate of a non extending load we can just narrow it to

41751

// use a broadcast_load.

41752

if (ISD::isNormalLoad(TruncIn.getNode())) {

41753

LoadSDNode *LN = cast<LoadSDNode>(TruncIn);

41754

// Unless its volatile or atomic.

41755

if (LN->isSimple()) {

41756

SDVTList Tys = DAG.getVTList(VT, MVT::Other);

41757

SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };

41758

SDValue BcastLd = DAG.getMemIntrinsicNode(

41759

X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::i16,

41760

LN->getPointerInfo(), LN->getOriginalAlign(),

41761

LN->getMemOperand()->getFlags());

41762

DCI.CombineTo(N.getNode(), BcastLd);

41763

DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));

41764

DCI.recursivelyDeleteUnusedNodes(Src.getNode());

41765

return N; // Return N so it doesn't get rechecked!

41766

}

41767

}

41768

41769

// If this is a truncate of an i16 extload, we can directly replace it.

41770

if (ISD::isUNINDEXEDLoad(Src.getOperand(0).getNode()) &&

41771

ISD::isEXTLoad(Src.getOperand(0).getNode())) {

41772

LoadSDNode *LN = cast<LoadSDNode>(Src.getOperand(0));

41773

if (LN->getMemoryVT().getSizeInBits() == 16) {

41774

SDVTList Tys = DAG.getVTList(VT, MVT::Other);

41775

SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };

41776

SDValue BcastLd =

41777

DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops,

41778

LN->getMemoryVT(), LN->getMemOperand());

41779

DCI.CombineTo(N.getNode(), BcastLd);

41780

DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));

41781

DCI.recursivelyDeleteUnusedNodes(Src.getNode());

41782

return N; // Return N so it doesn't get rechecked!

41783

}

41784

}

41785

41786

// If this is a truncate of load that has been shifted right, we can

41787

// offset the pointer and use a narrower load.

41788

if (TruncIn.getOpcode() == ISD::SRL &&

41789

TruncIn.getOperand(0).hasOneUse() &&

41790

isa<ConstantSDNode>(TruncIn.getOperand(1)) &&

41791

ISD::isNormalLoad(TruncIn.getOperand(0).getNode())) {

41792

LoadSDNode *LN = cast<LoadSDNode>(TruncIn.getOperand(0));

41793

unsigned ShiftAmt = TruncIn.getConstantOperandVal(1);

41794

// Make sure the shift amount and the load size are divisible by 16.

41795

// Don't do this if the load is volatile or atomic.

41796

if (ShiftAmt % 16 == 0 && TruncIn.getValueSizeInBits() % 16 == 0 &&

41797

LN->isSimple()) {

41798

unsigned Offset = ShiftAmt / 8;

41799

SDVTList Tys = DAG.getVTList(VT, MVT::Other);

41800

SDValue Ptr = DAG.getMemBasePlusOffset(LN->getBasePtr(),

41801

TypeSize::Fixed(Offset), DL);

41802

SDValue Ops[] = { LN->getChain(), Ptr };

41803

SDValue BcastLd = DAG.getMemIntrinsicNode(

41804

X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::i16,

41805

LN->getPointerInfo().getWithOffset(Offset),

41806

LN->getOriginalAlign(),

41807

LN->getMemOperand()->getFlags());

41808

DCI.CombineTo(N.getNode(), BcastLd);

41809

DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));

41810

DCI.recursivelyDeleteUnusedNodes(Src.getNode());

41811

return N; // Return N so it doesn't get rechecked!

41812

}

41813

}

41814

}

41815

41816

// vbroadcast(vzload X) -> vbroadcast_load X

41817

if (Src.getOpcode() == X86ISD::VZEXT_LOAD && Src.hasOneUse()) {

41818

MemSDNode *LN = cast<MemIntrinsicSDNode>(Src);

41819

if (LN->getMemoryVT().getSizeInBits() == VT.getScalarSizeInBits()) {

41820

SDVTList Tys = DAG.getVTList(VT, MVT::Other);

41821

SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };

41822

SDValue BcastLd =

41823

DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops,

41824

LN->getMemoryVT(), LN->getMemOperand());

41825

DCI.CombineTo(N.getNode(), BcastLd);

41826

DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));

41827

DCI.recursivelyDeleteUnusedNodes(LN);

41828

return N; // Return N so it doesn't get rechecked!

41829

}

41830

}

41831

41832

// vbroadcast(vector load X) -> vbroadcast_load

41833

if ((SrcVT == MVT::v2f64 || SrcVT == MVT::v4f32 || SrcVT == MVT::v2i64 ||

41834

SrcVT == MVT::v4i32) &&

41835

Src.hasOneUse() && ISD::isNormalLoad(Src.getNode())) {

41836

LoadSDNode *LN = cast<LoadSDNode>(Src);

41837

// Unless the load is volatile or atomic.

41838

if (LN->isSimple()) {

41839

SDVTList Tys = DAG.getVTList(VT, MVT::Other);

41840

SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};

41841

SDValue BcastLd = DAG.getMemIntrinsicNode(

41842

X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, SrcVT.getScalarType(),

41843

LN->getPointerInfo(), LN->getOriginalAlign(),

41844

LN->getMemOperand()->getFlags());

41845

DCI.CombineTo(N.getNode(), BcastLd);

41846

DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));

41847

DCI.recursivelyDeleteUnusedNodes(LN);

41848

return N; // Return N so it doesn't get rechecked!

41849

}

41850

}

41851

41852

return SDValue();

41853

}

41854

case X86ISD::VZEXT_MOVL: {

41855

SDValue N0 = N.getOperand(0);

41856

41857

// If this a vzmovl of a full vector load, replace it with a vzload, unless

41858

// the load is volatile.

41859

if (N0.hasOneUse() && ISD::isNormalLoad(N0.getNode())) {

41860

auto *LN = cast<LoadSDNode>(N0);

41861

if (SDValue VZLoad =

41862

narrowLoadToVZLoad(LN, VT.getVectorElementType(), VT, DAG)) {

41863

DCI.CombineTo(N.getNode(), VZLoad);

41864

DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));

41865

DCI.recursivelyDeleteUnusedNodes(LN);

41866

return N;

41867

}

41868

}

41869

41870

// If this a VZEXT_MOVL of a VBROADCAST_LOAD, we don't need the broadcast

41871

// and can just use a VZEXT_LOAD.

41872

// FIXME: Is there some way to do this with SimplifyDemandedVectorElts?

41873

if (N0.hasOneUse() && N0.getOpcode() == X86ISD::VBROADCAST_LOAD) {

41874

auto *LN = cast<MemSDNode>(N0);

41875

if (VT.getScalarSizeInBits() == LN->getMemoryVT().getSizeInBits()) {

41876

SDVTList Tys = DAG.getVTList(VT, MVT::Other);

41877

SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};

41878

SDValue VZLoad =

41879

DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops,

41880

LN->getMemoryVT(), LN->getMemOperand());

41881

DCI.CombineTo(N.getNode(), VZLoad);

41882

DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));

41883

DCI.recursivelyDeleteUnusedNodes(LN);

41884

return N;

41885

}

41886

}

41887

41888

// Turn (v2i64 (vzext_movl (scalar_to_vector (i64 X)))) into

41889

// (v2i64 (bitcast (v4i32 (vzext_movl (scalar_to_vector (i32 (trunc X)))))))

41890

// if the upper bits of the i64 are zero.

41891

if (N0.hasOneUse() && N0.getOpcode() == ISD::SCALAR_TO_VECTOR &&

41892

N0.getOperand(0).hasOneUse() &&

41893

N0.getOperand(0).getValueType() == MVT::i64) {

41894

SDValue In = N0.getOperand(0);

41895

APInt Mask = APInt::getHighBitsSet(64, 32);

41896

if (DAG.MaskedValueIsZero(In, Mask)) {

41897

SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, In);

41898

MVT VecVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);

41899

SDValue SclVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Trunc);

41900

SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, DL, VecVT, SclVec);

41901

return DAG.getBitcast(VT, Movl);

41902

}

41903

}

41904

41905

// Load a scalar integer constant directly to XMM instead of transferring an

41906

// immediate value from GPR.

41907

// vzext_movl (scalar_to_vector C) --> load [C,0...]

41908

if (N0.getOpcode() == ISD::SCALAR_TO_VECTOR) {

41909

if (auto *C = dyn_cast<ConstantSDNode>(N0.getOperand(0))) {

41910

// Create a vector constant - scalar constant followed by zeros.

41911

EVT ScalarVT = N0.getOperand(0).getValueType();

41912

Type *ScalarTy = ScalarVT.getTypeForEVT(*DAG.getContext());

41913

unsigned NumElts = VT.getVectorNumElements();

41914

Constant *Zero = ConstantInt::getNullValue(ScalarTy);

41915

SmallVector<Constant *, 32> ConstantVec(NumElts, Zero);

41916

ConstantVec[0] = const_cast<ConstantInt *>(C->getConstantIntValue());

41917

41918

// Load the vector constant from constant pool.

41919

MVT PVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());

41920

SDValue CP = DAG.getConstantPool(ConstantVector::get(ConstantVec), PVT);

41921

MachinePointerInfo MPI =

41922

MachinePointerInfo::getConstantPool(DAG.getMachineFunction());

41923

Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();

41924

return DAG.getLoad(VT, DL, DAG.getEntryNode(), CP, MPI, Alignment,

41925

MachineMemOperand::MOLoad);

41926

}

41927

}

41928

41929

// Pull subvector inserts into undef through VZEXT_MOVL by making it an

41930

// insert into a zero vector. This helps get VZEXT_MOVL closer to

41931

// scalar_to_vectors where 256/512 are canonicalized to an insert and a

41932

// 128-bit scalar_to_vector. This reduces the number of isel patterns.

41933

if (!DCI.isBeforeLegalizeOps() && N0.hasOneUse()) {

41934

SDValue V = peekThroughOneUseBitcasts(N0);

41935

41936

if (V.getOpcode() == ISD::INSERT_SUBVECTOR && V.getOperand(0).isUndef() &&

41937

isNullConstant(V.getOperand(2))) {

41938

SDValue In = V.getOperand(1);

41939

MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(),

41940

In.getValueSizeInBits() /

41941

VT.getScalarSizeInBits());

41942

In = DAG.getBitcast(SubVT, In);

41943

SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, DL, SubVT, In);

41944

return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,

41945

getZeroVector(VT, Subtarget, DAG, DL), Movl,

41946

V.getOperand(2));

41947

}

41948

}

41949

41950

return SDValue();

41951

}

41952

case X86ISD::BLENDI: {

41953

SDValue N0 = N.getOperand(0);

41954

SDValue N1 = N.getOperand(1);

41955

41956

// blend(bitcast(x),bitcast(y)) -> bitcast(blend(x,y)) to narrower types.

41957

// TODO: Handle MVT::v16i16 repeated blend mask.

41958

if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST &&

41959

N0.getOperand(0).getValueType() == N1.getOperand(0).getValueType()) {

41960

MVT SrcVT = N0.getOperand(0).getSimpleValueType();

41961

if ((VT.getScalarSizeInBits() % SrcVT.getScalarSizeInBits()) == 0 &&

41962

SrcVT.getScalarSizeInBits() >= 32) {

41963

unsigned BlendMask = N.getConstantOperandVal(2);

41964

unsigned Size = VT.getVectorNumElements();

41965

unsigned Scale = VT.getScalarSizeInBits() / SrcVT.getScalarSizeInBits();

41966

BlendMask = scaleVectorShuffleBlendMask(BlendMask, Size, Scale);

41967

return DAG.getBitcast(

41968

VT, DAG.getNode(X86ISD::BLENDI, DL, SrcVT, N0.getOperand(0),

41969

N1.getOperand(0),

41970

DAG.getTargetConstant(BlendMask, DL, MVT::i8)));

41971

}

41972

}

41973

return SDValue();

41974

}

41975

case X86ISD::SHUFP: {

41976

// Fold shufps(shuffle(x),shuffle(y)) -> shufps(x,y).

41977

// This is a more relaxed shuffle combiner that can ignore oneuse limits.

41978

// TODO: Support types other than v4f32.

41979

if (VT == MVT::v4f32) {

41980

bool Updated = false;

41981

SmallVector<int> Mask;

41982

SmallVector<SDValue> Ops;

41983

if (getTargetShuffleMask(N.getNode(), VT, false, Ops, Mask) &&

41984

Ops.size() == 2) {

41985

for (int i = 0; i != 2; ++i) {

41986

SmallVector<SDValue> SubOps;

41987

SmallVector<int> SubMask, SubScaledMask;

41988

SDValue Sub = peekThroughBitcasts(Ops[i]);

41989

// TODO: Scaling might be easier if we specify the demanded elts.

41990

if (getTargetShuffleInputs(Sub, SubOps, SubMask, DAG, 0, false) &&

41991

scaleShuffleElements(SubMask, 4, SubScaledMask) &&

41992

SubOps.size() == 1 && isUndefOrInRange(SubScaledMask, 0, 4)) {

41993

int Ofs = i * 2;

41994

Mask[Ofs + 0] = SubScaledMask[Mask[Ofs + 0] % 4] + (i * 4);

41995

Mask[Ofs + 1] = SubScaledMask[Mask[Ofs + 1] % 4] + (i * 4);

41996

Ops[i] = DAG.getBitcast(VT, SubOps[0]);

41997

Updated = true;

41998

}

41999

}

42000

}

42001

if (Updated) {

42002

for (int &M : Mask)

42003

M %= 4;

42004

Ops.push_back(getV4X86ShuffleImm8ForMask(Mask, DL, DAG));

42005

return DAG.getNode(X86ISD::SHUFP, DL, VT, Ops);

42006

}

42007

}

42008

return SDValue();

42009

}

42010

case X86ISD::VPERMI: {

42011

// vpermi(bitcast(x)) -> bitcast(vpermi(x)) for same number of elements.

42012

// TODO: Remove when we have preferred domains in combineX86ShuffleChain.

42013

SDValue N0 = N.getOperand(0);

42014

SDValue N1 = N.getOperand(1);

42015

unsigned EltSizeInBits = VT.getScalarSizeInBits();

42016

if (N0.getOpcode() == ISD::BITCAST &&

42017

N0.getOperand(0).getScalarValueSizeInBits() == EltSizeInBits) {

42018

SDValue Src = N0.getOperand(0);

42019

EVT SrcVT = Src.getValueType();

42020

SDValue Res = DAG.getNode(X86ISD::VPERMI, DL, SrcVT, Src, N1);

42021

return DAG.getBitcast(VT, Res);

42022

}

42023

return SDValue();

42024

}

42025

case X86ISD::VPERM2X128: {

42026

// Fold vperm2x128(bitcast(x),bitcast(y),c) -> bitcast(vperm2x128(x,y,c)).

42027

SDValue LHS = N->getOperand(0);

42028

SDValue RHS = N->getOperand(1);

42029

if (LHS.getOpcode() == ISD::BITCAST &&

42030

(RHS.getOpcode() == ISD::BITCAST || RHS.isUndef())) {

42031

EVT SrcVT = LHS.getOperand(0).getValueType();

42032

if (RHS.isUndef() || SrcVT == RHS.getOperand(0).getValueType()) {

42033

return DAG.getBitcast(VT, DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT,

42034

DAG.getBitcast(SrcVT, LHS),

42035

DAG.getBitcast(SrcVT, RHS),

42036

N->getOperand(2)));

42037

}

42038

}

42039

42040

// Fold vperm2x128(op(),op()) -> op(vperm2x128(),vperm2x128()).

42041

if (SDValue Res = canonicalizeLaneShuffleWithRepeatedOps(N, DAG, DL))

42042

return Res;

42043

42044

// Fold vperm2x128 subvector shuffle with an inner concat pattern.

42045

// vperm2x128(concat(X,Y),concat(Z,W)) --> concat X,Y etc.

42046

auto FindSubVector128 = [&](unsigned Idx) {

42047

if (Idx > 3)

42048

return SDValue();

42049

SDValue Src = peekThroughBitcasts(N.getOperand(Idx < 2 ? 0 : 1));

42050

SmallVector<SDValue> SubOps;

42051

if (collectConcatOps(Src.getNode(), SubOps, DAG) && SubOps.size() == 2)

42052

return SubOps[Idx & 1];

42053

unsigned NumElts = Src.getValueType().getVectorNumElements();

42054

if ((Idx & 1) == 1 && Src.getOpcode() == ISD::INSERT_SUBVECTOR &&

42055

Src.getOperand(1).getValueSizeInBits() == 128 &&

42056

Src.getConstantOperandAPInt(2) == (NumElts / 2)) {

42057

return Src.getOperand(1);

42058

}

42059

return SDValue();

42060

};

42061

unsigned Imm = N.getConstantOperandVal(2);

42062

if (SDValue SubLo = FindSubVector128(Imm & 0x0F)) {

42063

if (SDValue SubHi = FindSubVector128((Imm & 0xF0) >> 4)) {

42064

MVT SubVT = VT.getHalfNumVectorElementsVT();

42065

SubLo = DAG.getBitcast(SubVT, SubLo);

42066

SubHi = DAG.getBitcast(SubVT, SubHi);

42067

return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, SubLo, SubHi);

42068

}

42069

}

42070

return SDValue();

42071

}

42072

case X86ISD::PSHUFD:

42073

case X86ISD::PSHUFLW:

42074

case X86ISD::PSHUFHW:

42075

Mask = getPSHUFShuffleMask(N);

42076

assert(Mask.size() == 4)(static_cast <bool> (Mask.size() == 4) ? void (0) : __assert_fail
("Mask.size() == 4", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 42076, __extension__ __PRETTY_FUNCTION__));

42077

break;

42078

case X86ISD::MOVSD:

42079

case X86ISD::MOVSH:

42080

case X86ISD::MOVSS: {

42081

SDValue N0 = N.getOperand(0);

42082

SDValue N1 = N.getOperand(1);

42083

42084

// Canonicalize scalar FPOps:

42085

// MOVS*(N0, OP(N0, N1)) --> MOVS*(N0, SCALAR_TO_VECTOR(OP(N0[0], N1[0])))

42086

// If commutable, allow OP(N1[0], N0[0]).

42087

unsigned Opcode1 = N1.getOpcode();

42088

if (Opcode1 == ISD::FADD || Opcode1 == ISD::FMUL || Opcode1 == ISD::FSUB ||

42089

Opcode1 == ISD::FDIV) {

42090

SDValue N10 = N1.getOperand(0);

42091

SDValue N11 = N1.getOperand(1);

42092

if (N10 == N0 ||

42093

(N11 == N0 && (Opcode1 == ISD::FADD || Opcode1 == ISD::FMUL))) {

42094

if (N10 != N0)

42095

std::swap(N10, N11);

42096

MVT SVT = VT.getVectorElementType();

42097

SDValue ZeroIdx = DAG.getIntPtrConstant(0, DL);

42098

N10 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT, N10, ZeroIdx);

42099

N11 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT, N11, ZeroIdx);

42100

SDValue Scl = DAG.getNode(Opcode1, DL, SVT, N10, N11);

42101

SDValue SclVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl);

42102

return DAG.getNode(Opcode, DL, VT, N0, SclVec);

42103

}

42104

}

42105

42106

return SDValue();

42107

}

42108

case X86ISD::INSERTPS: {

42109

assert(VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32")(static_cast <bool> (VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32"
) ? void (0) : __assert_fail ("VT == MVT::v4f32 && \"INSERTPS ValueType must be MVT::v4f32\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 42109, __extension__
__PRETTY_FUNCTION__));

42110

SDValue Op0 = N.getOperand(0);

42111

SDValue Op1 = N.getOperand(1);

42112

unsigned InsertPSMask = N.getConstantOperandVal(2);

42113

unsigned SrcIdx = (InsertPSMask >> 6) & 0x3;

42114

unsigned DstIdx = (InsertPSMask >> 4) & 0x3;

42115

unsigned ZeroMask = InsertPSMask & 0xF;

42116

42117

// If we zero out all elements from Op0 then we don't need to reference it.

42118

if (((ZeroMask | (1u << DstIdx)) == 0xF) && !Op0.isUndef())

42119

return DAG.getNode(X86ISD::INSERTPS, DL, VT, DAG.getUNDEF(VT), Op1,

42120

DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));

42121

42122

// If we zero out the element from Op1 then we don't need to reference it.

42123

if ((ZeroMask & (1u << DstIdx)) && !Op1.isUndef())

42124

return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),

42125

DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));

42126

42127

// Attempt to merge insertps Op1 with an inner target shuffle node.

42128

SmallVector<int, 8> TargetMask1;

42129

SmallVector<SDValue, 2> Ops1;

42130

APInt KnownUndef1, KnownZero1;

42131

if (getTargetShuffleAndZeroables(Op1, TargetMask1, Ops1, KnownUndef1,

42132

KnownZero1)) {

42133

if (KnownUndef1[SrcIdx] || KnownZero1[SrcIdx]) {

42134

// Zero/UNDEF insertion - zero out element and remove dependency.

42135

InsertPSMask |= (1u << DstIdx);

42136

return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),

42137

DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));

42138

}

42139

// Update insertps mask srcidx and reference the source input directly.

42140

int M = TargetMask1[SrcIdx];

42141

assert(0 <= M && M < 8 && "Shuffle index out of range")(static_cast <bool> (0 <= M && M < 8 &&
"Shuffle index out of range") ? void (0) : __assert_fail ("0 <= M && M < 8 && \"Shuffle index out of range\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 42141, __extension__
__PRETTY_FUNCTION__));

42142

InsertPSMask = (InsertPSMask & 0x3f) | ((M & 0x3) << 6);

42143

Op1 = Ops1[M < 4 ? 0 : 1];

42144

return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,

42145

DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));

42146

}

42147

42148

// Attempt to merge insertps Op0 with an inner target shuffle node.

42149

SmallVector<int, 8> TargetMask0;

42150

SmallVector<SDValue, 2> Ops0;

42151

APInt KnownUndef0, KnownZero0;

42152

if (getTargetShuffleAndZeroables(Op0, TargetMask0, Ops0, KnownUndef0,

42153

KnownZero0)) {

42154

bool Updated = false;

42155

bool UseInput00 = false;

42156

bool UseInput01 = false;

42157

for (int i = 0; i != 4; ++i) {

42158

if ((InsertPSMask & (1u << i)) || (i == (int)DstIdx)) {

42159

// No change if element is already zero or the inserted element.

42160

continue;

42161

}

42162

42163

if (KnownUndef0[i] || KnownZero0[i]) {

42164

// If the target mask is undef/zero then we must zero the element.

42165

InsertPSMask |= (1u << i);

42166

Updated = true;

42167

continue;

42168

}

42169

42170

// The input vector element must be inline.

42171

int M = TargetMask0[i];

42172

if (M != i && M != (i + 4))

42173

return SDValue();

42174

42175

// Determine which inputs of the target shuffle we're using.

42176

UseInput00 |= (0 <= M && M < 4);

42177

UseInput01 |= (4 <= M);

42178

}

42179

42180

// If we're not using both inputs of the target shuffle then use the

42181

// referenced input directly.

42182

if (UseInput00 && !UseInput01) {

42183

Updated = true;

42184

Op0 = Ops0[0];

42185

} else if (!UseInput00 && UseInput01) {

42186

Updated = true;

42187

Op0 = Ops0[1];

42188

}

42189

42190

if (Updated)

42191

return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,

42192

DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));

42193

}

42194

42195

// If we're inserting an element from a vbroadcast load, fold the

42196

// load into the X86insertps instruction. We need to convert the scalar

42197

// load to a vector and clear the source lane of the INSERTPS control.

42198

if (Op1.getOpcode() == X86ISD::VBROADCAST_LOAD && Op1.hasOneUse()) {

42199

auto *MemIntr = cast<MemIntrinsicSDNode>(Op1);

42200

if (MemIntr->getMemoryVT().getScalarSizeInBits() == 32) {

42201

SDValue Load = DAG.getLoad(MVT::f32, DL, MemIntr->getChain(),

42202

MemIntr->getBasePtr(),

42203

MemIntr->getMemOperand());

42204

SDValue Insert = DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0,

42205

DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT,

42206

Load),

42207

DAG.getTargetConstant(InsertPSMask & 0x3f, DL, MVT::i8));

42208

DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), Load.getValue(1));

42209

return Insert;

42210

}

42211

}

42212

42213

return SDValue();

42214

}

42215

default:

42216

return SDValue();

42217

}

42218

42219

// Nuke no-op shuffles that show up after combining.

42220

if (isNoopShuffleMask(Mask))

42221

return N.getOperand(0);

42222

42223

// Look for simplifications involving one or two shuffle instructions.

42224

SDValue V = N.getOperand(0);

42225

switch (N.getOpcode()) {

42226

default:

42227

break;

42228

case X86ISD::PSHUFLW:

42229

case X86ISD::PSHUFHW:

42230

assert(VT.getVectorElementType() == MVT::i16 && "Bad word shuffle type!")(static_cast <bool> (VT.getVectorElementType() == MVT::
i16 && "Bad word shuffle type!") ? void (0) : __assert_fail
("VT.getVectorElementType() == MVT::i16 && \"Bad word shuffle type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 42230, __extension__
__PRETTY_FUNCTION__));

42231

42232

// See if this reduces to a PSHUFD which is no more expensive and can

42233

// combine with more operations. Note that it has to at least flip the

42234

// dwords as otherwise it would have been removed as a no-op.

42235

if (ArrayRef(Mask).equals({2, 3, 0, 1})) {

42236

int DMask[] = {0, 1, 2, 3};

42237

int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2;

42238

DMask[DOffset + 0] = DOffset + 1;

42239

DMask[DOffset + 1] = DOffset + 0;

42240

MVT DVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);

42241

V = DAG.getBitcast(DVT, V);

42242

V = DAG.getNode(X86ISD::PSHUFD, DL, DVT, V,

42243

getV4X86ShuffleImm8ForMask(DMask, DL, DAG));

42244

return DAG.getBitcast(VT, V);

42245

}

42246

42247

// Look for shuffle patterns which can be implemented as a single unpack.

42248

// FIXME: This doesn't handle the location of the PSHUFD generically, and

42249

// only works when we have a PSHUFD followed by two half-shuffles.

42250

if (Mask[0] == Mask[1] && Mask[2] == Mask[3] &&

42251

(V.getOpcode() == X86ISD::PSHUFLW ||

42252

V.getOpcode() == X86ISD::PSHUFHW) &&

42253

V.getOpcode() != N.getOpcode() &&

42254

V.hasOneUse() && V.getOperand(0).hasOneUse()) {

42255

SDValue D = peekThroughOneUseBitcasts(V.getOperand(0));

42256

if (D.getOpcode() == X86ISD::PSHUFD) {

42257

SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);

42258

SmallVector<int, 4> DMask = getPSHUFShuffleMask(D);

42259

int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;

42260

int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;

42261

int WordMask[8];

42262

for (int i = 0; i < 4; ++i) {

42263

WordMask[i + NOffset] = Mask[i] + NOffset;

42264

WordMask[i + VOffset] = VMask[i] + VOffset;

42265

}

42266

// Map the word mask through the DWord mask.

42267

int MappedMask[8];

42268

for (int i = 0; i < 8; ++i)

42269

MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2;

42270

if (ArrayRef(MappedMask).equals({0, 0, 1, 1, 2, 2, 3, 3}) ||

42271

ArrayRef(MappedMask).equals({4, 4, 5, 5, 6, 6, 7, 7})) {

42272

// We can replace all three shuffles with an unpack.

42273

V = DAG.getBitcast(VT, D.getOperand(0));

42274

return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL

42275

: X86ISD::UNPCKH,

42276

DL, VT, V, V);

42277

}

42278

}

42279

}

42280

42281

break;

42282

42283

case X86ISD::PSHUFD:

42284

if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DAG))

42285

return NewN;

42286

42287

break;

42288

}

42289

42290

return SDValue();

42291

}

42292

42293

/// Checks if the shuffle mask takes subsequent elements

42294

/// alternately from two vectors.

42295

/// For example <0, 5, 2, 7> or <8, 1, 10, 3, 12, 5, 14, 7> are both correct.

42296

static bool isAddSubOrSubAddMask(ArrayRef<int> Mask, bool &Op0Even) {

42297

42298

int ParitySrc[2] = {-1, -1};

42299

unsigned Size = Mask.size();

42300

for (unsigned i = 0; i != Size; ++i) {

42301

int M = Mask[i];

42302

if (M < 0)

42303

continue;

42304

42305

// Make sure we are using the matching element from the input.

42306

if ((M % Size) != i)

42307

return false;

42308

42309

// Make sure we use the same input for all elements of the same parity.

42310

int Src = M / Size;

42311

if (ParitySrc[i % 2] >= 0 && ParitySrc[i % 2] != Src)

42312

return false;

42313

ParitySrc[i % 2] = Src;

42314

}

42315

42316

// Make sure each input is used.

42317

if (ParitySrc[0] < 0 || ParitySrc[1] < 0 || ParitySrc[0] == ParitySrc[1])

42318

return false;

42319

42320

Op0Even = ParitySrc[0] == 0;

42321

return true;

42322

}

42323

42324

/// Returns true iff the shuffle node \p N can be replaced with ADDSUB(SUBADD)

42325

/// operation. If true is returned then the operands of ADDSUB(SUBADD) operation

42326

/// are written to the parameters \p Opnd0 and \p Opnd1.

42327

///

42328

/// We combine shuffle to ADDSUB(SUBADD) directly on the abstract vector shuffle nodes

42329

/// so it is easier to generically match. We also insert dummy vector shuffle

42330

/// nodes for the operands which explicitly discard the lanes which are unused

42331

/// by this operation to try to flow through the rest of the combiner

42332

/// the fact that they're unused.

42333

static bool isAddSubOrSubAdd(SDNode *N, const X86Subtarget &Subtarget,

42334

SelectionDAG &DAG, SDValue &Opnd0, SDValue &Opnd1,

42335

bool &IsSubAdd) {

42336

42337

EVT VT = N->getValueType(0);

42338

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

42339

if (!Subtarget.hasSSE3() || !TLI.isTypeLegal(VT) ||

42340

!VT.getSimpleVT().isFloatingPoint())

42341

return false;

42342

42343

// We only handle target-independent shuffles.

42344

// FIXME: It would be easy and harmless to use the target shuffle mask

42345

// extraction tool to support more.

42346

if (N->getOpcode() != ISD::VECTOR_SHUFFLE)

42347

return false;

42348

42349

SDValue V1 = N->getOperand(0);

42350

SDValue V2 = N->getOperand(1);

42351

42352

// Make sure we have an FADD and an FSUB.

42353

if ((V1.getOpcode() != ISD::FADD && V1.getOpcode() != ISD::FSUB) ||

42354

(V2.getOpcode() != ISD::FADD && V2.getOpcode() != ISD::FSUB) ||

42355

V1.getOpcode() == V2.getOpcode())

42356

return false;

42357

42358

// If there are other uses of these operations we can't fold them.

42359

if (!V1->hasOneUse() || !V2->hasOneUse())

42360

return false;

42361

42362

// Ensure that both operations have the same operands. Note that we can

42363

// commute the FADD operands.

42364

SDValue LHS, RHS;

42365

if (V1.getOpcode() == ISD::FSUB) {

42366

LHS = V1->getOperand(0); RHS = V1->getOperand(1);

42367

if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) &&

42368

(V2->getOperand(0) != RHS || V2->getOperand(1) != LHS))

42369

return false;

42370

} else {

42371

assert(V2.getOpcode() == ISD::FSUB && "Unexpected opcode")(static_cast <bool> (V2.getOpcode() == ISD::FSUB &&
"Unexpected opcode") ? void (0) : __assert_fail ("V2.getOpcode() == ISD::FSUB && \"Unexpected opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 42371, __extension__
__PRETTY_FUNCTION__));

42372

LHS = V2->getOperand(0); RHS = V2->getOperand(1);

42373

if ((V1->getOperand(0) != LHS || V1->getOperand(1) != RHS) &&

42374

(V1->getOperand(0) != RHS || V1->getOperand(1) != LHS))

42375

return false;

42376

}

42377

42378

ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();

42379

bool Op0Even;

42380

if (!isAddSubOrSubAddMask(Mask, Op0Even))

42381

return false;

42382

42383

// It's a subadd if the vector in the even parity is an FADD.

42384

IsSubAdd = Op0Even ? V1->getOpcode() == ISD::FADD

42385

: V2->getOpcode() == ISD::FADD;

42386

42387

Opnd0 = LHS;

42388

Opnd1 = RHS;

42389

return true;

42390

}

42391

42392

/// Combine shuffle of two fma nodes into FMAddSub or FMSubAdd.

42393

static SDValue combineShuffleToFMAddSub(SDNode *N,

42394

const X86Subtarget &Subtarget,

42395

SelectionDAG &DAG) {

42396

// We only handle target-independent shuffles.

42397

// FIXME: It would be easy and harmless to use the target shuffle mask

42398

// extraction tool to support more.

42399

if (N->getOpcode() != ISD::VECTOR_SHUFFLE)

42400

return SDValue();

42401

42402

MVT VT = N->getSimpleValueType(0);

42403

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

42404

if (!Subtarget.hasAnyFMA() || !TLI.isTypeLegal(VT))

42405

return SDValue();

42406

42407

// We're trying to match (shuffle fma(a, b, c), X86Fmsub(a, b, c).

42408

SDValue Op0 = N->getOperand(0);

42409

SDValue Op1 = N->getOperand(1);

42410

SDValue FMAdd = Op0, FMSub = Op1;

42411

if (FMSub.getOpcode() != X86ISD::FMSUB)

42412

std::swap(FMAdd, FMSub);

42413

42414

if (FMAdd.getOpcode() != ISD::FMA || FMSub.getOpcode() != X86ISD::FMSUB ||

42415

FMAdd.getOperand(0) != FMSub.getOperand(0) || !FMAdd.hasOneUse() ||

42416

FMAdd.getOperand(1) != FMSub.getOperand(1) || !FMSub.hasOneUse() ||

42417

FMAdd.getOperand(2) != FMSub.getOperand(2))

42418

return SDValue();

42419

42420

// Check for correct shuffle mask.

42421

ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();

42422

bool Op0Even;

42423

if (!isAddSubOrSubAddMask(Mask, Op0Even))

42424

return SDValue();

42425

42426

// FMAddSub takes zeroth operand from FMSub node.

42427

SDLoc DL(N);

42428

bool IsSubAdd = Op0Even ? Op0 == FMAdd : Op1 == FMAdd;

42429

unsigned Opcode = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;

42430

return DAG.getNode(Opcode, DL, VT, FMAdd.getOperand(0), FMAdd.getOperand(1),

42431

FMAdd.getOperand(2));

42432

}

42433

42434

/// Try to combine a shuffle into a target-specific add-sub or

42435

/// mul-add-sub node.

42436

static SDValue combineShuffleToAddSubOrFMAddSub(SDNode *N,

42437

const X86Subtarget &Subtarget,

42438

SelectionDAG &DAG) {

42439

if (SDValue V = combineShuffleToFMAddSub(N, Subtarget, DAG))

42440

return V;

42441

42442

SDValue Opnd0, Opnd1;

42443

bool IsSubAdd;

42444

if (!isAddSubOrSubAdd(N, Subtarget, DAG, Opnd0, Opnd1, IsSubAdd))

42445

return SDValue();

42446

42447

MVT VT = N->getSimpleValueType(0);

42448

SDLoc DL(N);

42449

42450

// Try to generate X86ISD::FMADDSUB node here.

42451

SDValue Opnd2;

42452

if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, 2)) {

42453

unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;

42454

return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);

42455

}

42456

42457

if (IsSubAdd)

42458

return SDValue();

42459

42460

// Do not generate X86ISD::ADDSUB node for 512-bit types even though

42461

// the ADDSUB idiom has been successfully recognized. There are no known

42462

// X86 targets with 512-bit ADDSUB instructions!

42463

if (VT.is512BitVector())

42464

return SDValue();

42465

42466

// Do not generate X86ISD::ADDSUB node for FP16's vector types even though

42467

// the ADDSUB idiom has been successfully recognized. There are no known

42468

// X86 targets with FP16 ADDSUB instructions!

42469

if (VT.getVectorElementType() == MVT::f16)

42470

return SDValue();

42471

42472

return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);

42473

}

42474

42475

// We are looking for a shuffle where both sources are concatenated with undef

42476

// and have a width that is half of the output's width. AVX2 has VPERMD/Q, so

42477

// if we can express this as a single-source shuffle, that's preferable.

42478

static SDValue combineShuffleOfConcatUndef(SDNode *N, SelectionDAG &DAG,

42479

const X86Subtarget &Subtarget) {

42480

if (!Subtarget.hasAVX2() || !isa<ShuffleVectorSDNode>(N))

42481

return SDValue();

42482

42483

EVT VT = N->getValueType(0);

42484

42485

// We only care about shuffles of 128/256-bit vectors of 32/64-bit values.

42486

if (!VT.is128BitVector() && !VT.is256BitVector())

42487

return SDValue();

42488

42489

if (VT.getVectorElementType() != MVT::i32 &&

42490

VT.getVectorElementType() != MVT::i64 &&

42491

VT.getVectorElementType() != MVT::f32 &&

42492

VT.getVectorElementType() != MVT::f64)

42493

return SDValue();

42494

42495

SDValue N0 = N->getOperand(0);

42496

SDValue N1 = N->getOperand(1);

42497

42498

// Check that both sources are concats with undef.

42499

if (N0.getOpcode() != ISD::CONCAT_VECTORS ||

42500

N1.getOpcode() != ISD::CONCAT_VECTORS || N0.getNumOperands() != 2 ||

42501

N1.getNumOperands() != 2 || !N0.getOperand(1).isUndef() ||

42502

!N1.getOperand(1).isUndef())

42503

return SDValue();

42504

42505

// Construct the new shuffle mask. Elements from the first source retain their

42506

// index, but elements from the second source no longer need to skip an undef.

42507

SmallVector<int, 8> Mask;

42508

int NumElts = VT.getVectorNumElements();

42509

42510

ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);

42511

for (int Elt : SVOp->getMask())

42512

Mask.push_back(Elt < NumElts ? Elt : (Elt - NumElts / 2));

42513

42514

SDLoc DL(N);

42515

SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, N0.getOperand(0),

42516

N1.getOperand(0));

42517

return DAG.getVectorShuffle(VT, DL, Concat, DAG.getUNDEF(VT), Mask);

42518

}

42519

42520

/// If we have a shuffle of AVX/AVX512 (256/512 bit) vectors that only uses the

42521

/// low half of each source vector and does not set any high half elements in

42522

/// the destination vector, narrow the shuffle to half its original size.

42523

static SDValue narrowShuffle(ShuffleVectorSDNode *Shuf, SelectionDAG &DAG) {

42524

if (!Shuf->getValueType(0).isSimple())

42525

return SDValue();

42526

MVT VT = Shuf->getSimpleValueType(0);

42527

if (!VT.is256BitVector() && !VT.is512BitVector())

42528

return SDValue();

42529

42530

// See if we can ignore all of the high elements of the shuffle.

42531

ArrayRef<int> Mask = Shuf->getMask();

42532

if (!isUndefUpperHalf(Mask))

42533

return SDValue();

42534

42535

// Check if the shuffle mask accesses only the low half of each input vector

42536

// (half-index output is 0 or 2).

42537

int HalfIdx1, HalfIdx2;

42538

SmallVector<int, 8> HalfMask(Mask.size() / 2);

42539

if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2) ||

42540

(HalfIdx1 % 2 == 1) || (HalfIdx2 % 2 == 1))

42541

return SDValue();

42542

42543

// Create a half-width shuffle to replace the unnecessarily wide shuffle.

42544

// The trick is knowing that all of the insert/extract are actually free

42545

// subregister (zmm<->ymm or ymm<->xmm) ops. That leaves us with a shuffle

42546

// of narrow inputs into a narrow output, and that is always cheaper than

42547

// the wide shuffle that we started with.

42548

return getShuffleHalfVectors(SDLoc(Shuf), Shuf->getOperand(0),

42549

Shuf->getOperand(1), HalfMask, HalfIdx1,

42550

HalfIdx2, false, DAG, /*UseConcat*/true);

42551

}

42552

42553

static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,

42554

TargetLowering::DAGCombinerInfo &DCI,

42555

const X86Subtarget &Subtarget) {

42556

if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(N))

42557

if (SDValue V = narrowShuffle(Shuf, DAG))

42558

return V;

42559

42560

// If we have legalized the vector types, look for blends of FADD and FSUB

42561

// nodes that we can fuse into an ADDSUB, FMADDSUB, or FMSUBADD node.

42562

SDLoc dl(N);

42563

EVT VT = N->getValueType(0);

42564

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

42565

if (TLI.isTypeLegal(VT))

42566

if (SDValue AddSub = combineShuffleToAddSubOrFMAddSub(N, Subtarget, DAG))

42567

return AddSub;

42568

42569

// Attempt to combine into a vector load/broadcast.

42570

if (SDValue LD = combineToConsecutiveLoads(

42571

VT, SDValue(N, 0), dl, DAG, Subtarget, /*IsAfterLegalize*/ true))

42572

return LD;

42573

42574

// For AVX2, we sometimes want to combine

42575

// (vector_shuffle <mask> (concat_vectors t1, undef)

42576

// (concat_vectors t2, undef))

42577

// Into:

42578

// (vector_shuffle <mask> (concat_vectors t1, t2), undef)

42579

// Since the latter can be efficiently lowered with VPERMD/VPERMQ

42580

if (SDValue ShufConcat = combineShuffleOfConcatUndef(N, DAG, Subtarget))

42581

return ShufConcat;

42582

42583

if (isTargetShuffle(N->getOpcode())) {

42584

SDValue Op(N, 0);

42585

if (SDValue Shuffle = combineTargetShuffle(Op, DAG, DCI, Subtarget))

42586

return Shuffle;

42587

42588

// Try recursively combining arbitrary sequences of x86 shuffle

42589

// instructions into higher-order shuffles. We do this after combining

42590

// specific PSHUF instruction sequences into their minimal form so that we

42591

// can evaluate how many specialized shuffle instructions are involved in

42592

// a particular chain.

42593

if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))

42594

return Res;

42595

42596

// Simplify source operands based on shuffle mask.

42597

// TODO - merge this into combineX86ShufflesRecursively.

42598

APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());

42599

if (TLI.SimplifyDemandedVectorElts(Op, DemandedElts, DCI))

42600

return SDValue(N, 0);

42601

42602

// Canonicalize SHUFFLE(BINOP(X,Y)) -> BINOP(SHUFFLE(X),SHUFFLE(Y)).

42603

// Perform this after other shuffle combines to allow inner shuffles to be

42604

// combined away first.

42605

if (SDValue BinOp = canonicalizeShuffleWithBinOps(Op, DAG, dl))

42606

return BinOp;

42607

}

42608

42609

return SDValue();

42610

}

42611

42612

// Simplify variable target shuffle masks based on the demanded elements.

42613

// TODO: Handle DemandedBits in mask indices as well?

42614

bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetShuffle(

42615

SDValue Op, const APInt &DemandedElts, unsigned MaskIndex,

42616

TargetLowering::TargetLoweringOpt &TLO, unsigned Depth) const {

42617

// If we're demanding all elements don't bother trying to simplify the mask.

42618

unsigned NumElts = DemandedElts.getBitWidth();

42619

if (DemandedElts.isAllOnes())

42620

return false;

42621

42622

SDValue Mask = Op.getOperand(MaskIndex);

42623

if (!Mask.hasOneUse())

42624

return false;

42625

42626

// Attempt to generically simplify the variable shuffle mask.

42627

APInt MaskUndef, MaskZero;

42628

if (SimplifyDemandedVectorElts(Mask, DemandedElts, MaskUndef, MaskZero, TLO,

42629

Depth + 1))

42630

return true;

42631

42632

// Attempt to extract+simplify a (constant pool load) shuffle mask.

42633

// TODO: Support other types from getTargetShuffleMaskIndices?

42634

SDValue BC = peekThroughOneUseBitcasts(Mask);

42635

EVT BCVT = BC.getValueType();

42636

auto *Load = dyn_cast<LoadSDNode>(BC);

42637

if (!Load)

42638

return false;

42639

42640

const Constant *C = getTargetConstantFromNode(Load);

42641

if (!C)

42642

return false;

42643

42644

Type *CTy = C->getType();

42645

if (!CTy->isVectorTy() ||

42646

CTy->getPrimitiveSizeInBits() != Mask.getValueSizeInBits())

42647

return false;

42648

42649

// Handle scaling for i64 elements on 32-bit targets.

42650

unsigned NumCstElts = cast<FixedVectorType>(CTy)->getNumElements();

42651

if (NumCstElts != NumElts && NumCstElts != (NumElts * 2))

42652

return false;

42653

unsigned Scale = NumCstElts / NumElts;

42654

42655

// Simplify mask if we have an undemanded element that is not undef.

42656

bool Simplified = false;

42657

SmallVector<Constant *, 32> ConstVecOps;

42658

for (unsigned i = 0; i != NumCstElts; ++i) {

42659

Constant *Elt = C->getAggregateElement(i);

42660

if (!DemandedElts[i / Scale] && !isa<UndefValue>(Elt)) {

42661

ConstVecOps.push_back(UndefValue::get(Elt->getType()));

42662

Simplified = true;

42663

continue;

42664

}

42665

ConstVecOps.push_back(Elt);

42666

}

42667

if (!Simplified)

42668

return false;

42669

42670

// Generate new constant pool entry + legalize immediately for the load.

42671

SDLoc DL(Op);

42672

SDValue CV = TLO.DAG.getConstantPool(ConstantVector::get(ConstVecOps), BCVT);

42673

SDValue LegalCV = LowerConstantPool(CV, TLO.DAG);

42674

SDValue NewMask = TLO.DAG.getLoad(

42675

BCVT, DL, TLO.DAG.getEntryNode(), LegalCV,

42676

MachinePointerInfo::getConstantPool(TLO.DAG.getMachineFunction()),

42677

Load->getAlign());

42678

return TLO.CombineTo(Mask, TLO.DAG.getBitcast(Mask.getValueType(), NewMask));

42679

}

42680

42681

bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(

42682

SDValue Op, const APInt &DemandedElts, APInt &KnownUndef, APInt &KnownZero,

42683

TargetLoweringOpt &TLO, unsigned Depth) const {

42684

int NumElts = DemandedElts.getBitWidth();

42685

unsigned Opc = Op.getOpcode();

42686

EVT VT = Op.getValueType();

42687

42688

// Handle special case opcodes.

42689

switch (Opc) {

42690

case X86ISD::PMULDQ:

42691

case X86ISD::PMULUDQ: {

42692

APInt LHSUndef, LHSZero;

42693

APInt RHSUndef, RHSZero;

42694

SDValue LHS = Op.getOperand(0);

42695

SDValue RHS = Op.getOperand(1);

42696

if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO,

42697

Depth + 1))

42698

return true;

42699

if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO,

42700

Depth + 1))

42701

return true;

42702

// Multiply by zero.

42703

KnownZero = LHSZero | RHSZero;

42704

break;

42705

}

42706

case X86ISD::VPMADDWD: {

42707

APInt LHSUndef, LHSZero;

42708

APInt RHSUndef, RHSZero;

42709

SDValue LHS = Op.getOperand(0);

42710

SDValue RHS = Op.getOperand(1);

42711

APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, 2 * NumElts);

42712

42713

if (SimplifyDemandedVectorElts(LHS, DemandedSrcElts, LHSUndef, LHSZero, TLO,

42714

Depth + 1))

42715

return true;

42716

if (SimplifyDemandedVectorElts(RHS, DemandedSrcElts, RHSUndef, RHSZero, TLO,

42717

Depth + 1))

42718

return true;

42719

42720

// TODO: Multiply by zero.

42721

42722

// If RHS/LHS elements are known zero then we don't need the LHS/RHS equivalent.

42723

APInt DemandedLHSElts = DemandedSrcElts & ~RHSZero;

42724

if (SimplifyDemandedVectorElts(LHS, DemandedLHSElts, LHSUndef, LHSZero, TLO,

42725

Depth + 1))

42726

return true;

42727

APInt DemandedRHSElts = DemandedSrcElts & ~LHSZero;

42728

if (SimplifyDemandedVectorElts(RHS, DemandedRHSElts, RHSUndef, RHSZero, TLO,

42729

Depth + 1))

42730

return true;

42731

break;

42732

}

42733

case X86ISD::PSADBW: {

42734

SDValue LHS = Op.getOperand(0);

42735

SDValue RHS = Op.getOperand(1);

42736

assert(VT.getScalarType() == MVT::i64 &&(static_cast <bool> (VT.getScalarType() == MVT::i64 &&
LHS.getValueType() == RHS.getValueType() && LHS.getValueType
().getScalarType() == MVT::i8 && "Unexpected PSADBW types"
) ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i64 && LHS.getValueType() == RHS.getValueType() && LHS.getValueType().getScalarType() == MVT::i8 && \"Unexpected PSADBW types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 42739, __extension__
__PRETTY_FUNCTION__))

42737

LHS.getValueType() == RHS.getValueType() &&(static_cast <bool> (VT.getScalarType() == MVT::i64 &&
LHS.getValueType() == RHS.getValueType() && LHS.getValueType
().getScalarType() == MVT::i8 && "Unexpected PSADBW types"
) ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i64 && LHS.getValueType() == RHS.getValueType() && LHS.getValueType().getScalarType() == MVT::i8 && \"Unexpected PSADBW types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 42739, __extension__
__PRETTY_FUNCTION__))

42738

LHS.getValueType().getScalarType() == MVT::i8 &&(static_cast <bool> (VT.getScalarType() == MVT::i64 &&
LHS.getValueType() == RHS.getValueType() && LHS.getValueType
().getScalarType() == MVT::i8 && "Unexpected PSADBW types"
) ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i64 && LHS.getValueType() == RHS.getValueType() && LHS.getValueType().getScalarType() == MVT::i8 && \"Unexpected PSADBW types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 42739, __extension__
__PRETTY_FUNCTION__))

42739

"Unexpected PSADBW types")(static_cast <bool> (VT.getScalarType() == MVT::i64 &&
LHS.getValueType() == RHS.getValueType() && LHS.getValueType
().getScalarType() == MVT::i8 && "Unexpected PSADBW types"
) ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i64 && LHS.getValueType() == RHS.getValueType() && LHS.getValueType().getScalarType() == MVT::i8 && \"Unexpected PSADBW types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 42739, __extension__
__PRETTY_FUNCTION__));

42740

42741

// Aggressively peek through ops to get at the demanded elts.

42742

if (!DemandedElts.isAllOnes()) {

42743

unsigned NumSrcElts = LHS.getValueType().getVectorNumElements();

42744

APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, NumSrcElts);

42745

SDValue NewLHS = SimplifyMultipleUseDemandedVectorElts(

42746

LHS, DemandedSrcElts, TLO.DAG, Depth + 1);

42747

SDValue NewRHS = SimplifyMultipleUseDemandedVectorElts(

42748

RHS, DemandedSrcElts, TLO.DAG, Depth + 1);

42749

if (NewLHS || NewRHS) {

42750

NewLHS = NewLHS ? NewLHS : LHS;

42751

NewRHS = NewRHS ? NewRHS : RHS;

42752

return TLO.CombineTo(

42753

Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewLHS, NewRHS));

42754

}

42755

}

42756

break;

42757

}

42758

case X86ISD::VSHL:

42759

case X86ISD::VSRL:

42760

case X86ISD::VSRA: {

42761

// We only need the bottom 64-bits of the (128-bit) shift amount.

42762

SDValue Amt = Op.getOperand(1);

42763

MVT AmtVT = Amt.getSimpleValueType();

42764

assert(AmtVT.is128BitVector() && "Unexpected value type")(static_cast <bool> (AmtVT.is128BitVector() && "Unexpected value type"
) ? void (0) : __assert_fail ("AmtVT.is128BitVector() && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 42764, __extension__
__PRETTY_FUNCTION__));

42765

42766

// If we reuse the shift amount just for sse shift amounts then we know that

42767

// only the bottom 64-bits are only ever used.

42768

bool AssumeSingleUse = llvm::all_of(Amt->uses(), [&Amt](SDNode *Use) {

42769

unsigned UseOpc = Use->getOpcode();

42770

return (UseOpc == X86ISD::VSHL || UseOpc == X86ISD::VSRL ||

42771

UseOpc == X86ISD::VSRA) &&

42772

Use->getOperand(0) != Amt;

42773

});

42774

42775

APInt AmtUndef, AmtZero;

42776

unsigned NumAmtElts = AmtVT.getVectorNumElements();

42777

APInt AmtElts = APInt::getLowBitsSet(NumAmtElts, NumAmtElts / 2);

42778

if (SimplifyDemandedVectorElts(Amt, AmtElts, AmtUndef, AmtZero, TLO,

42779

Depth + 1, AssumeSingleUse))

42780

return true;

42781

[[fallthrough]];

42782

}

42783

case X86ISD::VSHLI:

42784

case X86ISD::VSRLI:

42785

case X86ISD::VSRAI: {

42786

SDValue Src = Op.getOperand(0);

42787

APInt SrcUndef;

42788

if (SimplifyDemandedVectorElts(Src, DemandedElts, SrcUndef, KnownZero, TLO,

42789

Depth + 1))

42790

return true;

42791

42792

// Fold shift(0,x) -> 0

42793

if (DemandedElts.isSubsetOf(KnownZero))

42794

return TLO.CombineTo(

42795

Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op)));

42796

42797

// Aggressively peek through ops to get at the demanded elts.

42798

if (!DemandedElts.isAllOnes())

42799

if (SDValue NewSrc = SimplifyMultipleUseDemandedVectorElts(

42800

Src, DemandedElts, TLO.DAG, Depth + 1))

42801

return TLO.CombineTo(

42802

Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc, Op.getOperand(1)));

42803

break;

42804

}

42805

case X86ISD::VPSHA:

42806

case X86ISD::VPSHL:

42807

case X86ISD::VSHLV:

42808

case X86ISD::VSRLV:

42809

case X86ISD::VSRAV: {

42810

APInt LHSUndef, LHSZero;

42811

APInt RHSUndef, RHSZero;

42812

SDValue LHS = Op.getOperand(0);

42813

SDValue RHS = Op.getOperand(1);

42814

if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO,

42815

Depth + 1))

42816

return true;

42817

42818

// Fold shift(0,x) -> 0

42819

if (DemandedElts.isSubsetOf(LHSZero))

42820

return TLO.CombineTo(

42821

Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op)));

42822

42823

if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO,

42824

Depth + 1))

42825

return true;

42826

42827

KnownZero = LHSZero;

42828

break;

42829

}

42830

case X86ISD::KSHIFTL: {

42831

SDValue Src = Op.getOperand(0);

42832

auto *Amt = cast<ConstantSDNode>(Op.getOperand(1));

42833

assert(Amt->getAPIntValue().ult(NumElts) && "Out of range shift amount")(static_cast <bool> (Amt->getAPIntValue().ult(NumElts
) && "Out of range shift amount") ? void (0) : __assert_fail
("Amt->getAPIntValue().ult(NumElts) && \"Out of range shift amount\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 42833, __extension__
__PRETTY_FUNCTION__));

42834

unsigned ShiftAmt = Amt->getZExtValue();

42835

42836

if (ShiftAmt == 0)

42837

return TLO.CombineTo(Op, Src);

42838

42839

// If this is ((X >>u C1) << ShAmt), see if we can simplify this into a

42840

// single shift. We can do this if the bottom bits (which are shifted

42841

// out) are never demanded.

42842

if (Src.getOpcode() == X86ISD::KSHIFTR) {

42843

if (!DemandedElts.intersects(APInt::getLowBitsSet(NumElts, ShiftAmt))) {

42844

unsigned C1 = Src.getConstantOperandVal(1);

42845

unsigned NewOpc = X86ISD::KSHIFTL;

42846

int Diff = ShiftAmt - C1;

42847

if (Diff < 0) {

42848

Diff = -Diff;

42849

NewOpc = X86ISD::KSHIFTR;

42850

}

42851

42852

SDLoc dl(Op);

42853

SDValue NewSA = TLO.DAG.getTargetConstant(Diff, dl, MVT::i8);

42854

return TLO.CombineTo(

42855

Op, TLO.DAG.getNode(NewOpc, dl, VT, Src.getOperand(0), NewSA));

42856

}

42857

}

42858

42859

APInt DemandedSrc = DemandedElts.lshr(ShiftAmt);

42860

if (SimplifyDemandedVectorElts(Src, DemandedSrc, KnownUndef, KnownZero, TLO,

42861

Depth + 1))

42862

return true;

42863

42864

KnownUndef <<= ShiftAmt;

42865

KnownZero <<= ShiftAmt;

42866

KnownZero.setLowBits(ShiftAmt);

42867

break;

42868

}

42869

case X86ISD::KSHIFTR: {

42870

SDValue Src = Op.getOperand(0);

42871

auto *Amt = cast<ConstantSDNode>(Op.getOperand(1));

42872

assert(Amt->getAPIntValue().ult(NumElts) && "Out of range shift amount")(static_cast <bool> (Amt->getAPIntValue().ult(NumElts
) && "Out of range shift amount") ? void (0) : __assert_fail
("Amt->getAPIntValue().ult(NumElts) && \"Out of range shift amount\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 42872, __extension__
__PRETTY_FUNCTION__));

42873

unsigned ShiftAmt = Amt->getZExtValue();

42874

42875

if (ShiftAmt == 0)

42876

return TLO.CombineTo(Op, Src);

42877

42878

// If this is ((X << C1) >>u ShAmt), see if we can simplify this into a

42879

// single shift. We can do this if the top bits (which are shifted

42880

// out) are never demanded.

42881

if (Src.getOpcode() == X86ISD::KSHIFTL) {

42882

if (!DemandedElts.intersects(APInt::getHighBitsSet(NumElts, ShiftAmt))) {

42883

unsigned C1 = Src.getConstantOperandVal(1);

42884

unsigned NewOpc = X86ISD::KSHIFTR;

42885

int Diff = ShiftAmt - C1;

42886

if (Diff < 0) {

42887

Diff = -Diff;

42888

NewOpc = X86ISD::KSHIFTL;

42889

}

42890

42891

SDLoc dl(Op);

42892

SDValue NewSA = TLO.DAG.getTargetConstant(Diff, dl, MVT::i8);

42893

return TLO.CombineTo(

42894

Op, TLO.DAG.getNode(NewOpc, dl, VT, Src.getOperand(0), NewSA));

42895

}

42896

}

42897

42898

APInt DemandedSrc = DemandedElts.shl(ShiftAmt);

42899

if (SimplifyDemandedVectorElts(Src, DemandedSrc, KnownUndef, KnownZero, TLO,

42900

Depth + 1))

42901

return true;

42902

42903

KnownUndef.lshrInPlace(ShiftAmt);

42904

KnownZero.lshrInPlace(ShiftAmt);

42905

KnownZero.setHighBits(ShiftAmt);

42906

break;

42907

}

42908

case X86ISD::ANDNP: {

42909

// ANDNP = (~LHS & RHS);

42910

SDValue LHS = Op.getOperand(0);

42911

SDValue RHS = Op.getOperand(1);

42912

42913

auto GetDemandedMasks = [&](SDValue Op, bool Invert = false) {

42914

APInt UndefElts;

42915

SmallVector<APInt> EltBits;

42916

int NumElts = VT.getVectorNumElements();

42917

int EltSizeInBits = VT.getScalarSizeInBits();

42918

APInt OpBits = APInt::getAllOnes(EltSizeInBits);

42919

APInt OpElts = DemandedElts;

42920

if (getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,

42921

EltBits)) {

42922

OpBits.clearAllBits();

42923

OpElts.clearAllBits();

42924

for (int I = 0; I != NumElts; ++I) {

42925

if (!DemandedElts[I])

42926

continue;

42927

if (UndefElts[I]) {

42928

// We can't assume an undef src element gives an undef dst - the

42929

// other src might be zero.

42930

OpBits.setAllBits();

42931

OpElts.setBit(I);

42932

} else if ((Invert && !EltBits[I].isAllOnes()) ||

42933

(!Invert && !EltBits[I].isZero())) {

42934

OpBits |= Invert ? ~EltBits[I] : EltBits[I];

42935

OpElts.setBit(I);

42936

}

42937

}

42938

}

42939

return std::make_pair(OpBits, OpElts);

42940

};

42941

APInt BitsLHS, EltsLHS;

42942

APInt BitsRHS, EltsRHS;

42943

std::tie(BitsLHS, EltsLHS) = GetDemandedMasks(RHS);

42944

std::tie(BitsRHS, EltsRHS) = GetDemandedMasks(LHS, true);

42945

42946

APInt LHSUndef, LHSZero;

42947

APInt RHSUndef, RHSZero;

42948

if (SimplifyDemandedVectorElts(LHS, EltsLHS, LHSUndef, LHSZero, TLO,

42949

Depth + 1))

42950

return true;

42951

if (SimplifyDemandedVectorElts(RHS, EltsRHS, RHSUndef, RHSZero, TLO,

42952

Depth + 1))

42953

return true;

42954

42955

if (!DemandedElts.isAllOnes()) {

42956

SDValue NewLHS = SimplifyMultipleUseDemandedBits(LHS, BitsLHS, EltsLHS,

42957

TLO.DAG, Depth + 1);

42958

SDValue NewRHS = SimplifyMultipleUseDemandedBits(RHS, BitsRHS, EltsRHS,

42959

TLO.DAG, Depth + 1);

42960

if (NewLHS || NewRHS) {

42961

NewLHS = NewLHS ? NewLHS : LHS;

42962

NewRHS = NewRHS ? NewRHS : RHS;

42963

return TLO.CombineTo(

42964

Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewLHS, NewRHS));

42965

}

42966

}

42967

break;

42968

}

42969

case X86ISD::CVTSI2P:

42970

case X86ISD::CVTUI2P: {

42971

SDValue Src = Op.getOperand(0);

42972

MVT SrcVT = Src.getSimpleValueType();

42973

APInt SrcUndef, SrcZero;

42974

APInt SrcElts = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());

42975

if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO,

42976

Depth + 1))

42977

return true;

42978

break;

42979

}

42980

case X86ISD::PACKSS:

42981

case X86ISD::PACKUS: {

42982

SDValue N0 = Op.getOperand(0);

42983

SDValue N1 = Op.getOperand(1);

42984

42985

APInt DemandedLHS, DemandedRHS;

42986

getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);

42987

42988

APInt LHSUndef, LHSZero;

42989

if (SimplifyDemandedVectorElts(N0, DemandedLHS, LHSUndef, LHSZero, TLO,

42990

Depth + 1))

42991

return true;

42992

APInt RHSUndef, RHSZero;

42993

if (SimplifyDemandedVectorElts(N1, DemandedRHS, RHSUndef, RHSZero, TLO,

42994

Depth + 1))

42995

return true;

42996

42997

// TODO - pass on known zero/undef.

42998

42999

// Aggressively peek through ops to get at the demanded elts.

43000

// TODO - we should do this for all target/faux shuffles ops.

43001

if (!DemandedElts.isAllOnes()) {

43002

SDValue NewN0 = SimplifyMultipleUseDemandedVectorElts(N0, DemandedLHS,

43003

TLO.DAG, Depth + 1);

43004

SDValue NewN1 = SimplifyMultipleUseDemandedVectorElts(N1, DemandedRHS,

43005

TLO.DAG, Depth + 1);

43006

if (NewN0 || NewN1) {

43007

NewN0 = NewN0 ? NewN0 : N0;

43008

NewN1 = NewN1 ? NewN1 : N1;

43009

return TLO.CombineTo(Op,

43010

TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewN0, NewN1));

43011

}

43012

}

43013

break;

43014

}

43015

case X86ISD::HADD:

43016

case X86ISD::HSUB:

43017

case X86ISD::FHADD:

43018

case X86ISD::FHSUB: {

43019

SDValue N0 = Op.getOperand(0);

43020

SDValue N1 = Op.getOperand(1);

43021

43022

APInt DemandedLHS, DemandedRHS;

43023

getHorizDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);

43024

43025

APInt LHSUndef, LHSZero;

43026

if (SimplifyDemandedVectorElts(N0, DemandedLHS, LHSUndef, LHSZero, TLO,

43027

Depth + 1))

43028

return true;

43029

APInt RHSUndef, RHSZero;

43030

if (SimplifyDemandedVectorElts(N1, DemandedRHS, RHSUndef, RHSZero, TLO,

43031

Depth + 1))

43032

return true;

43033

43034

// TODO - pass on known zero/undef.

43035

43036

// Aggressively peek through ops to get at the demanded elts.

43037

// TODO: Handle repeated operands.

43038

if (N0 != N1 && !DemandedElts.isAllOnes()) {

43039

SDValue NewN0 = SimplifyMultipleUseDemandedVectorElts(N0, DemandedLHS,

43040

TLO.DAG, Depth + 1);

43041

SDValue NewN1 = SimplifyMultipleUseDemandedVectorElts(N1, DemandedRHS,

43042

TLO.DAG, Depth + 1);

43043

if (NewN0 || NewN1) {

43044

NewN0 = NewN0 ? NewN0 : N0;

43045

NewN1 = NewN1 ? NewN1 : N1;

43046

return TLO.CombineTo(Op,

43047

TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewN0, NewN1));

43048

}

43049

}

43050

break;

43051

}

43052

case X86ISD::VTRUNC:

43053

case X86ISD::VTRUNCS:

43054

case X86ISD::VTRUNCUS: {

43055

SDValue Src = Op.getOperand(0);

43056

MVT SrcVT = Src.getSimpleValueType();

43057

APInt DemandedSrc = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());

43058

APInt SrcUndef, SrcZero;

43059

if (SimplifyDemandedVectorElts(Src, DemandedSrc, SrcUndef, SrcZero, TLO,

43060

Depth + 1))

43061

return true;

43062

KnownZero = SrcZero.zextOrTrunc(NumElts);

43063

KnownUndef = SrcUndef.zextOrTrunc(NumElts);

43064

break;

43065

}

43066

case X86ISD::BLENDV: {

43067

APInt SelUndef, SelZero;

43068

if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedElts, SelUndef,

43069

SelZero, TLO, Depth + 1))

43070

return true;

43071

43072

// TODO: Use SelZero to adjust LHS/RHS DemandedElts.

43073

APInt LHSUndef, LHSZero;

43074

if (SimplifyDemandedVectorElts(Op.getOperand(1), DemandedElts, LHSUndef,

43075

LHSZero, TLO, Depth + 1))

43076

return true;

43077

43078

APInt RHSUndef, RHSZero;

43079

if (SimplifyDemandedVectorElts(Op.getOperand(2), DemandedElts, RHSUndef,

43080

RHSZero, TLO, Depth + 1))

43081

return true;

43082

43083

KnownZero = LHSZero & RHSZero;

43084

KnownUndef = LHSUndef & RHSUndef;

43085

break;

43086

}

43087

case X86ISD::VZEXT_MOVL: {

43088

// If upper demanded elements are already zero then we have nothing to do.

43089

SDValue Src = Op.getOperand(0);

43090

APInt DemandedUpperElts = DemandedElts;

43091

DemandedUpperElts.clearLowBits(1);

43092

if (TLO.DAG.MaskedVectorIsZero(Src, DemandedUpperElts, Depth + 1))

43093

return TLO.CombineTo(Op, Src);

43094

break;

43095

}

43096

case X86ISD::VBROADCAST: {

43097

SDValue Src = Op.getOperand(0);

43098

MVT SrcVT = Src.getSimpleValueType();

43099

if (!SrcVT.isVector())

43100

break;

43101

// Don't bother broadcasting if we just need the 0'th element.

43102

if (DemandedElts == 1) {

43103

if (Src.getValueType() != VT)

43104

Src = widenSubVector(VT.getSimpleVT(), Src, false, Subtarget, TLO.DAG,

43105

SDLoc(Op));

43106

return TLO.CombineTo(Op, Src);

43107

}

43108

APInt SrcUndef, SrcZero;

43109

APInt SrcElts = APInt::getOneBitSet(SrcVT.getVectorNumElements(), 0);

43110

if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO,

43111

Depth + 1))

43112

return true;

43113

// Aggressively peek through src to get at the demanded elt.

43114

// TODO - we should do this for all target/faux shuffles ops.

43115

if (SDValue NewSrc = SimplifyMultipleUseDemandedVectorElts(

43116

Src, SrcElts, TLO.DAG, Depth + 1))

43117

return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));

43118

break;

43119

}

43120

case X86ISD::VPERMV:

43121

if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 0, TLO,

43122

Depth))

43123

return true;

43124

break;

43125

case X86ISD::PSHUFB:

43126

case X86ISD::VPERMV3:

43127

case X86ISD::VPERMILPV:

43128

if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 1, TLO,

43129

Depth))

43130

return true;

43131

break;

43132

case X86ISD::VPPERM:

43133

case X86ISD::VPERMIL2:

43134

if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 2, TLO,

43135

Depth))

43136

return true;

43137

break;

43138

}

43139

43140

// For 256/512-bit ops that are 128/256-bit ops glued together, if we do not

43141

// demand any of the high elements, then narrow the op to 128/256-bits: e.g.

43142

// (op ymm0, ymm1) --> insert undef, (op xmm0, xmm1), 0

43143

if ((VT.is256BitVector() || VT.is512BitVector()) &&

43144

DemandedElts.lshr(NumElts / 2) == 0) {

43145

unsigned SizeInBits = VT.getSizeInBits();

43146

unsigned ExtSizeInBits = SizeInBits / 2;

43147

43148

// See if 512-bit ops only use the bottom 128-bits.

43149

if (VT.is512BitVector() && DemandedElts.lshr(NumElts / 4) == 0)

43150

ExtSizeInBits = SizeInBits / 4;

43151

43152

switch (Opc) {

43153

// Scalar broadcast.

43154

case X86ISD::VBROADCAST: {

43155

SDLoc DL(Op);

43156

SDValue Src = Op.getOperand(0);

43157

if (Src.getValueSizeInBits() > ExtSizeInBits)

43158

Src = extractSubVector(Src, 0, TLO.DAG, DL, ExtSizeInBits);

43159

EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),

43160

ExtSizeInBits / VT.getScalarSizeInBits());

43161

SDValue Bcst = TLO.DAG.getNode(X86ISD::VBROADCAST, DL, BcstVT, Src);

43162

return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Bcst, 0,

43163

TLO.DAG, DL, ExtSizeInBits));

43164

}

43165

case X86ISD::VBROADCAST_LOAD: {

43166

SDLoc DL(Op);

43167

auto *MemIntr = cast<MemIntrinsicSDNode>(Op);

43168

EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),

43169

ExtSizeInBits / VT.getScalarSizeInBits());

43170

SDVTList Tys = TLO.DAG.getVTList(BcstVT, MVT::Other);

43171

SDValue Ops[] = {MemIntr->getOperand(0), MemIntr->getOperand(1)};

43172

SDValue Bcst = TLO.DAG.getMemIntrinsicNode(

43173

X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MemIntr->getMemoryVT(),

43174

MemIntr->getMemOperand());

43175

TLO.DAG.makeEquivalentMemoryOrdering(SDValue(MemIntr, 1),

43176

Bcst.getValue(1));

43177

return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Bcst, 0,

43178

TLO.DAG, DL, ExtSizeInBits));

43179

}

43180

// Subvector broadcast.

43181

case X86ISD::SUBV_BROADCAST_LOAD: {

43182

auto *MemIntr = cast<MemIntrinsicSDNode>(Op);

43183

EVT MemVT = MemIntr->getMemoryVT();

43184

if (ExtSizeInBits == MemVT.getStoreSizeInBits()) {

43185

SDLoc DL(Op);

43186

SDValue Ld =

43187

TLO.DAG.getLoad(MemVT, DL, MemIntr->getChain(),

43188

MemIntr->getBasePtr(), MemIntr->getMemOperand());

43189

TLO.DAG.makeEquivalentMemoryOrdering(SDValue(MemIntr, 1),

43190

Ld.getValue(1));

43191

return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Ld, 0,

43192

TLO.DAG, DL, ExtSizeInBits));

43193

} else if ((ExtSizeInBits % MemVT.getStoreSizeInBits()) == 0) {

43194

SDLoc DL(Op);

43195

EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),

43196

ExtSizeInBits / VT.getScalarSizeInBits());

43197

if (SDValue BcstLd =

43198

getBROADCAST_LOAD(Opc, DL, BcstVT, MemVT, MemIntr, 0, TLO.DAG))

43199

return TLO.CombineTo(Op,

43200

insertSubVector(TLO.DAG.getUNDEF(VT), BcstLd, 0,

43201

TLO.DAG, DL, ExtSizeInBits));

43202

}

43203

break;

43204

}

43205

// Byte shifts by immediate.

43206

case X86ISD::VSHLDQ:

43207

case X86ISD::VSRLDQ:

43208

// Shift by uniform.

43209

case X86ISD::VSHL:

43210

case X86ISD::VSRL:

43211

case X86ISD::VSRA:

43212

// Shift by immediate.

43213

case X86ISD::VSHLI:

43214

case X86ISD::VSRLI:

43215

case X86ISD::VSRAI: {

43216

SDLoc DL(Op);

43217

SDValue Ext0 =

43218

extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, ExtSizeInBits);

43219

SDValue ExtOp =

43220

TLO.DAG.getNode(Opc, DL, Ext0.getValueType(), Ext0, Op.getOperand(1));

43221

SDValue UndefVec = TLO.DAG.getUNDEF(VT);

43222

SDValue Insert =

43223

insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);

43224

return TLO.CombineTo(Op, Insert);

43225

}

43226

case X86ISD::VPERMI: {

43227

// Simplify PERMPD/PERMQ to extract_subvector.

43228

// TODO: This should be done in shuffle combining.

43229

if (VT == MVT::v4f64 || VT == MVT::v4i64) {

43230

SmallVector<int, 4> Mask;

43231

DecodeVPERMMask(NumElts, Op.getConstantOperandVal(1), Mask);

43232

if (isUndefOrEqual(Mask[0], 2) && isUndefOrEqual(Mask[1], 3)) {

43233

SDLoc DL(Op);

43234

SDValue Ext = extractSubVector(Op.getOperand(0), 2, TLO.DAG, DL, 128);

43235

SDValue UndefVec = TLO.DAG.getUNDEF(VT);

43236

SDValue Insert = insertSubVector(UndefVec, Ext, 0, TLO.DAG, DL, 128);

43237

return TLO.CombineTo(Op, Insert);

43238

}

43239

}

43240

break;

43241

}

43242

case X86ISD::VPERM2X128: {

43243

// Simplify VPERM2F128/VPERM2I128 to extract_subvector.

43244

SDLoc DL(Op);

43245

unsigned LoMask = Op.getConstantOperandVal(2) & 0xF;

43246

if (LoMask & 0x8)

43247

return TLO.CombineTo(

43248

Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, DL));

43249

unsigned EltIdx = (LoMask & 0x1) * (NumElts / 2);

43250

unsigned SrcIdx = (LoMask & 0x2) >> 1;

43251

SDValue ExtOp =

43252

extractSubVector(Op.getOperand(SrcIdx), EltIdx, TLO.DAG, DL, 128);

43253

SDValue UndefVec = TLO.DAG.getUNDEF(VT);

43254

SDValue Insert =

43255

insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);

43256

return TLO.CombineTo(Op, Insert);

43257

}

43258

// Zero upper elements.

43259

case X86ISD::VZEXT_MOVL:

43260

// Target unary shuffles by immediate:

43261

case X86ISD::PSHUFD:

43262

case X86ISD::PSHUFLW:

43263

case X86ISD::PSHUFHW:

43264

case X86ISD::VPERMILPI:

43265

// (Non-Lane Crossing) Target Shuffles.

43266

case X86ISD::VPERMILPV:

43267

case X86ISD::VPERMIL2:

43268

case X86ISD::PSHUFB:

43269

case X86ISD::UNPCKL:

43270

case X86ISD::UNPCKH:

43271

case X86ISD::BLENDI:

43272

// Integer ops.

43273

case X86ISD::PACKSS:

43274

case X86ISD::PACKUS:

43275

// Horizontal Ops.

43276

case X86ISD::HADD:

43277

case X86ISD::HSUB:

43278

case X86ISD::FHADD:

43279

case X86ISD::FHSUB: {

43280

SDLoc DL(Op);

43281

SmallVector<SDValue, 4> Ops;

43282

for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {

43283

SDValue SrcOp = Op.getOperand(i);

43284

EVT SrcVT = SrcOp.getValueType();

43285

assert((!SrcVT.isVector() || SrcVT.getSizeInBits() == SizeInBits) &&(static_cast <bool> ((!SrcVT.isVector() || SrcVT.getSizeInBits
() == SizeInBits) && "Unsupported vector size") ? void
(0) : __assert_fail ("(!SrcVT.isVector() || SrcVT.getSizeInBits() == SizeInBits) && \"Unsupported vector size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43286, __extension__
__PRETTY_FUNCTION__))

43286

"Unsupported vector size")(static_cast <bool> ((!SrcVT.isVector() || SrcVT.getSizeInBits
() == SizeInBits) && "Unsupported vector size") ? void
(0) : __assert_fail ("(!SrcVT.isVector() || SrcVT.getSizeInBits() == SizeInBits) && \"Unsupported vector size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43286, __extension__
__PRETTY_FUNCTION__));

43287

Ops.push_back(SrcVT.isVector() ? extractSubVector(SrcOp, 0, TLO.DAG, DL,

43288

ExtSizeInBits)

43289

: SrcOp);

43290

}

43291

MVT ExtVT = VT.getSimpleVT();

43292

ExtVT = MVT::getVectorVT(ExtVT.getScalarType(),

43293

ExtSizeInBits / ExtVT.getScalarSizeInBits());

43294

SDValue ExtOp = TLO.DAG.getNode(Opc, DL, ExtVT, Ops);

43295

SDValue UndefVec = TLO.DAG.getUNDEF(VT);

43296

SDValue Insert =

43297

insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);

43298

return TLO.CombineTo(Op, Insert);

43299

}

43300

}

43301

}

43302

43303

// For splats, unless we *only* demand the 0'th element,

43304

// stop attempts at simplification here, we aren't going to improve things,

43305

// this is better than any potential shuffle.

43306

if (!DemandedElts.isOne() && TLO.DAG.isSplatValue(Op, /*AllowUndefs*/false))

43307

return false;

43308

43309

// Get target/faux shuffle mask.

43310

APInt OpUndef, OpZero;

43311

SmallVector<int, 64> OpMask;

43312

SmallVector<SDValue, 2> OpInputs;

43313

if (!getTargetShuffleInputs(Op, DemandedElts, OpInputs, OpMask, OpUndef,

43314

OpZero, TLO.DAG, Depth, false))

43315

return false;

43316

43317

// Shuffle inputs must be the same size as the result.

43318

if (OpMask.size() != (unsigned)NumElts ||

43319

llvm::any_of(OpInputs, [VT](SDValue V) {

43320

return VT.getSizeInBits() != V.getValueSizeInBits() ||

43321

!V.getValueType().isVector();

43322

}))

43323

return false;

43324

43325

KnownZero = OpZero;

43326

KnownUndef = OpUndef;

43327

43328

// Check if shuffle mask can be simplified to undef/zero/identity.

43329

int NumSrcs = OpInputs.size();

43330

for (int i = 0; i != NumElts; ++i)

43331

if (!DemandedElts[i])

43332

OpMask[i] = SM_SentinelUndef;

43333

43334

if (isUndefInRange(OpMask, 0, NumElts)) {

43335

KnownUndef.setAllBits();

43336

return TLO.CombineTo(Op, TLO.DAG.getUNDEF(VT));

43337

}

43338

if (isUndefOrZeroInRange(OpMask, 0, NumElts)) {

43339

KnownZero.setAllBits();

43340

return TLO.CombineTo(

43341

Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op)));

43342

}

43343

for (int Src = 0; Src != NumSrcs; ++Src)

43344

if (isSequentialOrUndefInRange(OpMask, 0, NumElts, Src * NumElts))

43345

return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, OpInputs[Src]));

43346

43347

// Attempt to simplify inputs.

43348

for (int Src = 0; Src != NumSrcs; ++Src) {

43349

// TODO: Support inputs of different types.

43350

if (OpInputs[Src].getValueType() != VT)

43351

continue;

43352

43353

int Lo = Src * NumElts;

43354

APInt SrcElts = APInt::getZero(NumElts);

43355

for (int i = 0; i != NumElts; ++i)

43356

if (DemandedElts[i]) {

43357

int M = OpMask[i] - Lo;

43358

if (0 <= M && M < NumElts)

43359

SrcElts.setBit(M);

43360

}

43361

43362

// TODO - Propagate input undef/zero elts.

43363

APInt SrcUndef, SrcZero;

43364

if (SimplifyDemandedVectorElts(OpInputs[Src], SrcElts, SrcUndef, SrcZero,

43365

TLO, Depth + 1))

43366

return true;

43367

}

43368

43369

// If we don't demand all elements, then attempt to combine to a simpler

43370

// shuffle.

43371

// We need to convert the depth to something combineX86ShufflesRecursively

43372

// can handle - so pretend its Depth == 0 again, and reduce the max depth

43373

// to match. This prevents combineX86ShuffleChain from returning a

43374

// combined shuffle that's the same as the original root, causing an

43375

// infinite loop.

43376

if (!DemandedElts.isAllOnes()) {

43377

assert(Depth < X86::MaxShuffleCombineDepth && "Depth out of range")(static_cast <bool> (Depth < X86::MaxShuffleCombineDepth
&& "Depth out of range") ? void (0) : __assert_fail (
"Depth < X86::MaxShuffleCombineDepth && \"Depth out of range\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43377, __extension__
__PRETTY_FUNCTION__));

43378

43379

SmallVector<int, 64> DemandedMask(NumElts, SM_SentinelUndef);

43380

for (int i = 0; i != NumElts; ++i)

43381

if (DemandedElts[i])

43382

DemandedMask[i] = i;

43383

43384

SDValue NewShuffle = combineX86ShufflesRecursively(

43385

{Op}, 0, Op, DemandedMask, {}, 0, X86::MaxShuffleCombineDepth - Depth,

43386

/*HasVarMask*/ false,

43387

/*AllowCrossLaneVarMask*/ true, /*AllowPerLaneVarMask*/ true, TLO.DAG,

43388

Subtarget);

43389

if (NewShuffle)

43390

return TLO.CombineTo(Op, NewShuffle);

43391

}

43392

43393

return false;

43394

}

43395

43396

bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(

43397

SDValue Op, const APInt &OriginalDemandedBits,

43398

const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO,

43399

unsigned Depth) const {

43400

EVT VT = Op.getValueType();

43401

unsigned BitWidth = OriginalDemandedBits.getBitWidth();

43402

unsigned Opc = Op.getOpcode();

43403

switch(Opc) {

43404

case X86ISD::VTRUNC: {

43405

KnownBits KnownOp;

43406

SDValue Src = Op.getOperand(0);

43407

MVT SrcVT = Src.getSimpleValueType();

43408

43409

// Simplify the input, using demanded bit information.

43410

APInt TruncMask = OriginalDemandedBits.zext(SrcVT.getScalarSizeInBits());

43411

APInt DemandedElts = OriginalDemandedElts.trunc(SrcVT.getVectorNumElements());

43412

if (SimplifyDemandedBits(Src, TruncMask, DemandedElts, KnownOp, TLO, Depth + 1))

43413

return true;

43414

break;

43415

}

43416

case X86ISD::PMULDQ:

43417

case X86ISD::PMULUDQ: {

43418

// PMULDQ/PMULUDQ only uses lower 32 bits from each vector element.

43419

KnownBits KnownLHS, KnownRHS;

43420

SDValue LHS = Op.getOperand(0);

43421

SDValue RHS = Op.getOperand(1);

43422

43423

// Don't mask bits on 32-bit AVX512 targets which might lose a broadcast.

43424

// FIXME: Can we bound this better?

43425

APInt DemandedMask = APInt::getLowBitsSet(64, 32);

43426

APInt DemandedMaskLHS = APInt::getAllOnes(64);

43427

APInt DemandedMaskRHS = APInt::getAllOnes(64);

43428

43429

bool Is32BitAVX512 = !Subtarget.is64Bit() && Subtarget.hasAVX512();

43430

if (!Is32BitAVX512 || !TLO.DAG.isSplatValue(LHS))

43431

DemandedMaskLHS = DemandedMask;

43432

if (!Is32BitAVX512 || !TLO.DAG.isSplatValue(RHS))

43433

DemandedMaskRHS = DemandedMask;

43434

43435

if (SimplifyDemandedBits(LHS, DemandedMaskLHS, OriginalDemandedElts,

43436

KnownLHS, TLO, Depth + 1))

43437

return true;

43438

if (SimplifyDemandedBits(RHS, DemandedMaskRHS, OriginalDemandedElts,

43439

KnownRHS, TLO, Depth + 1))

43440

return true;

43441

43442

// PMULUDQ(X,1) -> AND(X,(1<<32)-1) 'getZeroExtendInReg'.

43443

KnownRHS = KnownRHS.trunc(32);

43444

if (Opc == X86ISD::PMULUDQ && KnownRHS.isConstant() &&

43445

KnownRHS.getConstant().isOne()) {

43446

SDLoc DL(Op);

43447

SDValue Mask = TLO.DAG.getConstant(DemandedMask, DL, VT);

43448

return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::AND, DL, VT, LHS, Mask));

43449

}

43450

43451

// Aggressively peek through ops to get at the demanded low bits.

43452

SDValue DemandedLHS = SimplifyMultipleUseDemandedBits(

43453

LHS, DemandedMaskLHS, OriginalDemandedElts, TLO.DAG, Depth + 1);

43454

SDValue DemandedRHS = SimplifyMultipleUseDemandedBits(

43455

RHS, DemandedMaskRHS, OriginalDemandedElts, TLO.DAG, Depth + 1);

43456

if (DemandedLHS || DemandedRHS) {

43457

DemandedLHS = DemandedLHS ? DemandedLHS : LHS;

43458

DemandedRHS = DemandedRHS ? DemandedRHS : RHS;

43459

return TLO.CombineTo(

43460

Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, DemandedLHS, DemandedRHS));

43461

}

43462

break;

43463

}

43464

case X86ISD::VSHLI: {

43465

SDValue Op0 = Op.getOperand(0);

43466

43467

unsigned ShAmt = Op.getConstantOperandVal(1);

43468

if (ShAmt >= BitWidth)

43469

break;

43470

43471

APInt DemandedMask = OriginalDemandedBits.lshr(ShAmt);

43472

43473

// If this is ((X >>u C1) << ShAmt), see if we can simplify this into a

43474

// single shift. We can do this if the bottom bits (which are shifted

43475

// out) are never demanded.

43476

if (Op0.getOpcode() == X86ISD::VSRLI &&

43477

OriginalDemandedBits.countr_zero() >= ShAmt) {

43478

unsigned Shift2Amt = Op0.getConstantOperandVal(1);

43479

if (Shift2Amt < BitWidth) {

43480

int Diff = ShAmt - Shift2Amt;

43481

if (Diff == 0)

43482

return TLO.CombineTo(Op, Op0.getOperand(0));

43483

43484

unsigned NewOpc = Diff < 0 ? X86ISD::VSRLI : X86ISD::VSHLI;

43485

SDValue NewShift = TLO.DAG.getNode(

43486

NewOpc, SDLoc(Op), VT, Op0.getOperand(0),

43487

TLO.DAG.getTargetConstant(std::abs(Diff), SDLoc(Op), MVT::i8));

43488

return TLO.CombineTo(Op, NewShift);

43489

}

43490

}

43491

43492

// If we are only demanding sign bits then we can use the shift source directly.

43493

unsigned NumSignBits =

43494

TLO.DAG.ComputeNumSignBits(Op0, OriginalDemandedElts, Depth + 1);

43495

unsigned UpperDemandedBits = BitWidth - OriginalDemandedBits.countr_zero();

43496

if (NumSignBits > ShAmt && (NumSignBits - ShAmt) >= UpperDemandedBits)

43497

return TLO.CombineTo(Op, Op0);

43498

43499

if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,

43500

TLO, Depth + 1))

43501

return true;

43502

43503

assert(!Known.hasConflict() && "Bits known to be one AND zero?")(static_cast <bool> (!Known.hasConflict() && "Bits known to be one AND zero?"
) ? void (0) : __assert_fail ("!Known.hasConflict() && \"Bits known to be one AND zero?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43503, __extension__
__PRETTY_FUNCTION__));

43504

Known.Zero <<= ShAmt;

43505

Known.One <<= ShAmt;

43506

43507

// Low bits known zero.

43508

Known.Zero.setLowBits(ShAmt);

43509

return false;

43510

}

43511

case X86ISD::VSRLI: {

43512

unsigned ShAmt = Op.getConstantOperandVal(1);

43513

if (ShAmt >= BitWidth)

43514

break;

43515

43516

APInt DemandedMask = OriginalDemandedBits << ShAmt;

43517

43518

if (SimplifyDemandedBits(Op.getOperand(0), DemandedMask,

43519

OriginalDemandedElts, Known, TLO, Depth + 1))

43520

return true;

43521

43522

assert(!Known.hasConflict() && "Bits known to be one AND zero?")(static_cast <bool> (!Known.hasConflict() && "Bits known to be one AND zero?"
) ? void (0) : __assert_fail ("!Known.hasConflict() && \"Bits known to be one AND zero?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43522, __extension__
__PRETTY_FUNCTION__));

43523

Known.Zero.lshrInPlace(ShAmt);

43524

Known.One.lshrInPlace(ShAmt);

43525

43526

// High bits known zero.

43527

Known.Zero.setHighBits(ShAmt);

43528

return false;

43529

}

43530

case X86ISD::VSRAI: {

43531

SDValue Op0 = Op.getOperand(0);

43532

SDValue Op1 = Op.getOperand(1);

43533

43534

unsigned ShAmt = cast<ConstantSDNode>(Op1)->getZExtValue();

43535

if (ShAmt >= BitWidth)

43536

break;

43537

43538

APInt DemandedMask = OriginalDemandedBits << ShAmt;

43539

43540

// If we just want the sign bit then we don't need to shift it.

43541

if (OriginalDemandedBits.isSignMask())

43542

return TLO.CombineTo(Op, Op0);

43543

43544

// fold (VSRAI (VSHLI X, C1), C1) --> X iff NumSignBits(X) > C1

43545

if (Op0.getOpcode() == X86ISD::VSHLI &&

43546

Op.getOperand(1) == Op0.getOperand(1)) {

43547

SDValue Op00 = Op0.getOperand(0);

43548

unsigned NumSignBits =

43549

TLO.DAG.ComputeNumSignBits(Op00, OriginalDemandedElts);

43550

if (ShAmt < NumSignBits)

43551

return TLO.CombineTo(Op, Op00);

43552

}

43553

43554

// If any of the demanded bits are produced by the sign extension, we also

43555

// demand the input sign bit.

43556

if (OriginalDemandedBits.countl_zero() < ShAmt)

43557

DemandedMask.setSignBit();

43558

43559

if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,

43560

TLO, Depth + 1))

43561

return true;

43562

43563

assert(!Known.hasConflict() && "Bits known to be one AND zero?")(static_cast <bool> (!Known.hasConflict() && "Bits known to be one AND zero?"
) ? void (0) : __assert_fail ("!Known.hasConflict() && \"Bits known to be one AND zero?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43563, __extension__
__PRETTY_FUNCTION__));

43564

Known.Zero.lshrInPlace(ShAmt);

43565

Known.One.lshrInPlace(ShAmt);

43566

43567

// If the input sign bit is known to be zero, or if none of the top bits

43568

// are demanded, turn this into an unsigned shift right.

43569

if (Known.Zero[BitWidth - ShAmt - 1] ||

43570

OriginalDemandedBits.countl_zero() >= ShAmt)

43571

return TLO.CombineTo(

43572

Op, TLO.DAG.getNode(X86ISD::VSRLI, SDLoc(Op), VT, Op0, Op1));

43573

43574

// High bits are known one.

43575

if (Known.One[BitWidth - ShAmt - 1])

43576

Known.One.setHighBits(ShAmt);

43577

return false;

43578

}

43579

case X86ISD::BLENDV: {

43580

SDValue Sel = Op.getOperand(0);

43581

SDValue LHS = Op.getOperand(1);

43582

SDValue RHS = Op.getOperand(2);

43583

43584

APInt SignMask = APInt::getSignMask(BitWidth);

43585

SDValue NewSel = SimplifyMultipleUseDemandedBits(

43586

Sel, SignMask, OriginalDemandedElts, TLO.DAG, Depth + 1);

43587

SDValue NewLHS = SimplifyMultipleUseDemandedBits(

43588

LHS, OriginalDemandedBits, OriginalDemandedElts, TLO.DAG, Depth + 1);

43589

SDValue NewRHS = SimplifyMultipleUseDemandedBits(

43590

RHS, OriginalDemandedBits, OriginalDemandedElts, TLO.DAG, Depth + 1);

43591

43592

if (NewSel || NewLHS || NewRHS) {

43593

NewSel = NewSel ? NewSel : Sel;

43594

NewLHS = NewLHS ? NewLHS : LHS;

43595

NewRHS = NewRHS ? NewRHS : RHS;

43596

return TLO.CombineTo(Op, TLO.DAG.getNode(X86ISD::BLENDV, SDLoc(Op), VT,

43597

NewSel, NewLHS, NewRHS));

43598

}

43599

break;

43600

}

43601

case X86ISD::PEXTRB:

43602

case X86ISD::PEXTRW: {

43603

SDValue Vec = Op.getOperand(0);

43604

auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(1));

43605

MVT VecVT = Vec.getSimpleValueType();

43606

unsigned NumVecElts = VecVT.getVectorNumElements();

43607

43608

if (CIdx && CIdx->getAPIntValue().ult(NumVecElts)) {

43609

unsigned Idx = CIdx->getZExtValue();

43610

unsigned VecBitWidth = VecVT.getScalarSizeInBits();

43611

43612

// If we demand no bits from the vector then we must have demanded

43613

// bits from the implict zext - simplify to zero.

43614

APInt DemandedVecBits = OriginalDemandedBits.trunc(VecBitWidth);

43615

if (DemandedVecBits == 0)

43616

return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));

43617

43618

APInt KnownUndef, KnownZero;

43619

APInt DemandedVecElts = APInt::getOneBitSet(NumVecElts, Idx);

43620

if (SimplifyDemandedVectorElts(Vec, DemandedVecElts, KnownUndef,

43621

KnownZero, TLO, Depth + 1))

43622

return true;

43623

43624

KnownBits KnownVec;

43625

if (SimplifyDemandedBits(Vec, DemandedVecBits, DemandedVecElts,

43626

KnownVec, TLO, Depth + 1))

43627

return true;

43628

43629

if (SDValue V = SimplifyMultipleUseDemandedBits(

43630

Vec, DemandedVecBits, DemandedVecElts, TLO.DAG, Depth + 1))

43631

return TLO.CombineTo(

43632

Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, V, Op.getOperand(1)));

43633

43634

Known = KnownVec.zext(BitWidth);

43635

return false;

43636

}

43637

break;

43638

}

43639

case X86ISD::PINSRB:

43640

case X86ISD::PINSRW: {

43641

SDValue Vec = Op.getOperand(0);

43642

SDValue Scl = Op.getOperand(1);

43643

auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2));

43644

MVT VecVT = Vec.getSimpleValueType();

43645

43646

if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements())) {

43647

unsigned Idx = CIdx->getZExtValue();

43648

if (!OriginalDemandedElts[Idx])

43649

return TLO.CombineTo(Op, Vec);

43650

43651

KnownBits KnownVec;

43652

APInt DemandedVecElts(OriginalDemandedElts);

43653

DemandedVecElts.clearBit(Idx);

43654

if (SimplifyDemandedBits(Vec, OriginalDemandedBits, DemandedVecElts,

43655

KnownVec, TLO, Depth + 1))

43656

return true;

43657

43658

KnownBits KnownScl;

43659

unsigned NumSclBits = Scl.getScalarValueSizeInBits();

43660

APInt DemandedSclBits = OriginalDemandedBits.zext(NumSclBits);

43661

if (SimplifyDemandedBits(Scl, DemandedSclBits, KnownScl, TLO, Depth + 1))

43662

return true;

43663

43664

KnownScl = KnownScl.trunc(VecVT.getScalarSizeInBits());

43665

Known = KnownBits::commonBits(KnownVec, KnownScl);

43666

return false;

43667

}

43668

break;

43669

}

43670

case X86ISD::PACKSS:

43671

// PACKSS saturates to MIN/MAX integer values. So if we just want the

43672

// sign bit then we can just ask for the source operands sign bit.

43673

// TODO - add known bits handling.

43674

if (OriginalDemandedBits.isSignMask()) {

43675

APInt DemandedLHS, DemandedRHS;

43676

getPackDemandedElts(VT, OriginalDemandedElts, DemandedLHS, DemandedRHS);

43677

43678

KnownBits KnownLHS, KnownRHS;

43679

APInt SignMask = APInt::getSignMask(BitWidth * 2);

43680

if (SimplifyDemandedBits(Op.getOperand(0), SignMask, DemandedLHS,

43681

KnownLHS, TLO, Depth + 1))

43682

return true;

43683

if (SimplifyDemandedBits(Op.getOperand(1), SignMask, DemandedRHS,

43684

KnownRHS, TLO, Depth + 1))

43685

return true;

43686

43687

// Attempt to avoid multi-use ops if we don't need anything from them.

43688

SDValue DemandedOp0 = SimplifyMultipleUseDemandedBits(

43689

Op.getOperand(0), SignMask, DemandedLHS, TLO.DAG, Depth + 1);

43690

SDValue DemandedOp1 = SimplifyMultipleUseDemandedBits(

43691

Op.getOperand(1), SignMask, DemandedRHS, TLO.DAG, Depth + 1);

43692

if (DemandedOp0 || DemandedOp1) {

43693

SDValue Op0 = DemandedOp0 ? DemandedOp0 : Op.getOperand(0);

43694

SDValue Op1 = DemandedOp1 ? DemandedOp1 : Op.getOperand(1);

43695

return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, Op0, Op1));

43696

}

43697

}

43698

// TODO - add general PACKSS/PACKUS SimplifyDemandedBits support.

43699

break;

43700

case X86ISD::VBROADCAST: {

43701

SDValue Src = Op.getOperand(0);

43702

MVT SrcVT = Src.getSimpleValueType();

43703

APInt DemandedElts = APInt::getOneBitSet(

43704

SrcVT.isVector() ? SrcVT.getVectorNumElements() : 1, 0);

43705

if (SimplifyDemandedBits(Src, OriginalDemandedBits, DemandedElts, Known,

43706

TLO, Depth + 1))

43707

return true;

43708

// If we don't need the upper bits, attempt to narrow the broadcast source.

43709

// Don't attempt this on AVX512 as it might affect broadcast folding.

43710

// TODO: Should we attempt this for i32/i16 splats? They tend to be slower.

43711

if ((BitWidth == 64) && SrcVT.isScalarInteger() && !Subtarget.hasAVX512() &&

43712

OriginalDemandedBits.countl_zero() >= (BitWidth / 2) &&

43713

Src->hasOneUse()) {

43714

MVT NewSrcVT = MVT::getIntegerVT(BitWidth / 2);

43715

SDValue NewSrc =

43716

TLO.DAG.getNode(ISD::TRUNCATE, SDLoc(Src), NewSrcVT, Src);

43717

MVT NewVT = MVT::getVectorVT(NewSrcVT, VT.getVectorNumElements() * 2);

43718

SDValue NewBcst =

43719

TLO.DAG.getNode(X86ISD::VBROADCAST, SDLoc(Op), NewVT, NewSrc);

43720

return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, NewBcst));

43721

}

43722

break;

43723

}

43724

case X86ISD::PCMPGT:

43725

// icmp sgt(0, R) == ashr(R, BitWidth-1).

43726

// iff we only need the sign bit then we can use R directly.

43727

if (OriginalDemandedBits.isSignMask() &&

43728

ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()))

43729

return TLO.CombineTo(Op, Op.getOperand(1));

43730

break;

43731

case X86ISD::MOVMSK: {

43732

SDValue Src = Op.getOperand(0);

43733

MVT SrcVT = Src.getSimpleValueType();

43734

unsigned SrcBits = SrcVT.getScalarSizeInBits();

43735

unsigned NumElts = SrcVT.getVectorNumElements();

43736

43737

// If we don't need the sign bits at all just return zero.

43738

if (OriginalDemandedBits.countr_zero() >= NumElts)

43739

return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));

43740

43741

// See if we only demand bits from the lower 128-bit vector.

43742

if (SrcVT.is256BitVector() &&

43743

OriginalDemandedBits.getActiveBits() <= (NumElts / 2)) {

43744

SDValue NewSrc = extract128BitVector(Src, 0, TLO.DAG, SDLoc(Src));

43745

return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));

43746

}

43747

43748

// Only demand the vector elements of the sign bits we need.

43749

APInt KnownUndef, KnownZero;

43750

APInt DemandedElts = OriginalDemandedBits.zextOrTrunc(NumElts);

43751

if (SimplifyDemandedVectorElts(Src, DemandedElts, KnownUndef, KnownZero,

43752

TLO, Depth + 1))

43753

return true;

43754

43755

Known.Zero = KnownZero.zext(BitWidth);

43756

Known.Zero.setHighBits(BitWidth - NumElts);

43757

43758

// MOVMSK only uses the MSB from each vector element.

43759

KnownBits KnownSrc;

43760

APInt DemandedSrcBits = APInt::getSignMask(SrcBits);

43761

if (SimplifyDemandedBits(Src, DemandedSrcBits, DemandedElts, KnownSrc, TLO,

43762

Depth + 1))

43763

return true;

43764

43765

if (KnownSrc.One[SrcBits - 1])

43766

Known.One.setLowBits(NumElts);

43767

else if (KnownSrc.Zero[SrcBits - 1])

43768

Known.Zero.setLowBits(NumElts);

43769

43770

// Attempt to avoid multi-use os if we don't need anything from it.

43771

if (SDValue NewSrc = SimplifyMultipleUseDemandedBits(

43772

Src, DemandedSrcBits, DemandedElts, TLO.DAG, Depth + 1))

43773

return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));

43774

return false;

43775

}

43776

case X86ISD::TESTP: {

43777

SDValue Op0 = Op.getOperand(0);

43778

SDValue Op1 = Op.getOperand(1);

43779

MVT OpVT = Op0.getSimpleValueType();

43780

assert((OpVT.getVectorElementType() == MVT::f32 ||(static_cast <bool> ((OpVT.getVectorElementType() == MVT
::f32 || OpVT.getVectorElementType() == MVT::f64) && "Illegal vector type for X86ISD::TESTP"
) ? void (0) : __assert_fail ("(OpVT.getVectorElementType() == MVT::f32 || OpVT.getVectorElementType() == MVT::f64) && \"Illegal vector type for X86ISD::TESTP\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43782, __extension__
__PRETTY_FUNCTION__))

43781

OpVT.getVectorElementType() == MVT::f64) &&(static_cast <bool> ((OpVT.getVectorElementType() == MVT
::f32 || OpVT.getVectorElementType() == MVT::f64) && "Illegal vector type for X86ISD::TESTP"
) ? void (0) : __assert_fail ("(OpVT.getVectorElementType() == MVT::f32 || OpVT.getVectorElementType() == MVT::f64) && \"Illegal vector type for X86ISD::TESTP\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43782, __extension__
__PRETTY_FUNCTION__))

43782

"Illegal vector type for X86ISD::TESTP")(static_cast <bool> ((OpVT.getVectorElementType() == MVT
::f32 || OpVT.getVectorElementType() == MVT::f64) && "Illegal vector type for X86ISD::TESTP"
) ? void (0) : __assert_fail ("(OpVT.getVectorElementType() == MVT::f32 || OpVT.getVectorElementType() == MVT::f64) && \"Illegal vector type for X86ISD::TESTP\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43782, __extension__
__PRETTY_FUNCTION__));

43783

43784

// TESTPS/TESTPD only demands the sign bits of ALL the elements.

43785

KnownBits KnownSrc;

43786

APInt SignMask = APInt::getSignMask(OpVT.getScalarSizeInBits());

43787

bool AssumeSingleUse = (Op0 == Op1) && Op->isOnlyUserOf(Op0.getNode());

43788

return SimplifyDemandedBits(Op0, SignMask, KnownSrc, TLO, Depth + 1,

43789

AssumeSingleUse) ||

43790

SimplifyDemandedBits(Op1, SignMask, KnownSrc, TLO, Depth + 1,

43791

AssumeSingleUse);

43792

}

43793

case X86ISD::BEXTR:

43794

case X86ISD::BEXTRI: {

43795

SDValue Op0 = Op.getOperand(0);

43796

SDValue Op1 = Op.getOperand(1);

43797

43798

// Only bottom 16-bits of the control bits are required.

43799

if (auto *Cst1 = dyn_cast<ConstantSDNode>(Op1)) {

43800

// NOTE: SimplifyDemandedBits won't do this for constants.

43801

uint64_t Val1 = Cst1->getZExtValue();

43802

uint64_t MaskedVal1 = Val1 & 0xFFFF;

43803

if (Opc == X86ISD::BEXTR && MaskedVal1 != Val1) {

43804

SDLoc DL(Op);

43805

return TLO.CombineTo(

43806

Op, TLO.DAG.getNode(X86ISD::BEXTR, DL, VT, Op0,

43807

TLO.DAG.getConstant(MaskedVal1, DL, VT)));

43808

}

43809

43810

unsigned Shift = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 0);

43811

unsigned Length = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 8);

43812

43813

// If the length is 0, the result is 0.

43814

if (Length == 0) {

43815

Known.setAllZero();

43816

return false;

43817

}

43818

43819

if ((Shift + Length) <= BitWidth) {

43820

APInt DemandedMask = APInt::getBitsSet(BitWidth, Shift, Shift + Length);

43821

if (SimplifyDemandedBits(Op0, DemandedMask, Known, TLO, Depth + 1))

43822

return true;

43823

43824

Known = Known.extractBits(Length, Shift);

43825

Known = Known.zextOrTrunc(BitWidth);

43826

return false;

43827

}

43828

} else {

43829

assert(Opc == X86ISD::BEXTR && "Unexpected opcode!")(static_cast <bool> (Opc == X86ISD::BEXTR && "Unexpected opcode!"
) ? void (0) : __assert_fail ("Opc == X86ISD::BEXTR && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43829, __extension__
__PRETTY_FUNCTION__));

43830

KnownBits Known1;

43831

APInt DemandedMask(APInt::getLowBitsSet(BitWidth, 16));

43832

if (SimplifyDemandedBits(Op1, DemandedMask, Known1, TLO, Depth + 1))

43833

return true;

43834

43835

// If the length is 0, replace with 0.

43836

KnownBits LengthBits = Known1.extractBits(8, 8);

43837

if (LengthBits.isZero())

43838

return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));

43839

}

43840

43841

break;

43842

}

43843

case X86ISD::PDEP: {

43844

SDValue Op0 = Op.getOperand(0);

43845

SDValue Op1 = Op.getOperand(1);

43846

43847

unsigned DemandedBitsLZ = OriginalDemandedBits.countl_zero();

43848

APInt LoMask = APInt::getLowBitsSet(BitWidth, BitWidth - DemandedBitsLZ);

43849

43850

// If the demanded bits has leading zeroes, we don't demand those from the

43851

// mask.

43852

if (SimplifyDemandedBits(Op1, LoMask, Known, TLO, Depth + 1))

43853

return true;

43854

43855

// The number of possible 1s in the mask determines the number of LSBs of

43856

// operand 0 used. Undemanded bits from the mask don't matter so filter

43857

// them before counting.

43858

KnownBits Known2;

43859

uint64_t Count = (~Known.Zero & LoMask).popcount();

43860

APInt DemandedMask(APInt::getLowBitsSet(BitWidth, Count));

43861

if (SimplifyDemandedBits(Op0, DemandedMask, Known2, TLO, Depth + 1))

43862

return true;

43863

43864

// Zeroes are retained from the mask, but not ones.

43865

Known.One.clearAllBits();

43866

// The result will have at least as many trailing zeros as the non-mask

43867

// operand since bits can only map to the same or higher bit position.

43868

Known.Zero.setLowBits(Known2.countMinTrailingZeros());

43869

return false;

43870

}

43871

}

43872

43873

return TargetLowering::SimplifyDemandedBitsForTargetNode(

43874

Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth);

43875

}

43876

43877

SDValue X86TargetLowering::SimplifyMultipleUseDemandedBitsForTargetNode(

43878

SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,

43879

SelectionDAG &DAG, unsigned Depth) const {

43880

int NumElts = DemandedElts.getBitWidth();

43881

unsigned Opc = Op.getOpcode();

43882

EVT VT = Op.getValueType();

43883

43884

switch (Opc) {

43885

case X86ISD::PINSRB:

43886

case X86ISD::PINSRW: {

43887

// If we don't demand the inserted element, return the base vector.

43888

SDValue Vec = Op.getOperand(0);

43889

auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2));

43890

MVT VecVT = Vec.getSimpleValueType();

43891

if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements()) &&

43892

!DemandedElts[CIdx->getZExtValue()])

43893

return Vec;

43894

break;

43895

}

43896

case X86ISD::VSHLI: {

43897

// If we are only demanding sign bits then we can use the shift source

43898

// directly.

43899

SDValue Op0 = Op.getOperand(0);

43900

unsigned ShAmt = Op.getConstantOperandVal(1);

43901

unsigned BitWidth = DemandedBits.getBitWidth();

43902

unsigned NumSignBits = DAG.ComputeNumSignBits(Op0, DemandedElts, Depth + 1);

43903

unsigned UpperDemandedBits = BitWidth - DemandedBits.countr_zero();

43904

if (NumSignBits > ShAmt && (NumSignBits - ShAmt) >= UpperDemandedBits)

43905

return Op0;

43906

break;

43907

}

43908

case X86ISD::VSRAI:

43909

// iff we only need the sign bit then we can use the source directly.

43910

// TODO: generalize where we only demand extended signbits.

43911

if (DemandedBits.isSignMask())

43912

return Op.getOperand(0);

43913

break;

43914

case X86ISD::PCMPGT:

43915

// icmp sgt(0, R) == ashr(R, BitWidth-1).

43916

// iff we only need the sign bit then we can use R directly.

43917

if (DemandedBits.isSignMask() &&

43918

ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()))

43919

return Op.getOperand(1);

43920

break;

43921

case X86ISD::ANDNP: {

43922

// ANDNP = (~LHS & RHS);

43923

SDValue LHS = Op.getOperand(0);

43924

SDValue RHS = Op.getOperand(1);

43925

43926

KnownBits LHSKnown = DAG.computeKnownBits(LHS, DemandedElts, Depth + 1);

43927

KnownBits RHSKnown = DAG.computeKnownBits(RHS, DemandedElts, Depth + 1);

43928

43929

// If all of the demanded bits are known 0 on LHS and known 0 on RHS, then

43930

// the (inverted) LHS bits cannot contribute to the result of the 'andn' in

43931

// this context, so return RHS.

43932

if (DemandedBits.isSubsetOf(RHSKnown.Zero | LHSKnown.Zero))

43933

return RHS;

43934

break;

43935

}

43936

}

43937

43938

APInt ShuffleUndef, ShuffleZero;

43939

SmallVector<int, 16> ShuffleMask;

43940

SmallVector<SDValue, 2> ShuffleOps;

43941

if (getTargetShuffleInputs(Op, DemandedElts, ShuffleOps, ShuffleMask,

43942

ShuffleUndef, ShuffleZero, DAG, Depth, false)) {

43943

// If all the demanded elts are from one operand and are inline,

43944

// then we can use the operand directly.

43945

int NumOps = ShuffleOps.size();

43946

if (ShuffleMask.size() == (unsigned)NumElts &&

43947

llvm::all_of(ShuffleOps, [VT](SDValue V) {

43948

return VT.getSizeInBits() == V.getValueSizeInBits();

43949

})) {

43950

43951

if (DemandedElts.isSubsetOf(ShuffleUndef))

43952

return DAG.getUNDEF(VT);

43953

if (DemandedElts.isSubsetOf(ShuffleUndef | ShuffleZero))

43954

return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(Op));

43955

43956

// Bitmask that indicates which ops have only been accessed 'inline'.

43957

APInt IdentityOp = APInt::getAllOnes(NumOps);

43958

for (int i = 0; i != NumElts; ++i) {

43959

int M = ShuffleMask[i];

43960

if (!DemandedElts[i] || ShuffleUndef[i])

43961

continue;

43962

int OpIdx = M / NumElts;

43963

int EltIdx = M % NumElts;

43964

if (M < 0 || EltIdx != i) {

43965

IdentityOp.clearAllBits();

43966

break;

43967

}

43968

IdentityOp &= APInt::getOneBitSet(NumOps, OpIdx);

43969

if (IdentityOp == 0)

43970

break;

43971

}

43972

assert((IdentityOp == 0 || IdentityOp.popcount() == 1) &&(static_cast <bool> ((IdentityOp == 0 || IdentityOp.popcount
() == 1) && "Multiple identity shuffles detected") ? void
(0) : __assert_fail ("(IdentityOp == 0 || IdentityOp.popcount() == 1) && \"Multiple identity shuffles detected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43973, __extension__
__PRETTY_FUNCTION__))

43973

"Multiple identity shuffles detected")(static_cast <bool> ((IdentityOp == 0 || IdentityOp.popcount
() == 1) && "Multiple identity shuffles detected") ? void
(0) : __assert_fail ("(IdentityOp == 0 || IdentityOp.popcount() == 1) && \"Multiple identity shuffles detected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43973, __extension__
__PRETTY_FUNCTION__));

43974

43975

if (IdentityOp != 0)

43976

return DAG.getBitcast(VT, ShuffleOps[IdentityOp.countr_zero()]);

43977

}

43978

}

43979

43980

return TargetLowering::SimplifyMultipleUseDemandedBitsForTargetNode(

43981

Op, DemandedBits, DemandedElts, DAG, Depth);

43982

}

43983

43984

bool X86TargetLowering::isGuaranteedNotToBeUndefOrPoisonForTargetNode(

43985

SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,

43986

bool PoisonOnly, unsigned Depth) const {

43987

unsigned EltsBits = Op.getScalarValueSizeInBits();

43988

unsigned NumElts = DemandedElts.getBitWidth();

43989

43990

// TODO: Add more target shuffles.

43991

switch (Op.getOpcode()) {

43992

case X86ISD::PSHUFD:

43993

case X86ISD::VPERMILPI: {

43994

SmallVector<int, 8> Mask;

43995

DecodePSHUFMask(NumElts, EltsBits, Op.getConstantOperandVal(1), Mask);

43996

43997

APInt DemandedSrcElts = APInt::getZero(NumElts);

43998

for (unsigned I = 0; I != NumElts; ++I)

43999

if (DemandedElts[I])

44000

DemandedSrcElts.setBit(Mask[I]);

44001

44002

return DAG.isGuaranteedNotToBeUndefOrPoison(

44003

Op.getOperand(0), DemandedSrcElts, PoisonOnly, Depth + 1);

44004

}

44005

}

44006

return TargetLowering::isGuaranteedNotToBeUndefOrPoisonForTargetNode(

44007

Op, DemandedElts, DAG, PoisonOnly, Depth);

44008

}

44009

44010

bool X86TargetLowering::canCreateUndefOrPoisonForTargetNode(

44011

SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,

44012

bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const {

44013

44014

// TODO: Add more target shuffles.

44015

switch (Op.getOpcode()) {

44016

case X86ISD::PSHUFD:

44017

case X86ISD::VPERMILPI:

44018

return false;

44019

}

44020

return TargetLowering::canCreateUndefOrPoisonForTargetNode(

44021

Op, DemandedElts, DAG, PoisonOnly, ConsiderFlags, Depth);

44022

}

44023

44024

bool X86TargetLowering::isSplatValueForTargetNode(SDValue Op,

44025

const APInt &DemandedElts,

44026

APInt &UndefElts,

44027

const SelectionDAG &DAG,

44028

unsigned Depth) const {

44029

unsigned NumElts = DemandedElts.getBitWidth();

44030

unsigned Opc = Op.getOpcode();

44031

44032

switch (Opc) {

44033

case X86ISD::VBROADCAST:

44034

case X86ISD::VBROADCAST_LOAD:

44035

UndefElts = APInt::getZero(NumElts);

44036

return true;

44037

}

44038

44039

return TargetLowering::isSplatValueForTargetNode(Op, DemandedElts, UndefElts,

44040

DAG, Depth);

44041

}

44042

44043

// Helper to peek through bitops/trunc/setcc to determine size of source vector.

44044

// Allows combineBitcastvxi1 to determine what size vector generated a <X x i1>.

44045

static bool checkBitcastSrcVectorSize(SDValue Src, unsigned Size,

44046

bool AllowTruncate) {

44047

switch (Src.getOpcode()) {

44048

case ISD::TRUNCATE:

44049

if (!AllowTruncate)

44050

return false;

44051

[[fallthrough]];

44052

case ISD::SETCC:

44053

return Src.getOperand(0).getValueSizeInBits() == Size;

44054

case ISD::AND:

44055

case ISD::XOR:

44056

case ISD::OR:

44057

return checkBitcastSrcVectorSize(Src.getOperand(0), Size, AllowTruncate) &&

44058

checkBitcastSrcVectorSize(Src.getOperand(1), Size, AllowTruncate);

44059

case ISD::SELECT:

44060

case ISD::VSELECT:

44061

return Src.getOperand(0).getScalarValueSizeInBits() == 1 &&

44062

checkBitcastSrcVectorSize(Src.getOperand(1), Size, AllowTruncate) &&

44063

checkBitcastSrcVectorSize(Src.getOperand(2), Size, AllowTruncate);

44064

case ISD::BUILD_VECTOR:

44065

return ISD::isBuildVectorAllZeros(Src.getNode()) ||

44066

ISD::isBuildVectorAllOnes(Src.getNode());

44067

}

44068

return false;

44069

}

44070

44071

// Helper to flip between AND/OR/XOR opcodes and their X86ISD FP equivalents.

44072

static unsigned getAltBitOpcode(unsigned Opcode) {

44073

switch(Opcode) {

44074

case ISD::AND: return X86ISD::FAND;

44075

case ISD::OR: return X86ISD::FOR;

44076

case ISD::XOR: return X86ISD::FXOR;

44077

case X86ISD::ANDNP: return X86ISD::FANDN;

44078

}

44079

llvm_unreachable("Unknown bitwise opcode")::llvm::llvm_unreachable_internal("Unknown bitwise opcode", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 44079);

44080

}

44081

44082

// Helper to adjust v4i32 MOVMSK expansion to work with SSE1-only targets.

44083

static SDValue adjustBitcastSrcVectorSSE1(SelectionDAG &DAG, SDValue Src,

44084

const SDLoc &DL) {

44085

EVT SrcVT = Src.getValueType();

44086

if (SrcVT != MVT::v4i1)

44087

return SDValue();

44088

44089

switch (Src.getOpcode()) {

44090

case ISD::SETCC:

44091

if (Src.getOperand(0).getValueType() == MVT::v4i32 &&

44092

ISD::isBuildVectorAllZeros(Src.getOperand(1).getNode()) &&

44093

cast<CondCodeSDNode>(Src.getOperand(2))->get() == ISD::SETLT) {

44094

SDValue Op0 = Src.getOperand(0);

44095

if (ISD::isNormalLoad(Op0.getNode()))

44096

return DAG.getBitcast(MVT::v4f32, Op0);

44097

if (Op0.getOpcode() == ISD::BITCAST &&

44098

Op0.getOperand(0).getValueType() == MVT::v4f32)

44099

return Op0.getOperand(0);

44100

}

44101

break;

44102

case ISD::AND:

44103

case ISD::XOR:

44104

case ISD::OR: {

44105

SDValue Op0 = adjustBitcastSrcVectorSSE1(DAG, Src.getOperand(0), DL);

44106

SDValue Op1 = adjustBitcastSrcVectorSSE1(DAG, Src.getOperand(1), DL);

44107

if (Op0 && Op1)

44108

return DAG.getNode(getAltBitOpcode(Src.getOpcode()), DL, MVT::v4f32, Op0,

44109

Op1);

44110

break;

44111

}

44112

}

44113

return SDValue();

44114

}

44115

44116

// Helper to push sign extension of vXi1 SETCC result through bitops.

44117

static SDValue signExtendBitcastSrcVector(SelectionDAG &DAG, EVT SExtVT,

44118

SDValue Src, const SDLoc &DL) {

44119

switch (Src.getOpcode()) {

44120

case ISD::SETCC:

44121

case ISD::TRUNCATE:

44122

case ISD::BUILD_VECTOR:

44123

return DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src);

44124

case ISD::AND:

44125

case ISD::XOR:

44126

case ISD::OR:

44127

return DAG.getNode(

44128

Src.getOpcode(), DL, SExtVT,

44129

signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(0), DL),

44130

signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(1), DL));

44131

case ISD::SELECT:

44132

case ISD::VSELECT:

44133

return DAG.getSelect(

44134

DL, SExtVT, Src.getOperand(0),

44135

signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(1), DL),

44136

signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(2), DL));

44137

}

44138

llvm_unreachable("Unexpected node type for vXi1 sign extension")::llvm::llvm_unreachable_internal("Unexpected node type for vXi1 sign extension"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 44138);

44139

}

44140

44141

// Try to match patterns such as

44142

// (i16 bitcast (v16i1 x))

44143

// ->

44144

// (i16 movmsk (16i8 sext (v16i1 x)))

44145

// before the illegal vector is scalarized on subtargets that don't have legal

44146

// vxi1 types.

44147

static SDValue combineBitcastvxi1(SelectionDAG &DAG, EVT VT, SDValue Src,

44148

const SDLoc &DL,

44149

const X86Subtarget &Subtarget) {

44150

EVT SrcVT = Src.getValueType();

44151

if (!SrcVT.isSimple() || SrcVT.getScalarType() != MVT::i1)

44152

return SDValue();

44153

44154

// Recognize the IR pattern for the movmsk intrinsic under SSE1 before type

44155

// legalization destroys the v4i32 type.

44156

if (Subtarget.hasSSE1() && !Subtarget.hasSSE2()) {

44157

if (SDValue V = adjustBitcastSrcVectorSSE1(DAG, Src, DL)) {

44158

V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32,

44159

DAG.getBitcast(MVT::v4f32, V));

44160

return DAG.getZExtOrTrunc(V, DL, VT);

44161

}

44162

}

44163

44164

// If the input is a truncate from v16i8 or v32i8 go ahead and use a

44165

// movmskb even with avx512. This will be better than truncating to vXi1 and

44166

// using a kmov. This can especially help KNL if the input is a v16i8/v32i8

44167

// vpcmpeqb/vpcmpgtb.

44168

bool PreferMovMsk = Src.getOpcode() == ISD::TRUNCATE && Src.hasOneUse() &&

44169

(Src.getOperand(0).getValueType() == MVT::v16i8 ||

44170

Src.getOperand(0).getValueType() == MVT::v32i8 ||

44171

Src.getOperand(0).getValueType() == MVT::v64i8);

44172

44173

// Prefer movmsk for AVX512 for (bitcast (setlt X, 0)) which can be handled

44174

// directly with vpmovmskb/vmovmskps/vmovmskpd.

44175

if (Src.getOpcode() == ISD::SETCC && Src.hasOneUse() &&

44176

cast<CondCodeSDNode>(Src.getOperand(2))->get() == ISD::SETLT &&

44177

ISD::isBuildVectorAllZeros(Src.getOperand(1).getNode())) {

44178

EVT CmpVT = Src.getOperand(0).getValueType();

44179

EVT EltVT = CmpVT.getVectorElementType();

44180

if (CmpVT.getSizeInBits() <= 256 &&

44181

(EltVT == MVT::i8 || EltVT == MVT::i32 || EltVT == MVT::i64))

44182

PreferMovMsk = true;

44183

}

44184

44185

// With AVX512 vxi1 types are legal and we prefer using k-regs.

44186

// MOVMSK is supported in SSE2 or later.

44187

if (!Subtarget.hasSSE2() || (Subtarget.hasAVX512() && !PreferMovMsk))

44188

return SDValue();

44189

44190

// If the upper ops of a concatenation are undef, then try to bitcast the

44191

// lower op and extend.

44192

SmallVector<SDValue, 4> SubSrcOps;

44193

if (collectConcatOps(Src.getNode(), SubSrcOps, DAG) &&

44194

SubSrcOps.size() >= 2) {

44195

SDValue LowerOp = SubSrcOps[0];

44196

ArrayRef<SDValue> UpperOps(std::next(SubSrcOps.begin()), SubSrcOps.end());

44197

if (LowerOp.getOpcode() == ISD::SETCC &&

44198

all_of(UpperOps, [](SDValue Op) { return Op.isUndef(); })) {

44199

EVT SubVT = VT.getIntegerVT(

44200

*DAG.getContext(), LowerOp.getValueType().getVectorMinNumElements());

44201

if (SDValue V = combineBitcastvxi1(DAG, SubVT, LowerOp, DL, Subtarget)) {

44202

EVT IntVT = VT.getIntegerVT(*DAG.getContext(), VT.getSizeInBits());

44203

return DAG.getBitcast(VT, DAG.getNode(ISD::ANY_EXTEND, DL, IntVT, V));

44204

}

44205

}

44206

}

44207

44208

// There are MOVMSK flavors for types v16i8, v32i8, v4f32, v8f32, v4f64 and

44209

// v8f64. So all legal 128-bit and 256-bit vectors are covered except for

44210

// v8i16 and v16i16.

44211

// For these two cases, we can shuffle the upper element bytes to a

44212

// consecutive sequence at the start of the vector and treat the results as

44213

// v16i8 or v32i8, and for v16i8 this is the preferable solution. However,

44214

// for v16i16 this is not the case, because the shuffle is expensive, so we

44215

// avoid sign-extending to this type entirely.

44216

// For example, t0 := (v8i16 sext(v8i1 x)) needs to be shuffled as:

44217

// (v16i8 shuffle <0,2,4,6,8,10,12,14,u,u,...,u> (v16i8 bitcast t0), undef)

44218

MVT SExtVT;

44219

bool PropagateSExt = false;

44220

switch (SrcVT.getSimpleVT().SimpleTy) {

44221

default:

44222

return SDValue();

44223

case MVT::v2i1:

44224

SExtVT = MVT::v2i64;

44225

break;

44226

case MVT::v4i1:

44227

SExtVT = MVT::v4i32;

44228

// For cases such as (i4 bitcast (v4i1 setcc v4i64 v1, v2))

44229

// sign-extend to a 256-bit operation to avoid truncation.

44230

if (Subtarget.hasAVX() &&

44231

checkBitcastSrcVectorSize(Src, 256, Subtarget.hasAVX2())) {

44232

SExtVT = MVT::v4i64;

44233

PropagateSExt = true;

44234

}

44235

break;

44236

case MVT::v8i1:

44237

SExtVT = MVT::v8i16;

44238

// For cases such as (i8 bitcast (v8i1 setcc v8i32 v1, v2)),

44239

// sign-extend to a 256-bit operation to match the compare.

44240

// If the setcc operand is 128-bit, prefer sign-extending to 128-bit over

44241

// 256-bit because the shuffle is cheaper than sign extending the result of

44242

// the compare.

44243

if (Subtarget.hasAVX() && (checkBitcastSrcVectorSize(Src, 256, true) ||

44244

checkBitcastSrcVectorSize(Src, 512, true))) {

44245

SExtVT = MVT::v8i32;

44246

PropagateSExt = true;

44247

}

44248

break;

44249

case MVT::v16i1:

44250

SExtVT = MVT::v16i8;

44251

// For the case (i16 bitcast (v16i1 setcc v16i16 v1, v2)),

44252

// it is not profitable to sign-extend to 256-bit because this will

44253

// require an extra cross-lane shuffle which is more expensive than

44254

// truncating the result of the compare to 128-bits.

44255

break;

44256

case MVT::v32i1:

44257

SExtVT = MVT::v32i8;

44258

break;

44259

case MVT::v64i1:

44260

// If we have AVX512F, but not AVX512BW and the input is truncated from

44261

// v64i8 checked earlier. Then split the input and make two pmovmskbs.

44262

if (Subtarget.hasAVX512()) {

44263

if (Subtarget.hasBWI())

44264

return SDValue();

44265

SExtVT = MVT::v64i8;

44266

break;

44267

}

44268

// Split if this is a <64 x i8> comparison result.

44269

if (checkBitcastSrcVectorSize(Src, 512, false)) {

44270

SExtVT = MVT::v64i8;

44271

break;

44272

}

44273

return SDValue();

44274

};

44275

44276

SDValue V = PropagateSExt ? signExtendBitcastSrcVector(DAG, SExtVT, Src, DL)

44277

: DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src);

44278

44279

if (SExtVT == MVT::v16i8 || SExtVT == MVT::v32i8 || SExtVT == MVT::v64i8) {

44280

V = getPMOVMSKB(DL, V, DAG, Subtarget);

44281

} else {

44282

if (SExtVT == MVT::v8i16)

44283

V = DAG.getNode(X86ISD::PACKSS, DL, MVT::v16i8, V,

44284

DAG.getUNDEF(MVT::v8i16));

44285

V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);

44286

}

44287

44288

EVT IntVT =

44289

EVT::getIntegerVT(*DAG.getContext(), SrcVT.getVectorNumElements());

44290

V = DAG.getZExtOrTrunc(V, DL, IntVT);

44291

return DAG.getBitcast(VT, V);

44292

}

44293

44294

// Convert a vXi1 constant build vector to the same width scalar integer.

44295

static SDValue combinevXi1ConstantToInteger(SDValue Op, SelectionDAG &DAG) {

44296

EVT SrcVT = Op.getValueType();

44297

assert(SrcVT.getVectorElementType() == MVT::i1 &&(static_cast <bool> (SrcVT.getVectorElementType() == MVT
::i1 && "Expected a vXi1 vector") ? void (0) : __assert_fail
("SrcVT.getVectorElementType() == MVT::i1 && \"Expected a vXi1 vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 44298, __extension__
__PRETTY_FUNCTION__))

44298

"Expected a vXi1 vector")(static_cast <bool> (SrcVT.getVectorElementType() == MVT
::i1 && "Expected a vXi1 vector") ? void (0) : __assert_fail
("SrcVT.getVectorElementType() == MVT::i1 && \"Expected a vXi1 vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 44298, __extension__
__PRETTY_FUNCTION__));

44299

assert(ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) &&(static_cast <bool> (ISD::isBuildVectorOfConstantSDNodes
(Op.getNode()) && "Expected a constant build vector")
? void (0) : __assert_fail ("ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) && \"Expected a constant build vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 44300, __extension__
__PRETTY_FUNCTION__))

44300

"Expected a constant build vector")(static_cast <bool> (ISD::isBuildVectorOfConstantSDNodes
(Op.getNode()) && "Expected a constant build vector")
? void (0) : __assert_fail ("ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) && \"Expected a constant build vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 44300, __extension__
__PRETTY_FUNCTION__));

44301

44302

APInt Imm(SrcVT.getVectorNumElements(), 0);

44303

for (unsigned Idx = 0, e = Op.getNumOperands(); Idx < e; ++Idx) {

44304

SDValue In = Op.getOperand(Idx);

44305

if (!In.isUndef() && (cast<ConstantSDNode>(In)->getZExtValue() & 0x1))

44306

Imm.setBit(Idx);

44307

}

44308

EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), Imm.getBitWidth());

44309

return DAG.getConstant(Imm, SDLoc(Op), IntVT);

44310

}

44311

44312

static SDValue combineCastedMaskArithmetic(SDNode *N, SelectionDAG &DAG,

44313

TargetLowering::DAGCombinerInfo &DCI,

44314

const X86Subtarget &Subtarget) {

44315

assert(N->getOpcode() == ISD::BITCAST && "Expected a bitcast")(static_cast <bool> (N->getOpcode() == ISD::BITCAST &&
"Expected a bitcast") ? void (0) : __assert_fail ("N->getOpcode() == ISD::BITCAST && \"Expected a bitcast\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 44315, __extension__
__PRETTY_FUNCTION__));

44316

44317

if (!DCI.isBeforeLegalizeOps())

44318

return SDValue();

44319

44320

// Only do this if we have k-registers.

44321

if (!Subtarget.hasAVX512())

44322

return SDValue();

44323

44324

EVT DstVT = N->getValueType(0);

44325

SDValue Op = N->getOperand(0);

44326

EVT SrcVT = Op.getValueType();

44327

44328

if (!Op.hasOneUse())

44329

return SDValue();

44330

44331

// Look for logic ops.

44332

if (Op.getOpcode() != ISD::AND &&

44333

Op.getOpcode() != ISD::OR &&

44334

Op.getOpcode() != ISD::XOR)

44335

return SDValue();

44336

44337

// Make sure we have a bitcast between mask registers and a scalar type.

44338

if (!(SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&

44339

DstVT.isScalarInteger()) &&

44340

!(DstVT.isVector() && DstVT.getVectorElementType() == MVT::i1 &&

44341

SrcVT.isScalarInteger()))

44342

return SDValue();

44343

44344

SDValue LHS = Op.getOperand(0);

44345

SDValue RHS = Op.getOperand(1);

44346

44347

if (LHS.hasOneUse() && LHS.getOpcode() == ISD::BITCAST &&

44348

LHS.getOperand(0).getValueType() == DstVT)

44349

return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT, LHS.getOperand(0),

44350

DAG.getBitcast(DstVT, RHS));

44351

44352

if (RHS.hasOneUse() && RHS.getOpcode() == ISD::BITCAST &&

44353

RHS.getOperand(0).getValueType() == DstVT)

44354

return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,

44355

DAG.getBitcast(DstVT, LHS), RHS.getOperand(0));

44356

44357

// If the RHS is a vXi1 build vector, this is a good reason to flip too.

44358

// Most of these have to move a constant from the scalar domain anyway.

44359

if (ISD::isBuildVectorOfConstantSDNodes(RHS.getNode())) {

44360

RHS = combinevXi1ConstantToInteger(RHS, DAG);

44361

return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,

44362

DAG.getBitcast(DstVT, LHS), RHS);

44363

}

44364

44365

return SDValue();

44366

}

44367

44368

static SDValue createMMXBuildVector(BuildVectorSDNode *BV, SelectionDAG &DAG,

44369

const X86Subtarget &Subtarget) {

44370

SDLoc DL(BV);

44371

unsigned NumElts = BV->getNumOperands();

44372

SDValue Splat = BV->getSplatValue();

44373

44374

// Build MMX element from integer GPR or SSE float values.

44375

auto CreateMMXElement = [&](SDValue V) {

44376

if (V.isUndef())

44377

return DAG.getUNDEF(MVT::x86mmx);

44378

if (V.getValueType().isFloatingPoint()) {

44379

if (Subtarget.hasSSE1() && !isa<ConstantFPSDNode>(V)) {

44380

V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, V);

44381

V = DAG.getBitcast(MVT::v2i64, V);

44382

return DAG.getNode(X86ISD::MOVDQ2Q, DL, MVT::x86mmx, V);

44383

}

44384

V = DAG.getBitcast(MVT::i32, V);

44385

} else {

44386

V = DAG.getAnyExtOrTrunc(V, DL, MVT::i32);

44387

}

44388

return DAG.getNode(X86ISD::MMX_MOVW2D, DL, MVT::x86mmx, V);

44389

};

44390

44391

// Convert build vector ops to MMX data in the bottom elements.

44392

SmallVector<SDValue, 8> Ops;

44393

44394

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

44395

44396

// Broadcast - use (PUNPCKL+)PSHUFW to broadcast single element.

44397

if (Splat) {

44398

if (Splat.isUndef())

44399

return DAG.getUNDEF(MVT::x86mmx);

44400

44401

Splat = CreateMMXElement(Splat);

44402

44403

if (Subtarget.hasSSE1()) {

44404

// Unpack v8i8 to splat i8 elements to lowest 16-bits.

44405

if (NumElts == 8)

44406

Splat = DAG.getNode(

44407

ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,

44408

DAG.getTargetConstant(Intrinsic::x86_mmx_punpcklbw, DL,

44409

TLI.getPointerTy(DAG.getDataLayout())),

44410

Splat, Splat);

44411

44412

// Use PSHUFW to repeat 16-bit elements.

44413

unsigned ShufMask = (NumElts > 2 ? 0 : 0x44);

44414

return DAG.getNode(

44415

ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,

44416

DAG.getTargetConstant(Intrinsic::x86_sse_pshuf_w, DL,

44417

TLI.getPointerTy(DAG.getDataLayout())),

44418

Splat, DAG.getTargetConstant(ShufMask, DL, MVT::i8));

44419

}

44420

Ops.append(NumElts, Splat);

44421

} else {

44422

for (unsigned i = 0; i != NumElts; ++i)

44423

Ops.push_back(CreateMMXElement(BV->getOperand(i)));

44424

}

44425

44426

// Use tree of PUNPCKLs to build up general MMX vector.

44427

while (Ops.size() > 1) {

44428

unsigned NumOps = Ops.size();

44429

unsigned IntrinOp =

44430

(NumOps == 2 ? Intrinsic::x86_mmx_punpckldq

44431

: (NumOps == 4 ? Intrinsic::x86_mmx_punpcklwd

44432

: Intrinsic::x86_mmx_punpcklbw));

44433

SDValue Intrin = DAG.getTargetConstant(

44434

IntrinOp, DL, TLI.getPointerTy(DAG.getDataLayout()));

44435

for (unsigned i = 0; i != NumOps; i += 2)

44436

Ops[i / 2] = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx, Intrin,

44437

Ops[i], Ops[i + 1]);

44438

Ops.resize(NumOps / 2);

44439

}

44440

44441

return Ops[0];

44442

}

44443

44444

// Recursive function that attempts to find if a bool vector node was originally

44445

// a vector/float/double that got truncated/extended/bitcast to/from a scalar

44446

// integer. If so, replace the scalar ops with bool vector equivalents back down

44447

// the chain.

44448

static SDValue combineBitcastToBoolVector(EVT VT, SDValue V, const SDLoc &DL,

44449

SelectionDAG &DAG,

44450

const X86Subtarget &Subtarget) {

44451

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

44452

unsigned Opc = V.getOpcode();

44453

switch (Opc) {

44454

case ISD::BITCAST: {

44455

// Bitcast from a vector/float/double, we can cheaply bitcast to VT.

44456

SDValue Src = V.getOperand(0);

44457

EVT SrcVT = Src.getValueType();

44458

if (SrcVT.isVector() || SrcVT.isFloatingPoint())

44459

return DAG.getBitcast(VT, Src);

44460

break;

44461

}

44462

case ISD::TRUNCATE: {

44463

// If we find a suitable source, a truncated scalar becomes a subvector.

44464

SDValue Src = V.getOperand(0);

44465

EVT NewSrcVT =

44466

EVT::getVectorVT(*DAG.getContext(), MVT::i1, Src.getValueSizeInBits());

44467

if (TLI.isTypeLegal(NewSrcVT))

44468

if (SDValue N0 =

44469

combineBitcastToBoolVector(NewSrcVT, Src, DL, DAG, Subtarget))

44470

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, N0,

44471

DAG.getIntPtrConstant(0, DL));

44472

break;

44473

}

44474

case ISD::ANY_EXTEND:

44475

case ISD::ZERO_EXTEND: {

44476

// If we find a suitable source, an extended scalar becomes a subvector.

44477

SDValue Src = V.getOperand(0);

44478

EVT NewSrcVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,

44479

Src.getScalarValueSizeInBits());

44480

if (TLI.isTypeLegal(NewSrcVT))

44481

if (SDValue N0 =

44482

combineBitcastToBoolVector(NewSrcVT, Src, DL, DAG, Subtarget))

44483

return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,

44484

Opc == ISD::ANY_EXTEND ? DAG.getUNDEF(VT)

44485

: DAG.getConstant(0, DL, VT),

44486

N0, DAG.getIntPtrConstant(0, DL));

44487

break;

44488

}

44489

case ISD::OR: {

44490

// If we find suitable sources, we can just move an OR to the vector domain.

44491

SDValue Src0 = V.getOperand(0);

44492

SDValue Src1 = V.getOperand(1);

44493

if (SDValue N0 = combineBitcastToBoolVector(VT, Src0, DL, DAG, Subtarget))

44494

if (SDValue N1 = combineBitcastToBoolVector(VT, Src1, DL, DAG, Subtarget))

44495

return DAG.getNode(Opc, DL, VT, N0, N1);

44496

break;

44497

}

44498

case ISD::SHL: {

44499

// If we find a suitable source, a SHL becomes a KSHIFTL.

44500

SDValue Src0 = V.getOperand(0);

44501

if ((VT == MVT::v8i1 && !Subtarget.hasDQI()) ||

44502

((VT == MVT::v32i1 || VT == MVT::v64i1) && !Subtarget.hasBWI()))

44503

break;

44504

44505

if (auto *Amt = dyn_cast<ConstantSDNode>(V.getOperand(1)))

44506

if (SDValue N0 = combineBitcastToBoolVector(VT, Src0, DL, DAG, Subtarget))

44507

return DAG.getNode(

44508

X86ISD::KSHIFTL, DL, VT, N0,

44509

DAG.getTargetConstant(Amt->getZExtValue(), DL, MVT::i8));

44510

break;

44511

}

44512

}

44513

return SDValue();

44514

}

44515

44516

static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,

44517

TargetLowering::DAGCombinerInfo &DCI,

44518

const X86Subtarget &Subtarget) {

44519

SDValue N0 = N->getOperand(0);

44520

EVT VT = N->getValueType(0);

44521

EVT SrcVT = N0.getValueType();

44522

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

44523

44524

// Try to match patterns such as

44525

// (i16 bitcast (v16i1 x))

44526

// ->

44527

// (i16 movmsk (16i8 sext (v16i1 x)))

44528

// before the setcc result is scalarized on subtargets that don't have legal

44529

// vxi1 types.

44530

if (DCI.isBeforeLegalize()) {

44531

SDLoc dl(N);

44532

if (SDValue V = combineBitcastvxi1(DAG, VT, N0, dl, Subtarget))

44533

return V;

44534

44535

// If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer

44536

// type, widen both sides to avoid a trip through memory.

44537

if ((VT == MVT::v4i1 || VT == MVT::v2i1) && SrcVT.isScalarInteger() &&

44538

Subtarget.hasAVX512()) {

44539

N0 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i8, N0);

44540

N0 = DAG.getBitcast(MVT::v8i1, N0);

44541

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, N0,

44542

DAG.getIntPtrConstant(0, dl));

44543

}

44544

44545

// If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer

44546

// type, widen both sides to avoid a trip through memory.

44547

if ((SrcVT == MVT::v4i1 || SrcVT == MVT::v2i1) && VT.isScalarInteger() &&

44548

Subtarget.hasAVX512()) {

44549

// Use zeros for the widening if we already have some zeroes. This can

44550

// allow SimplifyDemandedBits to remove scalar ANDs that may be down

44551

// stream of this.

44552

// FIXME: It might make sense to detect a concat_vectors with a mix of

44553

// zeroes and undef and turn it into insert_subvector for i1 vectors as

44554

// a separate combine. What we can't do is canonicalize the operands of

44555

// such a concat or we'll get into a loop with SimplifyDemandedBits.

44556

if (N0.getOpcode() == ISD::CONCAT_VECTORS) {

44557

SDValue LastOp = N0.getOperand(N0.getNumOperands() - 1);

44558

if (ISD::isBuildVectorAllZeros(LastOp.getNode())) {

44559

SrcVT = LastOp.getValueType();

44560

unsigned NumConcats = 8 / SrcVT.getVectorNumElements();

44561

SmallVector<SDValue, 4> Ops(N0->op_begin(), N0->op_end());

44562

Ops.resize(NumConcats, DAG.getConstant(0, dl, SrcVT));

44563

N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);

44564

N0 = DAG.getBitcast(MVT::i8, N0);

44565

return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);

44566

}

44567

}

44568

44569

unsigned NumConcats = 8 / SrcVT.getVectorNumElements();

44570

SmallVector<SDValue, 4> Ops(NumConcats, DAG.getUNDEF(SrcVT));

44571

Ops[0] = N0;

44572

N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);

44573

N0 = DAG.getBitcast(MVT::i8, N0);

44574

return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);

44575

}

44576

} else {

44577

// If we're bitcasting from iX to vXi1, see if the integer originally

44578

// began as a vXi1 and whether we can remove the bitcast entirely.

44579

if (VT.isVector() && VT.getScalarType() == MVT::i1 &&

44580

SrcVT.isScalarInteger() && TLI.isTypeLegal(VT)) {

44581

if (SDValue V =

44582

combineBitcastToBoolVector(VT, N0, SDLoc(N), DAG, Subtarget))

44583

return V;

44584

}

44585

}

44586

44587

// Look for (i8 (bitcast (v8i1 (extract_subvector (v16i1 X), 0)))) and

44588

// replace with (i8 (trunc (i16 (bitcast (v16i1 X))))). This can occur

44589

// due to insert_subvector legalization on KNL. By promoting the copy to i16

44590

// we can help with known bits propagation from the vXi1 domain to the

44591

// scalar domain.

44592

if (VT == MVT::i8 && SrcVT == MVT::v8i1 && Subtarget.hasAVX512() &&

44593

!Subtarget.hasDQI() && N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&

44594

N0.getOperand(0).getValueType() == MVT::v16i1 &&

44595

isNullConstant(N0.getOperand(1)))

44596

return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT,

44597

DAG.getBitcast(MVT::i16, N0.getOperand(0)));

44598

44599

// Canonicalize (bitcast (vbroadcast_load)) so that the output of the bitcast

44600

// and the vbroadcast_load are both integer or both fp. In some cases this

44601

// will remove the bitcast entirely.

44602

if (N0.getOpcode() == X86ISD::VBROADCAST_LOAD && N0.hasOneUse() &&

44603

VT.isFloatingPoint() != SrcVT.isFloatingPoint() && VT.isVector()) {

44604

auto *BCast = cast<MemIntrinsicSDNode>(N0);

44605

unsigned SrcVTSize = SrcVT.getScalarSizeInBits();

44606

unsigned MemSize = BCast->getMemoryVT().getScalarSizeInBits();

44607

// Don't swap i8/i16 since don't have fp types that size.

44608

if (MemSize >= 32) {

44609

MVT MemVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(MemSize)

44610

: MVT::getIntegerVT(MemSize);

44611

MVT LoadVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(SrcVTSize)

44612

: MVT::getIntegerVT(SrcVTSize);

44613

LoadVT = MVT::getVectorVT(LoadVT, SrcVT.getVectorNumElements());

44614

44615

SDVTList Tys = DAG.getVTList(LoadVT, MVT::Other);

44616

SDValue Ops[] = { BCast->getChain(), BCast->getBasePtr() };

44617

SDValue ResNode =

44618

DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, SDLoc(N), Tys, Ops,

44619

MemVT, BCast->getMemOperand());

44620

DAG.ReplaceAllUsesOfValueWith(SDValue(BCast, 1), ResNode.getValue(1));

44621

return DAG.getBitcast(VT, ResNode);

44622

}

44623

}

44624

44625

// Since MMX types are special and don't usually play with other vector types,

44626

// it's better to handle them early to be sure we emit efficient code by

44627

// avoiding store-load conversions.

44628

if (VT == MVT::x86mmx) {

44629

// Detect MMX constant vectors.

44630

APInt UndefElts;

44631

SmallVector<APInt, 1> EltBits;

44632

if (getTargetConstantBitsFromNode(N0, 64, UndefElts, EltBits)) {

44633

SDLoc DL(N0);

44634

// Handle zero-extension of i32 with MOVD.

44635

if (EltBits[0].countl_zero() >= 32)

44636

return DAG.getNode(X86ISD::MMX_MOVW2D, DL, VT,

44637

DAG.getConstant(EltBits[0].trunc(32), DL, MVT::i32));

44638

// Else, bitcast to a double.

44639

// TODO - investigate supporting sext 32-bit immediates on x86_64.

44640

APFloat F64(APFloat::IEEEdouble(), EltBits[0]);

44641

return DAG.getBitcast(VT, DAG.getConstantFP(F64, DL, MVT::f64));

44642

}

44643

44644

// Detect bitcasts to x86mmx low word.

44645

if (N0.getOpcode() == ISD::BUILD_VECTOR &&

44646

(SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8) &&

44647

N0.getOperand(0).getValueType() == SrcVT.getScalarType()) {

44648

bool LowUndef = true, AllUndefOrZero = true;

44649

for (unsigned i = 1, e = SrcVT.getVectorNumElements(); i != e; ++i) {

44650

SDValue Op = N0.getOperand(i);

44651

LowUndef &= Op.isUndef() || (i >= e/2);

44652

AllUndefOrZero &= (Op.isUndef() || isNullConstant(Op));

44653

}

44654

if (AllUndefOrZero) {

44655

SDValue N00 = N0.getOperand(0);

44656

SDLoc dl(N00);

44657

N00 = LowUndef ? DAG.getAnyExtOrTrunc(N00, dl, MVT::i32)

44658

: DAG.getZExtOrTrunc(N00, dl, MVT::i32);

44659

return DAG.getNode(X86ISD::MMX_MOVW2D, dl, VT, N00);

44660

}

44661

}

44662

44663

// Detect bitcasts of 64-bit build vectors and convert to a

44664

// MMX UNPCK/PSHUFW which takes MMX type inputs with the value in the

44665

// lowest element.

44666

if (N0.getOpcode() == ISD::BUILD_VECTOR &&

44667

(SrcVT == MVT::v2f32 || SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 ||

44668

SrcVT == MVT::v8i8))

44669

return createMMXBuildVector(cast<BuildVectorSDNode>(N0), DAG, Subtarget);

44670

44671

// Detect bitcasts between element or subvector extraction to x86mmx.

44672

if ((N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||

44673

N0.getOpcode() == ISD::EXTRACT_SUBVECTOR) &&

44674

isNullConstant(N0.getOperand(1))) {

44675

SDValue N00 = N0.getOperand(0);

44676

if (N00.getValueType().is128BitVector())

44677

return DAG.getNode(X86ISD::MOVDQ2Q, SDLoc(N00), VT,

44678

DAG.getBitcast(MVT::v2i64, N00));

44679

}

44680

44681

// Detect bitcasts from FP_TO_SINT to x86mmx.

44682

if (SrcVT == MVT::v2i32 && N0.getOpcode() == ISD::FP_TO_SINT) {

44683

SDLoc DL(N0);

44684

SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,

44685

DAG.getUNDEF(MVT::v2i32));

44686

return DAG.getNode(X86ISD::MOVDQ2Q, DL, VT,

44687

DAG.getBitcast(MVT::v2i64, Res));

44688

}

44689

}

44690

44691

// Try to remove a bitcast of constant vXi1 vector. We have to legalize

44692

// most of these to scalar anyway.

44693

if (Subtarget.hasAVX512() && VT.isScalarInteger() &&

44694

SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&

44695

ISD::isBuildVectorOfConstantSDNodes(N0.getNode())) {

44696

return combinevXi1ConstantToInteger(N0, DAG);

44697

}

44698

44699

if (Subtarget.hasAVX512() && SrcVT.isScalarInteger() &&

44700

VT.isVector() && VT.getVectorElementType() == MVT::i1 &&

44701

isa<ConstantSDNode>(N0)) {

44702

auto *C = cast<ConstantSDNode>(N0);

44703

if (C->isAllOnes())

44704

return DAG.getConstant(1, SDLoc(N0), VT);

44705

if (C->isZero())

44706

return DAG.getConstant(0, SDLoc(N0), VT);

44707

}

44708

44709

// Look for MOVMSK that is maybe truncated and then bitcasted to vXi1.

44710

// Turn it into a sign bit compare that produces a k-register. This avoids

44711

// a trip through a GPR.

44712

if (Subtarget.hasAVX512() && SrcVT.isScalarInteger() &&

44713

VT.isVector() && VT.getVectorElementType() == MVT::i1 &&

44714

isPowerOf2_32(VT.getVectorNumElements())) {

44715

unsigned NumElts = VT.getVectorNumElements();

44716

SDValue Src = N0;

44717

44718

// Peek through truncate.

44719

if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse())

44720

Src = N0.getOperand(0);

44721

44722

if (Src.getOpcode() == X86ISD::MOVMSK && Src.hasOneUse()) {

44723

SDValue MovmskIn = Src.getOperand(0);

44724

MVT MovmskVT = MovmskIn.getSimpleValueType();

44725

unsigned MovMskElts = MovmskVT.getVectorNumElements();

44726

44727

// We allow extra bits of the movmsk to be used since they are known zero.

44728

// We can't convert a VPMOVMSKB without avx512bw.

44729

if (MovMskElts <= NumElts &&

44730

(Subtarget.hasBWI() || MovmskVT.getVectorElementType() != MVT::i8)) {

44731

EVT IntVT = EVT(MovmskVT).changeVectorElementTypeToInteger();

44732

MovmskIn = DAG.getBitcast(IntVT, MovmskIn);

44733

SDLoc dl(N);

44734

MVT CmpVT = MVT::getVectorVT(MVT::i1, MovMskElts);

44735

SDValue Cmp = DAG.getSetCC(dl, CmpVT, MovmskIn,

44736

DAG.getConstant(0, dl, IntVT), ISD::SETLT);

44737

if (EVT(CmpVT) == VT)

44738

return Cmp;

44739

44740

// Pad with zeroes up to original VT to replace the zeroes that were

44741

// being used from the MOVMSK.

44742

unsigned NumConcats = NumElts / MovMskElts;

44743

SmallVector<SDValue, 4> Ops(NumConcats, DAG.getConstant(0, dl, CmpVT));

44744

Ops[0] = Cmp;

44745

return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Ops);

44746

}

44747

}

44748

}

44749

44750

// Try to remove bitcasts from input and output of mask arithmetic to

44751

// remove GPR<->K-register crossings.

44752

if (SDValue V = combineCastedMaskArithmetic(N, DAG, DCI, Subtarget))

44753

return V;

44754

44755

// Convert a bitcasted integer logic operation that has one bitcasted

44756

// floating-point operand into a floating-point logic operation. This may

44757

// create a load of a constant, but that is cheaper than materializing the

44758

// constant in an integer register and transferring it to an SSE register or

44759

// transferring the SSE operand to integer register and back.

44760

unsigned FPOpcode;

44761

switch (N0.getOpcode()) {

44762

case ISD::AND: FPOpcode = X86ISD::FAND; break;

44763

case ISD::OR: FPOpcode = X86ISD::FOR; break;

44764

case ISD::XOR: FPOpcode = X86ISD::FXOR; break;

44765

default: return SDValue();

44766

}

44767

44768

// Check if we have a bitcast from another integer type as well.

44769

if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||

44770

(Subtarget.hasSSE2() && VT == MVT::f64) ||

44771

(Subtarget.hasFP16() && VT == MVT::f16) ||

44772

(Subtarget.hasSSE2() && VT.isInteger() && VT.isVector() &&

44773

TLI.isTypeLegal(VT))))

44774

return SDValue();

44775

44776

SDValue LogicOp0 = N0.getOperand(0);

44777

SDValue LogicOp1 = N0.getOperand(1);

44778

SDLoc DL0(N0);

44779

44780

// bitcast(logic(bitcast(X), Y)) --> logic'(X, bitcast(Y))

44781

if (N0.hasOneUse() && LogicOp0.getOpcode() == ISD::BITCAST &&

44782

LogicOp0.hasOneUse() && LogicOp0.getOperand(0).hasOneUse() &&

44783

LogicOp0.getOperand(0).getValueType() == VT &&

44784

!isa<ConstantSDNode>(LogicOp0.getOperand(0))) {

44785

SDValue CastedOp1 = DAG.getBitcast(VT, LogicOp1);

44786

unsigned Opcode = VT.isFloatingPoint() ? FPOpcode : N0.getOpcode();

44787

return DAG.getNode(Opcode, DL0, VT, LogicOp0.getOperand(0), CastedOp1);

44788

}

44789

// bitcast(logic(X, bitcast(Y))) --> logic'(bitcast(X), Y)

44790

if (N0.hasOneUse() && LogicOp1.getOpcode() == ISD::BITCAST &&

44791

LogicOp1.hasOneUse() && LogicOp1.getOperand(0).hasOneUse() &&

44792

LogicOp1.getOperand(0).getValueType() == VT &&

44793

!isa<ConstantSDNode>(LogicOp1.getOperand(0))) {

44794

SDValue CastedOp0 = DAG.getBitcast(VT, LogicOp0);

44795

unsigned Opcode = VT.isFloatingPoint() ? FPOpcode : N0.getOpcode();

44796

return DAG.getNode(Opcode, DL0, VT, LogicOp1.getOperand(0), CastedOp0);

44797

}

44798

44799

return SDValue();

44800

}

44801

44802

// (mul (zext a), (sext, b))

44803

static bool detectExtMul(SelectionDAG &DAG, const SDValue &Mul, SDValue &Op0,

44804

SDValue &Op1) {

44805

Op0 = Mul.getOperand(0);

44806

Op1 = Mul.getOperand(1);

44807

44808

// The operand1 should be signed extend

44809

if (Op0.getOpcode() == ISD::SIGN_EXTEND)

44810

std::swap(Op0, Op1);

44811

44812

auto IsFreeTruncation = [](SDValue &Op) -> bool {

44813

if ((Op.getOpcode() == ISD::ZERO_EXTEND ||

44814

Op.getOpcode() == ISD::SIGN_EXTEND) &&

44815

Op.getOperand(0).getScalarValueSizeInBits() <= 8)

44816

return true;

44817

44818

auto *BV = dyn_cast<BuildVectorSDNode>(Op);

44819

return (BV && BV->isConstant());

44820

};

44821

44822

// (dpbusd (zext a), (sext, b)). Since the first operand should be unsigned

44823

// value, we need to check Op0 is zero extended value. Op1 should be signed

44824

// value, so we just check the signed bits.

44825

if ((IsFreeTruncation(Op0) &&

44826

DAG.computeKnownBits(Op0).countMaxActiveBits() <= 8) &&

44827

(IsFreeTruncation(Op1) && DAG.ComputeMaxSignificantBits(Op1) <= 8))

44828

return true;

44829

44830

return false;

44831

}

44832

44833

// Given a ABS node, detect the following pattern:

44834

// (ABS (SUB (ZERO_EXTEND a), (ZERO_EXTEND b))).

44835

// This is useful as it is the input into a SAD pattern.

44836

static bool detectZextAbsDiff(const SDValue &Abs, SDValue &Op0, SDValue &Op1) {

44837

SDValue AbsOp1 = Abs->getOperand(0);

44838

if (AbsOp1.getOpcode() != ISD::SUB)

44839

return false;

44840

44841

Op0 = AbsOp1.getOperand(0);

44842

Op1 = AbsOp1.getOperand(1);

44843

44844

// Check if the operands of the sub are zero-extended from vectors of i8.

44845

if (Op0.getOpcode() != ISD::ZERO_EXTEND ||

44846

Op0.getOperand(0).getValueType().getVectorElementType() != MVT::i8 ||

44847

Op1.getOpcode() != ISD::ZERO_EXTEND ||

44848

Op1.getOperand(0).getValueType().getVectorElementType() != MVT::i8)

44849

return false;

44850

44851

return true;

44852

}

44853

44854

static SDValue createVPDPBUSD(SelectionDAG &DAG, SDValue LHS, SDValue RHS,

44855

unsigned &LogBias, const SDLoc &DL,

44856

const X86Subtarget &Subtarget) {

44857

// Extend or truncate to MVT::i8 first.

44858

MVT Vi8VT =

44859

MVT::getVectorVT(MVT::i8, LHS.getValueType().getVectorElementCount());

44860

LHS = DAG.getZExtOrTrunc(LHS, DL, Vi8VT);

44861

RHS = DAG.getSExtOrTrunc(RHS, DL, Vi8VT);

44862

44863

// VPDPBUSD(<16 x i32>C, <16 x i8>A, <16 x i8>B). For each dst element

44864

// C[0] = C[0] + A[0]B[0] + A[1]B[1] + A[2]B[2] + A[3]B[3].

44865

// The src A, B element type is i8, but the dst C element type is i32.

44866

// When we calculate the reduce stage, we use src vector type vXi8 for it

44867

// so we need logbias 2 to avoid extra 2 stages.

44868

LogBias = 2;

44869

44870

unsigned RegSize = std::max(128u, (unsigned)Vi8VT.getSizeInBits());

44871

if (Subtarget.hasVNNI() && !Subtarget.hasVLX())

44872

RegSize = std::max(512u, RegSize);

44873

44874

// "Zero-extend" the i8 vectors. This is not a per-element zext, rather we

44875

// fill in the missing vector elements with 0.

44876

unsigned NumConcat = RegSize / Vi8VT.getSizeInBits();

44877

SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, Vi8VT));

44878

Ops[0] = LHS;

44879

MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);

44880

SDValue DpOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);

44881

Ops[0] = RHS;

44882

SDValue DpOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);

44883

44884

// Actually build the DotProduct, split as 256/512 bits for

44885

// AVXVNNI/AVX512VNNI.

44886

auto DpBuilder = [](SelectionDAG &DAG, const SDLoc &DL,

44887

ArrayRef<SDValue> Ops) {

44888

MVT VT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);

44889

return DAG.getNode(X86ISD::VPDPBUSD, DL, VT, Ops);

44890

};

44891

MVT DpVT = MVT::getVectorVT(MVT::i32, RegSize / 32);

44892

SDValue Zero = DAG.getConstant(0, DL, DpVT);

44893

44894

return SplitOpsAndApply(DAG, Subtarget, DL, DpVT, {Zero, DpOp0, DpOp1},

44895

DpBuilder, false);

44896

}

44897

44898

// Given two zexts of <k x i8> to <k x i32>, create a PSADBW of the inputs

44899

// to these zexts.

44900

static SDValue createPSADBW(SelectionDAG &DAG, const SDValue &Zext0,

44901

const SDValue &Zext1, const SDLoc &DL,

44902

const X86Subtarget &Subtarget) {

44903

// Find the appropriate width for the PSADBW.

44904

EVT InVT = Zext0.getOperand(0).getValueType();

44905

unsigned RegSize = std::max(128u, (unsigned)InVT.getSizeInBits());

44906

44907

// "Zero-extend" the i8 vectors. This is not a per-element zext, rather we

44908

// fill in the missing vector elements with 0.

44909

unsigned NumConcat = RegSize / InVT.getSizeInBits();

44910

SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, InVT));

44911

Ops[0] = Zext0.getOperand(0);

44912

MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);

44913

SDValue SadOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);

44914

Ops[0] = Zext1.getOperand(0);

44915

SDValue SadOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);

44916

44917

// Actually build the SAD, split as 128/256/512 bits for SSE/AVX2/AVX512BW.

44918

auto PSADBWBuilder = [](SelectionDAG &DAG, const SDLoc &DL,

44919

ArrayRef<SDValue> Ops) {

44920

MVT VT = MVT::getVectorVT(MVT::i64, Ops[0].getValueSizeInBits() / 64);

44921

return DAG.getNode(X86ISD::PSADBW, DL, VT, Ops);

44922

};

44923

MVT SadVT = MVT::getVectorVT(MVT::i64, RegSize / 64);

44924

return SplitOpsAndApply(DAG, Subtarget, DL, SadVT, { SadOp0, SadOp1 },

44925

PSADBWBuilder);

44926

}

44927

44928

// Attempt to replace an min/max v8i16/v16i8 horizontal reduction with

44929

// PHMINPOSUW.

44930

static SDValue combineMinMaxReduction(SDNode *Extract, SelectionDAG &DAG,

44931

const X86Subtarget &Subtarget) {

44932

// Bail without SSE41.

44933

if (!Subtarget.hasSSE41())

44934

return SDValue();

44935

44936

EVT ExtractVT = Extract->getValueType(0);

44937

if (ExtractVT != MVT::i16 && ExtractVT != MVT::i8)

44938

return SDValue();

44939

44940

// Check for SMAX/SMIN/UMAX/UMIN horizontal reduction patterns.

44941

ISD::NodeType BinOp;

44942

SDValue Src = DAG.matchBinOpReduction(

44943

Extract, BinOp, {ISD::SMAX, ISD::SMIN, ISD::UMAX, ISD::UMIN}, true);

44944

if (!Src)

44945

return SDValue();

44946

44947

EVT SrcVT = Src.getValueType();

44948

EVT SrcSVT = SrcVT.getScalarType();

44949

if (SrcSVT != ExtractVT || (SrcVT.getSizeInBits() % 128) != 0)

44950

return SDValue();

44951

44952

SDLoc DL(Extract);

44953

SDValue MinPos = Src;

44954

44955

// First, reduce the source down to 128-bit, applying BinOp to lo/hi.

44956

while (SrcVT.getSizeInBits() > 128) {

44957

SDValue Lo, Hi;

44958

std::tie(Lo, Hi) = splitVector(MinPos, DAG, DL);

44959

SrcVT = Lo.getValueType();

44960

MinPos = DAG.getNode(BinOp, DL, SrcVT, Lo, Hi);

44961

}

44962

assert(((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) ||(static_cast <bool> (((SrcVT == MVT::v8i16 && ExtractVT
== MVT::i16) || (SrcVT == MVT::v16i8 && ExtractVT ==
MVT::i8)) && "Unexpected value type") ? void (0) : __assert_fail
("((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) || (SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 44964, __extension__
__PRETTY_FUNCTION__))

44963

(SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) &&(static_cast <bool> (((SrcVT == MVT::v8i16 && ExtractVT
== MVT::i16) || (SrcVT == MVT::v16i8 && ExtractVT ==
MVT::i8)) && "Unexpected value type") ? void (0) : __assert_fail
("((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) || (SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 44964, __extension__
__PRETTY_FUNCTION__))

44964

"Unexpected value type")(static_cast <bool> (((SrcVT == MVT::v8i16 && ExtractVT
== MVT::i16) || (SrcVT == MVT::v16i8 && ExtractVT ==
MVT::i8)) && "Unexpected value type") ? void (0) : __assert_fail
("((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) || (SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 44964, __extension__
__PRETTY_FUNCTION__));

44965

44966

// PHMINPOSUW applies to UMIN(v8i16), for SMIN/SMAX/UMAX we must apply a mask

44967

// to flip the value accordingly.

44968

SDValue Mask;

44969

unsigned MaskEltsBits = ExtractVT.getSizeInBits();

44970

if (BinOp == ISD::SMAX)

44971

Mask = DAG.getConstant(APInt::getSignedMaxValue(MaskEltsBits), DL, SrcVT);

44972

else if (BinOp == ISD::SMIN)

44973

Mask = DAG.getConstant(APInt::getSignedMinValue(MaskEltsBits), DL, SrcVT);

44974

else if (BinOp == ISD::UMAX)

44975

Mask = DAG.getAllOnesConstant(DL, SrcVT);

44976

44977

if (Mask)

44978

MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);

44979

44980

// For v16i8 cases we need to perform UMIN on pairs of byte elements,

44981

// shuffling each upper element down and insert zeros. This means that the

44982

// v16i8 UMIN will leave the upper element as zero, performing zero-extension

44983

// ready for the PHMINPOS.

44984

if (ExtractVT == MVT::i8) {

44985

SDValue Upper = DAG.getVectorShuffle(

44986

SrcVT, DL, MinPos, DAG.getConstant(0, DL, MVT::v16i8),

44987

{1, 16, 3, 16, 5, 16, 7, 16, 9, 16, 11, 16, 13, 16, 15, 16});

44988

MinPos = DAG.getNode(ISD::UMIN, DL, SrcVT, MinPos, Upper);

44989

}

44990

44991

// Perform the PHMINPOS on a v8i16 vector,

44992

MinPos = DAG.getBitcast(MVT::v8i16, MinPos);

44993

MinPos = DAG.getNode(X86ISD::PHMINPOS, DL, MVT::v8i16, MinPos);

44994

MinPos = DAG.getBitcast(SrcVT, MinPos);

44995

44996

if (Mask)

44997

MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);

44998

44999

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, MinPos,

45000

DAG.getIntPtrConstant(0, DL));

45001

}

45002

45003

// Attempt to replace an all_of/any_of/parity style horizontal reduction with a MOVMSK.

45004

static SDValue combinePredicateReduction(SDNode *Extract, SelectionDAG &DAG,

45005

const X86Subtarget &Subtarget) {

45006

// Bail without SSE2.

45007

if (!Subtarget.hasSSE2())

45008

return SDValue();

45009

45010

EVT ExtractVT = Extract->getValueType(0);

45011

unsigned BitWidth = ExtractVT.getSizeInBits();

45012

if (ExtractVT != MVT::i64 && ExtractVT != MVT::i32 && ExtractVT != MVT::i16 &&

45013

ExtractVT != MVT::i8 && ExtractVT != MVT::i1)

45014

return SDValue();

45015

45016

// Check for OR(any_of)/AND(all_of)/XOR(parity) horizontal reduction patterns.

45017

ISD::NodeType BinOp;

45018

SDValue Match = DAG.matchBinOpReduction(Extract, BinOp, {ISD::OR, ISD::AND});

45019

if (!Match && ExtractVT == MVT::i1)

45020

Match = DAG.matchBinOpReduction(Extract, BinOp, {ISD::XOR});

45021

if (!Match)

45022

return SDValue();

45023

45024

// EXTRACT_VECTOR_ELT can require implicit extension of the vector element

45025

// which we can't support here for now.

45026

if (Match.getScalarValueSizeInBits() != BitWidth)

45027

return SDValue();

45028

45029

SDValue Movmsk;

45030

SDLoc DL(Extract);

45031

EVT MatchVT = Match.getValueType();

45032

unsigned NumElts = MatchVT.getVectorNumElements();

45033

unsigned MaxElts = Subtarget.hasInt256() ? 32 : 16;

45034

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

45035

LLVMContext &Ctx = *DAG.getContext();

45036

45037

if (ExtractVT == MVT::i1) {

45038

// Special case for (pre-legalization) vXi1 reductions.

45039

if (NumElts > 64 || !isPowerOf2_32(NumElts))

45040

return SDValue();

45041

if (Match.getOpcode() == ISD::SETCC) {

45042

ISD::CondCode CC = cast<CondCodeSDNode>(Match.getOperand(2))->get();

45043

if ((BinOp == ISD::AND && CC == ISD::CondCode::SETEQ) ||

45044

(BinOp == ISD::OR && CC == ISD::CondCode::SETNE)) {

45045

// For all_of(setcc(x,y,eq)) - use (iX)x == (iX)y.

45046

// For any_of(setcc(x,y,ne)) - use (iX)x != (iX)y.

45047

X86::CondCode X86CC;

45048

SDValue LHS = DAG.getFreeze(Match.getOperand(0));

45049

SDValue RHS = DAG.getFreeze(Match.getOperand(1));

45050

APInt Mask = APInt::getAllOnes(LHS.getScalarValueSizeInBits());

45051

if (SDValue V = LowerVectorAllEqual(DL, LHS, RHS, CC, Mask, Subtarget,

45052

DAG, X86CC))

45053

return DAG.getNode(ISD::TRUNCATE, DL, ExtractVT,

45054

getSETCC(X86CC, V, DL, DAG));

45055

}

45056

}

45057

if (TLI.isTypeLegal(MatchVT)) {

45058

// If this is a legal AVX512 predicate type then we can just bitcast.

45059

EVT MovmskVT = EVT::getIntegerVT(Ctx, NumElts);

45060

Movmsk = DAG.getBitcast(MovmskVT, Match);

45061

} else {

45062

// Use combineBitcastvxi1 to create the MOVMSK.

45063

while (NumElts > MaxElts) {

45064

SDValue Lo, Hi;

45065

std::tie(Lo, Hi) = DAG.SplitVector(Match, DL);

45066

Match = DAG.getNode(BinOp, DL, Lo.getValueType(), Lo, Hi);

45067

NumElts /= 2;

45068

}

45069

EVT MovmskVT = EVT::getIntegerVT(Ctx, NumElts);

45070

Movmsk = combineBitcastvxi1(DAG, MovmskVT, Match, DL, Subtarget);

45071

}

45072

if (!Movmsk)

45073

return SDValue();

45074

Movmsk = DAG.getZExtOrTrunc(Movmsk, DL, NumElts > 32 ? MVT::i64 : MVT::i32);

45075

} else {

45076

// FIXME: Better handling of k-registers or 512-bit vectors?

45077

unsigned MatchSizeInBits = Match.getValueSizeInBits();

45078

if (!(MatchSizeInBits == 128 ||

45079

(MatchSizeInBits == 256 && Subtarget.hasAVX())))

45080

return SDValue();

45081

45082

// Make sure this isn't a vector of 1 element. The perf win from using

45083

// MOVMSK diminishes with less elements in the reduction, but it is

45084

// generally better to get the comparison over to the GPRs as soon as

45085

// possible to reduce the number of vector ops.

45086

if (Match.getValueType().getVectorNumElements() < 2)

45087

return SDValue();

45088

45089

// Check that we are extracting a reduction of all sign bits.

45090

if (DAG.ComputeNumSignBits(Match) != BitWidth)

45091

return SDValue();

45092

45093

if (MatchSizeInBits == 256 && BitWidth < 32 && !Subtarget.hasInt256()) {

45094

SDValue Lo, Hi;

45095

std::tie(Lo, Hi) = DAG.SplitVector(Match, DL);

45096

Match = DAG.getNode(BinOp, DL, Lo.getValueType(), Lo, Hi);

45097

MatchSizeInBits = Match.getValueSizeInBits();

45098

}

45099

45100

// For 32/64 bit comparisons use MOVMSKPS/MOVMSKPD, else PMOVMSKB.

45101

MVT MaskSrcVT;

45102

if (64 == BitWidth || 32 == BitWidth)

45103

MaskSrcVT = MVT::getVectorVT(MVT::getFloatingPointVT(BitWidth),

45104

MatchSizeInBits / BitWidth);

45105

else

45106

MaskSrcVT = MVT::getVectorVT(MVT::i8, MatchSizeInBits / 8);

45107

45108

SDValue BitcastLogicOp = DAG.getBitcast(MaskSrcVT, Match);

45109

Movmsk = getPMOVMSKB(DL, BitcastLogicOp, DAG, Subtarget);

45110

NumElts = MaskSrcVT.getVectorNumElements();

45111

}

45112

assert((NumElts <= 32 || NumElts == 64) &&(static_cast <bool> ((NumElts <= 32 || NumElts == 64
) && "Not expecting more than 64 elements") ? void (0
) : __assert_fail ("(NumElts <= 32 || NumElts == 64) && \"Not expecting more than 64 elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45113, __extension__
__PRETTY_FUNCTION__))

45113

"Not expecting more than 64 elements")(static_cast <bool> ((NumElts <= 32 || NumElts == 64
) && "Not expecting more than 64 elements") ? void (0
) : __assert_fail ("(NumElts <= 32 || NumElts == 64) && \"Not expecting more than 64 elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45113, __extension__
__PRETTY_FUNCTION__));

45114

45115

MVT CmpVT = NumElts == 64 ? MVT::i64 : MVT::i32;

45116

if (BinOp == ISD::XOR) {

45117

// parity -> (PARITY(MOVMSK X))

45118

SDValue Result = DAG.getNode(ISD::PARITY, DL, CmpVT, Movmsk);

45119

return DAG.getZExtOrTrunc(Result, DL, ExtractVT);

45120

}

45121

45122

SDValue CmpC;

45123

ISD::CondCode CondCode;

45124

if (BinOp == ISD::OR) {

45125

// any_of -> MOVMSK != 0

45126

CmpC = DAG.getConstant(0, DL, CmpVT);

45127

CondCode = ISD::CondCode::SETNE;

45128

} else {

45129

// all_of -> MOVMSK == ((1 << NumElts) - 1)

45130

CmpC = DAG.getConstant(APInt::getLowBitsSet(CmpVT.getSizeInBits(), NumElts),

45131

DL, CmpVT);

45132

CondCode = ISD::CondCode::SETEQ;

45133

}

45134

45135

// The setcc produces an i8 of 0/1, so extend that to the result width and

45136

// negate to get the final 0/-1 mask value.

45137

EVT SetccVT = TLI.getSetCCResultType(DAG.getDataLayout(), Ctx, CmpVT);

45138

SDValue Setcc = DAG.getSetCC(DL, SetccVT, Movmsk, CmpC, CondCode);

45139

SDValue Zext = DAG.getZExtOrTrunc(Setcc, DL, ExtractVT);

45140

SDValue Zero = DAG.getConstant(0, DL, ExtractVT);

45141

return DAG.getNode(ISD::SUB, DL, ExtractVT, Zero, Zext);

45142

}

45143

45144

static SDValue combineVPDPBUSDPattern(SDNode *Extract, SelectionDAG &DAG,

45145

const X86Subtarget &Subtarget) {

45146

if (!Subtarget.hasVNNI() && !Subtarget.hasAVXVNNI())

45147

return SDValue();

45148

45149

EVT ExtractVT = Extract->getValueType(0);

45150

// Verify the type we're extracting is i32, as the output element type of

45151

// vpdpbusd is i32.

45152

if (ExtractVT != MVT::i32)

45153

return SDValue();

45154

45155

EVT VT = Extract->getOperand(0).getValueType();

45156

if (!isPowerOf2_32(VT.getVectorNumElements()))

45157

return SDValue();

45158

45159

// Match shuffle + add pyramid.

45160

ISD::NodeType BinOp;

45161

SDValue Root = DAG.matchBinOpReduction(Extract, BinOp, {ISD::ADD});

45162

45163

// We can't combine to vpdpbusd for zext, because each of the 4 multiplies

45164

// done by vpdpbusd compute a signed 16-bit product that will be sign extended

45165

// before adding into the accumulator.

45166

// TODO:

45167

// We also need to verify that the multiply has at least 2x the number of bits

45168

// of the input. We shouldn't match

45169

// (sign_extend (mul (vXi9 (zext (vXi8 X))), (vXi9 (zext (vXi8 Y)))).

45170

// if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND))

45171

// Root = Root.getOperand(0);

45172

45173

// If there was a match, we want Root to be a mul.

45174

if (!Root || Root.getOpcode() != ISD::MUL)

45175

return SDValue();

45176

45177

// Check whether we have an extend and mul pattern

45178

SDValue LHS, RHS;

45179

if (!detectExtMul(DAG, Root, LHS, RHS))

45180

return SDValue();

45181

45182

// Create the dot product instruction.

45183

SDLoc DL(Extract);

45184

unsigned StageBias;

45185

SDValue DP = createVPDPBUSD(DAG, LHS, RHS, StageBias, DL, Subtarget);

45186

45187

// If the original vector was wider than 4 elements, sum over the results

45188

// in the DP vector.

45189

unsigned Stages = Log2_32(VT.getVectorNumElements());

45190

EVT DpVT = DP.getValueType();

45191

45192

if (Stages > StageBias) {

45193

unsigned DpElems = DpVT.getVectorNumElements();

45194

45195

for (unsigned i = Stages - StageBias; i > 0; --i) {

45196

SmallVector<int, 16> Mask(DpElems, -1);

45197

for (unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)

45198

Mask[j] = MaskEnd + j;

45199

45200

SDValue Shuffle =

45201

DAG.getVectorShuffle(DpVT, DL, DP, DAG.getUNDEF(DpVT), Mask);

45202

DP = DAG.getNode(ISD::ADD, DL, DpVT, DP, Shuffle);

45203

}

45204

}

45205

45206

// Return the lowest ExtractSizeInBits bits.

45207

EVT ResVT =

45208

EVT::getVectorVT(*DAG.getContext(), ExtractVT,

45209

DpVT.getSizeInBits() / ExtractVT.getSizeInBits());

45210

DP = DAG.getBitcast(ResVT, DP);

45211

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, DP,

45212

Extract->getOperand(1));

45213

}

45214

45215

static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,

45216

const X86Subtarget &Subtarget) {

45217

// PSADBW is only supported on SSE2 and up.

45218

if (!Subtarget.hasSSE2())

45219

return SDValue();

45220

45221

EVT ExtractVT = Extract->getValueType(0);

45222

// Verify the type we're extracting is either i32 or i64.

45223

// FIXME: Could support other types, but this is what we have coverage for.

45224

if (ExtractVT != MVT::i32 && ExtractVT != MVT::i64)

45225

return SDValue();

45226

45227

EVT VT = Extract->getOperand(0).getValueType();

45228

if (!isPowerOf2_32(VT.getVectorNumElements()))

45229

return SDValue();

45230

45231

// Match shuffle + add pyramid.

45232

ISD::NodeType BinOp;

45233

SDValue Root = DAG.matchBinOpReduction(Extract, BinOp, {ISD::ADD});

45234

45235

// The operand is expected to be zero extended from i8

45236

// (verified in detectZextAbsDiff).

45237

// In order to convert to i64 and above, additional any/zero/sign

45238

// extend is expected.

45239

// The zero extend from 32 bit has no mathematical effect on the result.

45240

// Also the sign extend is basically zero extend

45241

// (extends the sign bit which is zero).

45242

// So it is correct to skip the sign/zero extend instruction.

45243

if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND ||

45244

Root.getOpcode() == ISD::ZERO_EXTEND ||

45245

Root.getOpcode() == ISD::ANY_EXTEND))

45246

Root = Root.getOperand(0);

45247

45248

// If there was a match, we want Root to be a select that is the root of an

45249

// abs-diff pattern.

45250

if (!Root || Root.getOpcode() != ISD::ABS)

45251

return SDValue();

45252

45253

// Check whether we have an abs-diff pattern feeding into the select.

45254

SDValue Zext0, Zext1;

45255

if (!detectZextAbsDiff(Root, Zext0, Zext1))

45256

return SDValue();

45257

45258

// Create the SAD instruction.

45259

SDLoc DL(Extract);

45260

SDValue SAD = createPSADBW(DAG, Zext0, Zext1, DL, Subtarget);

45261

45262

// If the original vector was wider than 8 elements, sum over the results

45263

// in the SAD vector.

45264

unsigned Stages = Log2_32(VT.getVectorNumElements());

45265

EVT SadVT = SAD.getValueType();

45266

if (Stages > 3) {

45267

unsigned SadElems = SadVT.getVectorNumElements();

45268

45269

for(unsigned i = Stages - 3; i > 0; --i) {

45270

SmallVector<int, 16> Mask(SadElems, -1);

45271

for(unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)

45272

Mask[j] = MaskEnd + j;

45273

45274

SDValue Shuffle =

45275

DAG.getVectorShuffle(SadVT, DL, SAD, DAG.getUNDEF(SadVT), Mask);

45276

SAD = DAG.getNode(ISD::ADD, DL, SadVT, SAD, Shuffle);

45277

}

45278

}

45279

45280

unsigned ExtractSizeInBits = ExtractVT.getSizeInBits();

45281

// Return the lowest ExtractSizeInBits bits.

45282

EVT ResVT = EVT::getVectorVT(*DAG.getContext(), ExtractVT,

45283

SadVT.getSizeInBits() / ExtractSizeInBits);

45284

SAD = DAG.getBitcast(ResVT, SAD);

45285

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, SAD,

45286

Extract->getOperand(1));

45287

}

45288

45289

// Attempt to peek through a target shuffle and extract the scalar from the

45290

// source.

45291

static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,

45292

TargetLowering::DAGCombinerInfo &DCI,

45293

const X86Subtarget &Subtarget) {

45294

if (DCI.isBeforeLegalizeOps())

45295

return SDValue();

45296

45297

SDLoc dl(N);

45298

SDValue Src = N->getOperand(0);

45299

SDValue Idx = N->getOperand(1);

45300

45301

EVT VT = N->getValueType(0);

45302

EVT SrcVT = Src.getValueType();

45303

EVT SrcSVT = SrcVT.getVectorElementType();

45304

unsigned SrcEltBits = SrcSVT.getSizeInBits();

45305

unsigned NumSrcElts = SrcVT.getVectorNumElements();

45306

45307

// Don't attempt this for boolean mask vectors or unknown extraction indices.

45308

if (SrcSVT == MVT::i1 || !isa<ConstantSDNode>(Idx))

45309

return SDValue();

45310

45311

const APInt &IdxC = N->getConstantOperandAPInt(1);

45312

if (IdxC.uge(NumSrcElts))

45313

return SDValue();

45314

45315

SDValue SrcBC = peekThroughBitcasts(Src);

45316

45317

// Handle extract(bitcast(broadcast(scalar_value))).

45318

if (X86ISD::VBROADCAST == SrcBC.getOpcode()) {

45319

SDValue SrcOp = SrcBC.getOperand(0);

45320

EVT SrcOpVT = SrcOp.getValueType();

45321

if (SrcOpVT.isScalarInteger() && VT.isInteger() &&

45322

(SrcOpVT.getSizeInBits() % SrcEltBits) == 0) {

45323

unsigned Scale = SrcOpVT.getSizeInBits() / SrcEltBits;

45324

unsigned Offset = IdxC.urem(Scale) * SrcEltBits;

45325

// TODO support non-zero offsets.

45326

if (Offset == 0) {

45327

SrcOp = DAG.getZExtOrTrunc(SrcOp, dl, SrcVT.getScalarType());

45328

SrcOp = DAG.getZExtOrTrunc(SrcOp, dl, VT);

45329

return SrcOp;

45330

}

45331

}

45332

}

45333

45334

// If we're extracting a single element from a broadcast load and there are

45335

// no other users, just create a single load.

45336

if (SrcBC.getOpcode() == X86ISD::VBROADCAST_LOAD && SrcBC.hasOneUse()) {

45337

auto *MemIntr = cast<MemIntrinsicSDNode>(SrcBC);

45338

unsigned SrcBCWidth = SrcBC.getScalarValueSizeInBits();

45339

if (MemIntr->getMemoryVT().getSizeInBits() == SrcBCWidth &&

45340

VT.getSizeInBits() == SrcBCWidth && SrcEltBits == SrcBCWidth) {

45341

SDValue Load = DAG.getLoad(VT, dl, MemIntr->getChain(),

45342

MemIntr->getBasePtr(),

45343

MemIntr->getPointerInfo(),

45344

MemIntr->getOriginalAlign(),

45345

MemIntr->getMemOperand()->getFlags());

45346

DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), Load.getValue(1));

45347

return Load;

45348

}

45349

}

45350

45351

// Handle extract(bitcast(scalar_to_vector(scalar_value))) for integers.

45352

// TODO: Move to DAGCombine?

45353

if (SrcBC.getOpcode() == ISD::SCALAR_TO_VECTOR && VT.isInteger() &&

45354

SrcBC.getValueType().isInteger() &&

45355

(SrcBC.getScalarValueSizeInBits() % SrcEltBits) == 0 &&

45356

SrcBC.getScalarValueSizeInBits() ==

45357

SrcBC.getOperand(0).getValueSizeInBits()) {

45358

unsigned Scale = SrcBC.getScalarValueSizeInBits() / SrcEltBits;

45359

if (IdxC.ult(Scale)) {

45360

unsigned Offset = IdxC.getZExtValue() * SrcVT.getScalarSizeInBits();

45361

SDValue Scl = SrcBC.getOperand(0);

45362

EVT SclVT = Scl.getValueType();

45363

if (Offset) {

45364

Scl = DAG.getNode(ISD::SRL, dl, SclVT, Scl,

45365

DAG.getShiftAmountConstant(Offset, SclVT, dl));

45366

}

45367

Scl = DAG.getZExtOrTrunc(Scl, dl, SrcVT.getScalarType());

45368

Scl = DAG.getZExtOrTrunc(Scl, dl, VT);

45369

return Scl;

45370

}

45371

}

45372

45373

// Handle extract(truncate(x)) for 0'th index.

45374

// TODO: Treat this as a faux shuffle?

45375

// TODO: When can we use this for general indices?

45376

if (ISD::TRUNCATE == Src.getOpcode() && IdxC == 0 &&

45377

(SrcVT.getSizeInBits() % 128) == 0) {

45378

Src = extract128BitVector(Src.getOperand(0), 0, DAG, dl);

45379

MVT ExtractVT = MVT::getVectorVT(SrcSVT.getSimpleVT(), 128 / SrcEltBits);

45380

return DAG.getNode(N->getOpcode(), dl, VT, DAG.getBitcast(ExtractVT, Src),

45381

Idx);

45382

}

45383

45384

// We can only legally extract other elements from 128-bit vectors and in

45385

// certain circumstances, depending on SSE-level.

45386

// TODO: Investigate float/double extraction if it will be just stored.

45387

auto GetLegalExtract = [&Subtarget, &DAG, &dl](SDValue Vec, EVT VecVT,

45388

unsigned Idx) {

45389

EVT VecSVT = VecVT.getScalarType();

45390

if ((VecVT.is256BitVector() || VecVT.is512BitVector()) &&

45391

(VecSVT == MVT::i8 || VecSVT == MVT::i16 || VecSVT == MVT::i32 ||

45392

VecSVT == MVT::i64)) {

45393

unsigned EltSizeInBits = VecSVT.getSizeInBits();

45394

unsigned NumEltsPerLane = 128 / EltSizeInBits;

45395

unsigned LaneOffset = (Idx & ~(NumEltsPerLane - 1)) * EltSizeInBits;

45396

unsigned LaneIdx = LaneOffset / Vec.getScalarValueSizeInBits();

45397

VecVT = EVT::getVectorVT(*DAG.getContext(), VecSVT, NumEltsPerLane);

45398

Vec = extract128BitVector(Vec, LaneIdx, DAG, dl);

45399

Idx &= (NumEltsPerLane - 1);

45400

}

45401

if ((VecVT == MVT::v4i32 || VecVT == MVT::v2i64) &&

45402

((Idx == 0 && Subtarget.hasSSE2()) || Subtarget.hasSSE41())) {

45403

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VecVT.getScalarType(),

45404

DAG.getBitcast(VecVT, Vec),

45405

DAG.getIntPtrConstant(Idx, dl));

45406

}

45407

if ((VecVT == MVT::v8i16 && Subtarget.hasSSE2()) ||

45408

(VecVT == MVT::v16i8 && Subtarget.hasSSE41())) {

45409

unsigned OpCode = (VecVT == MVT::v8i16 ? X86ISD::PEXTRW : X86ISD::PEXTRB);

45410

return DAG.getNode(OpCode, dl, MVT::i32, DAG.getBitcast(VecVT, Vec),

45411

DAG.getTargetConstant(Idx, dl, MVT::i8));

45412

}

45413

return SDValue();

45414

};

45415

45416

// Resolve the target shuffle inputs and mask.

45417

SmallVector<int, 16> Mask;

45418

SmallVector<SDValue, 2> Ops;

45419

if (!getTargetShuffleInputs(SrcBC, Ops, Mask, DAG))

45420

return SDValue();

45421

45422

// Shuffle inputs must be the same size as the result.

45423

if (llvm::any_of(Ops, [SrcVT](SDValue Op) {

45424

return SrcVT.getSizeInBits() != Op.getValueSizeInBits();

45425

}))

45426

return SDValue();

45427

45428

// Attempt to narrow/widen the shuffle mask to the correct size.

45429

if (Mask.size() != NumSrcElts) {

45430

if ((NumSrcElts % Mask.size()) == 0) {

45431

SmallVector<int, 16> ScaledMask;

45432

int Scale = NumSrcElts / Mask.size();

45433

narrowShuffleMaskElts(Scale, Mask, ScaledMask);

45434

Mask = std::move(ScaledMask);

45435

} else if ((Mask.size() % NumSrcElts) == 0) {

45436

// Simplify Mask based on demanded element.

45437

int ExtractIdx = (int)IdxC.getZExtValue();

45438

int Scale = Mask.size() / NumSrcElts;

45439

int Lo = Scale * ExtractIdx;

45440

int Hi = Scale * (ExtractIdx + 1);

45441

for (int i = 0, e = (int)Mask.size(); i != e; ++i)

45442

if (i < Lo || Hi <= i)

45443

Mask[i] = SM_SentinelUndef;

45444

45445

SmallVector<int, 16> WidenedMask;

45446

while (Mask.size() > NumSrcElts &&

45447

canWidenShuffleElements(Mask, WidenedMask))

45448

Mask = std::move(WidenedMask);

45449

}

45450

}

45451

45452

// If narrowing/widening failed, see if we can extract+zero-extend.

45453

int ExtractIdx;

45454

EVT ExtractVT;

45455

if (Mask.size() == NumSrcElts) {

45456

ExtractIdx = Mask[IdxC.getZExtValue()];

45457

ExtractVT = SrcVT;

45458

} else {

45459

unsigned Scale = Mask.size() / NumSrcElts;

45460

if ((Mask.size() % NumSrcElts) != 0 || SrcVT.isFloatingPoint())

45461

return SDValue();

45462

unsigned ScaledIdx = Scale * IdxC.getZExtValue();

45463

if (!isUndefOrZeroInRange(Mask, ScaledIdx + 1, Scale - 1))

45464

return SDValue();

45465

ExtractIdx = Mask[ScaledIdx];

45466

EVT ExtractSVT = EVT::getIntegerVT(*DAG.getContext(), SrcEltBits / Scale);

45467

ExtractVT = EVT::getVectorVT(*DAG.getContext(), ExtractSVT, Mask.size());

45468

assert(SrcVT.getSizeInBits() == ExtractVT.getSizeInBits() &&(static_cast <bool> (SrcVT.getSizeInBits() == ExtractVT
.getSizeInBits() && "Failed to widen vector type") ? void
(0) : __assert_fail ("SrcVT.getSizeInBits() == ExtractVT.getSizeInBits() && \"Failed to widen vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45469, __extension__
__PRETTY_FUNCTION__))

45469

"Failed to widen vector type")(static_cast <bool> (SrcVT.getSizeInBits() == ExtractVT
.getSizeInBits() && "Failed to widen vector type") ? void
(0) : __assert_fail ("SrcVT.getSizeInBits() == ExtractVT.getSizeInBits() && \"Failed to widen vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45469, __extension__
__PRETTY_FUNCTION__));

45470

}

45471

45472

// If the shuffle source element is undef/zero then we can just accept it.

45473

if (ExtractIdx == SM_SentinelUndef)

45474

return DAG.getUNDEF(VT);

45475

45476

if (ExtractIdx == SM_SentinelZero)

45477

return VT.isFloatingPoint() ? DAG.getConstantFP(0.0, dl, VT)

45478

: DAG.getConstant(0, dl, VT);

45479

45480

SDValue SrcOp = Ops[ExtractIdx / Mask.size()];

45481

ExtractIdx = ExtractIdx % Mask.size();

45482

if (SDValue V = GetLegalExtract(SrcOp, ExtractVT, ExtractIdx))

45483

return DAG.getZExtOrTrunc(V, dl, VT);

45484

45485

return SDValue();

45486

}

45487

45488

/// Extracting a scalar FP value from vector element 0 is free, so extract each

45489

/// operand first, then perform the math as a scalar op.

45490

static SDValue scalarizeExtEltFP(SDNode *ExtElt, SelectionDAG &DAG,

45491

const X86Subtarget &Subtarget) {

45492

assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Expected extract")(static_cast <bool> (ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT
&& "Expected extract") ? void (0) : __assert_fail ("ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && \"Expected extract\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45492, __extension__
__PRETTY_FUNCTION__));

45493

SDValue Vec = ExtElt->getOperand(0);

45494

SDValue Index = ExtElt->getOperand(1);

45495

EVT VT = ExtElt->getValueType(0);

45496

EVT VecVT = Vec.getValueType();

45497

45498

// TODO: If this is a unary/expensive/expand op, allow extraction from a

45499

// non-zero element because the shuffle+scalar op will be cheaper?

45500

if (!Vec.hasOneUse() || !isNullConstant(Index) || VecVT.getScalarType() != VT)

45501

return SDValue();

45502

45503

// Vector FP compares don't fit the pattern of FP math ops (propagate, not

45504

// extract, the condition code), so deal with those as a special-case.

45505

if (Vec.getOpcode() == ISD::SETCC && VT == MVT::i1) {

45506

EVT OpVT = Vec.getOperand(0).getValueType().getScalarType();

45507

if (OpVT != MVT::f32 && OpVT != MVT::f64)

45508

return SDValue();

45509

45510

// extract (setcc X, Y, CC), 0 --> setcc (extract X, 0), (extract Y, 0), CC

45511

SDLoc DL(ExtElt);

45512

SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT,

45513

Vec.getOperand(0), Index);

45514

SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT,

45515

Vec.getOperand(1), Index);

45516

return DAG.getNode(Vec.getOpcode(), DL, VT, Ext0, Ext1, Vec.getOperand(2));

45517

}

45518

45519

if (!(VT == MVT::f16 && Subtarget.hasFP16()) && VT != MVT::f32 &&

45520

VT != MVT::f64)

45521

return SDValue();

45522

45523

// Vector FP selects don't fit the pattern of FP math ops (because the

45524

// condition has a different type and we have to change the opcode), so deal

45525

// with those here.

45526

// FIXME: This is restricted to pre type legalization by ensuring the setcc

45527

// has i1 elements. If we loosen this we need to convert vector bool to a

45528

// scalar bool.

45529

if (Vec.getOpcode() == ISD::VSELECT &&

45530

Vec.getOperand(0).getOpcode() == ISD::SETCC &&

45531

Vec.getOperand(0).getValueType().getScalarType() == MVT::i1 &&

45532

Vec.getOperand(0).getOperand(0).getValueType() == VecVT) {

45533

// ext (sel Cond, X, Y), 0 --> sel (ext Cond, 0), (ext X, 0), (ext Y, 0)

45534

SDLoc DL(ExtElt);

45535

SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL,

45536

Vec.getOperand(0).getValueType().getScalarType(),

45537

Vec.getOperand(0), Index);

45538

SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,

45539

Vec.getOperand(1), Index);

45540

SDValue Ext2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,

45541

Vec.getOperand(2), Index);

45542

return DAG.getNode(ISD::SELECT, DL, VT, Ext0, Ext1, Ext2);

45543

}

45544

45545

// TODO: This switch could include FNEG and the x86-specific FP logic ops

45546

// (FAND, FANDN, FOR, FXOR). But that may require enhancements to avoid

45547

// missed load folding and fma+fneg combining.

45548

switch (Vec.getOpcode()) {

45549

case ISD::FMA: // Begin 3 operands

45550

case ISD::FMAD:

45551

case ISD::FADD: // Begin 2 operands

45552

case ISD::FSUB:

45553

case ISD::FMUL:

45554

case ISD::FDIV:

45555

case ISD::FREM:

45556

case ISD::FCOPYSIGN:

45557

case ISD::FMINNUM:

45558

case ISD::FMAXNUM:

45559

case ISD::FMINNUM_IEEE:

45560

case ISD::FMAXNUM_IEEE:

45561

case ISD::FMAXIMUM:

45562

case ISD::FMINIMUM:

45563

case X86ISD::FMAX:

45564

case X86ISD::FMIN:

45565

case ISD::FABS: // Begin 1 operand

45566

case ISD::FSQRT:

45567

case ISD::FRINT:

45568

case ISD::FCEIL:

45569

case ISD::FTRUNC:

45570

case ISD::FNEARBYINT:

45571

case ISD::FROUND:

45572

case ISD::FFLOOR:

45573

case X86ISD::FRCP:

45574

case X86ISD::FRSQRT: {

45575

// extract (fp X, Y, ...), 0 --> fp (extract X, 0), (extract Y, 0), ...

45576

SDLoc DL(ExtElt);

45577

SmallVector<SDValue, 4> ExtOps;

45578

for (SDValue Op : Vec->ops())

45579

ExtOps.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op, Index));

45580

return DAG.getNode(Vec.getOpcode(), DL, VT, ExtOps);

45581

}

45582

default:

45583

return SDValue();

45584

}

45585

llvm_unreachable("All opcodes should return within switch")::llvm::llvm_unreachable_internal("All opcodes should return within switch"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45585);

45586

}

45587

45588

/// Try to convert a vector reduction sequence composed of binops and shuffles

45589

/// into horizontal ops.

45590

static SDValue combineArithReduction(SDNode *ExtElt, SelectionDAG &DAG,

45591

const X86Subtarget &Subtarget) {

45592

assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unexpected caller")(static_cast <bool> (ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT
&& "Unexpected caller") ? void (0) : __assert_fail (
"ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && \"Unexpected caller\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45592, __extension__
__PRETTY_FUNCTION__));

45593

45594

// We need at least SSE2 to anything here.

45595

if (!Subtarget.hasSSE2())

45596

return SDValue();

45597

45598

ISD::NodeType Opc;

45599

SDValue Rdx = DAG.matchBinOpReduction(ExtElt, Opc,

45600

{ISD::ADD, ISD::MUL, ISD::FADD}, true);

45601

if (!Rdx)

45602

return SDValue();

45603

45604

SDValue Index = ExtElt->getOperand(1);

45605

assert(isNullConstant(Index) &&(static_cast <bool> (isNullConstant(Index) && "Reduction doesn't end in an extract from index 0"
) ? void (0) : __assert_fail ("isNullConstant(Index) && \"Reduction doesn't end in an extract from index 0\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45606, __extension__
__PRETTY_FUNCTION__))

45606

"Reduction doesn't end in an extract from index 0")(static_cast <bool> (isNullConstant(Index) && "Reduction doesn't end in an extract from index 0"
) ? void (0) : __assert_fail ("isNullConstant(Index) && \"Reduction doesn't end in an extract from index 0\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45606, __extension__
__PRETTY_FUNCTION__));

45607

45608

EVT VT = ExtElt->getValueType(0);

45609

EVT VecVT = Rdx.getValueType();

45610

if (VecVT.getScalarType() != VT)

45611

return SDValue();

45612

45613

SDLoc DL(ExtElt);

45614

unsigned NumElts = VecVT.getVectorNumElements();

45615

unsigned EltSizeInBits = VecVT.getScalarSizeInBits();

45616

45617

// Extend v4i8/v8i8 vector to v16i8, with undef upper 64-bits.

45618

auto WidenToV16I8 = [&](SDValue V, bool ZeroExtend) {

45619

if (V.getValueType() == MVT::v4i8) {

45620

if (ZeroExtend && Subtarget.hasSSE41()) {

45621

V = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v4i32,

45622

DAG.getConstant(0, DL, MVT::v4i32),

45623

DAG.getBitcast(MVT::i32, V),

45624

DAG.getIntPtrConstant(0, DL));

45625

return DAG.getBitcast(MVT::v16i8, V);

45626

}

45627

V = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i8, V,

45628

ZeroExtend ? DAG.getConstant(0, DL, MVT::v4i8)

45629

: DAG.getUNDEF(MVT::v4i8));

45630

}

45631

return DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V,

45632

DAG.getUNDEF(MVT::v8i8));

45633

};

45634

45635

// vXi8 mul reduction - promote to vXi16 mul reduction.

45636

if (Opc == ISD::MUL) {

45637

if (VT != MVT::i8 || NumElts < 4 || !isPowerOf2_32(NumElts))

45638

return SDValue();

45639

if (VecVT.getSizeInBits() >= 128) {

45640

EVT WideVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts / 2);

45641

SDValue Lo = getUnpackl(DAG, DL, VecVT, Rdx, DAG.getUNDEF(VecVT));

45642

SDValue Hi = getUnpackh(DAG, DL, VecVT, Rdx, DAG.getUNDEF(VecVT));

45643

Lo = DAG.getBitcast(WideVT, Lo);

45644

Hi = DAG.getBitcast(WideVT, Hi);

45645

Rdx = DAG.getNode(Opc, DL, WideVT, Lo, Hi);

45646

while (Rdx.getValueSizeInBits() > 128) {

45647

std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);

45648

Rdx = DAG.getNode(Opc, DL, Lo.getValueType(), Lo, Hi);

45649

}

45650

} else {

45651

Rdx = WidenToV16I8(Rdx, false);

45652

Rdx = getUnpackl(DAG, DL, MVT::v16i8, Rdx, DAG.getUNDEF(MVT::v16i8));

45653

Rdx = DAG.getBitcast(MVT::v8i16, Rdx);

45654

}

45655

if (NumElts >= 8)

45656

Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,

45657

DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,

45658

{4, 5, 6, 7, -1, -1, -1, -1}));

45659

Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,

45660

DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,

45661

{2, 3, -1, -1, -1, -1, -1, -1}));

45662

Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,

45663

DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,

45664

{1, -1, -1, -1, -1, -1, -1, -1}));

45665

Rdx = DAG.getBitcast(MVT::v16i8, Rdx);

45666

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);

45667

}

45668

45669

// vXi8 add reduction - sub 128-bit vector.

45670

if (VecVT == MVT::v4i8 || VecVT == MVT::v8i8) {

45671

Rdx = WidenToV16I8(Rdx, true);

45672

Rdx = DAG.getNode(X86ISD::PSADBW, DL, MVT::v2i64, Rdx,

45673

DAG.getConstant(0, DL, MVT::v16i8));

45674

Rdx = DAG.getBitcast(MVT::v16i8, Rdx);

45675

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);

45676

}

45677

45678

// Must be a >=128-bit vector with pow2 elements.

45679

if ((VecVT.getSizeInBits() % 128) != 0 || !isPowerOf2_32(NumElts))

45680

return SDValue();

45681

45682

// vXi8 add reduction - sum lo/hi halves then use PSADBW.

45683

if (VT == MVT::i8) {

45684

while (Rdx.getValueSizeInBits() > 128) {

45685

SDValue Lo, Hi;

45686

std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);

45687

VecVT = Lo.getValueType();

45688

Rdx = DAG.getNode(ISD::ADD, DL, VecVT, Lo, Hi);

45689

}

45690

assert(VecVT == MVT::v16i8 && "v16i8 reduction expected")(static_cast <bool> (VecVT == MVT::v16i8 && "v16i8 reduction expected"
) ? void (0) : __assert_fail ("VecVT == MVT::v16i8 && \"v16i8 reduction expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45690, __extension__
__PRETTY_FUNCTION__));

45691

45692

SDValue Hi = DAG.getVectorShuffle(

45693

MVT::v16i8, DL, Rdx, Rdx,

45694

{8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1});

45695

Rdx = DAG.getNode(ISD::ADD, DL, MVT::v16i8, Rdx, Hi);

45696

Rdx = DAG.getNode(X86ISD::PSADBW, DL, MVT::v2i64, Rdx,

45697

getZeroVector(MVT::v16i8, Subtarget, DAG, DL));

45698

Rdx = DAG.getBitcast(MVT::v16i8, Rdx);

45699

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);

45700

}

45701

45702

// See if we can use vXi8 PSADBW add reduction for larger zext types.

45703

// If the source vector values are 0-255, then we can use PSADBW to

45704

// sum+zext v8i8 subvectors to vXi64, then perform the reduction.

45705

// TODO: See if its worth avoiding vXi16/i32 truncations?

45706

if (Opc == ISD::ADD && NumElts >= 4 && EltSizeInBits >= 16 &&

45707

DAG.computeKnownBits(Rdx).getMaxValue().ule(255) &&

45708

(EltSizeInBits == 16 || Rdx.getOpcode() == ISD::ZERO_EXTEND ||

45709

Subtarget.hasAVX512())) {

45710

EVT ByteVT = VecVT.changeVectorElementType(MVT::i8);

45711

Rdx = DAG.getNode(ISD::TRUNCATE, DL, ByteVT, Rdx);

45712

if (ByteVT.getSizeInBits() < 128)

45713

Rdx = WidenToV16I8(Rdx, true);

45714

45715

// Build the PSADBW, split as 128/256/512 bits for SSE/AVX2/AVX512BW.

45716

auto PSADBWBuilder = [](SelectionDAG &DAG, const SDLoc &DL,

45717

ArrayRef<SDValue> Ops) {

45718

MVT VT = MVT::getVectorVT(MVT::i64, Ops[0].getValueSizeInBits() / 64);

45719

SDValue Zero = DAG.getConstant(0, DL, Ops[0].getValueType());

45720

return DAG.getNode(X86ISD::PSADBW, DL, VT, Ops[0], Zero);

45721

};

45722

MVT SadVT = MVT::getVectorVT(MVT::i64, Rdx.getValueSizeInBits() / 64);

45723

Rdx = SplitOpsAndApply(DAG, Subtarget, DL, SadVT, {Rdx}, PSADBWBuilder);

45724

45725

// TODO: We could truncate to vXi16/vXi32 before performing the reduction.

45726

while (Rdx.getValueSizeInBits() > 128) {

45727

SDValue Lo, Hi;

45728

std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);

45729

VecVT = Lo.getValueType();

45730

Rdx = DAG.getNode(ISD::ADD, DL, VecVT, Lo, Hi);

45731

}

45732

assert(Rdx.getValueType() == MVT::v2i64 && "v2i64 reduction expected")(static_cast <bool> (Rdx.getValueType() == MVT::v2i64 &&
"v2i64 reduction expected") ? void (0) : __assert_fail ("Rdx.getValueType() == MVT::v2i64 && \"v2i64 reduction expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45732, __extension__
__PRETTY_FUNCTION__));

45733

45734

if (NumElts > 8) {

45735

SDValue RdxHi = DAG.getVectorShuffle(MVT::v2i64, DL, Rdx, Rdx, {1, -1});

45736

Rdx = DAG.getNode(ISD::ADD, DL, MVT::v2i64, Rdx, RdxHi);

45737

}

45738

45739

VecVT = MVT::getVectorVT(VT.getSimpleVT(), 128 / VT.getSizeInBits());

45740

Rdx = DAG.getBitcast(VecVT, Rdx);

45741

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);

45742

}

45743

45744

// Only use (F)HADD opcodes if they aren't microcoded or minimizes codesize.

45745

if (!shouldUseHorizontalOp(true, DAG, Subtarget))

45746

return SDValue();

45747

45748

unsigned HorizOpcode = Opc == ISD::ADD ? X86ISD::HADD : X86ISD::FHADD;

45749

45750

// 256-bit horizontal instructions operate on 128-bit chunks rather than

45751

// across the whole vector, so we need an extract + hop preliminary stage.

45752

// This is the only step where the operands of the hop are not the same value.

45753

// TODO: We could extend this to handle 512-bit or even longer vectors.

45754

if (((VecVT == MVT::v16i16 || VecVT == MVT::v8i32) && Subtarget.hasSSSE3()) ||

45755

((VecVT == MVT::v8f32 || VecVT == MVT::v4f64) && Subtarget.hasSSE3())) {

45756

unsigned NumElts = VecVT.getVectorNumElements();

45757

SDValue Hi = extract128BitVector(Rdx, NumElts / 2, DAG, DL);

45758

SDValue Lo = extract128BitVector(Rdx, 0, DAG, DL);

45759

Rdx = DAG.getNode(HorizOpcode, DL, Lo.getValueType(), Hi, Lo);

45760

VecVT = Rdx.getValueType();

45761

}

45762

if (!((VecVT == MVT::v8i16 || VecVT == MVT::v4i32) && Subtarget.hasSSSE3()) &&

45763

!((VecVT == MVT::v4f32 || VecVT == MVT::v2f64) && Subtarget.hasSSE3()))

45764

return SDValue();

45765

45766

// extract (add (shuf X), X), 0 --> extract (hadd X, X), 0

45767

unsigned ReductionSteps = Log2_32(VecVT.getVectorNumElements());

45768

for (unsigned i = 0; i != ReductionSteps; ++i)

45769

Rdx = DAG.getNode(HorizOpcode, DL, VecVT, Rdx, Rdx);

45770

45771

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);

45772

}

45773

45774

/// Detect vector gather/scatter index generation and convert it from being a

45775

/// bunch of shuffles and extracts into a somewhat faster sequence.

45776

/// For i686, the best sequence is apparently storing the value and loading

45777

/// scalars back, while for x64 we should use 64-bit extracts and shifts.

45778

static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,

45779

TargetLowering::DAGCombinerInfo &DCI,

45780

const X86Subtarget &Subtarget) {

45781

if (SDValue NewOp = combineExtractWithShuffle(N, DAG, DCI, Subtarget))

45782

return NewOp;

45783

45784

SDValue InputVector = N->getOperand(0);

45785

SDValue EltIdx = N->getOperand(1);

45786

auto *CIdx = dyn_cast<ConstantSDNode>(EltIdx);

45787

45788

EVT SrcVT = InputVector.getValueType();

45789

EVT VT = N->getValueType(0);

45790

SDLoc dl(InputVector);

45791

bool IsPextr = N->getOpcode() != ISD::EXTRACT_VECTOR_ELT;

45792

unsigned NumSrcElts = SrcVT.getVectorNumElements();

45793

unsigned NumEltBits = VT.getScalarSizeInBits();

45794

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

45795

45796

if (CIdx && CIdx->getAPIntValue().uge(NumSrcElts))

45797

return IsPextr ? DAG.getConstant(0, dl, VT) : DAG.getUNDEF(VT);

45798

45799

// Integer Constant Folding.

45800

if (CIdx && VT.isInteger()) {

45801

APInt UndefVecElts;

45802

SmallVector<APInt, 16> EltBits;

45803

unsigned VecEltBitWidth = SrcVT.getScalarSizeInBits();

45804

if (getTargetConstantBitsFromNode(InputVector, VecEltBitWidth, UndefVecElts,

45805

EltBits, true, false)) {

45806

uint64_t Idx = CIdx->getZExtValue();

45807

if (UndefVecElts[Idx])

45808

return IsPextr ? DAG.getConstant(0, dl, VT) : DAG.getUNDEF(VT);

45809

return DAG.getConstant(EltBits[Idx].zext(NumEltBits), dl, VT);

45810

}

45811

45812

// Convert extract_element(bitcast(<X x i1>) -> bitcast(extract_subvector()).

45813

// Improves lowering of bool masks on rust which splits them into byte array.

45814

if (InputVector.getOpcode() == ISD::BITCAST && (NumEltBits % 8) == 0) {

45815

SDValue Src = peekThroughBitcasts(InputVector);

45816

if (Src.getValueType().getScalarType() == MVT::i1 &&

45817

TLI.isTypeLegal(Src.getValueType())) {

45818

MVT SubVT = MVT::getVectorVT(MVT::i1, NumEltBits);

45819

SDValue Sub = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVT, Src,

45820

DAG.getIntPtrConstant(CIdx->getZExtValue() * NumEltBits, dl));

45821

return DAG.getBitcast(VT, Sub);

45822

}

45823

}

45824

}

45825

45826

if (IsPextr) {

45827

if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(NumEltBits),

45828

DCI))

45829

return SDValue(N, 0);

45830

45831

// PEXTR*(PINSR*(v, s, c), c) -> s (with implicit zext handling).

45832

if ((InputVector.getOpcode() == X86ISD::PINSRB ||

45833

InputVector.getOpcode() == X86ISD::PINSRW) &&

45834

InputVector.getOperand(2) == EltIdx) {

45835

assert(SrcVT == InputVector.getOperand(0).getValueType() &&(static_cast <bool> (SrcVT == InputVector.getOperand(0)
.getValueType() && "Vector type mismatch") ? void (0)
: __assert_fail ("SrcVT == InputVector.getOperand(0).getValueType() && \"Vector type mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45836, __extension__
__PRETTY_FUNCTION__))

45836

"Vector type mismatch")(static_cast <bool> (SrcVT == InputVector.getOperand(0)
.getValueType() && "Vector type mismatch") ? void (0)
: __assert_fail ("SrcVT == InputVector.getOperand(0).getValueType() && \"Vector type mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45836, __extension__
__PRETTY_FUNCTION__));

45837

SDValue Scl = InputVector.getOperand(1);

45838

Scl = DAG.getNode(ISD::TRUNCATE, dl, SrcVT.getScalarType(), Scl);

45839

return DAG.getZExtOrTrunc(Scl, dl, VT);

45840

}

45841

45842

// TODO - Remove this once we can handle the implicit zero-extension of

45843

// X86ISD::PEXTRW/X86ISD::PEXTRB in combinePredicateReduction and

45844

// combineBasicSADPattern.

45845

return SDValue();

45846

}

45847

45848

// Detect mmx extraction of all bits as a i64. It works better as a bitcast.

45849

if (VT == MVT::i64 && SrcVT == MVT::v1i64 &&

45850

InputVector.getOpcode() == ISD::BITCAST &&

45851

InputVector.getOperand(0).getValueType() == MVT::x86mmx &&

45852

isNullConstant(EltIdx) && InputVector.hasOneUse())

45853

return DAG.getBitcast(VT, InputVector);

45854

45855

// Detect mmx to i32 conversion through a v2i32 elt extract.

45856

if (VT == MVT::i32 && SrcVT == MVT::v2i32 &&

45857

InputVector.getOpcode() == ISD::BITCAST &&

45858

InputVector.getOperand(0).getValueType() == MVT::x86mmx &&

45859

isNullConstant(EltIdx) && InputVector.hasOneUse())

45860

return DAG.getNode(X86ISD::MMX_MOVD2W, dl, MVT::i32,

45861

InputVector.getOperand(0));

45862

45863

// Check whether this extract is the root of a sum of absolute differences

45864

// pattern. This has to be done here because we really want it to happen

45865

// pre-legalization,

45866

if (SDValue SAD = combineBasicSADPattern(N, DAG, Subtarget))

45867

return SAD;

45868

45869

if (SDValue VPDPBUSD = combineVPDPBUSDPattern(N, DAG, Subtarget))

45870

return VPDPBUSD;

45871

45872

// Attempt to replace an all_of/any_of horizontal reduction with a MOVMSK.

45873

if (SDValue Cmp = combinePredicateReduction(N, DAG, Subtarget))

45874

return Cmp;

45875

45876

// Attempt to replace min/max v8i16/v16i8 reductions with PHMINPOSUW.

45877

if (SDValue MinMax = combineMinMaxReduction(N, DAG, Subtarget))

45878

return MinMax;

45879

45880

// Attempt to optimize ADD/FADD/MUL reductions with HADD, promotion etc..

45881

if (SDValue V = combineArithReduction(N, DAG, Subtarget))

45882

return V;

45883

45884

if (SDValue V = scalarizeExtEltFP(N, DAG, Subtarget))

45885

return V;

45886

45887

// Attempt to extract a i1 element by using MOVMSK to extract the signbits

45888

// and then testing the relevant element.

45889

//

45890

// Note that we only combine extracts on the *same* result number, i.e.

45891

// t0 = merge_values a0, a1, a2, a3

45892

// i1 = extract_vector_elt t0, Constant:i64<2>

45893

// i1 = extract_vector_elt t0, Constant:i64<3>

45894

// but not

45895

// i1 = extract_vector_elt t0:1, Constant:i64<2>

45896

// since the latter would need its own MOVMSK.

45897

if (SrcVT.getScalarType() == MVT::i1) {

45898

bool IsVar = !CIdx;

45899

SmallVector<SDNode *, 16> BoolExtracts;

45900

unsigned ResNo = InputVector.getResNo();

45901

auto IsBoolExtract = [&BoolExtracts, &ResNo, &IsVar](SDNode *Use) {

45902

if (Use->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&

45903

Use->getOperand(0).getResNo() == ResNo &&

45904

Use->getValueType(0) == MVT::i1) {

45905

BoolExtracts.push_back(Use);

45906

IsVar |= !isa<ConstantSDNode>(Use->getOperand(1));

45907

return true;

45908

}

45909

return false;

45910

};

45911

// TODO: Can we drop the oneuse check for constant extracts?

45912

if (all_of(InputVector->uses(), IsBoolExtract) &&

45913

(IsVar || BoolExtracts.size() > 1)) {

45914

EVT BCVT = EVT::getIntegerVT(*DAG.getContext(), NumSrcElts);

45915

if (SDValue BC =

45916

combineBitcastvxi1(DAG, BCVT, InputVector, dl, Subtarget)) {

45917

for (SDNode *Use : BoolExtracts) {

45918

// extractelement vXi1 X, MaskIdx --> ((movmsk X) & Mask) == Mask

45919

// Mask = 1 << MaskIdx

45920

SDValue MaskIdx = DAG.getZExtOrTrunc(Use->getOperand(1), dl, MVT::i8);

45921

SDValue MaskBit = DAG.getConstant(1, dl, BCVT);

45922

SDValue Mask = DAG.getNode(ISD::SHL, dl, BCVT, MaskBit, MaskIdx);

45923

SDValue Res = DAG.getNode(ISD::AND, dl, BCVT, BC, Mask);

45924

Res = DAG.getSetCC(dl, MVT::i1, Res, Mask, ISD::SETEQ);

45925

DCI.CombineTo(Use, Res);

45926

}

45927

return SDValue(N, 0);

45928

}

45929

}

45930

}

45931

45932

// If this extract is from a loaded vector value and will be used as an

45933

// integer, that requires a potentially expensive XMM -> GPR transfer.

45934

// Additionally, if we can convert to a scalar integer load, that will likely

45935

// be folded into a subsequent integer op.

45936

// Note: Unlike the related fold for this in DAGCombiner, this is not limited

45937

// to a single-use of the loaded vector. For the reasons above, we

45938

// expect this to be profitable even if it creates an extra load.

45939

bool LikelyUsedAsVector = any_of(N->uses(), [](SDNode *Use) {

45940

return Use->getOpcode() == ISD::STORE ||

45941

Use->getOpcode() == ISD::INSERT_VECTOR_ELT ||

45942

Use->getOpcode() == ISD::SCALAR_TO_VECTOR;

45943

});

45944

auto *LoadVec = dyn_cast<LoadSDNode>(InputVector);

45945

if (LoadVec && CIdx && ISD::isNormalLoad(LoadVec) && VT.isInteger() &&

45946

SrcVT.getVectorElementType() == VT && DCI.isAfterLegalizeDAG() &&

45947

!LikelyUsedAsVector && LoadVec->isSimple()) {

45948

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

45949

SDValue NewPtr =

45950

TLI.getVectorElementPointer(DAG, LoadVec->getBasePtr(), SrcVT, EltIdx);

45951

unsigned PtrOff = VT.getSizeInBits() * CIdx->getZExtValue() / 8;

45952

MachinePointerInfo MPI = LoadVec->getPointerInfo().getWithOffset(PtrOff);

45953

Align Alignment = commonAlignment(LoadVec->getAlign(), PtrOff);

45954

SDValue Load =

45955

DAG.getLoad(VT, dl, LoadVec->getChain(), NewPtr, MPI, Alignment,

45956

LoadVec->getMemOperand()->getFlags(), LoadVec->getAAInfo());

45957

DAG.makeEquivalentMemoryOrdering(LoadVec, Load);

45958

return Load;

45959

}

45960

45961

return SDValue();

45962

}

45963

45964

// Convert (vXiY *ext(vXi1 bitcast(iX))) to extend_in_reg(broadcast(iX)).

45965

// This is more or less the reverse of combineBitcastvxi1.

45966

static SDValue combineToExtendBoolVectorInReg(

45967

unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N0, SelectionDAG &DAG,

45968

TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) {

45969

if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND &&

45970

Opcode != ISD::ANY_EXTEND)

45971

return SDValue();

45972

if (!DCI.isBeforeLegalizeOps())

45973

return SDValue();

45974

if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())

45975

return SDValue();

45976

45977

EVT SVT = VT.getScalarType();

45978

EVT InSVT = N0.getValueType().getScalarType();

45979

unsigned EltSizeInBits = SVT.getSizeInBits();

45980

45981

// Input type must be extending a bool vector (bit-casted from a scalar

45982

// integer) to legal integer types.

45983

if (!VT.isVector())

45984

return SDValue();

45985

if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16 && SVT != MVT::i8)

45986

return SDValue();

45987

if (InSVT != MVT::i1 || N0.getOpcode() != ISD::BITCAST)

45988

return SDValue();

45989

45990

SDValue N00 = N0.getOperand(0);

45991

EVT SclVT = N00.getValueType();

45992

if (!SclVT.isScalarInteger())

45993

return SDValue();

45994

45995

SDValue Vec;

45996

SmallVector<int> ShuffleMask;

45997

unsigned NumElts = VT.getVectorNumElements();

45998

assert(NumElts == SclVT.getSizeInBits() && "Unexpected bool vector size")(static_cast <bool> (NumElts == SclVT.getSizeInBits() &&
"Unexpected bool vector size") ? void (0) : __assert_fail ("NumElts == SclVT.getSizeInBits() && \"Unexpected bool vector size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45998, __extension__
__PRETTY_FUNCTION__));

45999

46000

// Broadcast the scalar integer to the vector elements.

46001

if (NumElts > EltSizeInBits) {

46002

// If the scalar integer is greater than the vector element size, then we

46003

// must split it down into sub-sections for broadcasting. For example:

46004

// i16 -> v16i8 (i16 -> v8i16 -> v16i8) with 2 sub-sections.

46005

// i32 -> v32i8 (i32 -> v8i32 -> v32i8) with 4 sub-sections.

46006

assert((NumElts % EltSizeInBits) == 0 && "Unexpected integer scale")(static_cast <bool> ((NumElts % EltSizeInBits) == 0 &&
"Unexpected integer scale") ? void (0) : __assert_fail ("(NumElts % EltSizeInBits) == 0 && \"Unexpected integer scale\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46006, __extension__
__PRETTY_FUNCTION__));

46007

unsigned Scale = NumElts / EltSizeInBits;

46008

EVT BroadcastVT = EVT::getVectorVT(*DAG.getContext(), SclVT, EltSizeInBits);

46009

Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00);

46010

Vec = DAG.getBitcast(VT, Vec);

46011

46012

for (unsigned i = 0; i != Scale; ++i)

46013

ShuffleMask.append(EltSizeInBits, i);

46014

Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask);

46015

} else if (Subtarget.hasAVX2() && NumElts < EltSizeInBits &&

46016

(SclVT == MVT::i8 || SclVT == MVT::i16 || SclVT == MVT::i32)) {

46017

// If we have register broadcast instructions, use the scalar size as the

46018

// element type for the shuffle. Then cast to the wider element type. The

46019

// widened bits won't be used, and this might allow the use of a broadcast

46020

// load.

46021

assert((EltSizeInBits % NumElts) == 0 && "Unexpected integer scale")(static_cast <bool> ((EltSizeInBits % NumElts) == 0 &&
"Unexpected integer scale") ? void (0) : __assert_fail ("(EltSizeInBits % NumElts) == 0 && \"Unexpected integer scale\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46021, __extension__
__PRETTY_FUNCTION__));

46022

unsigned Scale = EltSizeInBits / NumElts;

46023

EVT BroadcastVT =

46024

EVT::getVectorVT(*DAG.getContext(), SclVT, NumElts * Scale);

46025

Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00);

46026

ShuffleMask.append(NumElts * Scale, 0);

46027

Vec = DAG.getVectorShuffle(BroadcastVT, DL, Vec, Vec, ShuffleMask);

46028

Vec = DAG.getBitcast(VT, Vec);

46029

} else {

46030

// For smaller scalar integers, we can simply any-extend it to the vector

46031

// element size (we don't care about the upper bits) and broadcast it to all

46032

// elements.

46033

SDValue Scl = DAG.getAnyExtOrTrunc(N00, DL, SVT);

46034

Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl);

46035

ShuffleMask.append(NumElts, 0);

46036

Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask);

46037

}

46038

46039

// Now, mask the relevant bit in each element.

46040

SmallVector<SDValue, 32> Bits;

46041

for (unsigned i = 0; i != NumElts; ++i) {

46042

int BitIdx = (i % EltSizeInBits);

46043

APInt Bit = APInt::getBitsSet(EltSizeInBits, BitIdx, BitIdx + 1);

46044

Bits.push_back(DAG.getConstant(Bit, DL, SVT));

46045

}

46046

SDValue BitMask = DAG.getBuildVector(VT, DL, Bits);

46047

Vec = DAG.getNode(ISD::AND, DL, VT, Vec, BitMask);

46048

46049

// Compare against the bitmask and extend the result.

46050

EVT CCVT = VT.changeVectorElementType(MVT::i1);

46051

Vec = DAG.getSetCC(DL, CCVT, Vec, BitMask, ISD::SETEQ);

46052

Vec = DAG.getSExtOrTrunc(Vec, DL, VT);

46053

46054

// For SEXT, this is now done, otherwise shift the result down for

46055

// zero-extension.

46056

if (Opcode == ISD::SIGN_EXTEND)

46057

return Vec;

46058

return DAG.getNode(ISD::SRL, DL, VT, Vec,

46059

DAG.getConstant(EltSizeInBits - 1, DL, VT));

46060

}

46061

46062

/// If a vector select has an operand that is -1 or 0, try to simplify the

46063

/// select to a bitwise logic operation.

46064

/// TODO: Move to DAGCombiner, possibly using TargetLowering::hasAndNot()?

46065

static SDValue

46066

combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG,

46067

TargetLowering::DAGCombinerInfo &DCI,

46068

const X86Subtarget &Subtarget) {

46069

SDValue Cond = N->getOperand(0);

46070

SDValue LHS = N->getOperand(1);

46071

SDValue RHS = N->getOperand(2);

46072

EVT VT = LHS.getValueType();

46073

EVT CondVT = Cond.getValueType();

46074

SDLoc DL(N);

46075

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

46076

46077

if (N->getOpcode() != ISD::VSELECT)

46078

return SDValue();

46079

46080

assert(CondVT.isVector() && "Vector select expects a vector selector!")(static_cast <bool> (CondVT.isVector() && "Vector select expects a vector selector!"
) ? void (0) : __assert_fail ("CondVT.isVector() && \"Vector select expects a vector selector!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46080, __extension__
__PRETTY_FUNCTION__));

46081

46082

// TODO: Use isNullOrNullSplat() to distinguish constants with undefs?

46083

// TODO: Can we assert that both operands are not zeros (because that should

46084

// get simplified at node creation time)?

46085

bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());

46086

bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());

46087

46088

// If both inputs are 0/undef, create a complete zero vector.

46089

// FIXME: As noted above this should be handled by DAGCombiner/getNode.

46090

if (TValIsAllZeros && FValIsAllZeros) {

46091

if (VT.isFloatingPoint())

46092

return DAG.getConstantFP(0.0, DL, VT);

46093

return DAG.getConstant(0, DL, VT);

46094

}

46095

46096

// To use the condition operand as a bitwise mask, it must have elements that

46097

// are the same size as the select elements. Ie, the condition operand must

46098

// have already been promoted from the IR select condition type <N x i1>.

46099

// Don't check if the types themselves are equal because that excludes

46100

// vector floating-point selects.

46101

if (CondVT.getScalarSizeInBits() != VT.getScalarSizeInBits())

46102

return SDValue();

46103

46104

// Try to invert the condition if true value is not all 1s and false value is

46105

// not all 0s. Only do this if the condition has one use.

46106

bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode());

46107

if (!TValIsAllOnes && !FValIsAllZeros && Cond.hasOneUse() &&

46108

// Check if the selector will be produced by CMPP*/PCMP*.

46109

Cond.getOpcode() == ISD::SETCC &&

46110

// Check if SETCC has already been promoted.

46111

TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT) ==

46112

CondVT) {

46113

bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode());

46114

46115

if (TValIsAllZeros || FValIsAllOnes) {

46116

SDValue CC = Cond.getOperand(2);

46117

ISD::CondCode NewCC = ISD::getSetCCInverse(

46118

cast<CondCodeSDNode>(CC)->get(), Cond.getOperand(0).getValueType());

46119

Cond = DAG.getSetCC(DL, CondVT, Cond.getOperand(0), Cond.getOperand(1),

46120

NewCC);

46121

std::swap(LHS, RHS);

46122

TValIsAllOnes = FValIsAllOnes;

46123

FValIsAllZeros = TValIsAllZeros;

46124

}

46125

}

46126

46127

// Cond value must be 'sign splat' to be converted to a logical op.

46128

if (DAG.ComputeNumSignBits(Cond) != CondVT.getScalarSizeInBits())

46129

return SDValue();

46130

46131

// vselect Cond, 111..., 000... -> Cond

46132

if (TValIsAllOnes && FValIsAllZeros)

46133

return DAG.getBitcast(VT, Cond);

46134

46135

if (!TLI.isTypeLegal(CondVT))

46136

return SDValue();

46137

46138

// vselect Cond, 111..., X -> or Cond, X

46139

if (TValIsAllOnes) {

46140

SDValue CastRHS = DAG.getBitcast(CondVT, RHS);

46141

SDValue Or = DAG.getNode(ISD::OR, DL, CondVT, Cond, CastRHS);

46142

return DAG.getBitcast(VT, Or);

46143

}

46144

46145

// vselect Cond, X, 000... -> and Cond, X

46146

if (FValIsAllZeros) {

46147

SDValue CastLHS = DAG.getBitcast(CondVT, LHS);

46148

SDValue And = DAG.getNode(ISD::AND, DL, CondVT, Cond, CastLHS);

46149

return DAG.getBitcast(VT, And);

46150

}

46151

46152

// vselect Cond, 000..., X -> andn Cond, X

46153

if (TValIsAllZeros) {

46154

SDValue CastRHS = DAG.getBitcast(CondVT, RHS);

46155

SDValue AndN;

46156

// The canonical form differs for i1 vectors - x86andnp is not used

46157

if (CondVT.getScalarType() == MVT::i1)

46158

AndN = DAG.getNode(ISD::AND, DL, CondVT, DAG.getNOT(DL, Cond, CondVT),

46159

CastRHS);

46160

else

46161

AndN = DAG.getNode(X86ISD::ANDNP, DL, CondVT, Cond, CastRHS);

46162

return DAG.getBitcast(VT, AndN);

46163

}

46164

46165

return SDValue();

46166

}

46167

46168

/// If both arms of a vector select are concatenated vectors, split the select,

46169

/// and concatenate the result to eliminate a wide (256-bit) vector instruction:

46170

/// vselect Cond, (concat T0, T1), (concat F0, F1) -->

46171

/// concat (vselect (split Cond), T0, F0), (vselect (split Cond), T1, F1)

46172

static SDValue narrowVectorSelect(SDNode *N, SelectionDAG &DAG,

46173

const X86Subtarget &Subtarget) {

46174

unsigned Opcode = N->getOpcode();

46175

if (Opcode != X86ISD::BLENDV && Opcode != ISD::VSELECT)

46176

return SDValue();

46177

46178

// TODO: Split 512-bit vectors too?

46179

EVT VT = N->getValueType(0);

46180

if (!VT.is256BitVector())

46181

return SDValue();

46182

46183

// TODO: Split as long as any 2 of the 3 operands are concatenated?

46184

SDValue Cond = N->getOperand(0);

46185

SDValue TVal = N->getOperand(1);

46186

SDValue FVal = N->getOperand(2);

46187

SmallVector<SDValue, 4> CatOpsT, CatOpsF;

46188

if (!TVal.hasOneUse() || !FVal.hasOneUse() ||

46189

!collectConcatOps(TVal.getNode(), CatOpsT, DAG) ||

46190

!collectConcatOps(FVal.getNode(), CatOpsF, DAG))

46191

return SDValue();

46192

46193

auto makeBlend = [Opcode](SelectionDAG &DAG, const SDLoc &DL,

46194

ArrayRef<SDValue> Ops) {

46195

return DAG.getNode(Opcode, DL, Ops[1].getValueType(), Ops);

46196

};

46197

return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { Cond, TVal, FVal },

46198

makeBlend, /*CheckBWI*/ false);

46199

}

46200

46201

static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG) {

46202

SDValue Cond = N->getOperand(0);

46203

SDValue LHS = N->getOperand(1);

46204

SDValue RHS = N->getOperand(2);

46205

SDLoc DL(N);

46206

46207

auto *TrueC = dyn_cast<ConstantSDNode>(LHS);

46208

auto *FalseC = dyn_cast<ConstantSDNode>(RHS);

46209

if (!TrueC || !FalseC)

46210

return SDValue();

46211

46212

// Don't do this for crazy integer types.

46213

EVT VT = N->getValueType(0);

46214

if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))

46215

return SDValue();

46216

46217

// We're going to use the condition bit in math or logic ops. We could allow

46218

// this with a wider condition value (post-legalization it becomes an i8),

46219

// but if nothing is creating selects that late, it doesn't matter.

46220

if (Cond.getValueType() != MVT::i1)

46221

return SDValue();

46222

46223

// A power-of-2 multiply is just a shift. LEA also cheaply handles multiply by

46224

// 3, 5, or 9 with i32/i64, so those get transformed too.

46225

// TODO: For constants that overflow or do not differ by power-of-2 or small

46226

// multiplier, convert to 'and' + 'add'.

46227

const APInt &TrueVal = TrueC->getAPIntValue();

46228

const APInt &FalseVal = FalseC->getAPIntValue();

46229

46230

// We have a more efficient lowering for "(X == 0) ? Y : -1" using SBB.

46231

if ((TrueVal.isAllOnes() || FalseVal.isAllOnes()) &&

46232

Cond.getOpcode() == ISD::SETCC && isNullConstant(Cond.getOperand(1))) {

46233

ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();

46234

if (CC == ISD::SETEQ || CC == ISD::SETNE)

46235

return SDValue();

46236

}

46237

46238

bool OV;

46239

APInt Diff = TrueVal.ssub_ov(FalseVal, OV);

46240

if (OV)

46241

return SDValue();

46242

46243

APInt AbsDiff = Diff.abs();

46244

if (AbsDiff.isPowerOf2() ||

46245

((VT == MVT::i32 || VT == MVT::i64) &&

46246

(AbsDiff == 3 || AbsDiff == 5 || AbsDiff == 9))) {

46247

46248

// We need a positive multiplier constant for shift/LEA codegen. The 'not'

46249

// of the condition can usually be folded into a compare predicate, but even

46250

// without that, the sequence should be cheaper than a CMOV alternative.

46251

if (TrueVal.slt(FalseVal)) {

46252

Cond = DAG.getNOT(DL, Cond, MVT::i1);

46253

std::swap(TrueC, FalseC);

46254

}

46255

46256

// select Cond, TC, FC --> (zext(Cond) * (TC - FC)) + FC

46257

SDValue R = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Cond);

46258

46259

// Multiply condition by the difference if non-one.

46260

if (!AbsDiff.isOne())

46261

R = DAG.getNode(ISD::MUL, DL, VT, R, DAG.getConstant(AbsDiff, DL, VT));

46262

46263

// Add the base if non-zero.

46264

if (!FalseC->isZero())

46265

R = DAG.getNode(ISD::ADD, DL, VT, R, SDValue(FalseC, 0));

46266

46267

return R;

46268

}

46269

46270

return SDValue();

46271

}

46272

46273

/// If this is a *dynamic* select (non-constant condition) and we can match

46274

/// this node with one of the variable blend instructions, restructure the

46275

/// condition so that blends can use the high (sign) bit of each element.

46276

/// This function will also call SimplifyDemandedBits on already created

46277

/// BLENDV to perform additional simplifications.

46278

static SDValue combineVSelectToBLENDV(SDNode *N, SelectionDAG &DAG,

46279

TargetLowering::DAGCombinerInfo &DCI,

46280

const X86Subtarget &Subtarget) {

46281

SDValue Cond = N->getOperand(0);

46282

if ((N->getOpcode() != ISD::VSELECT &&

46283

N->getOpcode() != X86ISD::BLENDV) ||

46284

ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))

46285

return SDValue();

46286

46287

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

46288

unsigned BitWidth = Cond.getScalarValueSizeInBits();

46289

EVT VT = N->getValueType(0);

46290

46291

// We can only handle the cases where VSELECT is directly legal on the

46292

// subtarget. We custom lower VSELECT nodes with constant conditions and

46293

// this makes it hard to see whether a dynamic VSELECT will correctly

46294

// lower, so we both check the operation's status and explicitly handle the

46295

// cases where a *dynamic* blend will fail even though a constant-condition

46296

// blend could be custom lowered.

46297

// FIXME: We should find a better way to handle this class of problems.

46298

// Potentially, we should combine constant-condition vselect nodes

46299

// pre-legalization into shuffles and not mark as many types as custom

46300

// lowered.

46301

if (!TLI.isOperationLegalOrCustom(ISD::VSELECT, VT))

46302

return SDValue();

46303

// FIXME: We don't support i16-element blends currently. We could and

46304

// should support them by making *all* the bits in the condition be set

46305

// rather than just the high bit and using an i8-element blend.

46306

if (VT.getVectorElementType() == MVT::i16)

46307

return SDValue();

46308

// Dynamic blending was only available from SSE4.1 onward.

46309

if (VT.is128BitVector() && !Subtarget.hasSSE41())

46310

return SDValue();

46311

// Byte blends are only available in AVX2

46312

if (VT == MVT::v32i8 && !Subtarget.hasAVX2())

46313

return SDValue();

46314

// There are no 512-bit blend instructions that use sign bits.

46315

if (VT.is512BitVector())

46316

return SDValue();

46317

46318

// Don't optimize before the condition has been transformed to a legal type

46319

// and don't ever optimize vector selects that map to AVX512 mask-registers.

46320

if (BitWidth < 8 || BitWidth > 64)

46321

return SDValue();

46322

46323

auto OnlyUsedAsSelectCond = [](SDValue Cond) {

46324

for (SDNode::use_iterator UI = Cond->use_begin(), UE = Cond->use_end();

46325

UI != UE; ++UI)

46326

if ((UI->getOpcode() != ISD::VSELECT &&

46327

UI->getOpcode() != X86ISD::BLENDV) ||

46328

UI.getOperandNo() != 0)

46329

return false;

46330

46331

return true;

46332

};

46333

46334

APInt DemandedBits(APInt::getSignMask(BitWidth));

46335

46336

if (OnlyUsedAsSelectCond(Cond)) {

46337

KnownBits Known;

46338

TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),

46339

!DCI.isBeforeLegalizeOps());

46340

if (!TLI.SimplifyDemandedBits(Cond, DemandedBits, Known, TLO, 0, true))

46341

return SDValue();

46342

46343

// If we changed the computation somewhere in the DAG, this change will

46344

// affect all users of Cond. Update all the nodes so that we do not use

46345

// the generic VSELECT anymore. Otherwise, we may perform wrong

46346

// optimizations as we messed with the actual expectation for the vector

46347

// boolean values.

46348

for (SDNode *U : Cond->uses()) {

46349

if (U->getOpcode() == X86ISD::BLENDV)

46350

continue;

46351

46352

SDValue SB = DAG.getNode(X86ISD::BLENDV, SDLoc(U), U->getValueType(0),

46353

Cond, U->getOperand(1), U->getOperand(2));

46354

DAG.ReplaceAllUsesOfValueWith(SDValue(U, 0), SB);

46355

DCI.AddToWorklist(U);

46356

}

46357

DCI.CommitTargetLoweringOpt(TLO);

46358

return SDValue(N, 0);

46359

}

46360

46361

// Otherwise we can still at least try to simplify multiple use bits.

46362

if (SDValue V = TLI.SimplifyMultipleUseDemandedBits(Cond, DemandedBits, DAG))

46363

return DAG.getNode(X86ISD::BLENDV, SDLoc(N), N->getValueType(0), V,

46364

N->getOperand(1), N->getOperand(2));

46365

46366

return SDValue();

46367

}

46368

46369

// Try to match:

46370

// (or (and (M, (sub 0, X)), (pandn M, X)))

46371

// which is a special case of:

46372

// (select M, (sub 0, X), X)

46373

// Per:

46374

// http://graphics.stanford.edu/~seander/bithacks.html#ConditionalNegate

46375

// We know that, if fNegate is 0 or 1:

46376

// (fNegate ? -v : v) == ((v ^ -fNegate) + fNegate)

46377

//

46378

// Here, we have a mask, M (all 1s or 0), and, similarly, we know that:

46379

// ((M & 1) ? -X : X) == ((X ^ -(M & 1)) + (M & 1))

46380

// ( M ? -X : X) == ((X ^ M ) + (M & 1))

46381

// This lets us transform our vselect to:

46382

// (add (xor X, M), (and M, 1))

46383

// And further to:

46384

// (sub (xor X, M), M)

46385

static SDValue combineLogicBlendIntoConditionalNegate(

46386

EVT VT, SDValue Mask, SDValue X, SDValue Y, const SDLoc &DL,

46387

SelectionDAG &DAG, const X86Subtarget &Subtarget) {

46388

EVT MaskVT = Mask.getValueType();

46389

assert(MaskVT.isInteger() &&(static_cast <bool> (MaskVT.isInteger() && DAG.
ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() &&
"Mask must be zero/all-bits") ? void (0) : __assert_fail ("MaskVT.isInteger() && DAG.ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() && \"Mask must be zero/all-bits\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46391, __extension__
__PRETTY_FUNCTION__))

46390

DAG.ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() &&(static_cast <bool> (MaskVT.isInteger() && DAG.
ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() &&
"Mask must be zero/all-bits") ? void (0) : __assert_fail ("MaskVT.isInteger() && DAG.ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() && \"Mask must be zero/all-bits\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46391, __extension__
__PRETTY_FUNCTION__))

46391

"Mask must be zero/all-bits")(static_cast <bool> (MaskVT.isInteger() && DAG.
ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() &&
"Mask must be zero/all-bits") ? void (0) : __assert_fail ("MaskVT.isInteger() && DAG.ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() && \"Mask must be zero/all-bits\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46391, __extension__
__PRETTY_FUNCTION__));

46392

46393

if (X.getValueType() != MaskVT || Y.getValueType() != MaskVT)

46394

return SDValue();

46395

if (!DAG.getTargetLoweringInfo().isOperationLegal(ISD::SUB, MaskVT))

46396

return SDValue();

46397

46398

auto IsNegV = [](SDNode *N, SDValue V) {

46399

return N->getOpcode() == ISD::SUB && N->getOperand(1) == V &&

46400

ISD::isBuildVectorAllZeros(N->getOperand(0).getNode());

46401

};

46402

46403

SDValue V;

46404

if (IsNegV(Y.getNode(), X))

46405

V = X;

46406

else if (IsNegV(X.getNode(), Y))

46407

V = Y;

46408

else

46409

return SDValue();

46410

46411

SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask);

46412

SDValue SubOp2 = Mask;

46413

46414

// If the negate was on the false side of the select, then

46415

// the operands of the SUB need to be swapped. PR 27251.

46416

// This is because the pattern being matched above is

46417

// (vselect M, (sub (0, X), X) -> (sub (xor X, M), M)

46418

// but if the pattern matched was

46419

// (vselect M, X, (sub (0, X))), that is really negation of the pattern

46420

// above, -(vselect M, (sub 0, X), X), and therefore the replacement

46421

// pattern also needs to be a negation of the replacement pattern above.

46422

// And -(sub X, Y) is just sub (Y, X), so swapping the operands of the

46423

// sub accomplishes the negation of the replacement pattern.

46424

if (V == Y)

46425

std::swap(SubOp1, SubOp2);

46426

46427

SDValue Res = DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2);

46428

return DAG.getBitcast(VT, Res);

46429

}

46430

46431

/// Do target-specific dag combines on SELECT and VSELECT nodes.

46432

static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,

46433

TargetLowering::DAGCombinerInfo &DCI,

46434

const X86Subtarget &Subtarget) {

46435

SDLoc DL(N);

46436

SDValue Cond = N->getOperand(0);

46437

SDValue LHS = N->getOperand(1);

46438

SDValue RHS = N->getOperand(2);

46439

46440

// Try simplification again because we use this function to optimize

46441

// BLENDV nodes that are not handled by the generic combiner.

46442

if (SDValue V = DAG.simplifySelect(Cond, LHS, RHS))

46443

return V;

46444

46445

EVT VT = LHS.getValueType();

46446

EVT CondVT = Cond.getValueType();

46447

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

46448

bool CondConstantVector = ISD::isBuildVectorOfConstantSDNodes(Cond.getNode());

46449

46450

// Attempt to combine (select M, (sub 0, X), X) -> (sub (xor X, M), M).

46451

// Limit this to cases of non-constant masks that createShuffleMaskFromVSELECT

46452

// can't catch, plus vXi8 cases where we'd likely end up with BLENDV.

46453

if (CondVT.isVector() && CondVT.isInteger() &&

46454

CondVT.getScalarSizeInBits() == VT.getScalarSizeInBits() &&

46455

(!CondConstantVector || CondVT.getScalarType() == MVT::i8) &&

46456

DAG.ComputeNumSignBits(Cond) == CondVT.getScalarSizeInBits())

46457

if (SDValue V = combineLogicBlendIntoConditionalNegate(VT, Cond, RHS, LHS,

46458

DL, DAG, Subtarget))

46459

return V;

46460

46461

// Convert vselects with constant condition into shuffles.

46462

if (CondConstantVector && DCI.isBeforeLegalizeOps() &&

46463

(N->getOpcode() == ISD::VSELECT || N->getOpcode() == X86ISD::BLENDV)) {

46464

SmallVector<int, 64> Mask;

46465

if (createShuffleMaskFromVSELECT(Mask, Cond,

46466

N->getOpcode() == X86ISD::BLENDV))

46467

return DAG.getVectorShuffle(VT, DL, LHS, RHS, Mask);

46468

}

46469

46470

// fold vselect(cond, pshufb(x), pshufb(y)) -> or (pshufb(x), pshufb(y))

46471

// by forcing the unselected elements to zero.

46472

// TODO: Can we handle more shuffles with this?

46473

if (N->getOpcode() == ISD::VSELECT && CondVT.isVector() &&

46474

LHS.getOpcode() == X86ISD::PSHUFB && RHS.getOpcode() == X86ISD::PSHUFB &&

46475

LHS.hasOneUse() && RHS.hasOneUse()) {

46476

MVT SimpleVT = VT.getSimpleVT();

46477

SmallVector<SDValue, 1> LHSOps, RHSOps;

46478

SmallVector<int, 64> LHSMask, RHSMask, CondMask;

46479

if (createShuffleMaskFromVSELECT(CondMask, Cond) &&

46480

getTargetShuffleMask(LHS.getNode(), SimpleVT, true, LHSOps, LHSMask) &&

46481

getTargetShuffleMask(RHS.getNode(), SimpleVT, true, RHSOps, RHSMask)) {

46482

int NumElts = VT.getVectorNumElements();

46483

for (int i = 0; i != NumElts; ++i) {

46484

// getConstVector sets negative shuffle mask values as undef, so ensure

46485

// we hardcode SM_SentinelZero values to zero (0x80).

46486

if (CondMask[i] < NumElts) {

46487

LHSMask[i] = isUndefOrZero(LHSMask[i]) ? 0x80 : LHSMask[i];

46488

RHSMask[i] = 0x80;

46489

} else {

46490

LHSMask[i] = 0x80;

46491

RHSMask[i] = isUndefOrZero(RHSMask[i]) ? 0x80 : RHSMask[i];

46492

}

46493

}

46494

LHS = DAG.getNode(X86ISD::PSHUFB, DL, VT, LHS.getOperand(0),

46495

getConstVector(LHSMask, SimpleVT, DAG, DL, true));

46496

RHS = DAG.getNode(X86ISD::PSHUFB, DL, VT, RHS.getOperand(0),

46497

getConstVector(RHSMask, SimpleVT, DAG, DL, true));

46498

return DAG.getNode(ISD::OR, DL, VT, LHS, RHS);

46499

}

46500

}

46501

46502

// If we have SSE[12] support, try to form min/max nodes. SSE min/max

46503

// instructions match the semantics of the common C idiom x<y?x:y but not

46504

// x<=y?x:y, because of how they handle negative zero (which can be

46505

// ignored in unsafe-math mode).

46506

// We also try to create v2f32 min/max nodes, which we later widen to v4f32.

46507

if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&

46508

VT != MVT::f80 && VT != MVT::f128 && !isSoftFP16(VT, Subtarget) &&

46509

(TLI.isTypeLegal(VT) || VT == MVT::v2f32) &&

46510

(Subtarget.hasSSE2() ||

46511

(Subtarget.hasSSE1() && VT.getScalarType() == MVT::f32))) {

46512

ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();

46513

46514

unsigned Opcode = 0;

46515

// Check for x CC y ? x : y.

46516

if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&

46517

DAG.isEqualTo(RHS, Cond.getOperand(1))) {

46518

switch (CC) {

46519

default: break;

46520

case ISD::SETULT:

46521

// Converting this to a min would handle NaNs incorrectly, and swapping

46522

// the operands would cause it to handle comparisons between positive

46523

// and negative zero incorrectly.

46524

if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {

46525

if (!DAG.getTarget().Options.NoSignedZerosFPMath &&

46526

!(DAG.isKnownNeverZeroFloat(LHS) ||

46527

DAG.isKnownNeverZeroFloat(RHS)))

46528

break;

46529

std::swap(LHS, RHS);

46530

}

46531

Opcode = X86ISD::FMIN;

46532

break;

46533

case ISD::SETOLE:

46534

// Converting this to a min would handle comparisons between positive

46535

// and negative zero incorrectly.

46536

if (!DAG.getTarget().Options.NoSignedZerosFPMath &&

46537

!DAG.isKnownNeverZeroFloat(LHS) && !DAG.isKnownNeverZeroFloat(RHS))

46538

break;

46539

Opcode = X86ISD::FMIN;

46540

break;

46541

case ISD::SETULE:

46542

// Converting this to a min would handle both negative zeros and NaNs

46543

// incorrectly, but we can swap the operands to fix both.

46544

std::swap(LHS, RHS);

46545

[[fallthrough]];

46546

case ISD::SETOLT:

46547

case ISD::SETLT:

46548

case ISD::SETLE:

46549

Opcode = X86ISD::FMIN;

46550

break;

46551

46552

case ISD::SETOGE:

46553

// Converting this to a max would handle comparisons between positive

46554

// and negative zero incorrectly.

46555

if (!DAG.getTarget().Options.NoSignedZerosFPMath &&

46556

!DAG.isKnownNeverZeroFloat(LHS) && !DAG.isKnownNeverZeroFloat(RHS))

46557

break;

46558

Opcode = X86ISD::FMAX;

46559

break;

46560

case ISD::SETUGT:

46561

// Converting this to a max would handle NaNs incorrectly, and swapping

46562

// the operands would cause it to handle comparisons between positive

46563

// and negative zero incorrectly.

46564

if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {

46565

if (!DAG.getTarget().Options.NoSignedZerosFPMath &&

46566

!(DAG.isKnownNeverZeroFloat(LHS) ||

46567

DAG.isKnownNeverZeroFloat(RHS)))

46568

break;

46569

std::swap(LHS, RHS);

46570

}

46571

Opcode = X86ISD::FMAX;

46572

break;

46573

case ISD::SETUGE:

46574

// Converting this to a max would handle both negative zeros and NaNs

46575

// incorrectly, but we can swap the operands to fix both.

46576

std::swap(LHS, RHS);

46577

[[fallthrough]];

46578

case ISD::SETOGT:

46579

case ISD::SETGT:

46580

case ISD::SETGE:

46581

Opcode = X86ISD::FMAX;

46582

break;

46583

}

46584

// Check for x CC y ? y : x -- a min/max with reversed arms.

46585

} else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&

46586

DAG.isEqualTo(RHS, Cond.getOperand(0))) {

46587

switch (CC) {

46588

default: break;

46589

case ISD::SETOGE:

46590

// Converting this to a min would handle comparisons between positive

46591

// and negative zero incorrectly, and swapping the operands would

46592

// cause it to handle NaNs incorrectly.

46593

if (!DAG.getTarget().Options.NoSignedZerosFPMath &&

46594

!(DAG.isKnownNeverZeroFloat(LHS) ||

46595

DAG.isKnownNeverZeroFloat(RHS))) {

46596

if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))

46597

break;

46598

std::swap(LHS, RHS);

46599

}

46600

Opcode = X86ISD::FMIN;

46601

break;

46602

case ISD::SETUGT:

46603

// Converting this to a min would handle NaNs incorrectly.

46604

if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))

46605

break;

46606

Opcode = X86ISD::FMIN;

46607

break;

46608

case ISD::SETUGE:

46609

// Converting this to a min would handle both negative zeros and NaNs

46610

// incorrectly, but we can swap the operands to fix both.

46611

std::swap(LHS, RHS);

46612

[[fallthrough]];

46613

case ISD::SETOGT:

46614

case ISD::SETGT:

46615

case ISD::SETGE:

46616

Opcode = X86ISD::FMIN;

46617

break;

46618

46619

case ISD::SETULT:

46620

// Converting this to a max would handle NaNs incorrectly.

46621

if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))

46622

break;

46623

Opcode = X86ISD::FMAX;

46624

break;

46625

case ISD::SETOLE:

46626

// Converting this to a max would handle comparisons between positive

46627

// and negative zero incorrectly, and swapping the operands would

46628

// cause it to handle NaNs incorrectly.

46629

if (!DAG.getTarget().Options.NoSignedZerosFPMath &&

46630

!DAG.isKnownNeverZeroFloat(LHS) &&

46631

!DAG.isKnownNeverZeroFloat(RHS)) {

46632

if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))

46633

break;

46634

std::swap(LHS, RHS);

46635

}

46636

Opcode = X86ISD::FMAX;

46637

break;

46638

case ISD::SETULE:

46639

// Converting this to a max would handle both negative zeros and NaNs

46640

// incorrectly, but we can swap the operands to fix both.

46641

std::swap(LHS, RHS);

46642

[[fallthrough]];

46643

case ISD::SETOLT:

46644

case ISD::SETLT:

46645

case ISD::SETLE:

46646

Opcode = X86ISD::FMAX;

46647

break;

46648

}

46649

}

46650

46651

if (Opcode)

46652

return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);

46653

}

46654

46655

// Some mask scalar intrinsics rely on checking if only one bit is set

46656

// and implement it in C code like this:

46657

// A[0] = (U & 1) ? A[0] : W[0];

46658

// This creates some redundant instructions that break pattern matching.

46659

// fold (select (setcc (and (X, 1), 0, seteq), Y, Z)) -> select(and(X, 1),Z,Y)

46660

if (Subtarget.hasAVX512() && N->getOpcode() == ISD::SELECT &&

46661

Cond.getOpcode() == ISD::SETCC && (VT == MVT::f32 || VT == MVT::f64)) {

46662

ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();

46663

SDValue AndNode = Cond.getOperand(0);

46664

if (AndNode.getOpcode() == ISD::AND && CC == ISD::SETEQ &&

46665

isNullConstant(Cond.getOperand(1)) &&

46666

isOneConstant(AndNode.getOperand(1))) {

46667

// LHS and RHS swapped due to

46668

// setcc outputting 1 when AND resulted in 0 and vice versa.

46669

AndNode = DAG.getZExtOrTrunc(AndNode, DL, MVT::i8);

46670

return DAG.getNode(ISD::SELECT, DL, VT, AndNode, RHS, LHS);

46671

}

46672

}

46673

46674

// v16i8 (select v16i1, v16i8, v16i8) does not have a proper

46675

// lowering on KNL. In this case we convert it to

46676

// v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction.

46677

// The same situation all vectors of i8 and i16 without BWI.

46678

// Make sure we extend these even before type legalization gets a chance to

46679

// split wide vectors.

46680

// Since SKX these selects have a proper lowering.

46681

if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && CondVT.isVector() &&

46682

CondVT.getVectorElementType() == MVT::i1 &&

46683

(VT.getVectorElementType() == MVT::i8 ||

46684

VT.getVectorElementType() == MVT::i16)) {

46685

Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond);

46686

return DAG.getNode(N->getOpcode(), DL, VT, Cond, LHS, RHS);

46687

}

46688

46689

// AVX512 - Extend select with zero to merge with target shuffle.

46690

// select(mask, extract_subvector(shuffle(x)), zero) -->

46691

// extract_subvector(select(insert_subvector(mask), shuffle(x), zero))

46692

// TODO - support non target shuffles as well.

46693

if (Subtarget.hasAVX512() && CondVT.isVector() &&

46694

CondVT.getVectorElementType() == MVT::i1) {

46695

auto SelectableOp = [&TLI](SDValue Op) {

46696

return Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&

46697

isTargetShuffle(Op.getOperand(0).getOpcode()) &&

46698

isNullConstant(Op.getOperand(1)) &&

46699

TLI.isTypeLegal(Op.getOperand(0).getValueType()) &&

46700

Op.hasOneUse() && Op.getOperand(0).hasOneUse();

46701

};

46702

46703

bool SelectableLHS = SelectableOp(LHS);

46704

bool SelectableRHS = SelectableOp(RHS);

46705

bool ZeroLHS = ISD::isBuildVectorAllZeros(LHS.getNode());

46706

bool ZeroRHS = ISD::isBuildVectorAllZeros(RHS.getNode());

46707

46708

if ((SelectableLHS && ZeroRHS) || (SelectableRHS && ZeroLHS)) {

46709

EVT SrcVT = SelectableLHS ? LHS.getOperand(0).getValueType()

46710

: RHS.getOperand(0).getValueType();

46711

EVT SrcCondVT = SrcVT.changeVectorElementType(MVT::i1);

46712

LHS = insertSubVector(DAG.getUNDEF(SrcVT), LHS, 0, DAG, DL,

46713

VT.getSizeInBits());

46714

RHS = insertSubVector(DAG.getUNDEF(SrcVT), RHS, 0, DAG, DL,

46715

VT.getSizeInBits());

46716

Cond = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, SrcCondVT,

46717

DAG.getUNDEF(SrcCondVT), Cond,

46718

DAG.getIntPtrConstant(0, DL));

46719

SDValue Res = DAG.getSelect(DL, SrcVT, Cond, LHS, RHS);

46720

return extractSubVector(Res, 0, DAG, DL, VT.getSizeInBits());

46721

}

46722

}

46723

46724

if (SDValue V = combineSelectOfTwoConstants(N, DAG))

46725

return V;

46726

46727

if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&

46728

Cond.hasOneUse()) {

46729

EVT CondVT = Cond.getValueType();

46730

SDValue Cond0 = Cond.getOperand(0);

46731

SDValue Cond1 = Cond.getOperand(1);

46732

ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();

46733

46734

// Canonicalize min/max:

46735

// (x > 0) ? x : 0 -> (x >= 0) ? x : 0

46736

// (x < -1) ? x : -1 -> (x <= -1) ? x : -1

46737

// This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates

46738

// the need for an extra compare against zero. e.g.

46739

// (a - b) > 0 : (a - b) ? 0 -> (a - b) >= 0 : (a - b) ? 0

46740

// subl %esi, %edi

46741

// testl %edi, %edi

46742

// movl $0, %eax

46743

// cmovgl %edi, %eax

46744

// =>

46745

// xorl %eax, %eax

46746

// subl %esi, $edi

46747

// cmovsl %eax, %edi

46748

//

46749

// We can also canonicalize

46750

// (x s> 1) ? x : 1 -> (x s>= 1) ? x : 1 -> (x s> 0) ? x : 1

46751

// (x u> 1) ? x : 1 -> (x u>= 1) ? x : 1 -> (x != 0) ? x : 1

46752

// This allows the use of a test instruction for the compare.

46753

if (LHS == Cond0 && RHS == Cond1) {

46754

if ((CC == ISD::SETGT && (isNullConstant(RHS) || isOneConstant(RHS))) ||

46755

(CC == ISD::SETLT && isAllOnesConstant(RHS))) {

46756

ISD::CondCode NewCC = CC == ISD::SETGT ? ISD::SETGE : ISD::SETLE;

46757

Cond = DAG.getSetCC(SDLoc(Cond), CondVT, Cond0, Cond1, NewCC);

46758

return DAG.getSelect(DL, VT, Cond, LHS, RHS);

46759

}

46760

if (CC == ISD::SETUGT && isOneConstant(RHS)) {

46761

ISD::CondCode NewCC = ISD::SETUGE;

46762

Cond = DAG.getSetCC(SDLoc(Cond), CondVT, Cond0, Cond1, NewCC);

46763

return DAG.getSelect(DL, VT, Cond, LHS, RHS);

46764

}

46765

}

46766

46767

// Similar to DAGCombine's select(or(CC0,CC1),X,Y) fold but for legal types.

46768

// fold eq + gt/lt nested selects into ge/le selects

46769

// select (cmpeq Cond0, Cond1), LHS, (select (cmpugt Cond0, Cond1), LHS, Y)

46770

// --> (select (cmpuge Cond0, Cond1), LHS, Y)

46771

// select (cmpslt Cond0, Cond1), LHS, (select (cmpeq Cond0, Cond1), LHS, Y)

46772

// --> (select (cmpsle Cond0, Cond1), LHS, Y)

46773

// .. etc ..

46774

if (RHS.getOpcode() == ISD::SELECT && RHS.getOperand(1) == LHS &&

46775

RHS.getOperand(0).getOpcode() == ISD::SETCC) {

46776

SDValue InnerSetCC = RHS.getOperand(0);

46777

ISD::CondCode InnerCC =

46778

cast<CondCodeSDNode>(InnerSetCC.getOperand(2))->get();

46779

if ((CC == ISD::SETEQ || InnerCC == ISD::SETEQ) &&

46780

Cond0 == InnerSetCC.getOperand(0) &&

46781

Cond1 == InnerSetCC.getOperand(1)) {

46782

ISD::CondCode NewCC;

46783

switch (CC == ISD::SETEQ ? InnerCC : CC) {

46784

case ISD::SETGT: NewCC = ISD::SETGE; break;

46785

case ISD::SETLT: NewCC = ISD::SETLE; break;

46786

case ISD::SETUGT: NewCC = ISD::SETUGE; break;

46787

case ISD::SETULT: NewCC = ISD::SETULE; break;

46788

default: NewCC = ISD::SETCC_INVALID; break;

46789

}

46790

if (NewCC != ISD::SETCC_INVALID) {

46791

Cond = DAG.getSetCC(DL, CondVT, Cond0, Cond1, NewCC);

46792

return DAG.getSelect(DL, VT, Cond, LHS, RHS.getOperand(2));

46793

}

46794

}

46795

}

46796

}

46797

46798

// Check if the first operand is all zeros and Cond type is vXi1.

46799

// If this an avx512 target we can improve the use of zero masking by

46800

// swapping the operands and inverting the condition.

46801

if (N->getOpcode() == ISD::VSELECT && Cond.hasOneUse() &&

46802

Subtarget.hasAVX512() && CondVT.getVectorElementType() == MVT::i1 &&

46803

ISD::isBuildVectorAllZeros(LHS.getNode()) &&

46804

!ISD::isBuildVectorAllZeros(RHS.getNode())) {

46805

// Invert the cond to not(cond) : xor(op,allones)=not(op)

46806

SDValue CondNew = DAG.getNOT(DL, Cond, CondVT);

46807

// Vselect cond, op1, op2 = Vselect not(cond), op2, op1

46808

return DAG.getSelect(DL, VT, CondNew, RHS, LHS);

46809

}

46810

46811

// Attempt to convert a (vXi1 bitcast(iX Cond)) selection mask before it might

46812

// get split by legalization.

46813

if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::BITCAST &&

46814

CondVT.getVectorElementType() == MVT::i1 &&

46815

TLI.isTypeLegal(VT.getScalarType())) {

46816

EVT ExtCondVT = VT.changeVectorElementTypeToInteger();

46817

if (SDValue ExtCond = combineToExtendBoolVectorInReg(

46818

ISD::SIGN_EXTEND, DL, ExtCondVT, Cond, DAG, DCI, Subtarget)) {

46819

ExtCond = DAG.getNode(ISD::TRUNCATE, DL, CondVT, ExtCond);

46820

return DAG.getSelect(DL, VT, ExtCond, LHS, RHS);

46821

}

46822

}

46823

46824

// Early exit check

46825

if (!TLI.isTypeLegal(VT) || isSoftFP16(VT, Subtarget))

46826

return SDValue();

46827

46828

if (SDValue V = combineVSelectWithAllOnesOrZeros(N, DAG, DCI, Subtarget))

46829

return V;

46830

46831

if (SDValue V = combineVSelectToBLENDV(N, DAG, DCI, Subtarget))

46832

return V;

46833

46834

if (SDValue V = narrowVectorSelect(N, DAG, Subtarget))

46835

return V;

46836

46837

// select(~Cond, X, Y) -> select(Cond, Y, X)

46838

if (CondVT.getScalarType() != MVT::i1) {

46839

if (SDValue CondNot = IsNOT(Cond, DAG))

46840

return DAG.getNode(N->getOpcode(), DL, VT,

46841

DAG.getBitcast(CondVT, CondNot), RHS, LHS);

46842

46843

// pcmpgt(X, -1) -> pcmpgt(0, X) to help select/blendv just use the

46844

// signbit.

46845

if (Cond.getOpcode() == X86ISD::PCMPGT &&

46846

ISD::isBuildVectorAllOnes(Cond.getOperand(1).getNode()) &&

46847

Cond.hasOneUse()) {

46848

Cond = DAG.getNode(X86ISD::PCMPGT, DL, CondVT,

46849

DAG.getConstant(0, DL, CondVT), Cond.getOperand(0));

46850

return DAG.getNode(N->getOpcode(), DL, VT, Cond, RHS, LHS);

46851

}

46852

}

46853

46854

// Try to optimize vXi1 selects if both operands are either all constants or

46855

// bitcasts from scalar integer type. In that case we can convert the operands

46856

// to integer and use an integer select which will be converted to a CMOV.

46857

// We need to take a little bit of care to avoid creating an i64 type after

46858

// type legalization.

46859

if (N->getOpcode() == ISD::SELECT && VT.isVector() &&

46860

VT.getVectorElementType() == MVT::i1 &&

46861

(DCI.isBeforeLegalize() || (VT != MVT::v64i1 || Subtarget.is64Bit()))) {

46862

EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getVectorNumElements());

46863

if (DCI.isBeforeLegalize() || TLI.isTypeLegal(IntVT)) {

46864

bool LHSIsConst = ISD::isBuildVectorOfConstantSDNodes(LHS.getNode());

46865

bool RHSIsConst = ISD::isBuildVectorOfConstantSDNodes(RHS.getNode());

46866

46867

if ((LHSIsConst || (LHS.getOpcode() == ISD::BITCAST &&

46868

LHS.getOperand(0).getValueType() == IntVT)) &&

46869

(RHSIsConst || (RHS.getOpcode() == ISD::BITCAST &&

46870

RHS.getOperand(0).getValueType() == IntVT))) {

46871

if (LHSIsConst)

46872

LHS = combinevXi1ConstantToInteger(LHS, DAG);

46873

else

46874

LHS = LHS.getOperand(0);

46875

46876

if (RHSIsConst)

46877

RHS = combinevXi1ConstantToInteger(RHS, DAG);

46878

else

46879

RHS = RHS.getOperand(0);

46880

46881

SDValue Select = DAG.getSelect(DL, IntVT, Cond, LHS, RHS);

46882

return DAG.getBitcast(VT, Select);

46883

}

46884

}

46885

}

46886

46887

// If this is "((X & C) == 0) ? Y : Z" and C is a constant mask vector of

46888

// single bits, then invert the predicate and swap the select operands.

46889

// This can lower using a vector shift bit-hack rather than mask and compare.

46890

if (DCI.isBeforeLegalize() && !Subtarget.hasAVX512() &&

46891

N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&

46892

Cond.hasOneUse() && CondVT.getVectorElementType() == MVT::i1 &&

46893

Cond.getOperand(0).getOpcode() == ISD::AND &&

46894

isNullOrNullSplat(Cond.getOperand(1)) &&

46895

cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&

46896

Cond.getOperand(0).getValueType() == VT) {

46897

// The 'and' mask must be composed of power-of-2 constants.

46898

SDValue And = Cond.getOperand(0);

46899

auto *C = isConstOrConstSplat(And.getOperand(1));

46900

if (C && C->getAPIntValue().isPowerOf2()) {

46901

// vselect (X & C == 0), LHS, RHS --> vselect (X & C != 0), RHS, LHS

46902

SDValue NotCond =

46903

DAG.getSetCC(DL, CondVT, And, Cond.getOperand(1), ISD::SETNE);

46904

return DAG.getSelect(DL, VT, NotCond, RHS, LHS);

46905

}

46906

46907

// If we have a non-splat but still powers-of-2 mask, AVX1 can use pmulld

46908

// and AVX2 can use vpsllv{dq}. 8-bit lacks a proper shift or multiply.

46909

// 16-bit lacks a proper blendv.

46910

unsigned EltBitWidth = VT.getScalarSizeInBits();

46911

bool CanShiftBlend =

46912

TLI.isTypeLegal(VT) && ((Subtarget.hasAVX() && EltBitWidth == 32) ||

46913

(Subtarget.hasAVX2() && EltBitWidth == 64) ||

46914

(Subtarget.hasXOP()));

46915

if (CanShiftBlend &&

46916

ISD::matchUnaryPredicate(And.getOperand(1), [](ConstantSDNode *C) {

46917

return C->getAPIntValue().isPowerOf2();

46918

})) {

46919

// Create a left-shift constant to get the mask bits over to the sign-bit.

46920

SDValue Mask = And.getOperand(1);

46921

SmallVector<int, 32> ShlVals;

46922

for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {

46923

auto *MaskVal = cast<ConstantSDNode>(Mask.getOperand(i));

46924

ShlVals.push_back(EltBitWidth - 1 -

46925

MaskVal->getAPIntValue().exactLogBase2());

46926

}

46927

// vsel ((X & C) == 0), LHS, RHS --> vsel ((shl X, C') < 0), RHS, LHS

46928

SDValue ShlAmt = getConstVector(ShlVals, VT.getSimpleVT(), DAG, DL);

46929

SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, And.getOperand(0), ShlAmt);

46930

SDValue NewCond =

46931

DAG.getSetCC(DL, CondVT, Shl, Cond.getOperand(1), ISD::SETLT);

46932

return DAG.getSelect(DL, VT, NewCond, RHS, LHS);

46933

}

46934

}

46935

46936

return SDValue();

46937

}

46938

46939

/// Combine:

46940

/// (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S)

46941

/// to:

46942

/// (brcond/cmov/setcc .., (LADD x, 1), COND_LE)

46943

/// i.e., reusing the EFLAGS produced by the LOCKed instruction.

46944

/// Note that this is only legal for some op/cc combinations.

46945

static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC,

46946

SelectionDAG &DAG,

46947

const X86Subtarget &Subtarget) {

46948

// This combine only operates on CMP-like nodes.

46949

if (!(Cmp.getOpcode() == X86ISD::CMP ||

46950

(Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))

46951

return SDValue();

46952

46953

// Can't replace the cmp if it has more uses than the one we're looking at.

46954

// FIXME: We would like to be able to handle this, but would need to make sure

46955

// all uses were updated.

46956

if (!Cmp.hasOneUse())

46957

return SDValue();

46958

46959

// This only applies to variations of the common case:

46960

// (icmp slt x, 0) -> (icmp sle (add x, 1), 0)

46961

// (icmp sge x, 0) -> (icmp sgt (add x, 1), 0)

46962

// (icmp sle x, 0) -> (icmp slt (sub x, 1), 0)

46963

// (icmp sgt x, 0) -> (icmp sge (sub x, 1), 0)

46964

// Using the proper condcodes (see below), overflow is checked for.

46965

46966

// FIXME: We can generalize both constraints:

46967

// - XOR/OR/AND (if they were made to survive AtomicExpand)

46968

// - LHS != 1

46969

// if the result is compared.

46970

46971

SDValue CmpLHS = Cmp.getOperand(0);

46972

SDValue CmpRHS = Cmp.getOperand(1);

46973

EVT CmpVT = CmpLHS.getValueType();

46974

46975

if (!CmpLHS.hasOneUse())

46976

return SDValue();

46977

46978

unsigned Opc = CmpLHS.getOpcode();

46979

if (Opc != ISD::ATOMIC_LOAD_ADD && Opc != ISD::ATOMIC_LOAD_SUB)

46980

return SDValue();

46981

46982

SDValue OpRHS = CmpLHS.getOperand(2);

46983

auto *OpRHSC = dyn_cast<ConstantSDNode>(OpRHS);

46984

if (!OpRHSC)

46985

return SDValue();

46986

46987

APInt Addend = OpRHSC->getAPIntValue();

46988

if (Opc == ISD::ATOMIC_LOAD_SUB)

46989

Addend = -Addend;

46990

46991

auto *CmpRHSC = dyn_cast<ConstantSDNode>(CmpRHS);

46992

if (!CmpRHSC)

46993

return SDValue();

46994

46995

APInt Comparison = CmpRHSC->getAPIntValue();

46996

APInt NegAddend = -Addend;

46997

46998

// See if we can adjust the CC to make the comparison match the negated

46999

// addend.

47000

if (Comparison != NegAddend) {

47001

APInt IncComparison = Comparison + 1;

47002

if (IncComparison == NegAddend) {

47003

if (CC == X86::COND_A && !Comparison.isMaxValue()) {

47004

Comparison = IncComparison;

47005

CC = X86::COND_AE;

47006

} else if (CC == X86::COND_LE && !Comparison.isMaxSignedValue()) {

47007

Comparison = IncComparison;

47008

CC = X86::COND_L;

47009

}

47010

}

47011

APInt DecComparison = Comparison - 1;

47012

if (DecComparison == NegAddend) {

47013

if (CC == X86::COND_AE && !Comparison.isMinValue()) {

47014

Comparison = DecComparison;

47015

CC = X86::COND_A;

47016

} else if (CC == X86::COND_L && !Comparison.isMinSignedValue()) {

47017

Comparison = DecComparison;

47018

CC = X86::COND_LE;

47019

}

47020

}

47021

}

47022

47023

// If the addend is the negation of the comparison value, then we can do

47024

// a full comparison by emitting the atomic arithmetic as a locked sub.

47025

if (Comparison == NegAddend) {

47026

// The CC is fine, but we need to rewrite the LHS of the comparison as an

47027

// atomic sub.

47028

auto *AN = cast<AtomicSDNode>(CmpLHS.getNode());

47029

auto AtomicSub = DAG.getAtomic(

47030

ISD::ATOMIC_LOAD_SUB, SDLoc(CmpLHS), CmpVT,

47031

/*Chain*/ CmpLHS.getOperand(0), /*LHS*/ CmpLHS.getOperand(1),

47032

/*RHS*/ DAG.getConstant(NegAddend, SDLoc(CmpRHS), CmpVT),

47033

AN->getMemOperand());

47034

auto LockOp = lowerAtomicArithWithLOCK(AtomicSub, DAG, Subtarget);

47035

DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0), DAG.getUNDEF(CmpVT));

47036

DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));

47037

return LockOp;

47038

}

47039

47040

// We can handle comparisons with zero in a number of cases by manipulating

47041

// the CC used.

47042

if (!Comparison.isZero())

47043

return SDValue();

47044

47045

if (CC == X86::COND_S && Addend == 1)

47046

CC = X86::COND_LE;

47047

else if (CC == X86::COND_NS && Addend == 1)

47048

CC = X86::COND_G;

47049

else if (CC == X86::COND_G && Addend == -1)

47050

CC = X86::COND_GE;

47051

else if (CC == X86::COND_LE && Addend == -1)

47052

CC = X86::COND_L;

47053

else

47054

return SDValue();

47055

47056

SDValue LockOp = lowerAtomicArithWithLOCK(CmpLHS, DAG, Subtarget);

47057

DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0), DAG.getUNDEF(CmpVT));

47058

DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));

47059

return LockOp;

47060

}

47061

47062

// Check whether a boolean test is testing a boolean value generated by

47063

// X86ISD::SETCC. If so, return the operand of that SETCC and proper condition

47064

// code.

47065

//

47066

// Simplify the following patterns:

47067

// (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or

47068

// (Op (CMP (SETCC Cond EFLAGS) 0) NEQ)

47069

// to (Op EFLAGS Cond)

47070

//

47071

// (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or

47072

// (Op (CMP (SETCC Cond EFLAGS) 1) NEQ)

47073

// to (Op EFLAGS !Cond)

47074

//

47075

// where Op could be BRCOND or CMOV.

47076

//

47077

static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {

47078

// This combine only operates on CMP-like nodes.

47079

if (!(Cmp.getOpcode() == X86ISD::CMP ||

47080

(Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))

47081

return SDValue();

47082

47083

// Quit if not used as a boolean value.

47084

if (CC != X86::COND_E && CC != X86::COND_NE)

47085

return SDValue();

47086

47087

// Check CMP operands. One of them should be 0 or 1 and the other should be

47088

// an SetCC or extended from it.

47089

SDValue Op1 = Cmp.getOperand(0);

47090

SDValue Op2 = Cmp.getOperand(1);

47091

47092

SDValue SetCC;

47093

const ConstantSDNode* C = nullptr;

47094

bool needOppositeCond = (CC == X86::COND_E);

47095

bool checkAgainstTrue = false; // Is it a comparison against 1?

47096

47097

if ((C = dyn_cast<ConstantSDNode>(Op1)))

47098

SetCC = Op2;

47099

else if ((C = dyn_cast<ConstantSDNode>(Op2)))

47100

SetCC = Op1;

47101

else // Quit if all operands are not constants.

47102

return SDValue();

47103

47104

if (C->getZExtValue() == 1) {

47105

needOppositeCond = !needOppositeCond;

47106

checkAgainstTrue = true;

47107

} else if (C->getZExtValue() != 0)

47108

// Quit if the constant is neither 0 or 1.

47109

return SDValue();

47110

47111

bool truncatedToBoolWithAnd = false;

47112

// Skip (zext $x), (trunc $x), or (and $x, 1) node.

47113

while (SetCC.getOpcode() == ISD::ZERO_EXTEND ||

47114

SetCC.getOpcode() == ISD::TRUNCATE ||

47115

SetCC.getOpcode() == ISD::AND) {

47116

if (SetCC.getOpcode() == ISD::AND) {

47117

int OpIdx = -1;

47118

if (isOneConstant(SetCC.getOperand(0)))

47119

OpIdx = 1;

47120

if (isOneConstant(SetCC.getOperand(1)))

47121

OpIdx = 0;

47122

if (OpIdx < 0)

47123

break;

47124

SetCC = SetCC.getOperand(OpIdx);

47125

truncatedToBoolWithAnd = true;

47126

} else

47127

SetCC = SetCC.getOperand(0);

47128

}

47129

47130

switch (SetCC.getOpcode()) {

47131

case X86ISD::SETCC_CARRY:

47132

// Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to

47133

// simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1,

47134

// i.e. it's a comparison against true but the result of SETCC_CARRY is not

47135

// truncated to i1 using 'and'.

47136

if (checkAgainstTrue && !truncatedToBoolWithAnd)

47137

break;

47138

assert(X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B &&(static_cast <bool> (X86::CondCode(SetCC.getConstantOperandVal
(0)) == X86::COND_B && "Invalid use of SETCC_CARRY!")
? void (0) : __assert_fail ("X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B && \"Invalid use of SETCC_CARRY!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 47139, __extension__
__PRETTY_FUNCTION__))

47139

"Invalid use of SETCC_CARRY!")(static_cast <bool> (X86::CondCode(SetCC.getConstantOperandVal
(0)) == X86::COND_B && "Invalid use of SETCC_CARRY!")
? void (0) : __assert_fail ("X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B && \"Invalid use of SETCC_CARRY!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 47139, __extension__
__PRETTY_FUNCTION__));

47140

[[fallthrough]];

47141

case X86ISD::SETCC:

47142

// Set the condition code or opposite one if necessary.

47143

CC = X86::CondCode(SetCC.getConstantOperandVal(0));

47144

if (needOppositeCond)

47145

CC = X86::GetOppositeBranchCondition(CC);

47146

return SetCC.getOperand(1);

47147

case X86ISD::CMOV: {

47148

// Check whether false/true value has canonical one, i.e. 0 or 1.

47149

ConstantSDNode *FVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(0));

47150

ConstantSDNode *TVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(1));

47151

// Quit if true value is not a constant.

47152

if (!TVal)

47153

return SDValue();

47154

// Quit if false value is not a constant.

47155

if (!FVal) {

47156

SDValue Op = SetCC.getOperand(0);

47157

// Skip 'zext' or 'trunc' node.

47158

if (Op.getOpcode() == ISD::ZERO_EXTEND ||

47159

Op.getOpcode() == ISD::TRUNCATE)

47160

Op = Op.getOperand(0);

47161

// A special case for rdrand/rdseed, where 0 is set if false cond is

47162

// found.

47163

if ((Op.getOpcode() != X86ISD::RDRAND &&

47164

Op.getOpcode() != X86ISD::RDSEED) || Op.getResNo() != 0)

47165

return SDValue();

47166

}

47167

// Quit if false value is not the constant 0 or 1.

47168

bool FValIsFalse = true;

47169

if (FVal && FVal->getZExtValue() != 0) {

47170

if (FVal->getZExtValue() != 1)

47171

return SDValue();

47172

// If FVal is 1, opposite cond is needed.

47173

needOppositeCond = !needOppositeCond;

47174

FValIsFalse = false;

47175

}

47176

// Quit if TVal is not the constant opposite of FVal.

47177

if (FValIsFalse && TVal->getZExtValue() != 1)

47178

return SDValue();

47179

if (!FValIsFalse && TVal->getZExtValue() != 0)

47180

return SDValue();

47181

CC = X86::CondCode(SetCC.getConstantOperandVal(2));

47182

if (needOppositeCond)

47183

CC = X86::GetOppositeBranchCondition(CC);

47184

return SetCC.getOperand(3);

47185

}

47186

}

47187

47188

return SDValue();

47189

}

47190

47191

/// Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS.

47192

/// Match:

47193

/// (X86or (X86setcc) (X86setcc))

47194

/// (X86cmp (and (X86setcc) (X86setcc)), 0)

47195

static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0,

47196

X86::CondCode &CC1, SDValue &Flags,

47197

bool &isAnd) {

47198

if (Cond->getOpcode() == X86ISD::CMP) {

47199

if (!isNullConstant(Cond->getOperand(1)))

47200

return false;

47201

47202

Cond = Cond->getOperand(0);

47203

}

47204

47205

isAnd = false;

47206

47207

SDValue SetCC0, SetCC1;

47208

switch (Cond->getOpcode()) {

47209

default: return false;

47210

case ISD::AND:

47211

case X86ISD::AND:

47212

isAnd = true;

47213

[[fallthrough]];

47214

case ISD::OR:

47215

case X86ISD::OR:

47216

SetCC0 = Cond->getOperand(0);

47217

SetCC1 = Cond->getOperand(1);

47218

break;

47219

};

47220

47221

// Make sure we have SETCC nodes, using the same flags value.

47222

if (SetCC0.getOpcode() != X86ISD::SETCC ||

47223

SetCC1.getOpcode() != X86ISD::SETCC ||

47224

SetCC0->getOperand(1) != SetCC1->getOperand(1))

47225

return false;

47226

47227

CC0 = (X86::CondCode)SetCC0->getConstantOperandVal(0);

47228

CC1 = (X86::CondCode)SetCC1->getConstantOperandVal(0);

47229

Flags = SetCC0->getOperand(1);

47230

return true;

47231

}

47232

47233

// When legalizing carry, we create carries via add X, -1

47234

// If that comes from an actual carry, via setcc, we use the

47235

// carry directly.

47236

static SDValue combineCarryThroughADD(SDValue EFLAGS, SelectionDAG &DAG) {

47237

if (EFLAGS.getOpcode() == X86ISD::ADD) {

47238

if (isAllOnesConstant(EFLAGS.getOperand(1))) {

47239

bool FoundAndLSB = false;

47240

SDValue Carry = EFLAGS.getOperand(0);

47241

while (Carry.getOpcode() == ISD::TRUNCATE ||

47242

Carry.getOpcode() == ISD::ZERO_EXTEND ||

47243

(Carry.getOpcode() == ISD::AND &&

47244

isOneConstant(Carry.getOperand(1)))) {

47245

FoundAndLSB |= Carry.getOpcode() == ISD::AND;

47246

Carry = Carry.getOperand(0);

47247

}

47248

if (Carry.getOpcode() == X86ISD::SETCC ||

47249

Carry.getOpcode() == X86ISD::SETCC_CARRY) {

47250

// TODO: Merge this code with equivalent in combineAddOrSubToADCOrSBB?

47251

uint64_t CarryCC = Carry.getConstantOperandVal(0);

47252

SDValue CarryOp1 = Carry.getOperand(1);

47253

if (CarryCC == X86::COND_B)

47254

return CarryOp1;

47255

if (CarryCC == X86::COND_A) {

47256

// Try to convert COND_A into COND_B in an attempt to facilitate

47257

// materializing "setb reg".

47258

//

47259

// Do not flip "e > c", where "c" is a constant, because Cmp

47260

// instruction cannot take an immediate as its first operand.

47261

//

47262

if (CarryOp1.getOpcode() == X86ISD::SUB &&

47263

CarryOp1.getNode()->hasOneUse() &&

47264

CarryOp1.getValueType().isInteger() &&

47265

!isa<ConstantSDNode>(CarryOp1.getOperand(1))) {

47266

SDValue SubCommute =

47267

DAG.getNode(X86ISD::SUB, SDLoc(CarryOp1), CarryOp1->getVTList(),

47268

CarryOp1.getOperand(1), CarryOp1.getOperand(0));

47269

return SDValue(SubCommute.getNode(), CarryOp1.getResNo());

47270

}

47271

}

47272

// If this is a check of the z flag of an add with 1, switch to the

47273

// C flag.

47274

if (CarryCC == X86::COND_E &&

47275

CarryOp1.getOpcode() == X86ISD::ADD &&

47276

isOneConstant(CarryOp1.getOperand(1)))

47277

return CarryOp1;

47278

} else if (FoundAndLSB) {

47279

SDLoc DL(Carry);

47280

SDValue BitNo = DAG.getConstant(0, DL, Carry.getValueType());

47281

if (Carry.getOpcode() == ISD::SRL) {

47282

BitNo = Carry.getOperand(1);

47283

Carry = Carry.getOperand(0);

47284

}

47285

return getBT(Carry, BitNo, DL, DAG);

47286

}

47287

}

47288

}

47289

47290

return SDValue();

47291

}

47292

47293

/// If we are inverting an PTEST/TESTP operand, attempt to adjust the CC

47294

/// to avoid the inversion.

47295

static SDValue combinePTESTCC(SDValue EFLAGS, X86::CondCode &CC,

47296

SelectionDAG &DAG,

47297

const X86Subtarget &Subtarget) {

47298

// TODO: Handle X86ISD::KTEST/X86ISD::KORTEST.

47299

if (EFLAGS.getOpcode() != X86ISD::PTEST &&

47300

EFLAGS.getOpcode() != X86ISD::TESTP)

47301

return SDValue();

47302

47303

// PTEST/TESTP sets EFLAGS as:

47304

// TESTZ: ZF = (Op0 & Op1) == 0

47305

// TESTC: CF = (~Op0 & Op1) == 0

47306

// TESTNZC: ZF == 0 && CF == 0

47307

EVT VT = EFLAGS.getValueType();

47308

SDValue Op0 = EFLAGS.getOperand(0);

47309

SDValue Op1 = EFLAGS.getOperand(1);

47310

EVT OpVT = Op0.getValueType();

47311

47312

// TEST*(~X,Y) == TEST*(X,Y)

47313

if (SDValue NotOp0 = IsNOT(Op0, DAG)) {

47314

X86::CondCode InvCC;

47315

switch (CC) {

47316

case X86::COND_B:

47317

// testc -> testz.

47318

InvCC = X86::COND_E;

47319

break;

47320

case X86::COND_AE:

47321

// !testc -> !testz.

47322

InvCC = X86::COND_NE;

47323

break;

47324

case X86::COND_E:

47325

// testz -> testc.

47326

InvCC = X86::COND_B;

47327

break;

47328

case X86::COND_NE:

47329

// !testz -> !testc.

47330

InvCC = X86::COND_AE;

47331

break;

47332

case X86::COND_A:

47333

case X86::COND_BE:

47334

// testnzc -> testnzc (no change).

47335

InvCC = CC;

47336

break;

47337

default:

47338

InvCC = X86::COND_INVALID;

47339

break;

47340

}

47341

47342

if (InvCC != X86::COND_INVALID) {

47343

CC = InvCC;

47344

return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,

47345

DAG.getBitcast(OpVT, NotOp0), Op1);

47346

}

47347

}

47348

47349

if (CC == X86::COND_B || CC == X86::COND_AE) {

47350

// TESTC(X,~X) == TESTC(X,-1)

47351

if (SDValue NotOp1 = IsNOT(Op1, DAG)) {

47352

if (peekThroughBitcasts(NotOp1) == peekThroughBitcasts(Op0)) {

47353

SDLoc DL(EFLAGS);

47354

return DAG.getNode(EFLAGS.getOpcode(), DL, VT,

47355

DAG.getBitcast(OpVT, NotOp1),

47356

DAG.getAllOnesConstant(DL, OpVT));

47357

}

47358

}

47359

}

47360

47361

if (CC == X86::COND_E || CC == X86::COND_NE) {

47362

// TESTZ(X,~Y) == TESTC(Y,X)

47363

if (SDValue NotOp1 = IsNOT(Op1, DAG)) {

47364

CC = (CC == X86::COND_E ? X86::COND_B : X86::COND_AE);

47365

return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,

47366

DAG.getBitcast(OpVT, NotOp1), Op0);

47367

}

47368

47369

if (Op0 == Op1) {

47370

SDValue BC = peekThroughBitcasts(Op0);

47371

EVT BCVT = BC.getValueType();

47372

47373

// TESTZ(AND(X,Y),AND(X,Y)) == TESTZ(X,Y)

47374

if (BC.getOpcode() == ISD::AND || BC.getOpcode() == X86ISD::FAND) {

47375

return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,

47376

DAG.getBitcast(OpVT, BC.getOperand(0)),

47377

DAG.getBitcast(OpVT, BC.getOperand(1)));

47378

}

47379

47380

// TESTZ(AND(~X,Y),AND(~X,Y)) == TESTC(X,Y)

47381

if (BC.getOpcode() == X86ISD::ANDNP || BC.getOpcode() == X86ISD::FANDN) {

47382

CC = (CC == X86::COND_E ? X86::COND_B : X86::COND_AE);

47383

return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,

47384

DAG.getBitcast(OpVT, BC.getOperand(0)),

47385

DAG.getBitcast(OpVT, BC.getOperand(1)));

47386

}

47387

47388

// If every element is an all-sign value, see if we can use TESTP/MOVMSK

47389

// to more efficiently extract the sign bits and compare that.

47390

// TODO: Handle TESTC with comparison inversion.

47391

// TODO: Can we remove SimplifyMultipleUseDemandedBits and rely on

47392

// TESTP/MOVMSK combines to make sure its never worse than PTEST?

47393

if (BCVT.isVector() && DAG.getTargetLoweringInfo().isTypeLegal(BCVT)) {

47394

unsigned EltBits = BCVT.getScalarSizeInBits();

47395

if (DAG.ComputeNumSignBits(BC) == EltBits) {

47396

assert(VT == MVT::i32 && "Expected i32 EFLAGS comparison result")(static_cast <bool> (VT == MVT::i32 && "Expected i32 EFLAGS comparison result"
) ? void (0) : __assert_fail ("VT == MVT::i32 && \"Expected i32 EFLAGS comparison result\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 47396, __extension__
__PRETTY_FUNCTION__));

47397

APInt SignMask = APInt::getSignMask(EltBits);

47398

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

47399

if (SDValue Res =

47400

TLI.SimplifyMultipleUseDemandedBits(BC, SignMask, DAG)) {

47401

// For vXi16 cases we need to use pmovmksb and extract every other

47402

// sign bit.

47403

SDLoc DL(EFLAGS);

47404

if ((EltBits == 32 || EltBits == 64) && Subtarget.hasAVX()) {

47405

MVT FloatSVT = MVT::getFloatingPointVT(EltBits);

47406

MVT FloatVT =

47407

MVT::getVectorVT(FloatSVT, OpVT.getSizeInBits() / EltBits);

47408

Res = DAG.getBitcast(FloatVT, Res);

47409

return DAG.getNode(X86ISD::TESTP, SDLoc(EFLAGS), VT, Res, Res);

47410

} else if (EltBits == 16) {

47411

MVT MovmskVT = BCVT.is128BitVector() ? MVT::v16i8 : MVT::v32i8;

47412

Res = DAG.getBitcast(MovmskVT, Res);

47413

Res = getPMOVMSKB(DL, Res, DAG, Subtarget);

47414

Res = DAG.getNode(ISD::AND, DL, MVT::i32, Res,

47415

DAG.getConstant(0xAAAAAAAA, DL, MVT::i32));

47416

} else {

47417

Res = getPMOVMSKB(DL, Res, DAG, Subtarget);

47418

}

47419

return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Res,

47420

DAG.getConstant(0, DL, MVT::i32));

47421

}

47422

}

47423

}

47424

}

47425

47426

// TESTZ(-1,X) == TESTZ(X,X)

47427

if (ISD::isBuildVectorAllOnes(Op0.getNode()))

47428

return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op1, Op1);

47429

47430

// TESTZ(X,-1) == TESTZ(X,X)

47431

if (ISD::isBuildVectorAllOnes(Op1.getNode()))

47432

return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op0, Op0);

47433

47434

// TESTZ(OR(LO(X),HI(X)),OR(LO(Y),HI(Y))) -> TESTZ(X,Y)

47435

// TODO: Add COND_NE handling?

47436

if (CC == X86::COND_E && OpVT.is128BitVector() && Subtarget.hasAVX()) {

47437

SDValue Src0 = peekThroughBitcasts(Op0);

47438

SDValue Src1 = peekThroughBitcasts(Op1);

47439

if (Src0.getOpcode() == ISD::OR && Src1.getOpcode() == ISD::OR) {

47440

Src0 = getSplitVectorSrc(peekThroughBitcasts(Src0.getOperand(0)),

47441

peekThroughBitcasts(Src0.getOperand(1)), true);

47442

Src1 = getSplitVectorSrc(peekThroughBitcasts(Src1.getOperand(0)),

47443

peekThroughBitcasts(Src1.getOperand(1)), true);

47444

if (Src0 && Src1)

47445

return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,

47446

DAG.getBitcast(MVT::v4i64, Src0),

47447

DAG.getBitcast(MVT::v4i64, Src1));

47448

}

47449

}

47450

}

47451

47452

return SDValue();

47453

}

47454

47455

// Attempt to simplify the MOVMSK input based on the comparison type.

47456

static SDValue combineSetCCMOVMSK(SDValue EFLAGS, X86::CondCode &CC,

47457

SelectionDAG &DAG,

47458

const X86Subtarget &Subtarget) {

47459

// Handle eq/ne against zero (any_of).

47460

// Handle eq/ne against -1 (all_of).

47461

if (!(CC == X86::COND_E || CC == X86::COND_NE))

47462

return SDValue();

47463

if (EFLAGS.getValueType() != MVT::i32)

47464

return SDValue();

47465

unsigned CmpOpcode = EFLAGS.getOpcode();

47466

if (CmpOpcode != X86ISD::CMP && CmpOpcode != X86ISD::SUB)

47467

return SDValue();

47468

auto *CmpConstant = dyn_cast<ConstantSDNode>(EFLAGS.getOperand(1));

47469

if (!CmpConstant)

47470

return SDValue();

47471

const APInt &CmpVal = CmpConstant->getAPIntValue();

47472

47473

SDValue CmpOp = EFLAGS.getOperand(0);

47474

unsigned CmpBits = CmpOp.getValueSizeInBits();

47475

assert(CmpBits == CmpVal.getBitWidth() && "Value size mismatch")(static_cast <bool> (CmpBits == CmpVal.getBitWidth() &&
"Value size mismatch") ? void (0) : __assert_fail ("CmpBits == CmpVal.getBitWidth() && \"Value size mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 47475, __extension__
__PRETTY_FUNCTION__));

47476

47477

// Peek through any truncate.

47478

if (CmpOp.getOpcode() == ISD::TRUNCATE)

47479

CmpOp = CmpOp.getOperand(0);

47480

47481

// Bail if we don't find a MOVMSK.

47482

if (CmpOp.getOpcode() != X86ISD::MOVMSK)

47483

return SDValue();

47484

47485

SDValue Vec = CmpOp.getOperand(0);

47486

MVT VecVT = Vec.getSimpleValueType();

47487

assert((VecVT.is128BitVector() || VecVT.is256BitVector()) &&(static_cast <bool> ((VecVT.is128BitVector() || VecVT.is256BitVector
()) && "Unexpected MOVMSK operand") ? void (0) : __assert_fail
("(VecVT.is128BitVector() || VecVT.is256BitVector()) && \"Unexpected MOVMSK operand\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 47488, __extension__
__PRETTY_FUNCTION__))

47488

"Unexpected MOVMSK operand")(static_cast <bool> ((VecVT.is128BitVector() || VecVT.is256BitVector
()) && "Unexpected MOVMSK operand") ? void (0) : __assert_fail
("(VecVT.is128BitVector() || VecVT.is256BitVector()) && \"Unexpected MOVMSK operand\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 47488, __extension__
__PRETTY_FUNCTION__));

47489

unsigned NumElts = VecVT.getVectorNumElements();

47490

unsigned NumEltBits = VecVT.getScalarSizeInBits();

47491

47492

bool IsAnyOf = CmpOpcode == X86ISD::CMP && CmpVal.isZero();

47493

bool IsAllOf = (CmpOpcode == X86ISD::SUB || CmpOpcode == X86ISD::CMP) &&

47494

NumElts <= CmpBits && CmpVal.isMask(NumElts);

47495

if (!IsAnyOf && !IsAllOf)

47496

return SDValue();

47497

47498

// TODO: Check more combining cases for me.

47499

// Here we check the cmp use number to decide do combining or not.

47500

// Currently we only get 2 tests about combining "MOVMSK(CONCAT(..))"

47501

// and "MOVMSK(PCMPEQ(..))" are fit to use this constraint.

47502

bool IsOneUse = CmpOp.getNode()->hasOneUse();

47503

47504

// See if we can peek through to a vector with a wider element type, if the

47505

// signbits extend down to all the sub-elements as well.

47506

// Calling MOVMSK with the wider type, avoiding the bitcast, helps expose

47507

// potential SimplifyDemandedBits/Elts cases.

47508

// If we looked through a truncate that discard bits, we can't do this

47509

// transform.

47510

// FIXME: We could do this transform for truncates that discarded bits by

47511

// inserting an AND mask between the new MOVMSK and the CMP.

47512

if (Vec.getOpcode() == ISD::BITCAST && NumElts <= CmpBits) {

47513

SDValue BC = peekThroughBitcasts(Vec);

47514

MVT BCVT = BC.getSimpleValueType();

47515

unsigned BCNumElts = BCVT.getVectorNumElements();

47516

unsigned BCNumEltBits = BCVT.getScalarSizeInBits();

47517

if ((BCNumEltBits == 32 || BCNumEltBits == 64) &&

47518

BCNumEltBits > NumEltBits &&

47519

DAG.ComputeNumSignBits(BC) > (BCNumEltBits - NumEltBits)) {

47520

SDLoc DL(EFLAGS);

47521

APInt CmpMask = APInt::getLowBitsSet(32, IsAnyOf ? 0 : BCNumElts);

47522

return DAG.getNode(X86ISD::CMP, DL, MVT::i32,

47523

DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, BC),

47524

DAG.getConstant(CmpMask, DL, MVT::i32));

47525

}

47526

}

47527

47528

// MOVMSK(CONCAT(X,Y)) == 0 -> MOVMSK(OR(X,Y)).

47529

// MOVMSK(CONCAT(X,Y)) != 0 -> MOVMSK(OR(X,Y)).

47530

// MOVMSK(CONCAT(X,Y)) == -1 -> MOVMSK(AND(X,Y)).

47531

// MOVMSK(CONCAT(X,Y)) != -1 -> MOVMSK(AND(X,Y)).

47532

if (VecVT.is256BitVector() && NumElts <= CmpBits && IsOneUse) {

47533

SmallVector<SDValue> Ops;

47534

if (collectConcatOps(peekThroughBitcasts(Vec).getNode(), Ops, DAG) &&

47535

Ops.size() == 2) {

47536

SDLoc DL(EFLAGS);

47537

EVT SubVT = Ops[0].getValueType().changeTypeToInteger();

47538

APInt CmpMask = APInt::getLowBitsSet(32, IsAnyOf ? 0 : NumElts / 2);

47539

SDValue V = DAG.getNode(IsAnyOf ? ISD::OR : ISD::AND, DL, SubVT,

47540

DAG.getBitcast(SubVT, Ops[0]),

47541

DAG.getBitcast(SubVT, Ops[1]));

47542

V = DAG.getBitcast(VecVT.getHalfNumVectorElementsVT(), V);

47543

return DAG.getNode(X86ISD::CMP, DL, MVT::i32,

47544

DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V),

47545

DAG.getConstant(CmpMask, DL, MVT::i32));

47546

}

47547

}

47548

47549

// MOVMSK(PCMPEQ(X,0)) == -1 -> PTESTZ(X,X).

47550

// MOVMSK(PCMPEQ(X,0)) != -1 -> !PTESTZ(X,X).

47551

// MOVMSK(PCMPEQ(X,Y)) == -1 -> PTESTZ(XOR(X,Y),XOR(X,Y)).

47552

// MOVMSK(PCMPEQ(X,Y)) != -1 -> !PTESTZ(XOR(X,Y),XOR(X,Y)).

47553

if (IsAllOf && Subtarget.hasSSE41() && IsOneUse) {

47554

MVT TestVT = VecVT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;

47555

SDValue BC = peekThroughBitcasts(Vec);

47556

// Ensure MOVMSK was testing every signbit of BC.

47557

if (BC.getValueType().getVectorNumElements() <= NumElts) {

47558

if (BC.getOpcode() == X86ISD::PCMPEQ) {

47559

SDValue V = DAG.getNode(ISD::XOR, SDLoc(BC), BC.getValueType(),

47560

BC.getOperand(0), BC.getOperand(1));

47561

V = DAG.getBitcast(TestVT, V);

47562

return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V);

47563

}

47564

// Check for 256-bit split vector cases.

47565

if (BC.getOpcode() == ISD::AND &&

47566

BC.getOperand(0).getOpcode() == X86ISD::PCMPEQ &&

47567

BC.getOperand(1).getOpcode() == X86ISD::PCMPEQ) {

47568

SDValue LHS = BC.getOperand(0);

47569

SDValue RHS = BC.getOperand(1);

47570

LHS = DAG.getNode(ISD::XOR, SDLoc(LHS), LHS.getValueType(),

47571

LHS.getOperand(0), LHS.getOperand(1));

47572

RHS = DAG.getNode(ISD::XOR, SDLoc(RHS), RHS.getValueType(),

47573

RHS.getOperand(0), RHS.getOperand(1));

47574

LHS = DAG.getBitcast(TestVT, LHS);

47575

RHS = DAG.getBitcast(TestVT, RHS);

47576

SDValue V = DAG.getNode(ISD::OR, SDLoc(EFLAGS), TestVT, LHS, RHS);

47577

return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V);

47578

}

47579

}

47580

}

47581

47582

// See if we can avoid a PACKSS by calling MOVMSK on the sources.

47583

// For vXi16 cases we can use a v2Xi8 PMOVMSKB. We must mask out

47584

// sign bits prior to the comparison with zero unless we know that

47585

// the vXi16 splats the sign bit down to the lower i8 half.

47586

// TODO: Handle all_of patterns.

47587

if (Vec.getOpcode() == X86ISD::PACKSS && VecVT == MVT::v16i8) {

47588

SDValue VecOp0 = Vec.getOperand(0);

47589

SDValue VecOp1 = Vec.getOperand(1);

47590

bool SignExt0 = DAG.ComputeNumSignBits(VecOp0) > 8;

47591

bool SignExt1 = DAG.ComputeNumSignBits(VecOp1) > 8;

47592

// PMOVMSKB(PACKSSBW(X, undef)) -> PMOVMSKB(BITCAST_v16i8(X)) & 0xAAAA.

47593

if (IsAnyOf && CmpBits == 8 && VecOp1.isUndef()) {

47594

SDLoc DL(EFLAGS);

47595

SDValue Result = DAG.getBitcast(MVT::v16i8, VecOp0);

47596

Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);

47597

Result = DAG.getZExtOrTrunc(Result, DL, MVT::i16);

47598

if (!SignExt0) {

47599

Result = DAG.getNode(ISD::AND, DL, MVT::i16, Result,

47600

DAG.getConstant(0xAAAA, DL, MVT::i16));

47601

}

47602

return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,

47603

DAG.getConstant(0, DL, MVT::i16));

47604

}

47605

// PMOVMSKB(PACKSSBW(LO(X), HI(X)))

47606

// -> PMOVMSKB(BITCAST_v32i8(X)) & 0xAAAAAAAA.

47607

if (CmpBits >= 16 && Subtarget.hasInt256() &&

47608

(IsAnyOf || (SignExt0 && SignExt1))) {

47609

if (SDValue Src = getSplitVectorSrc(VecOp0, VecOp1, true)) {

47610

SDLoc DL(EFLAGS);

47611

SDValue Result = peekThroughBitcasts(Src);

47612

if (IsAllOf && Result.getOpcode() == X86ISD::PCMPEQ &&

47613

Result.getValueType().getVectorNumElements() <= NumElts) {

47614

SDValue V = DAG.getNode(ISD::XOR, DL, Result.getValueType(),

47615

Result.getOperand(0), Result.getOperand(1));

47616

V = DAG.getBitcast(MVT::v4i64, V);

47617

return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V);

47618

}

47619

Result = DAG.getBitcast(MVT::v32i8, Result);

47620

Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);

47621

unsigned CmpMask = IsAnyOf ? 0 : 0xFFFFFFFF;

47622

if (!SignExt0 || !SignExt1) {

47623

assert(IsAnyOf &&(static_cast <bool> (IsAnyOf && "Only perform v16i16 signmasks for any_of patterns"
) ? void (0) : __assert_fail ("IsAnyOf && \"Only perform v16i16 signmasks for any_of patterns\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 47624, __extension__
__PRETTY_FUNCTION__))

47624

"Only perform v16i16 signmasks for any_of patterns")(static_cast <bool> (IsAnyOf && "Only perform v16i16 signmasks for any_of patterns"
) ? void (0) : __assert_fail ("IsAnyOf && \"Only perform v16i16 signmasks for any_of patterns\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 47624, __extension__
__PRETTY_FUNCTION__));

47625

Result = DAG.getNode(ISD::AND, DL, MVT::i32, Result,

47626

DAG.getConstant(0xAAAAAAAA, DL, MVT::i32));

47627

}

47628

return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,

47629

DAG.getConstant(CmpMask, DL, MVT::i32));

47630

}

47631

}

47632

}

47633

47634

// MOVMSK(SHUFFLE(X,u)) -> MOVMSK(X) iff every element is referenced.

47635

SmallVector<int, 32> ShuffleMask;

47636

SmallVector<SDValue, 2> ShuffleInputs;

47637

if (NumElts <= CmpBits &&

47638

getTargetShuffleInputs(peekThroughBitcasts(Vec), ShuffleInputs,

47639

ShuffleMask, DAG) &&

47640

ShuffleInputs.size() == 1 && !isAnyZeroOrUndef(ShuffleMask) &&

47641

ShuffleInputs[0].getValueSizeInBits() == VecVT.getSizeInBits()) {

47642

unsigned NumShuffleElts = ShuffleMask.size();

47643

APInt DemandedElts = APInt::getZero(NumShuffleElts);

47644

for (int M : ShuffleMask) {

47645

assert(0 <= M && M < (int)NumShuffleElts && "Bad unary shuffle index")(static_cast <bool> (0 <= M && M < (int)NumShuffleElts
&& "Bad unary shuffle index") ? void (0) : __assert_fail
("0 <= M && M < (int)NumShuffleElts && \"Bad unary shuffle index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 47645, __extension__
__PRETTY_FUNCTION__));

47646

DemandedElts.setBit(M);

47647

}

47648

if (DemandedElts.isAllOnes()) {

47649

SDLoc DL(EFLAGS);

47650

SDValue Result = DAG.getBitcast(VecVT, ShuffleInputs[0]);

47651

Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);

47652

Result =

47653

DAG.getZExtOrTrunc(Result, DL, EFLAGS.getOperand(0).getValueType());

47654

return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,

47655

EFLAGS.getOperand(1));

47656

}

47657

}

47658

47659

// MOVMSKPS(V) !=/== 0 -> TESTPS(V,V)

47660

// MOVMSKPD(V) !=/== 0 -> TESTPD(V,V)

47661

// iff every element is referenced.

47662

if (NumElts <= CmpBits && IsAnyOf && Subtarget.hasAVX() && IsOneUse &&

47663

(NumEltBits == 32 || NumEltBits == 64)) {

47664

MVT FloatSVT = MVT::getFloatingPointVT(NumEltBits);

47665

MVT FloatVT = MVT::getVectorVT(FloatSVT, NumElts);

47666

SDValue V = DAG.getBitcast(FloatVT, Vec);

47667

return DAG.getNode(X86ISD::TESTP, SDLoc(EFLAGS), MVT::i32, V, V);

47668

}

47669

47670

return SDValue();

47671

}

47672

47673

/// Optimize an EFLAGS definition used according to the condition code \p CC

47674

/// into a simpler EFLAGS value, potentially returning a new \p CC and replacing

47675

/// uses of chain values.

47676

static SDValue combineSetCCEFLAGS(SDValue EFLAGS, X86::CondCode &CC,

47677

SelectionDAG &DAG,

47678

const X86Subtarget &Subtarget) {

47679

if (CC == X86::COND_B)

47680

if (SDValue Flags = combineCarryThroughADD(EFLAGS, DAG))

47681

return Flags;

47682

47683

if (SDValue R = checkBoolTestSetCCCombine(EFLAGS, CC))

47684

return R;

47685

47686

if (SDValue R = combinePTESTCC(EFLAGS, CC, DAG, Subtarget))

47687

return R;

47688

47689

if (SDValue R = combineSetCCMOVMSK(EFLAGS, CC, DAG, Subtarget))

47690

return R;

47691

47692

return combineSetCCAtomicArith(EFLAGS, CC, DAG, Subtarget);

47693

}

47694

47695

/// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]

47696

static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,

47697

TargetLowering::DAGCombinerInfo &DCI,

47698

const X86Subtarget &Subtarget) {

47699

SDLoc DL(N);

47700

47701

SDValue FalseOp = N->getOperand(0);

47702

SDValue TrueOp = N->getOperand(1);

47703

X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);

47704

SDValue Cond = N->getOperand(3);

47705

47706

// cmov X, X, ?, ? --> X

47707

if (TrueOp == FalseOp)

47708

return TrueOp;

47709

47710

// Try to simplify the EFLAGS and condition code operands.

47711

// We can't always do this as FCMOV only supports a subset of X86 cond.

47712

if (SDValue Flags = combineSetCCEFLAGS(Cond, CC, DAG, Subtarget)) {

47713

if (!(FalseOp.getValueType() == MVT::f80 ||

47714

(FalseOp.getValueType() == MVT::f64 && !Subtarget.hasSSE2()) ||

47715

(FalseOp.getValueType() == MVT::f32 && !Subtarget.hasSSE1())) ||

47716

!Subtarget.canUseCMOV() || hasFPCMov(CC)) {

47717

SDValue Ops[] = {FalseOp, TrueOp, DAG.getTargetConstant(CC, DL, MVT::i8),

47718

Flags};

47719

return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);

47720

}

47721

}

47722

47723

// If this is a select between two integer constants, try to do some

47724

// optimizations. Note that the operands are ordered the opposite of SELECT

47725

// operands.

47726

if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) {

47727

if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) {

47728

// Canonicalize the TrueC/FalseC values so that TrueC (the true value) is

47729

// larger than FalseC (the false value).

47730

if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {

47731

CC = X86::GetOppositeBranchCondition(CC);

47732

std::swap(TrueC, FalseC);

47733

std::swap(TrueOp, FalseOp);

47734

}

47735

47736

// Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0.

47737

// This is efficient for any integer data type (including i8/i16) and

47738

// shift amount.

47739

if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {

47740

Cond = getSETCC(CC, Cond, DL, DAG);

47741

47742

// Zero extend the condition if needed.

47743

Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);

47744

47745

unsigned ShAmt = TrueC->getAPIntValue().logBase2();

47746

Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,

47747

DAG.getConstant(ShAmt, DL, MVT::i8));

47748

return Cond;

47749

}

47750

47751

// Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient

47752

// for any integer data type, including i8/i16.

47753

if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {

47754

Cond = getSETCC(CC, Cond, DL, DAG);

47755

47756

// Zero extend the condition if needed.

47757

Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,

47758

FalseC->getValueType(0), Cond);

47759

Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,

47760

SDValue(FalseC, 0));

47761

return Cond;

47762

}

47763

47764

// Optimize cases that will turn into an LEA instruction. This requires

47765

// an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).

47766

if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {

47767

APInt Diff = TrueC->getAPIntValue() - FalseC->getAPIntValue();

47768

assert(Diff.getBitWidth() == N->getValueType(0).getSizeInBits() &&(static_cast <bool> (Diff.getBitWidth() == N->getValueType
(0).getSizeInBits() && "Implicit constant truncation"
) ? void (0) : __assert_fail ("Diff.getBitWidth() == N->getValueType(0).getSizeInBits() && \"Implicit constant truncation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 47769, __extension__
__PRETTY_FUNCTION__))

47769

"Implicit constant truncation")(static_cast <bool> (Diff.getBitWidth() == N->getValueType
(0).getSizeInBits() && "Implicit constant truncation"
) ? void (0) : __assert_fail ("Diff.getBitWidth() == N->getValueType(0).getSizeInBits() && \"Implicit constant truncation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 47769, __extension__
__PRETTY_FUNCTION__));

47770

47771

bool isFastMultiplier = false;

47772

if (Diff.ult(10)) {

47773

switch (Diff.getZExtValue()) {

47774

default: break;

47775

case 1: // result = add base, cond

47776

case 2: // result = lea base( , cond*2)

47777

case 3: // result = lea base(cond, cond*2)

47778

case 4: // result = lea base( , cond*4)

47779

case 5: // result = lea base(cond, cond*4)

47780

case 8: // result = lea base( , cond*8)

47781

case 9: // result = lea base(cond, cond*8)

47782

isFastMultiplier = true;

47783

break;

47784

}

47785

}

47786

47787

if (isFastMultiplier) {

47788

Cond = getSETCC(CC, Cond, DL ,DAG);

47789

// Zero extend the condition if needed.

47790

Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),

47791

Cond);

47792

// Scale the condition by the difference.

47793

if (Diff != 1)

47794

Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,

47795

DAG.getConstant(Diff, DL, Cond.getValueType()));

47796

47797

// Add the base if non-zero.

47798

if (FalseC->getAPIntValue() != 0)

47799

Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,

47800

SDValue(FalseC, 0));

47801

return Cond;

47802

}

47803

}

47804

}

47805

}

47806

47807

// Handle these cases:

47808

// (select (x != c), e, c) -> select (x != c), e, x),

47809

// (select (x == c), c, e) -> select (x == c), x, e)

47810

// where the c is an integer constant, and the "select" is the combination

47811

// of CMOV and CMP.

47812

//

47813

// The rationale for this change is that the conditional-move from a constant

47814

// needs two instructions, however, conditional-move from a register needs

47815

// only one instruction.

47816

//

47817

// CAVEAT: By replacing a constant with a symbolic value, it may obscure

47818

// some instruction-combining opportunities. This opt needs to be

47819

// postponed as late as possible.

47820

//

47821

if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) {

47822

// the DCI.xxxx conditions are provided to postpone the optimization as

47823

// late as possible.

47824

47825

ConstantSDNode *CmpAgainst = nullptr;

47826

if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) &&

47827

(CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) &&

47828

!isa<ConstantSDNode>(Cond.getOperand(0))) {

47829

47830

if (CC == X86::COND_NE &&

47831

CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) {

47832

CC = X86::GetOppositeBranchCondition(CC);

47833

std::swap(TrueOp, FalseOp);

47834

}

47835

47836

if (CC == X86::COND_E &&

47837

CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {

47838

SDValue Ops[] = {FalseOp, Cond.getOperand(0),

47839

DAG.getTargetConstant(CC, DL, MVT::i8), Cond};

47840

return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);

47841

}

47842

}

47843

}

47844

47845

// Transform:

47846

//

47847

// (cmov 1 T (uge T 2))

47848

//

47849

// to:

47850

//

47851

// (adc T 0 (sub T 1))

47852

if (CC == X86::COND_AE && isOneConstant(FalseOp) &&

47853

Cond.getOpcode() == X86ISD::SUB && Cond->hasOneUse()) {

47854

SDValue Cond0 = Cond.getOperand(0);

47855

if (Cond0.getOpcode() == ISD::TRUNCATE)

47856

Cond0 = Cond0.getOperand(0);

47857

auto *Sub1C = dyn_cast<ConstantSDNode>(Cond.getOperand(1));

47858

if (Cond0 == TrueOp && Sub1C && Sub1C->getZExtValue() == 2) {

47859

EVT CondVT = Cond->getValueType(0);

47860

EVT OuterVT = N->getValueType(0);

47861

// Subtract 1 and generate a carry.

47862

SDValue NewSub =

47863

DAG.getNode(X86ISD::SUB, DL, Cond->getVTList(), Cond.getOperand(0),

47864

DAG.getConstant(1, DL, CondVT));

47865

SDValue EFLAGS(NewSub.getNode(), 1);

47866

return DAG.getNode(X86ISD::ADC, DL, DAG.getVTList(OuterVT, MVT::i32),

47867

TrueOp, DAG.getConstant(0, DL, OuterVT), EFLAGS);

47868

}

47869

}

47870

47871

// Fold and/or of setcc's to double CMOV:

47872

// (CMOV F, T, ((cc1 | cc2) != 0)) -> (CMOV (CMOV F, T, cc1), T, cc2)

47873

// (CMOV F, T, ((cc1 & cc2) != 0)) -> (CMOV (CMOV T, F, !cc1), F, !cc2)

47874

//

47875

// This combine lets us generate:

47876

// cmovcc1 (jcc1 if we don't have CMOV)

47877

// cmovcc2 (same)

47878

// instead of:

47879

// setcc1

47880

// setcc2

47881

// and/or

47882

// cmovne (jne if we don't have CMOV)

47883

// When we can't use the CMOV instruction, it might increase branch

47884

// mispredicts.

47885

// When we can use CMOV, or when there is no mispredict, this improves

47886

// throughput and reduces register pressure.

47887

//

47888

if (CC == X86::COND_NE) {

47889

SDValue Flags;

47890

X86::CondCode CC0, CC1;

47891

bool isAndSetCC;

47892

if (checkBoolTestAndOrSetCCCombine(Cond, CC0, CC1, Flags, isAndSetCC)) {

47893

if (isAndSetCC) {

47894

std::swap(FalseOp, TrueOp);

47895

CC0 = X86::GetOppositeBranchCondition(CC0);

47896

CC1 = X86::GetOppositeBranchCondition(CC1);

47897

}

47898

47899

SDValue LOps[] = {FalseOp, TrueOp,

47900

DAG.getTargetConstant(CC0, DL, MVT::i8), Flags};

47901

SDValue LCMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), LOps);

47902

SDValue Ops[] = {LCMOV, TrueOp, DAG.getTargetConstant(CC1, DL, MVT::i8),

47903

Flags};

47904

SDValue CMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);

47905

return CMOV;

47906

}

47907

}

47908

47909

// Fold (CMOV C1, (ADD (CTTZ X), C2), (X != 0)) ->

47910

// (ADD (CMOV C1-C2, (CTTZ X), (X != 0)), C2)

47911

// Or (CMOV (ADD (CTTZ X), C2), C1, (X == 0)) ->

47912

// (ADD (CMOV (CTTZ X), C1-C2, (X == 0)), C2)

47913

if ((CC == X86::COND_NE || CC == X86::COND_E) &&

47914

Cond.getOpcode() == X86ISD::CMP && isNullConstant(Cond.getOperand(1))) {

47915

SDValue Add = TrueOp;

47916

SDValue Const = FalseOp;

47917

// Canonicalize the condition code for easier matching and output.

47918

if (CC == X86::COND_E)

47919

std::swap(Add, Const);

47920

47921

// We might have replaced the constant in the cmov with the LHS of the

47922

// compare. If so change it to the RHS of the compare.

47923

if (Const == Cond.getOperand(0))

47924

Const = Cond.getOperand(1);

47925

47926

// Ok, now make sure that Add is (add (cttz X), C2) and Const is a constant.

47927

if (isa<ConstantSDNode>(Const) && Add.getOpcode() == ISD::ADD &&

47928

Add.hasOneUse() && isa<ConstantSDNode>(Add.getOperand(1)) &&

47929

(Add.getOperand(0).getOpcode() == ISD::CTTZ_ZERO_UNDEF ||

47930

Add.getOperand(0).getOpcode() == ISD::CTTZ) &&

47931

Add.getOperand(0).getOperand(0) == Cond.getOperand(0)) {

47932

EVT VT = N->getValueType(0);

47933

// This should constant fold.

47934

SDValue Diff = DAG.getNode(ISD::SUB, DL, VT, Const, Add.getOperand(1));

47935

SDValue CMov =

47936

DAG.getNode(X86ISD::CMOV, DL, VT, Diff, Add.getOperand(0),

47937

DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8), Cond);

47938

return DAG.getNode(ISD::ADD, DL, VT, CMov, Add.getOperand(1));

47939

}

47940

}

47941

47942

return SDValue();

47943

}

47944

47945

/// Different mul shrinking modes.

47946

enum class ShrinkMode { MULS8, MULU8, MULS16, MULU16 };

47947

47948

static bool canReduceVMulWidth(SDNode *N, SelectionDAG &DAG, ShrinkMode &Mode) {

47949

EVT VT = N->getOperand(0).getValueType();

47950

if (VT.getScalarSizeInBits() != 32)

47951

return false;

47952

47953

assert(N->getNumOperands() == 2 && "NumOperands of Mul are 2")(static_cast <bool> (N->getNumOperands() == 2 &&
"NumOperands of Mul are 2") ? void (0) : __assert_fail ("N->getNumOperands() == 2 && \"NumOperands of Mul are 2\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 47953, __extension__
__PRETTY_FUNCTION__));

47954

unsigned SignBits[2] = {1, 1};

47955

bool IsPositive[2] = {false, false};

47956

for (unsigned i = 0; i < 2; i++) {

47957

SDValue Opd = N->getOperand(i);

47958

47959

SignBits[i] = DAG.ComputeNumSignBits(Opd);

47960

IsPositive[i] = DAG.SignBitIsZero(Opd);

47961

}

47962

47963

bool AllPositive = IsPositive[0] && IsPositive[1];

47964

unsigned MinSignBits = std::min(SignBits[0], SignBits[1]);

47965

// When ranges are from -128 ~ 127, use MULS8 mode.

47966

if (MinSignBits >= 25)

47967

Mode = ShrinkMode::MULS8;

47968

// When ranges are from 0 ~ 255, use MULU8 mode.

47969

else if (AllPositive && MinSignBits >= 24)

47970

Mode = ShrinkMode::MULU8;

47971

// When ranges are from -32768 ~ 32767, use MULS16 mode.

47972

else if (MinSignBits >= 17)

47973

Mode = ShrinkMode::MULS16;

47974

// When ranges are from 0 ~ 65535, use MULU16 mode.

47975

else if (AllPositive && MinSignBits >= 16)

47976

Mode = ShrinkMode::MULU16;

47977

else

47978

return false;

47979

return true;

47980

}

47981

47982

/// When the operands of vector mul are extended from smaller size values,

47983

/// like i8 and i16, the type of mul may be shrinked to generate more

47984

/// efficient code. Two typical patterns are handled:

47985

/// Pattern1:

47986

/// %2 = sext/zext <N x i8> %1 to <N x i32>

47987

/// %4 = sext/zext <N x i8> %3 to <N x i32>

47988

// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)

47989

/// %5 = mul <N x i32> %2, %4

47990

///

47991

/// Pattern2:

47992

/// %2 = zext/sext <N x i16> %1 to <N x i32>

47993

/// %4 = zext/sext <N x i16> %3 to <N x i32>

47994

/// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)

47995

/// %5 = mul <N x i32> %2, %4

47996

///

47997

/// There are four mul shrinking modes:

47998

/// If %2 == sext32(trunc8(%2)), i.e., the scalar value range of %2 is

47999

/// -128 to 128, and the scalar value range of %4 is also -128 to 128,

48000

/// generate pmullw+sext32 for it (MULS8 mode).

48001

/// If %2 == zext32(trunc8(%2)), i.e., the scalar value range of %2 is

48002

/// 0 to 255, and the scalar value range of %4 is also 0 to 255,

48003

/// generate pmullw+zext32 for it (MULU8 mode).

48004

/// If %2 == sext32(trunc16(%2)), i.e., the scalar value range of %2 is

48005

/// -32768 to 32767, and the scalar value range of %4 is also -32768 to 32767,

48006

/// generate pmullw+pmulhw for it (MULS16 mode).

48007

/// If %2 == zext32(trunc16(%2)), i.e., the scalar value range of %2 is

48008

/// 0 to 65535, and the scalar value range of %4 is also 0 to 65535,

48009

/// generate pmullw+pmulhuw for it (MULU16 mode).

48010

static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG,

48011

const X86Subtarget &Subtarget) {

48012

// Check for legality

48013

// pmullw/pmulhw are not supported by SSE.

48014

if (!Subtarget.hasSSE2())

48015

return SDValue();

48016

48017

// Check for profitability

48018

// pmulld is supported since SSE41. It is better to use pmulld

48019

// instead of pmullw+pmulhw, except for subtargets where pmulld is slower than

48020

// the expansion.

48021

bool OptForMinSize = DAG.getMachineFunction().getFunction().hasMinSize();

48022

if (Subtarget.hasSSE41() && (OptForMinSize || !Subtarget.isPMULLDSlow()))

48023

return SDValue();

48024

48025

ShrinkMode Mode;

48026

if (!canReduceVMulWidth(N, DAG, Mode))

48027

return SDValue();

48028

48029

SDLoc DL(N);

48030

SDValue N0 = N->getOperand(0);

48031

SDValue N1 = N->getOperand(1);

48032

EVT VT = N->getOperand(0).getValueType();

48033

unsigned NumElts = VT.getVectorNumElements();

48034

if ((NumElts % 2) != 0)

48035

return SDValue();

48036

48037

EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts);

48038

48039

// Shrink the operands of mul.

48040

SDValue NewN0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N0);

48041

SDValue NewN1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N1);

48042

48043

// Generate the lower part of mul: pmullw. For MULU8/MULS8, only the

48044

// lower part is needed.

48045

SDValue MulLo = DAG.getNode(ISD::MUL, DL, ReducedVT, NewN0, NewN1);

48046

if (Mode == ShrinkMode::MULU8 || Mode == ShrinkMode::MULS8)

48047

return DAG.getNode((Mode == ShrinkMode::MULU8) ? ISD::ZERO_EXTEND

48048

: ISD::SIGN_EXTEND,

48049

DL, VT, MulLo);

48050

48051

EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts / 2);

48052

// Generate the higher part of mul: pmulhw/pmulhuw. For MULU16/MULS16,

48053

// the higher part is also needed.

48054

SDValue MulHi =

48055

DAG.getNode(Mode == ShrinkMode::MULS16 ? ISD::MULHS : ISD::MULHU, DL,

48056

ReducedVT, NewN0, NewN1);

48057

48058

// Repack the lower part and higher part result of mul into a wider

48059

// result.

48060

// Generate shuffle functioning as punpcklwd.

48061

SmallVector<int, 16> ShuffleMask(NumElts);

48062

for (unsigned i = 0, e = NumElts / 2; i < e; i++) {

48063

ShuffleMask[2 * i] = i;

48064

ShuffleMask[2 * i + 1] = i + NumElts;

48065

}

48066

SDValue ResLo =

48067

DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);

48068

ResLo = DAG.getBitcast(ResVT, ResLo);

48069

// Generate shuffle functioning as punpckhwd.

48070

for (unsigned i = 0, e = NumElts / 2; i < e; i++) {

48071

ShuffleMask[2 * i] = i + NumElts / 2;

48072

ShuffleMask[2 * i + 1] = i + NumElts * 3 / 2;

48073

}

48074

SDValue ResHi =

48075

DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);

48076

ResHi = DAG.getBitcast(ResVT, ResHi);

48077

return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ResLo, ResHi);

48078

}

48079

48080

static SDValue combineMulSpecial(uint64_t MulAmt, SDNode *N, SelectionDAG &DAG,

48081

EVT VT, const SDLoc &DL) {

48082

48083

auto combineMulShlAddOrSub = [&](int Mult, int Shift, bool isAdd) {

48084

SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),

48085

DAG.getConstant(Mult, DL, VT));

48086

Result = DAG.getNode(ISD::SHL, DL, VT, Result,

48087

DAG.getConstant(Shift, DL, MVT::i8));

48088

Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,

48089

N->getOperand(0));

48090

return Result;

48091

};

48092

48093

auto combineMulMulAddOrSub = [&](int Mul1, int Mul2, bool isAdd) {

48094

SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),

48095

DAG.getConstant(Mul1, DL, VT));

48096

Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, Result,

48097

DAG.getConstant(Mul2, DL, VT));

48098

Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,

48099

N->getOperand(0));

48100

return Result;

48101

};

48102

48103

switch (MulAmt) {

48104

default:

48105

break;

48106

case 11:

48107

// mul x, 11 => add ((shl (mul x, 5), 1), x)

48108

return combineMulShlAddOrSub(5, 1, /*isAdd*/ true);

48109

case 21:

48110

// mul x, 21 => add ((shl (mul x, 5), 2), x)

48111

return combineMulShlAddOrSub(5, 2, /*isAdd*/ true);

48112

case 41:

48113

// mul x, 41 => add ((shl (mul x, 5), 3), x)

48114

return combineMulShlAddOrSub(5, 3, /*isAdd*/ true);

48115

case 22:

48116

// mul x, 22 => add (add ((shl (mul x, 5), 2), x), x)

48117

return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),

48118

combineMulShlAddOrSub(5, 2, /*isAdd*/ true));

48119

case 19:

48120

// mul x, 19 => add ((shl (mul x, 9), 1), x)

48121

return combineMulShlAddOrSub(9, 1, /*isAdd*/ true);

48122

case 37:

48123

// mul x, 37 => add ((shl (mul x, 9), 2), x)

48124

return combineMulShlAddOrSub(9, 2, /*isAdd*/ true);

48125

case 73:

48126

// mul x, 73 => add ((shl (mul x, 9), 3), x)

48127

return combineMulShlAddOrSub(9, 3, /*isAdd*/ true);

48128

case 13:

48129

// mul x, 13 => add ((shl (mul x, 3), 2), x)

48130

return combineMulShlAddOrSub(3, 2, /*isAdd*/ true);

48131

case 23:

48132

// mul x, 23 => sub ((shl (mul x, 3), 3), x)

48133

return combineMulShlAddOrSub(3, 3, /*isAdd*/ false);

48134

case 26:

48135

// mul x, 26 => add ((mul (mul x, 5), 5), x)

48136

return combineMulMulAddOrSub(5, 5, /*isAdd*/ true);

48137

case 28:

48138

// mul x, 28 => add ((mul (mul x, 9), 3), x)

48139

return combineMulMulAddOrSub(9, 3, /*isAdd*/ true);

48140

case 29:

48141

// mul x, 29 => add (add ((mul (mul x, 9), 3), x), x)

48142

return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),

48143

combineMulMulAddOrSub(9, 3, /*isAdd*/ true));

48144

}

48145

48146

// Another trick. If this is a power 2 + 2/4/8, we can use a shift followed

48147

// by a single LEA.

48148

// First check if this a sum of two power of 2s because that's easy. Then

48149

// count how many zeros are up to the first bit.

48150

// TODO: We can do this even without LEA at a cost of two shifts and an add.

48151

if (isPowerOf2_64(MulAmt & (MulAmt - 1))) {

48152

unsigned ScaleShift = llvm::countr_zero(MulAmt);

48153

if (ScaleShift >= 1 && ScaleShift < 4) {

48154

unsigned ShiftAmt = Log2_64((MulAmt & (MulAmt - 1)));

48155

SDValue Shift1 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),

48156

DAG.getConstant(ShiftAmt, DL, MVT::i8));

48157

SDValue Shift2 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),

48158

DAG.getConstant(ScaleShift, DL, MVT::i8));

48159

return DAG.getNode(ISD::ADD, DL, VT, Shift1, Shift2);

48160

}

48161

}

48162

48163

return SDValue();

48164

}

48165

48166

// If the upper 17 bits of either element are zero and the other element are

48167

// zero/sign bits then we can use PMADDWD, which is always at least as quick as

48168

// PMULLD, except on KNL.

48169

static SDValue combineMulToPMADDWD(SDNode *N, SelectionDAG &DAG,

48170

const X86Subtarget &Subtarget) {

48171

if (!Subtarget.hasSSE2())

48172

return SDValue();

48173

48174

if (Subtarget.isPMADDWDSlow())

48175

return SDValue();

48176

48177

EVT VT = N->getValueType(0);

48178

48179

// Only support vXi32 vectors.

48180

if (!VT.isVector() || VT.getVectorElementType() != MVT::i32)

48181

return SDValue();

48182

48183

// Make sure the type is legal or can split/widen to a legal type.

48184

// With AVX512 but without BWI, we would need to split v32i16.

48185

unsigned NumElts = VT.getVectorNumElements();

48186

if (NumElts == 1 || !isPowerOf2_32(NumElts))

48187

return SDValue();

48188

48189

// With AVX512 but without BWI, we would need to split v32i16.

48190

if (32 <= (2 * NumElts) && Subtarget.hasAVX512() && !Subtarget.hasBWI())

48191

return SDValue();

48192

48193

SDValue N0 = N->getOperand(0);

48194

SDValue N1 = N->getOperand(1);

48195

48196

// If we are zero/sign extending two steps without SSE4.1, its better to

48197

// reduce the vmul width instead.

48198

if (!Subtarget.hasSSE41() &&

48199

(((N0.getOpcode() == ISD::ZERO_EXTEND &&

48200

N0.getOperand(0).getScalarValueSizeInBits() <= 8) &&

48201

(N1.getOpcode() == ISD::ZERO_EXTEND &&

48202

N1.getOperand(0).getScalarValueSizeInBits() <= 8)) ||

48203

((N0.getOpcode() == ISD::SIGN_EXTEND &&

48204

N0.getOperand(0).getScalarValueSizeInBits() <= 8) &&

48205

(N1.getOpcode() == ISD::SIGN_EXTEND &&

48206

N1.getOperand(0).getScalarValueSizeInBits() <= 8))))

48207

return SDValue();

48208

48209

// If we are sign extending a wide vector without SSE4.1, its better to reduce

48210

// the vmul width instead.

48211

if (!Subtarget.hasSSE41() &&

48212

(N0.getOpcode() == ISD::SIGN_EXTEND &&

48213

N0.getOperand(0).getValueSizeInBits() > 128) &&

48214

(N1.getOpcode() == ISD::SIGN_EXTEND &&

48215

N1.getOperand(0).getValueSizeInBits() > 128))

48216

return SDValue();

48217

48218

// Sign bits must extend down to the lowest i16.

48219

if (DAG.ComputeMaxSignificantBits(N1) > 16 ||

48220

DAG.ComputeMaxSignificantBits(N0) > 16)

48221

return SDValue();

48222

48223

// At least one of the elements must be zero in the upper 17 bits, or can be

48224

// safely made zero without altering the final result.

48225

auto GetZeroableOp = [&](SDValue Op) {

48226

APInt Mask17 = APInt::getHighBitsSet(32, 17);

48227

if (DAG.MaskedValueIsZero(Op, Mask17))

48228

return Op;

48229

// Mask off upper 16-bits of sign-extended constants.

48230

if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode()))

48231

return DAG.getNode(ISD::AND, SDLoc(N), VT, Op,

48232

DAG.getConstant(0xFFFF, SDLoc(N), VT));

48233

if (Op.getOpcode() == ISD::SIGN_EXTEND && N->isOnlyUserOf(Op.getNode())) {

48234

SDValue Src = Op.getOperand(0);

48235

// Convert sext(vXi16) to zext(vXi16).

48236

if (Src.getScalarValueSizeInBits() == 16 && VT.getSizeInBits() <= 128)

48237

return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT, Src);

48238

// Convert sext(vXi8) to zext(vXi16 sext(vXi8)) on pre-SSE41 targets

48239

// which will expand the extension.

48240

if (Src.getScalarValueSizeInBits() < 16 && !Subtarget.hasSSE41()) {

48241

EVT ExtVT = VT.changeVectorElementType(MVT::i16);

48242

Src = DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), ExtVT, Src);

48243

return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT, Src);

48244

}

48245

}

48246

// Convert SIGN_EXTEND_VECTOR_INREG to ZEXT_EXTEND_VECTOR_INREG.

48247

if (Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG &&

48248

N->isOnlyUserOf(Op.getNode())) {

48249

SDValue Src = Op.getOperand(0);

48250

if (Src.getScalarValueSizeInBits() == 16)

48251

return DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(N), VT, Src);

48252

}

48253

// Convert VSRAI(Op, 16) to VSRLI(Op, 16).

48254

if (Op.getOpcode() == X86ISD::VSRAI && Op.getConstantOperandVal(1) == 16 &&

48255

N->isOnlyUserOf(Op.getNode())) {

48256

return DAG.getNode(X86ISD::VSRLI, SDLoc(N), VT, Op.getOperand(0),

48257

Op.getOperand(1));

48258

}

48259

return SDValue();

48260

};

48261

SDValue ZeroN0 = GetZeroableOp(N0);

48262

SDValue ZeroN1 = GetZeroableOp(N1);

48263

if (!ZeroN0 && !ZeroN1)

48264

return SDValue();

48265

N0 = ZeroN0 ? ZeroN0 : N0;

48266

N1 = ZeroN1 ? ZeroN1 : N1;

48267

48268

// Use SplitOpsAndApply to handle AVX splitting.

48269

auto PMADDWDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,

48270

ArrayRef<SDValue> Ops) {

48271

MVT ResVT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);

48272

MVT OpVT = MVT::getVectorVT(MVT::i16, Ops[0].getValueSizeInBits() / 16);

48273

return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT,

48274

DAG.getBitcast(OpVT, Ops[0]),

48275

DAG.getBitcast(OpVT, Ops[1]));

48276

};

48277

return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, {N0, N1},

48278

PMADDWDBuilder);

48279

}

48280

48281

static SDValue combineMulToPMULDQ(SDNode *N, SelectionDAG &DAG,

48282

const X86Subtarget &Subtarget) {

48283

if (!Subtarget.hasSSE2())

48284

return SDValue();

48285

48286

EVT VT = N->getValueType(0);

48287

48288

// Only support vXi64 vectors.

48289

if (!VT.isVector() || VT.getVectorElementType() != MVT::i64 ||

48290

VT.getVectorNumElements() < 2 ||

48291

!isPowerOf2_32(VT.getVectorNumElements()))

48292

return SDValue();

48293

48294

SDValue N0 = N->getOperand(0);

48295

SDValue N1 = N->getOperand(1);

48296

48297

// MULDQ returns the 64-bit result of the signed multiplication of the lower

48298

// 32-bits. We can lower with this if the sign bits stretch that far.

48299

if (Subtarget.hasSSE41() && DAG.ComputeNumSignBits(N0) > 32 &&

48300

DAG.ComputeNumSignBits(N1) > 32) {

48301

auto PMULDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL,

48302

ArrayRef<SDValue> Ops) {

48303

return DAG.getNode(X86ISD::PMULDQ, DL, Ops[0].getValueType(), Ops);

48304

};

48305

return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { N0, N1 },

48306

PMULDQBuilder, /*CheckBWI*/false);

48307

}

48308

48309

// If the upper bits are zero we can use a single pmuludq.

48310

APInt Mask = APInt::getHighBitsSet(64, 32);

48311

if (DAG.MaskedValueIsZero(N0, Mask) && DAG.MaskedValueIsZero(N1, Mask)) {

48312

auto PMULUDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL,

48313

ArrayRef<SDValue> Ops) {

48314

return DAG.getNode(X86ISD::PMULUDQ, DL, Ops[0].getValueType(), Ops);

48315

};

48316

return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { N0, N1 },

48317

PMULUDQBuilder, /*CheckBWI*/false);

48318

}

48319

48320

return SDValue();

48321

}

48322

48323

static SDValue combineMul(SDNode *N, SelectionDAG &DAG,

48324

TargetLowering::DAGCombinerInfo &DCI,

48325

const X86Subtarget &Subtarget) {

48326

EVT VT = N->getValueType(0);

48327

48328

if (SDValue V = combineMulToPMADDWD(N, DAG, Subtarget))

48329

return V;

48330

48331

if (SDValue V = combineMulToPMULDQ(N, DAG, Subtarget))

48332

return V;

48333

48334

if (DCI.isBeforeLegalize() && VT.isVector())

48335

return reduceVMULWidth(N, DAG, Subtarget);

48336

48337

// Optimize a single multiply with constant into two operations in order to

48338

// implement it with two cheaper instructions, e.g. LEA + SHL, LEA + LEA.

48339

if (!MulConstantOptimization)

48340

return SDValue();

48341

48342

// An imul is usually smaller than the alternative sequence.

48343

if (DAG.getMachineFunction().getFunction().hasMinSize())

48344

return SDValue();

48345

48346

if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())

48347

return SDValue();

48348

48349

if (VT != MVT::i64 && VT != MVT::i32)

48350

return SDValue();

48351

48352

ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));

48353

if (!C)

48354

return SDValue();

48355

if (isPowerOf2_64(C->getZExtValue()))

48356

return SDValue();

48357

48358

int64_t SignMulAmt = C->getSExtValue();

48359

assert(SignMulAmt != INT64_MIN && "Int min should have been handled!")(static_cast <bool> (SignMulAmt != (-9223372036854775807L
-1) && "Int min should have been handled!") ? void (
0) : __assert_fail ("SignMulAmt != INT64_MIN && \"Int min should have been handled!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48359, __extension__
__PRETTY_FUNCTION__));

48360

uint64_t AbsMulAmt = SignMulAmt < 0 ? -SignMulAmt : SignMulAmt;

48361

48362

SDLoc DL(N);

48363

if (AbsMulAmt == 3 || AbsMulAmt == 5 || AbsMulAmt == 9) {

48364

SDValue NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),

48365

DAG.getConstant(AbsMulAmt, DL, VT));

48366

if (SignMulAmt < 0)

48367

NewMul = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),

48368

NewMul);

48369

48370

return NewMul;

48371

}

48372

48373

uint64_t MulAmt1 = 0;

48374

uint64_t MulAmt2 = 0;

48375

if ((AbsMulAmt % 9) == 0) {

48376

MulAmt1 = 9;

48377

MulAmt2 = AbsMulAmt / 9;

48378

} else if ((AbsMulAmt % 5) == 0) {

48379

MulAmt1 = 5;

48380

MulAmt2 = AbsMulAmt / 5;

48381

} else if ((AbsMulAmt % 3) == 0) {

48382

MulAmt1 = 3;

48383

MulAmt2 = AbsMulAmt / 3;

48384

}

48385

48386

SDValue NewMul;

48387

// For negative multiply amounts, only allow MulAmt2 to be a power of 2.

48388

if (MulAmt2 &&

48389

(isPowerOf2_64(MulAmt2) ||

48390

(SignMulAmt >= 0 && (MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)))) {

48391

48392

if (isPowerOf2_64(MulAmt2) &&

48393

!(SignMulAmt >= 0 && N->hasOneUse() &&

48394

N->use_begin()->getOpcode() == ISD::ADD))

48395

// If second multiplifer is pow2, issue it first. We want the multiply by

48396

// 3, 5, or 9 to be folded into the addressing mode unless the lone use

48397

// is an add. Only do this for positive multiply amounts since the

48398

// negate would prevent it from being used as an address mode anyway.

48399

std::swap(MulAmt1, MulAmt2);

48400

48401

if (isPowerOf2_64(MulAmt1))

48402

NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),

48403

DAG.getConstant(Log2_64(MulAmt1), DL, MVT::i8));

48404

else

48405

NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),

48406

DAG.getConstant(MulAmt1, DL, VT));

48407

48408

if (isPowerOf2_64(MulAmt2))

48409

NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,

48410

DAG.getConstant(Log2_64(MulAmt2), DL, MVT::i8));

48411

else

48412

NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,

48413

DAG.getConstant(MulAmt2, DL, VT));

48414

48415

// Negate the result.

48416

if (SignMulAmt < 0)

48417

NewMul = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),

48418

NewMul);

48419

} else if (!Subtarget.slowLEA())

48420

NewMul = combineMulSpecial(C->getZExtValue(), N, DAG, VT, DL);

48421

48422

if (!NewMul) {

48423

assert(C->getZExtValue() != 0 &&(static_cast <bool> (C->getZExtValue() != 0 &&
C->getZExtValue() != (VT == MVT::i64 ? (18446744073709551615UL
) : (4294967295U)) && "Both cases that could cause potential overflows should have "
"already been handled.") ? void (0) : __assert_fail ("C->getZExtValue() != 0 && C->getZExtValue() != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) && \"Both cases that could cause potential overflows should have \" \"already been handled.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48426, __extension__
__PRETTY_FUNCTION__))

48424

C->getZExtValue() != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) &&(static_cast <bool> (C->getZExtValue() != 0 &&
C->getZExtValue() != (VT == MVT::i64 ? (18446744073709551615UL
) : (4294967295U)) && "Both cases that could cause potential overflows should have "
"already been handled.") ? void (0) : __assert_fail ("C->getZExtValue() != 0 && C->getZExtValue() != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) && \"Both cases that could cause potential overflows should have \" \"already been handled.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48426, __extension__
__PRETTY_FUNCTION__))

48425

"Both cases that could cause potential overflows should have "(static_cast <bool> (C->getZExtValue() != 0 &&
C->getZExtValue() != (VT == MVT::i64 ? (18446744073709551615UL
) : (4294967295U)) && "Both cases that could cause potential overflows should have "
"already been handled.") ? void (0) : __assert_fail ("C->getZExtValue() != 0 && C->getZExtValue() != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) && \"Both cases that could cause potential overflows should have \" \"already been handled.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48426, __extension__
__PRETTY_FUNCTION__))

48426

"already been handled.")(static_cast <bool> (C->getZExtValue() != 0 &&
C->getZExtValue() != (VT == MVT::i64 ? (18446744073709551615UL
) : (4294967295U)) && "Both cases that could cause potential overflows should have "
"already been handled.") ? void (0) : __assert_fail ("C->getZExtValue() != 0 && C->getZExtValue() != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) && \"Both cases that could cause potential overflows should have \" \"already been handled.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48426, __extension__
__PRETTY_FUNCTION__));

48427

if (isPowerOf2_64(AbsMulAmt - 1)) {

48428

// (mul x, 2^N + 1) => (add (shl x, N), x)

48429

NewMul = DAG.getNode(

48430

ISD::ADD, DL, VT, N->getOperand(0),

48431

DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),

48432

DAG.getConstant(Log2_64(AbsMulAmt - 1), DL,

48433

MVT::i8)));

48434

// To negate, subtract the number from zero

48435

if (SignMulAmt < 0)

48436

NewMul = DAG.getNode(ISD::SUB, DL, VT,

48437

DAG.getConstant(0, DL, VT), NewMul);

48438

} else if (isPowerOf2_64(AbsMulAmt + 1)) {

48439

// (mul x, 2^N - 1) => (sub (shl x, N), x)

48440

NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),

48441

DAG.getConstant(Log2_64(AbsMulAmt + 1),

48442

DL, MVT::i8));

48443

// To negate, reverse the operands of the subtract.

48444

if (SignMulAmt < 0)

48445

NewMul = DAG.getNode(ISD::SUB, DL, VT, N->getOperand(0), NewMul);

48446

else

48447

NewMul = DAG.getNode(ISD::SUB, DL, VT, NewMul, N->getOperand(0));

48448

} else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt - 2)) {

48449

// (mul x, 2^N + 2) => (add (shl x, N), (add x, x))

48450

NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),

48451

DAG.getConstant(Log2_64(AbsMulAmt - 2),

48452

DL, MVT::i8));

48453

NewMul = DAG.getNode(

48454

ISD::ADD, DL, VT, NewMul,

48455

DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0), N->getOperand(0)));

48456

} else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt + 2)) {

48457

// (mul x, 2^N - 2) => (sub (shl x, N), (add x, x))

48458

NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),

48459

DAG.getConstant(Log2_64(AbsMulAmt + 2),

48460

DL, MVT::i8));

48461

NewMul = DAG.getNode(

48462

ISD::SUB, DL, VT, NewMul,

48463

DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0), N->getOperand(0)));

48464

}

48465

}

48466

48467

return NewMul;

48468

}

48469

48470

// Try to form a MULHU or MULHS node by looking for

48471

// (srl (mul ext, ext), 16)

48472

// TODO: This is X86 specific because we want to be able to handle wide types

48473

// before type legalization. But we can only do it if the vector will be

48474

// legalized via widening/splitting. Type legalization can't handle promotion

48475

// of a MULHU/MULHS. There isn't a way to convey this to the generic DAG

48476

// combiner.

48477

static SDValue combineShiftToPMULH(SDNode *N, SelectionDAG &DAG,

48478

const X86Subtarget &Subtarget) {

48479

assert((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) &&(static_cast <bool> ((N->getOpcode() == ISD::SRL || N
->getOpcode() == ISD::SRA) && "SRL or SRA node is required here!"
) ? void (0) : __assert_fail ("(N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) && \"SRL or SRA node is required here!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48480, __extension__
__PRETTY_FUNCTION__))

48480

"SRL or SRA node is required here!")(static_cast <bool> ((N->getOpcode() == ISD::SRL || N
->getOpcode() == ISD::SRA) && "SRL or SRA node is required here!"
) ? void (0) : __assert_fail ("(N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) && \"SRL or SRA node is required here!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48480, __extension__
__PRETTY_FUNCTION__));

48481

SDLoc DL(N);

48482

48483

if (!Subtarget.hasSSE2())

48484

return SDValue();

48485

48486

// The operation feeding into the shift must be a multiply.

48487

SDValue ShiftOperand = N->getOperand(0);

48488

if (ShiftOperand.getOpcode() != ISD::MUL || !ShiftOperand.hasOneUse())

48489

return SDValue();

48490

48491

// Input type should be at least vXi32.

48492

EVT VT = N->getValueType(0);

48493

if (!VT.isVector() || VT.getVectorElementType().getSizeInBits() < 32)

48494

return SDValue();

48495

48496

// Need a shift by 16.

48497

APInt ShiftAmt;

48498

if (!ISD::isConstantSplatVector(N->getOperand(1).getNode(), ShiftAmt) ||

48499

ShiftAmt != 16)

48500

return SDValue();

48501

48502

SDValue LHS = ShiftOperand.getOperand(0);

48503

SDValue RHS = ShiftOperand.getOperand(1);

48504

48505

unsigned ExtOpc = LHS.getOpcode();

48506

if ((ExtOpc != ISD::SIGN_EXTEND && ExtOpc != ISD::ZERO_EXTEND) ||

48507

RHS.getOpcode() != ExtOpc)

48508

return SDValue();

48509

48510

// Peek through the extends.

48511

LHS = LHS.getOperand(0);

48512

RHS = RHS.getOperand(0);

48513

48514

// Ensure the input types match.

48515

EVT MulVT = LHS.getValueType();

48516

if (MulVT.getVectorElementType() != MVT::i16 || RHS.getValueType() != MulVT)

48517

return SDValue();

48518

48519

unsigned Opc = ExtOpc == ISD::SIGN_EXTEND ? ISD::MULHS : ISD::MULHU;

48520

SDValue Mulh = DAG.getNode(Opc, DL, MulVT, LHS, RHS);

48521

48522

ExtOpc = N->getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;

48523

return DAG.getNode(ExtOpc, DL, VT, Mulh);

48524

}

48525

48526

static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG) {

48527

SDValue N0 = N->getOperand(0);

48528

SDValue N1 = N->getOperand(1);

48529

ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);

48530

EVT VT = N0.getValueType();

48531

48532

// fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))

48533

// since the result of setcc_c is all zero's or all ones.

48534

if (VT.isInteger() && !VT.isVector() &&

48535

N1C && N0.getOpcode() == ISD::AND &&

48536

N0.getOperand(1).getOpcode() == ISD::Constant) {

48537

SDValue N00 = N0.getOperand(0);

48538

APInt Mask = N0.getConstantOperandAPInt(1);

48539

Mask <<= N1C->getAPIntValue();

48540

bool MaskOK = false;

48541

// We can handle cases concerning bit-widening nodes containing setcc_c if

48542

// we carefully interrogate the mask to make sure we are semantics

48543

// preserving.

48544

// The transform is not safe if the result of C1 << C2 exceeds the bitwidth

48545

// of the underlying setcc_c operation if the setcc_c was zero extended.

48546

// Consider the following example:

48547

// zext(setcc_c) -> i32 0x0000FFFF

48548

// c1 -> i32 0x0000FFFF

48549

// c2 -> i32 0x00000001

48550

// (shl (and (setcc_c), c1), c2) -> i32 0x0001FFFE

48551

// (and setcc_c, (c1 << c2)) -> i32 0x0000FFFE

48552

if (N00.getOpcode() == X86ISD::SETCC_CARRY) {

48553

MaskOK = true;

48554

} else if (N00.getOpcode() == ISD::SIGN_EXTEND &&

48555

N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {

48556

MaskOK = true;

48557

} else if ((N00.getOpcode() == ISD::ZERO_EXTEND ||

48558

N00.getOpcode() == ISD::ANY_EXTEND) &&

48559

N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {

48560

MaskOK = Mask.isIntN(N00.getOperand(0).getValueSizeInBits());

48561

}

48562

if (MaskOK && Mask != 0) {

48563

SDLoc DL(N);

48564

return DAG.getNode(ISD::AND, DL, VT, N00, DAG.getConstant(Mask, DL, VT));

48565

}

48566

}

48567

48568

return SDValue();

48569

}

48570

48571

static SDValue combineShiftRightArithmetic(SDNode *N, SelectionDAG &DAG,

48572

const X86Subtarget &Subtarget) {

48573

SDValue N0 = N->getOperand(0);

48574

SDValue N1 = N->getOperand(1);

48575

EVT VT = N0.getValueType();

48576

unsigned Size = VT.getSizeInBits();

48577

48578

if (SDValue V = combineShiftToPMULH(N, DAG, Subtarget))

48579

return V;

48580

48581

// fold (ashr (shl, a, [56,48,32,24,16]), SarConst)

48582

// into (shl, (sext (a), [56,48,32,24,16] - SarConst)) or

48583

// into (lshr, (sext (a), SarConst - [56,48,32,24,16]))

48584

// depending on sign of (SarConst - [56,48,32,24,16])

48585

48586

// sexts in X86 are MOVs. The MOVs have the same code size

48587

// as above SHIFTs (only SHIFT on 1 has lower code size).

48588

// However the MOVs have 2 advantages to a SHIFT:

48589

// 1. MOVs can write to a register that differs from source

48590

// 2. MOVs accept memory operands

48591

48592

if (VT.isVector() || N1.getOpcode() != ISD::Constant ||

48593

N0.getOpcode() != ISD::SHL || !N0.hasOneUse() ||

48594

N0.getOperand(1).getOpcode() != ISD::Constant)

48595

return SDValue();

48596

48597

SDValue N00 = N0.getOperand(0);

48598

SDValue N01 = N0.getOperand(1);

48599

APInt ShlConst = (cast<ConstantSDNode>(N01))->getAPIntValue();

48600

APInt SarConst = (cast<ConstantSDNode>(N1))->getAPIntValue();

48601

EVT CVT = N1.getValueType();

48602

48603

if (SarConst.isNegative())

48604

return SDValue();

48605

48606

for (MVT SVT : { MVT::i8, MVT::i16, MVT::i32 }) {

48607

unsigned ShiftSize = SVT.getSizeInBits();

48608

// skipping types without corresponding sext/zext and

48609

// ShlConst that is not one of [56,48,32,24,16]

48610

if (ShiftSize >= Size || ShlConst != Size - ShiftSize)

48611

continue;

48612

SDLoc DL(N);

48613

SDValue NN =

48614

DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, N00, DAG.getValueType(SVT));

48615

SarConst = SarConst - (Size - ShiftSize);

48616

if (SarConst == 0)

48617

return NN;

48618

if (SarConst.isNegative())

48619

return DAG.getNode(ISD::SHL, DL, VT, NN,

48620

DAG.getConstant(-SarConst, DL, CVT));

48621

return DAG.getNode(ISD::SRA, DL, VT, NN,

48622

DAG.getConstant(SarConst, DL, CVT));

48623

}

48624

return SDValue();

48625

}

48626

48627

static SDValue combineShiftRightLogical(SDNode *N, SelectionDAG &DAG,

48628

TargetLowering::DAGCombinerInfo &DCI,

48629

const X86Subtarget &Subtarget) {

48630

SDValue N0 = N->getOperand(0);

48631

SDValue N1 = N->getOperand(1);

48632

EVT VT = N0.getValueType();

48633

48634

if (SDValue V = combineShiftToPMULH(N, DAG, Subtarget))

48635

return V;

48636

48637

// Only do this on the last DAG combine as it can interfere with other

48638

// combines.

48639

if (!DCI.isAfterLegalizeDAG())

48640

return SDValue();

48641

48642

// Try to improve a sequence of srl (and X, C1), C2 by inverting the order.

48643

// TODO: This is a generic DAG combine that became an x86-only combine to

48644

// avoid shortcomings in other folds such as bswap, bit-test ('bt'), and

48645

// and-not ('andn').

48646

if (N0.getOpcode() != ISD::AND || !N0.hasOneUse())

48647

return SDValue();

48648

48649

auto *ShiftC = dyn_cast<ConstantSDNode>(N1);

48650

auto *AndC = dyn_cast<ConstantSDNode>(N0.getOperand(1));

48651

if (!ShiftC || !AndC)

48652

return SDValue();

48653

48654

// If we can shrink the constant mask below 8-bits or 32-bits, then this

48655

// transform should reduce code size. It may also enable secondary transforms

48656

// from improved known-bits analysis or instruction selection.

48657

APInt MaskVal = AndC->getAPIntValue();

48658

48659

// If this can be matched by a zero extend, don't optimize.

48660

if (MaskVal.isMask()) {

48661

unsigned TO = MaskVal.countr_one();

48662

if (TO >= 8 && isPowerOf2_32(TO))

48663

return SDValue();

48664

}

48665

48666

APInt NewMaskVal = MaskVal.lshr(ShiftC->getAPIntValue());

48667

unsigned OldMaskSize = MaskVal.getSignificantBits();

48668

unsigned NewMaskSize = NewMaskVal.getSignificantBits();

48669

if ((OldMaskSize > 8 && NewMaskSize <= 8) ||

48670

(OldMaskSize > 32 && NewMaskSize <= 32)) {

48671

// srl (and X, AndC), ShiftC --> and (srl X, ShiftC), (AndC >> ShiftC)

48672

SDLoc DL(N);

48673

SDValue NewMask = DAG.getConstant(NewMaskVal, DL, VT);

48674

SDValue NewShift = DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0), N1);

48675

return DAG.getNode(ISD::AND, DL, VT, NewShift, NewMask);

48676

}

48677

return SDValue();

48678

}

48679

48680

static SDValue combineHorizOpWithShuffle(SDNode *N, SelectionDAG &DAG,

48681

const X86Subtarget &Subtarget) {

48682

unsigned Opcode = N->getOpcode();

48683

assert(isHorizOp(Opcode) && "Unexpected hadd/hsub/pack opcode")(static_cast <bool> (isHorizOp(Opcode) && "Unexpected hadd/hsub/pack opcode"
) ? void (0) : __assert_fail ("isHorizOp(Opcode) && \"Unexpected hadd/hsub/pack opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48683, __extension__
__PRETTY_FUNCTION__));

48684

48685

SDLoc DL(N);

48686

EVT VT = N->getValueType(0);

48687

SDValue N0 = N->getOperand(0);

48688

SDValue N1 = N->getOperand(1);

48689

EVT SrcVT = N0.getValueType();

48690

48691

SDValue BC0 =

48692

N->isOnlyUserOf(N0.getNode()) ? peekThroughOneUseBitcasts(N0) : N0;

48693

SDValue BC1 =

48694

N->isOnlyUserOf(N1.getNode()) ? peekThroughOneUseBitcasts(N1) : N1;

48695

48696

// Attempt to fold HOP(LOSUBVECTOR(SHUFFLE(X)),HISUBVECTOR(SHUFFLE(X)))

48697

// to SHUFFLE(HOP(LOSUBVECTOR(X),HISUBVECTOR(X))), this is mainly for

48698

// truncation trees that help us avoid lane crossing shuffles.

48699

// TODO: There's a lot more we can do for PACK/HADD style shuffle combines.

48700

// TODO: We don't handle vXf64 shuffles yet.

48701

if (VT.is128BitVector() && SrcVT.getScalarSizeInBits() <= 32) {

48702

if (SDValue BCSrc = getSplitVectorSrc(BC0, BC1, false)) {

48703

SmallVector<SDValue> ShuffleOps;

48704

SmallVector<int> ShuffleMask, ScaledMask;

48705

SDValue Vec = peekThroughBitcasts(BCSrc);

48706

if (getTargetShuffleInputs(Vec, ShuffleOps, ShuffleMask, DAG)) {

48707

resolveTargetShuffleInputsAndMask(ShuffleOps, ShuffleMask);

48708

// To keep the HOP LHS/RHS coherency, we must be able to scale the unary

48709

// shuffle to a v4X64 width - we can probably relax this in the future.

48710

if (!isAnyZero(ShuffleMask) && ShuffleOps.size() == 1 &&

48711

ShuffleOps[0].getValueType().is256BitVector() &&

48712

scaleShuffleElements(ShuffleMask, 4, ScaledMask)) {

48713

SDValue Lo, Hi;

48714

MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f32 : MVT::v4i32;

48715

std::tie(Lo, Hi) = DAG.SplitVector(ShuffleOps[0], DL);

48716

Lo = DAG.getBitcast(SrcVT, Lo);

48717

Hi = DAG.getBitcast(SrcVT, Hi);

48718

SDValue Res = DAG.getNode(Opcode, DL, VT, Lo, Hi);

48719

Res = DAG.getBitcast(ShufVT, Res);

48720

Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, ScaledMask);

48721

return DAG.getBitcast(VT, Res);

48722

}

48723

}

48724

}

48725

}

48726

48727

// Attempt to fold HOP(SHUFFLE(X,Y),SHUFFLE(Z,W)) -> SHUFFLE(HOP()).

48728

if (VT.is128BitVector() && SrcVT.getScalarSizeInBits() <= 32) {

48729

// If either/both ops are a shuffle that can scale to v2x64,

48730

// then see if we can perform this as a v4x32 post shuffle.

48731

SmallVector<SDValue> Ops0, Ops1;

48732

SmallVector<int> Mask0, Mask1, ScaledMask0, ScaledMask1;

48733

bool IsShuf0 =

48734

getTargetShuffleInputs(BC0, Ops0, Mask0, DAG) && !isAnyZero(Mask0) &&

48735

scaleShuffleElements(Mask0, 2, ScaledMask0) &&

48736

all_of(Ops0, [](SDValue Op) { return Op.getValueSizeInBits() == 128; });

48737

bool IsShuf1 =

48738

getTargetShuffleInputs(BC1, Ops1, Mask1, DAG) && !isAnyZero(Mask1) &&

48739

scaleShuffleElements(Mask1, 2, ScaledMask1) &&

48740

all_of(Ops1, [](SDValue Op) { return Op.getValueSizeInBits() == 128; });

48741

if (IsShuf0 || IsShuf1) {

48742

if (!IsShuf0) {

48743

Ops0.assign({BC0});

48744

ScaledMask0.assign({0, 1});

48745

}

48746

if (!IsShuf1) {

48747

Ops1.assign({BC1});

48748

ScaledMask1.assign({0, 1});

48749

}

48750

48751

SDValue LHS, RHS;

48752

int PostShuffle[4] = {-1, -1, -1, -1};

48753

auto FindShuffleOpAndIdx = [&](int M, int &Idx, ArrayRef<SDValue> Ops) {

48754

if (M < 0)

48755

return true;

48756

Idx = M % 2;

48757

SDValue Src = Ops[M / 2];

48758

if (!LHS || LHS == Src) {

48759

LHS = Src;

48760

return true;

48761

}

48762

if (!RHS || RHS == Src) {

48763

Idx += 2;

48764

RHS = Src;

48765

return true;

48766

}

48767

return false;

48768

};

48769

if (FindShuffleOpAndIdx(ScaledMask0[0], PostShuffle[0], Ops0) &&

48770

FindShuffleOpAndIdx(ScaledMask0[1], PostShuffle[1], Ops0) &&

48771

FindShuffleOpAndIdx(ScaledMask1[0], PostShuffle[2], Ops1) &&

48772

FindShuffleOpAndIdx(ScaledMask1[1], PostShuffle[3], Ops1)) {

48773

LHS = DAG.getBitcast(SrcVT, LHS);

48774

RHS = DAG.getBitcast(SrcVT, RHS ? RHS : LHS);

48775

MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f32 : MVT::v4i32;

48776

SDValue Res = DAG.getNode(Opcode, DL, VT, LHS, RHS);

48777

Res = DAG.getBitcast(ShufVT, Res);

48778

Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, PostShuffle);

48779

return DAG.getBitcast(VT, Res);

48780

}

48781

}

48782

}

48783

48784

// Attempt to fold HOP(SHUFFLE(X,Y),SHUFFLE(X,Y)) -> SHUFFLE(HOP(X,Y)).

48785

if (VT.is256BitVector() && Subtarget.hasInt256()) {

48786

SmallVector<int> Mask0, Mask1;

48787

SmallVector<SDValue> Ops0, Ops1;

48788

SmallVector<int, 2> ScaledMask0, ScaledMask1;

48789

if (getTargetShuffleInputs(BC0, Ops0, Mask0, DAG) && !isAnyZero(Mask0) &&

48790

getTargetShuffleInputs(BC1, Ops1, Mask1, DAG) && !isAnyZero(Mask1) &&

48791

!Ops0.empty() && !Ops1.empty() &&

48792

all_of(Ops0,

48793

[](SDValue Op) { return Op.getValueType().is256BitVector(); }) &&

48794

all_of(Ops1,

48795

[](SDValue Op) { return Op.getValueType().is256BitVector(); }) &&

48796

scaleShuffleElements(Mask0, 2, ScaledMask0) &&

48797

scaleShuffleElements(Mask1, 2, ScaledMask1)) {

48798

SDValue Op00 = peekThroughBitcasts(Ops0.front());

48799

SDValue Op10 = peekThroughBitcasts(Ops1.front());

48800

SDValue Op01 = peekThroughBitcasts(Ops0.back());

48801

SDValue Op11 = peekThroughBitcasts(Ops1.back());

48802

if ((Op00 == Op11) && (Op01 == Op10)) {

48803

std::swap(Op10, Op11);

48804

ShuffleVectorSDNode::commuteMask(ScaledMask1);

48805

}

48806

if ((Op00 == Op10) && (Op01 == Op11)) {

48807

const int Map[4] = {0, 2, 1, 3};

48808

SmallVector<int, 4> ShuffleMask(

48809

{Map[ScaledMask0[0]], Map[ScaledMask1[0]], Map[ScaledMask0[1]],

48810

Map[ScaledMask1[1]]});

48811

MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;

48812

SDValue Res = DAG.getNode(Opcode, DL, VT, DAG.getBitcast(SrcVT, Op00),

48813

DAG.getBitcast(SrcVT, Op01));

48814

Res = DAG.getBitcast(ShufVT, Res);

48815

Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, ShuffleMask);

48816

return DAG.getBitcast(VT, Res);

48817

}

48818

}

48819

}

48820

48821

return SDValue();

48822

}

48823

48824

static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG,

48825

TargetLowering::DAGCombinerInfo &DCI,

48826

const X86Subtarget &Subtarget) {

48827

unsigned Opcode = N->getOpcode();

48828

assert((X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) &&(static_cast <bool> ((X86ISD::PACKSS == Opcode || X86ISD
::PACKUS == Opcode) && "Unexpected pack opcode") ? void
(0) : __assert_fail ("(X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) && \"Unexpected pack opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48829, __extension__
__PRETTY_FUNCTION__))

48829

"Unexpected pack opcode")(static_cast <bool> ((X86ISD::PACKSS == Opcode || X86ISD
::PACKUS == Opcode) && "Unexpected pack opcode") ? void
(0) : __assert_fail ("(X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) && \"Unexpected pack opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48829, __extension__
__PRETTY_FUNCTION__));

48830

48831

EVT VT = N->getValueType(0);

48832

SDValue N0 = N->getOperand(0);

48833

SDValue N1 = N->getOperand(1);

48834

unsigned NumDstElts = VT.getVectorNumElements();

48835

unsigned DstBitsPerElt = VT.getScalarSizeInBits();

48836

unsigned SrcBitsPerElt = 2 * DstBitsPerElt;

48837

assert(N0.getScalarValueSizeInBits() == SrcBitsPerElt &&(static_cast <bool> (N0.getScalarValueSizeInBits() == SrcBitsPerElt
&& N1.getScalarValueSizeInBits() == SrcBitsPerElt &&
"Unexpected PACKSS/PACKUS input type") ? void (0) : __assert_fail
("N0.getScalarValueSizeInBits() == SrcBitsPerElt && N1.getScalarValueSizeInBits() == SrcBitsPerElt && \"Unexpected PACKSS/PACKUS input type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48839, __extension__
__PRETTY_FUNCTION__))

48838

N1.getScalarValueSizeInBits() == SrcBitsPerElt &&(static_cast <bool> (N0.getScalarValueSizeInBits() == SrcBitsPerElt
&& N1.getScalarValueSizeInBits() == SrcBitsPerElt &&
"Unexpected PACKSS/PACKUS input type") ? void (0) : __assert_fail
("N0.getScalarValueSizeInBits() == SrcBitsPerElt && N1.getScalarValueSizeInBits() == SrcBitsPerElt && \"Unexpected PACKSS/PACKUS input type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48839, __extension__
__PRETTY_FUNCTION__))

48839

"Unexpected PACKSS/PACKUS input type")(static_cast <bool> (N0.getScalarValueSizeInBits() == SrcBitsPerElt
&& N1.getScalarValueSizeInBits() == SrcBitsPerElt &&
"Unexpected PACKSS/PACKUS input type") ? void (0) : __assert_fail
("N0.getScalarValueSizeInBits() == SrcBitsPerElt && N1.getScalarValueSizeInBits() == SrcBitsPerElt && \"Unexpected PACKSS/PACKUS input type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48839, __extension__
__PRETTY_FUNCTION__));

48840

48841

bool IsSigned = (X86ISD::PACKSS == Opcode);

48842

48843

// Constant Folding.

48844

APInt UndefElts0, UndefElts1;

48845

SmallVector<APInt, 32> EltBits0, EltBits1;

48846

if ((N0.isUndef() || N->isOnlyUserOf(N0.getNode())) &&

48847

(N1.isUndef() || N->isOnlyUserOf(N1.getNode())) &&

48848

getTargetConstantBitsFromNode(N0, SrcBitsPerElt, UndefElts0, EltBits0) &&

48849

getTargetConstantBitsFromNode(N1, SrcBitsPerElt, UndefElts1, EltBits1)) {

48850

unsigned NumLanes = VT.getSizeInBits() / 128;

48851

unsigned NumSrcElts = NumDstElts / 2;

48852

unsigned NumDstEltsPerLane = NumDstElts / NumLanes;

48853

unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes;

48854

48855

APInt Undefs(NumDstElts, 0);

48856

SmallVector<APInt, 32> Bits(NumDstElts, APInt::getZero(DstBitsPerElt));

48857

for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {

48858

for (unsigned Elt = 0; Elt != NumDstEltsPerLane; ++Elt) {

48859

unsigned SrcIdx = Lane * NumSrcEltsPerLane + Elt % NumSrcEltsPerLane;

48860

auto &UndefElts = (Elt >= NumSrcEltsPerLane ? UndefElts1 : UndefElts0);

48861

auto &EltBits = (Elt >= NumSrcEltsPerLane ? EltBits1 : EltBits0);

48862

48863

if (UndefElts[SrcIdx]) {

48864

Undefs.setBit(Lane * NumDstEltsPerLane + Elt);

48865

continue;

48866

}

48867

48868

APInt &Val = EltBits[SrcIdx];

48869

if (IsSigned) {

48870

// PACKSS: Truncate signed value with signed saturation.

48871

// Source values less than dst minint are saturated to minint.

48872

// Source values greater than dst maxint are saturated to maxint.

48873

if (Val.isSignedIntN(DstBitsPerElt))

48874

Val = Val.trunc(DstBitsPerElt);

48875

else if (Val.isNegative())

48876

Val = APInt::getSignedMinValue(DstBitsPerElt);

48877

else

48878

Val = APInt::getSignedMaxValue(DstBitsPerElt);

48879

} else {

48880

// PACKUS: Truncate signed value with unsigned saturation.

48881

// Source values less than zero are saturated to zero.

48882

// Source values greater than dst maxuint are saturated to maxuint.

48883

if (Val.isIntN(DstBitsPerElt))

48884

Val = Val.trunc(DstBitsPerElt);

48885

else if (Val.isNegative())

48886

Val = APInt::getZero(DstBitsPerElt);

48887

else

48888

Val = APInt::getAllOnes(DstBitsPerElt);

48889

}

48890

Bits[Lane * NumDstEltsPerLane + Elt] = Val;

48891

}

48892

}

48893

48894

return getConstVector(Bits, Undefs, VT.getSimpleVT(), DAG, SDLoc(N));

48895

}

48896

48897

// Try to fold PACK(SHUFFLE(),SHUFFLE()) -> SHUFFLE(PACK()).

48898

if (SDValue V = combineHorizOpWithShuffle(N, DAG, Subtarget))

48899

return V;

48900

48901

// Try to combine a PACKUSWB/PACKSSWB implemented truncate with a regular

48902

// truncate to create a larger truncate.

48903

if (Subtarget.hasAVX512() &&

48904

N0.getOpcode() == ISD::TRUNCATE && N1.isUndef() && VT == MVT::v16i8 &&

48905

N0.getOperand(0).getValueType() == MVT::v8i32) {

48906

if ((IsSigned && DAG.ComputeNumSignBits(N0) > 8) ||

48907

(!IsSigned &&

48908

DAG.MaskedValueIsZero(N0, APInt::getHighBitsSet(16, 8)))) {

48909

if (Subtarget.hasVLX())

48910

return DAG.getNode(X86ISD::VTRUNC, SDLoc(N), VT, N0.getOperand(0));

48911

48912

// Widen input to v16i32 so we can truncate that.

48913

SDLoc dl(N);

48914

SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i32,

48915

N0.getOperand(0), DAG.getUNDEF(MVT::v8i32));

48916

return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Concat);

48917

}

48918

}

48919

48920

// Try to fold PACK(EXTEND(X),EXTEND(Y)) -> CONCAT(X,Y) subvectors.

48921

if (VT.is128BitVector()) {

48922

unsigned ExtOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;

48923

SDValue Src0, Src1;

48924

if (N0.getOpcode() == ExtOpc &&

48925

N0.getOperand(0).getValueType().is64BitVector() &&

48926

N0.getOperand(0).getScalarValueSizeInBits() == DstBitsPerElt) {

48927

Src0 = N0.getOperand(0);

48928

}

48929

if (N1.getOpcode() == ExtOpc &&

48930

N1.getOperand(0).getValueType().is64BitVector() &&

48931

N1.getOperand(0).getScalarValueSizeInBits() == DstBitsPerElt) {

48932

Src1 = N1.getOperand(0);

48933

}

48934

if ((Src0 || N0.isUndef()) && (Src1 || N1.isUndef())) {

48935

assert((Src0 || Src1) && "Found PACK(UNDEF,UNDEF)")(static_cast <bool> ((Src0 || Src1) && "Found PACK(UNDEF,UNDEF)"
) ? void (0) : __assert_fail ("(Src0 || Src1) && \"Found PACK(UNDEF,UNDEF)\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48935, __extension__
__PRETTY_FUNCTION__));

48936

Src0 = Src0 ? Src0 : DAG.getUNDEF(Src1.getValueType());

48937

Src1 = Src1 ? Src1 : DAG.getUNDEF(Src0.getValueType());

48938

return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Src0, Src1);

48939

}

48940

48941

// Try again with pack(*_extend_vector_inreg, undef).

48942

unsigned VecInRegOpc = IsSigned ? ISD::SIGN_EXTEND_VECTOR_INREG

48943

: ISD::ZERO_EXTEND_VECTOR_INREG;

48944

if (N0.getOpcode() == VecInRegOpc && N1.isUndef() &&

48945

N0.getOperand(0).getScalarValueSizeInBits() < DstBitsPerElt)

48946

return getEXTEND_VECTOR_INREG(ExtOpc, SDLoc(N), VT, N0.getOperand(0),

48947

DAG);

48948

}

48949

48950

// Attempt to combine as shuffle.

48951

SDValue Op(N, 0);

48952

if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))

48953

return Res;

48954

48955

return SDValue();

48956

}

48957

48958

static SDValue combineVectorHADDSUB(SDNode *N, SelectionDAG &DAG,

48959

TargetLowering::DAGCombinerInfo &DCI,

48960

const X86Subtarget &Subtarget) {

48961

assert((X86ISD::HADD == N->getOpcode() || X86ISD::FHADD == N->getOpcode() ||(static_cast <bool> ((X86ISD::HADD == N->getOpcode()
|| X86ISD::FHADD == N->getOpcode() || X86ISD::HSUB == N->
getOpcode() || X86ISD::FHSUB == N->getOpcode()) &&
"Unexpected horizontal add/sub opcode") ? void (0) : __assert_fail
("(X86ISD::HADD == N->getOpcode() || X86ISD::FHADD == N->getOpcode() || X86ISD::HSUB == N->getOpcode() || X86ISD::FHSUB == N->getOpcode()) && \"Unexpected horizontal add/sub opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48963, __extension__
__PRETTY_FUNCTION__))

48962

X86ISD::HSUB == N->getOpcode() || X86ISD::FHSUB == N->getOpcode()) &&(static_cast <bool> ((X86ISD::HADD == N->getOpcode()
|| X86ISD::FHADD == N->getOpcode() || X86ISD::HSUB == N->
getOpcode() || X86ISD::FHSUB == N->getOpcode()) &&
"Unexpected horizontal add/sub opcode") ? void (0) : __assert_fail
("(X86ISD::HADD == N->getOpcode() || X86ISD::FHADD == N->getOpcode() || X86ISD::HSUB == N->getOpcode() || X86ISD::FHSUB == N->getOpcode()) && \"Unexpected horizontal add/sub opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48963, __extension__
__PRETTY_FUNCTION__))

48963

"Unexpected horizontal add/sub opcode")(static_cast <bool> ((X86ISD::HADD == N->getOpcode()
|| X86ISD::FHADD == N->getOpcode() || X86ISD::HSUB == N->
getOpcode() || X86ISD::FHSUB == N->getOpcode()) &&
"Unexpected horizontal add/sub opcode") ? void (0) : __assert_fail
("(X86ISD::HADD == N->getOpcode() || X86ISD::FHADD == N->getOpcode() || X86ISD::HSUB == N->getOpcode() || X86ISD::FHSUB == N->getOpcode()) && \"Unexpected horizontal add/sub opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48963, __extension__
__PRETTY_FUNCTION__));

48964

48965

if (!shouldUseHorizontalOp(true, DAG, Subtarget)) {

48966

MVT VT = N->getSimpleValueType(0);

48967

SDValue LHS = N->getOperand(0);

48968

SDValue RHS = N->getOperand(1);

48969

48970

// HOP(HOP'(X,X),HOP'(Y,Y)) -> HOP(PERMUTE(HOP'(X,Y)),PERMUTE(HOP'(X,Y)).

48971

if (LHS != RHS && LHS.getOpcode() == N->getOpcode() &&

48972

LHS.getOpcode() == RHS.getOpcode() &&

48973

LHS.getValueType() == RHS.getValueType() &&

48974

N->isOnlyUserOf(LHS.getNode()) && N->isOnlyUserOf(RHS.getNode())) {

48975

SDValue LHS0 = LHS.getOperand(0);

48976

SDValue LHS1 = LHS.getOperand(1);

48977

SDValue RHS0 = RHS.getOperand(0);

48978

SDValue RHS1 = RHS.getOperand(1);

48979

if ((LHS0 == LHS1 || LHS0.isUndef() || LHS1.isUndef()) &&

48980

(RHS0 == RHS1 || RHS0.isUndef() || RHS1.isUndef())) {

48981

SDLoc DL(N);

48982

SDValue Res = DAG.getNode(LHS.getOpcode(), DL, LHS.getValueType(),

48983

LHS0.isUndef() ? LHS1 : LHS0,

48984

RHS0.isUndef() ? RHS1 : RHS0);

48985

MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);

48986

Res = DAG.getBitcast(ShufVT, Res);

48987

SDValue NewLHS =

48988

DAG.getNode(X86ISD::PSHUFD, DL, ShufVT, Res,

48989

getV4X86ShuffleImm8ForMask({0, 1, 0, 1}, DL, DAG));

48990

SDValue NewRHS =

48991

DAG.getNode(X86ISD::PSHUFD, DL, ShufVT, Res,

48992

getV4X86ShuffleImm8ForMask({2, 3, 2, 3}, DL, DAG));

48993

return DAG.getNode(N->getOpcode(), DL, VT, DAG.getBitcast(VT, NewLHS),

48994

DAG.getBitcast(VT, NewRHS));

48995

}

48996

}

48997

}

48998

48999

// Try to fold HOP(SHUFFLE(),SHUFFLE()) -> SHUFFLE(HOP()).

49000

if (SDValue V = combineHorizOpWithShuffle(N, DAG, Subtarget))

49001

return V;

49002

49003

return SDValue();

49004

}

49005

49006

static SDValue combineVectorShiftVar(SDNode *N, SelectionDAG &DAG,

49007

TargetLowering::DAGCombinerInfo &DCI,

49008

const X86Subtarget &Subtarget) {

49009

assert((X86ISD::VSHL == N->getOpcode() || X86ISD::VSRA == N->getOpcode() ||(static_cast <bool> ((X86ISD::VSHL == N->getOpcode()
|| X86ISD::VSRA == N->getOpcode() || X86ISD::VSRL == N->
getOpcode()) && "Unexpected shift opcode") ? void (0)
: __assert_fail ("(X86ISD::VSHL == N->getOpcode() || X86ISD::VSRA == N->getOpcode() || X86ISD::VSRL == N->getOpcode()) && \"Unexpected shift opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49011, __extension__
__PRETTY_FUNCTION__))

49010

X86ISD::VSRL == N->getOpcode()) &&(static_cast <bool> ((X86ISD::VSHL == N->getOpcode()
|| X86ISD::VSRA == N->getOpcode() || X86ISD::VSRL == N->
getOpcode()) && "Unexpected shift opcode") ? void (0)
: __assert_fail ("(X86ISD::VSHL == N->getOpcode() || X86ISD::VSRA == N->getOpcode() || X86ISD::VSRL == N->getOpcode()) && \"Unexpected shift opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49011, __extension__
__PRETTY_FUNCTION__))

49011

"Unexpected shift opcode")(static_cast <bool> ((X86ISD::VSHL == N->getOpcode()
|| X86ISD::VSRA == N->getOpcode() || X86ISD::VSRL == N->
getOpcode()) && "Unexpected shift opcode") ? void (0)
: __assert_fail ("(X86ISD::VSHL == N->getOpcode() || X86ISD::VSRA == N->getOpcode() || X86ISD::VSRL == N->getOpcode()) && \"Unexpected shift opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49011, __extension__
__PRETTY_FUNCTION__));

49012

EVT VT = N->getValueType(0);

49013

SDValue N0 = N->getOperand(0);

49014

SDValue N1 = N->getOperand(1);

49015

49016

// Shift zero -> zero.

49017

if (ISD::isBuildVectorAllZeros(N0.getNode()))

49018

return DAG.getConstant(0, SDLoc(N), VT);

49019

49020

// Detect constant shift amounts.

49021

APInt UndefElts;

49022

SmallVector<APInt, 32> EltBits;

49023

if (getTargetConstantBitsFromNode(N1, 64, UndefElts, EltBits, true, false)) {

49024

unsigned X86Opc = getTargetVShiftUniformOpcode(N->getOpcode(), false);

49025

return getTargetVShiftByConstNode(X86Opc, SDLoc(N), VT.getSimpleVT(), N0,

49026

EltBits[0].getZExtValue(), DAG);

49027

}

49028

49029

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

49030

APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());

49031

if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))

49032

return SDValue(N, 0);

49033

49034

return SDValue();

49035

}

49036

49037

static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG,

49038

TargetLowering::DAGCombinerInfo &DCI,

49039

const X86Subtarget &Subtarget) {

49040

unsigned Opcode = N->getOpcode();

49041

assert((X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode ||(static_cast <bool> ((X86ISD::VSHLI == Opcode || X86ISD
::VSRAI == Opcode || X86ISD::VSRLI == Opcode) && "Unexpected shift opcode"
) ? void (0) : __assert_fail ("(X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode || X86ISD::VSRLI == Opcode) && \"Unexpected shift opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49043, __extension__
__PRETTY_FUNCTION__))

49042

X86ISD::VSRLI == Opcode) &&(static_cast <bool> ((X86ISD::VSHLI == Opcode || X86ISD
::VSRAI == Opcode || X86ISD::VSRLI == Opcode) && "Unexpected shift opcode"
) ? void (0) : __assert_fail ("(X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode || X86ISD::VSRLI == Opcode) && \"Unexpected shift opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49043, __extension__
__PRETTY_FUNCTION__))

49043

"Unexpected shift opcode")(static_cast <bool> ((X86ISD::VSHLI == Opcode || X86ISD
::VSRAI == Opcode || X86ISD::VSRLI == Opcode) && "Unexpected shift opcode"
) ? void (0) : __assert_fail ("(X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode || X86ISD::VSRLI == Opcode) && \"Unexpected shift opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49043, __extension__
__PRETTY_FUNCTION__));

49044

bool LogicalShift = X86ISD::VSHLI == Opcode || X86ISD::VSRLI == Opcode;

49045

EVT VT = N->getValueType(0);

49046

SDValue N0 = N->getOperand(0);

49047

SDValue N1 = N->getOperand(1);

49048

unsigned NumBitsPerElt = VT.getScalarSizeInBits();

49049

assert(VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 &&(static_cast <bool> (VT == N0.getValueType() &&
(NumBitsPerElt % 8) == 0 && "Unexpected value type")
? void (0) : __assert_fail ("VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49050, __extension__
__PRETTY_FUNCTION__))

49050

"Unexpected value type")(static_cast <bool> (VT == N0.getValueType() &&
(NumBitsPerElt % 8) == 0 && "Unexpected value type")
? void (0) : __assert_fail ("VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49050, __extension__
__PRETTY_FUNCTION__));

49051

assert(N1.getValueType() == MVT::i8 && "Unexpected shift amount type")(static_cast <bool> (N1.getValueType() == MVT::i8 &&
"Unexpected shift amount type") ? void (0) : __assert_fail (
"N1.getValueType() == MVT::i8 && \"Unexpected shift amount type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49051, __extension__
__PRETTY_FUNCTION__));

49052

49053

// (shift undef, X) -> 0

49054

if (N0.isUndef())

49055

return DAG.getConstant(0, SDLoc(N), VT);

49056

49057

// Out of range logical bit shifts are guaranteed to be zero.

49058

// Out of range arithmetic bit shifts splat the sign bit.

49059

unsigned ShiftVal = N->getConstantOperandVal(1);

49060

if (ShiftVal >= NumBitsPerElt) {

49061

if (LogicalShift)

49062

return DAG.getConstant(0, SDLoc(N), VT);

49063

ShiftVal = NumBitsPerElt - 1;

49064

}

49065

49066

// (shift X, 0) -> X

49067

if (!ShiftVal)

49068

return N0;

49069

49070

// (shift 0, C) -> 0

49071

if (ISD::isBuildVectorAllZeros(N0.getNode()))

49072

// N0 is all zeros or undef. We guarantee that the bits shifted into the

49073

// result are all zeros, not undef.

49074

return DAG.getConstant(0, SDLoc(N), VT);

49075

49076

// (VSRAI -1, C) -> -1

49077

if (!LogicalShift && ISD::isBuildVectorAllOnes(N0.getNode()))

49078

// N0 is all ones or undef. We guarantee that the bits shifted into the

49079

// result are all ones, not undef.

49080

return DAG.getConstant(-1, SDLoc(N), VT);

49081

49082

auto MergeShifts = [&](SDValue X, uint64_t Amt0, uint64_t Amt1) {

49083

unsigned NewShiftVal = Amt0 + Amt1;

49084

if (NewShiftVal >= NumBitsPerElt) {

49085

// Out of range logical bit shifts are guaranteed to be zero.

49086

// Out of range arithmetic bit shifts splat the sign bit.

49087

if (LogicalShift)

49088

return DAG.getConstant(0, SDLoc(N), VT);

49089

NewShiftVal = NumBitsPerElt - 1;

49090

}

49091

return DAG.getNode(Opcode, SDLoc(N), VT, N0.getOperand(0),

49092

DAG.getTargetConstant(NewShiftVal, SDLoc(N), MVT::i8));

49093

};

49094

49095

// (shift (shift X, C2), C1) -> (shift X, (C1 + C2))

49096

if (Opcode == N0.getOpcode())

49097

return MergeShifts(N0.getOperand(0), ShiftVal, N0.getConstantOperandVal(1));

49098

49099

// (shl (add X, X), C) -> (shl X, (C + 1))

49100

if (Opcode == X86ISD::VSHLI && N0.getOpcode() == ISD::ADD &&

49101

N0.getOperand(0) == N0.getOperand(1))

49102

return MergeShifts(N0.getOperand(0), ShiftVal, 1);

49103

49104

// We can decode 'whole byte' logical bit shifts as shuffles.

49105

if (LogicalShift && (ShiftVal % 8) == 0) {

49106

SDValue Op(N, 0);

49107

if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))

49108

return Res;

49109

}

49110

49111

auto TryConstantFold = [&](SDValue V) {

49112

APInt UndefElts;

49113

SmallVector<APInt, 32> EltBits;

49114

if (!getTargetConstantBitsFromNode(V, NumBitsPerElt, UndefElts, EltBits))

49115

return SDValue();

49116

assert(EltBits.size() == VT.getVectorNumElements() &&(static_cast <bool> (EltBits.size() == VT.getVectorNumElements
() && "Unexpected shift value type") ? void (0) : __assert_fail
("EltBits.size() == VT.getVectorNumElements() && \"Unexpected shift value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49117, __extension__
__PRETTY_FUNCTION__))

49117

"Unexpected shift value type")(static_cast <bool> (EltBits.size() == VT.getVectorNumElements
() && "Unexpected shift value type") ? void (0) : __assert_fail
("EltBits.size() == VT.getVectorNumElements() && \"Unexpected shift value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49117, __extension__
__PRETTY_FUNCTION__));

49118

// Undef elements need to fold to 0. It's possible SimplifyDemandedBits

49119

// created an undef input due to no input bits being demanded, but user

49120

// still expects 0 in other bits.

49121

for (unsigned i = 0, e = EltBits.size(); i != e; ++i) {

49122

APInt &Elt = EltBits[i];

49123

if (UndefElts[i])

49124

Elt = 0;

49125

else if (X86ISD::VSHLI == Opcode)

49126

Elt <<= ShiftVal;

49127

else if (X86ISD::VSRAI == Opcode)

49128

Elt.ashrInPlace(ShiftVal);

49129

else

49130

Elt.lshrInPlace(ShiftVal);

49131

}

49132

// Reset undef elements since they were zeroed above.

49133

UndefElts = 0;

49134

return getConstVector(EltBits, UndefElts, VT.getSimpleVT(), DAG, SDLoc(N));

49135

};

49136

49137

// Constant Folding.

49138

if (N->isOnlyUserOf(N0.getNode())) {

49139

if (SDValue C = TryConstantFold(N0))

49140

return C;

49141

49142

// Fold (shift (logic X, C2), C1) -> (logic (shift X, C1), (shift C2, C1))

49143

// Don't break NOT patterns.

49144

SDValue BC = peekThroughOneUseBitcasts(N0);

49145

if (ISD::isBitwiseLogicOp(BC.getOpcode()) &&

49146

BC->isOnlyUserOf(BC.getOperand(1).getNode()) &&

49147

!ISD::isBuildVectorAllOnes(BC.getOperand(1).getNode())) {

49148

if (SDValue RHS = TryConstantFold(BC.getOperand(1))) {

49149

SDLoc DL(N);

49150

SDValue LHS = DAG.getNode(Opcode, DL, VT,

49151

DAG.getBitcast(VT, BC.getOperand(0)), N1);

49152

return DAG.getNode(BC.getOpcode(), DL, VT, LHS, RHS);

49153

}

49154

}

49155

}

49156

49157

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

49158

if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(NumBitsPerElt),

49159

DCI))

49160

return SDValue(N, 0);

49161

49162

return SDValue();

49163

}

49164

49165

static SDValue combineVectorInsert(SDNode *N, SelectionDAG &DAG,

49166

TargetLowering::DAGCombinerInfo &DCI,

49167

const X86Subtarget &Subtarget) {

49168

EVT VT = N->getValueType(0);

49169

unsigned Opcode = N->getOpcode();

49170

assert(((Opcode == X86ISD::PINSRB && VT == MVT::v16i8) ||(static_cast <bool> (((Opcode == X86ISD::PINSRB &&
VT == MVT::v16i8) || (Opcode == X86ISD::PINSRW && VT
== MVT::v8i16) || Opcode == ISD::INSERT_VECTOR_ELT) &&
"Unexpected vector insertion") ? void (0) : __assert_fail ("((Opcode == X86ISD::PINSRB && VT == MVT::v16i8) || (Opcode == X86ISD::PINSRW && VT == MVT::v8i16) || Opcode == ISD::INSERT_VECTOR_ELT) && \"Unexpected vector insertion\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49173, __extension__
__PRETTY_FUNCTION__))

49171

(Opcode == X86ISD::PINSRW && VT == MVT::v8i16) ||(static_cast <bool> (((Opcode == X86ISD::PINSRB &&
VT == MVT::v16i8) || (Opcode == X86ISD::PINSRW && VT
== MVT::v8i16) || Opcode == ISD::INSERT_VECTOR_ELT) &&
"Unexpected vector insertion") ? void (0) : __assert_fail ("((Opcode == X86ISD::PINSRB && VT == MVT::v16i8) || (Opcode == X86ISD::PINSRW && VT == MVT::v8i16) || Opcode == ISD::INSERT_VECTOR_ELT) && \"Unexpected vector insertion\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49173, __extension__
__PRETTY_FUNCTION__))

49172

Opcode == ISD::INSERT_VECTOR_ELT) &&(static_cast <bool> (((Opcode == X86ISD::PINSRB &&
VT == MVT::v16i8) || (Opcode == X86ISD::PINSRW && VT
== MVT::v8i16) || Opcode == ISD::INSERT_VECTOR_ELT) &&
"Unexpected vector insertion") ? void (0) : __assert_fail ("((Opcode == X86ISD::PINSRB && VT == MVT::v16i8) || (Opcode == X86ISD::PINSRW && VT == MVT::v8i16) || Opcode == ISD::INSERT_VECTOR_ELT) && \"Unexpected vector insertion\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49173, __extension__
__PRETTY_FUNCTION__))

49173

"Unexpected vector insertion")(static_cast <bool> (((Opcode == X86ISD::PINSRB &&
VT == MVT::v16i8) || (Opcode == X86ISD::PINSRW && VT
== MVT::v8i16) || Opcode == ISD::INSERT_VECTOR_ELT) &&
"Unexpected vector insertion") ? void (0) : __assert_fail ("((Opcode == X86ISD::PINSRB && VT == MVT::v16i8) || (Opcode == X86ISD::PINSRW && VT == MVT::v8i16) || Opcode == ISD::INSERT_VECTOR_ELT) && \"Unexpected vector insertion\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49173, __extension__
__PRETTY_FUNCTION__));

49174

49175

SDValue Vec = N->getOperand(0);

49176

SDValue Scl = N->getOperand(1);

49177

SDValue Idx = N->getOperand(2);

49178

49179

// Fold insert_vector_elt(undef, elt, 0) --> scalar_to_vector(elt).

49180

if (Opcode == ISD::INSERT_VECTOR_ELT && Vec.isUndef() && isNullConstant(Idx))

49181

return DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), VT, Scl);

49182

49183

if (Opcode == X86ISD::PINSRB || Opcode == X86ISD::PINSRW) {

49184

unsigned NumBitsPerElt = VT.getScalarSizeInBits();

49185

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

49186

if (TLI.SimplifyDemandedBits(SDValue(N, 0),

49187

APInt::getAllOnes(NumBitsPerElt), DCI))

49188

return SDValue(N, 0);

49189

}

49190

49191

// Attempt to combine insertion patterns to a shuffle.

49192

if (VT.isSimple() && DCI.isAfterLegalizeDAG()) {

49193

SDValue Op(N, 0);

49194

if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))

49195

return Res;

49196

}

49197

49198

return SDValue();

49199

}

49200

49201

/// Recognize the distinctive (AND (setcc ...) (setcc ..)) where both setccs

49202

/// reference the same FP CMP, and rewrite for CMPEQSS and friends. Likewise for

49203

/// OR -> CMPNEQSS.

49204

static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG,

49205

TargetLowering::DAGCombinerInfo &DCI,

49206

const X86Subtarget &Subtarget) {

49207

unsigned opcode;

49208

49209

// SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but

49210

// we're requiring SSE2 for both.

49211

if (Subtarget.hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {

49212

SDValue N0 = N->getOperand(0);

49213

SDValue N1 = N->getOperand(1);

49214

SDValue CMP0 = N0.getOperand(1);

49215

SDValue CMP1 = N1.getOperand(1);

49216

SDLoc DL(N);

49217

49218

// The SETCCs should both refer to the same CMP.

49219

if (CMP0.getOpcode() != X86ISD::FCMP || CMP0 != CMP1)

49220

return SDValue();

49221

49222

SDValue CMP00 = CMP0->getOperand(0);

49223

SDValue CMP01 = CMP0->getOperand(1);

49224

EVT VT = CMP00.getValueType();

49225

49226

if (VT == MVT::f32 || VT == MVT::f64 ||

49227

(VT == MVT::f16 && Subtarget.hasFP16())) {

49228

bool ExpectingFlags = false;

49229

// Check for any users that want flags:

49230

for (const SDNode *U : N->uses()) {

49231

if (ExpectingFlags)

49232

break;

49233

49234

switch (U->getOpcode()) {

49235

default:

49236

case ISD::BR_CC:

49237

case ISD::BRCOND:

49238

case ISD::SELECT:

49239

ExpectingFlags = true;

49240

break;

49241

case ISD::CopyToReg:

49242

case ISD::SIGN_EXTEND:

49243

case ISD::ZERO_EXTEND:

49244

case ISD::ANY_EXTEND:

49245

break;

49246

}

49247

}

49248

49249

if (!ExpectingFlags) {

49250

enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0);

49251

enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0);

49252

49253

if (cc1 == X86::COND_E || cc1 == X86::COND_NE) {

49254

X86::CondCode tmp = cc0;

49255

cc0 = cc1;

49256

cc1 = tmp;

49257

}

49258

49259

if ((cc0 == X86::COND_E && cc1 == X86::COND_NP) ||

49260

(cc0 == X86::COND_NE && cc1 == X86::COND_P)) {

49261

// FIXME: need symbolic constants for these magic numbers.

49262

// See X86ATTInstPrinter.cpp:printSSECC().

49263

unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;

49264

if (Subtarget.hasAVX512()) {

49265

SDValue FSetCC =

49266

DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CMP00, CMP01,

49267

DAG.getTargetConstant(x86cc, DL, MVT::i8));

49268

// Need to fill with zeros to ensure the bitcast will produce zeroes

49269

// for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.

49270

SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v16i1,

49271

DAG.getConstant(0, DL, MVT::v16i1),

49272

FSetCC, DAG.getIntPtrConstant(0, DL));

49273

return DAG.getZExtOrTrunc(DAG.getBitcast(MVT::i16, Ins), DL,

49274

N->getSimpleValueType(0));

49275

}

49276

SDValue OnesOrZeroesF =

49277

DAG.getNode(X86ISD::FSETCC, DL, CMP00.getValueType(), CMP00,

49278

CMP01, DAG.getTargetConstant(x86cc, DL, MVT::i8));

49279

49280

bool is64BitFP = (CMP00.getValueType() == MVT::f64);

49281

MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32;

49282

49283

if (is64BitFP && !Subtarget.is64Bit()) {

49284

// On a 32-bit target, we cannot bitcast the 64-bit float to a

49285

// 64-bit integer, since that's not a legal type. Since

49286

// OnesOrZeroesF is all ones or all zeroes, we don't need all the

49287

// bits, but can do this little dance to extract the lowest 32 bits

49288

// and work with those going forward.

49289

SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,

49290

OnesOrZeroesF);

49291

SDValue Vector32 = DAG.getBitcast(MVT::v4f32, Vector64);

49292

OnesOrZeroesF = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,

49293

Vector32, DAG.getIntPtrConstant(0, DL));

49294

IntVT = MVT::i32;

49295

}

49296

49297

SDValue OnesOrZeroesI = DAG.getBitcast(IntVT, OnesOrZeroesF);

49298

SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI,

49299

DAG.getConstant(1, DL, IntVT));

49300

SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,

49301

ANDed);

49302

return OneBitOfTruth;

49303

}

49304

}

49305

}

49306

}

49307

return SDValue();

49308

}

49309

49310

/// Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).

49311

static SDValue combineAndNotIntoANDNP(SDNode *N, SelectionDAG &DAG) {

49312

assert(N->getOpcode() == ISD::AND && "Unexpected opcode combine into ANDNP")(static_cast <bool> (N->getOpcode() == ISD::AND &&
"Unexpected opcode combine into ANDNP") ? void (0) : __assert_fail
("N->getOpcode() == ISD::AND && \"Unexpected opcode combine into ANDNP\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49312, __extension__
__PRETTY_FUNCTION__));

49313

49314

MVT VT = N->getSimpleValueType(0);

49315

if (!VT.is128BitVector() && !VT.is256BitVector() && !VT.is512BitVector())

49316

return SDValue();

49317

49318

SDValue X, Y;

49319

SDValue N0 = N->getOperand(0);

49320

SDValue N1 = N->getOperand(1);

49321

49322

if (SDValue Not = IsNOT(N0, DAG)) {

49323

X = Not;

49324

Y = N1;

49325

} else if (SDValue Not = IsNOT(N1, DAG)) {

49326

X = Not;

49327

Y = N0;

49328

} else

49329

return SDValue();

49330

49331

X = DAG.getBitcast(VT, X);

49332

Y = DAG.getBitcast(VT, Y);

49333

return DAG.getNode(X86ISD::ANDNP, SDLoc(N), VT, X, Y);

49334

}

49335

49336

/// Try to fold:

49337

/// and (vector_shuffle<Z,...,Z>

49338

/// (insert_vector_elt undef, (xor X, -1), Z), undef), Y

49339

/// ->

49340

/// andnp (vector_shuffle<Z,...,Z>

49341

/// (insert_vector_elt undef, X, Z), undef), Y

49342

static SDValue combineAndShuffleNot(SDNode *N, SelectionDAG &DAG,

49343

const X86Subtarget &Subtarget) {

49344

assert(N->getOpcode() == ISD::AND && "Unexpected opcode combine into ANDNP")(static_cast <bool> (N->getOpcode() == ISD::AND &&
"Unexpected opcode combine into ANDNP") ? void (0) : __assert_fail
("N->getOpcode() == ISD::AND && \"Unexpected opcode combine into ANDNP\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49344, __extension__
__PRETTY_FUNCTION__));

49345

49346

EVT VT = N->getValueType(0);

49347

// Do not split 256 and 512 bit vectors with SSE2 as they overwrite original

49348

// value and require extra moves.

49349

if (!((VT.is128BitVector() && Subtarget.hasSSE2()) ||

49350

((VT.is256BitVector() || VT.is512BitVector()) && Subtarget.hasAVX())))

49351

return SDValue();

49352

49353

auto GetNot = [&DAG](SDValue V) {

49354

auto *SVN = dyn_cast<ShuffleVectorSDNode>(peekThroughOneUseBitcasts(V));

49355

// TODO: SVN->hasOneUse() is a strong condition. It can be relaxed if all

49356

// end-users are ISD::AND including cases

49357

// (and(extract_vector_element(SVN), Y)).

49358

if (!SVN || !SVN->hasOneUse() || !SVN->isSplat() ||

49359

!SVN->getOperand(1).isUndef()) {

49360

return SDValue();

49361

}

49362

SDValue IVEN = SVN->getOperand(0);

49363

if (IVEN.getOpcode() != ISD::INSERT_VECTOR_ELT ||

49364

!IVEN.getOperand(0).isUndef() || !IVEN.hasOneUse())

49365

return SDValue();

49366

if (!isa<ConstantSDNode>(IVEN.getOperand(2)) ||

49367

IVEN.getConstantOperandAPInt(2) != SVN->getSplatIndex())

49368

return SDValue();

49369

SDValue Src = IVEN.getOperand(1);

49370

if (SDValue Not = IsNOT(Src, DAG)) {

49371

SDValue NotSrc = DAG.getBitcast(Src.getValueType(), Not);

49372

SDValue NotIVEN =

49373

DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(IVEN), IVEN.getValueType(),

49374

IVEN.getOperand(0), NotSrc, IVEN.getOperand(2));

49375

return DAG.getVectorShuffle(SVN->getValueType(0), SDLoc(SVN), NotIVEN,

49376

SVN->getOperand(1), SVN->getMask());

49377

}

49378

return SDValue();

49379

};

49380

49381

SDValue X, Y;

49382

SDValue N0 = N->getOperand(0);

49383

SDValue N1 = N->getOperand(1);

49384

49385

if (SDValue Not = GetNot(N0)) {

49386

X = Not;

49387

Y = N1;

49388

} else if (SDValue Not = GetNot(N1)) {

49389

X = Not;

49390

Y = N0;

49391

} else

49392

return SDValue();

49393

49394

X = DAG.getBitcast(VT, X);

49395

Y = DAG.getBitcast(VT, Y);

49396

SDLoc DL(N);

49397

// We do not split for SSE at all, but we need to split vectors for AVX1 and

49398

// AVX2.

49399

if (!Subtarget.useAVX512Regs() && VT.is512BitVector()) {

49400

SDValue LoX, HiX;

49401

std::tie(LoX, HiX) = splitVector(X, DAG, DL);

49402

SDValue LoY, HiY;

49403

std::tie(LoY, HiY) = splitVector(Y, DAG, DL);

49404

EVT SplitVT = LoX.getValueType();

49405

SDValue LoV = DAG.getNode(X86ISD::ANDNP, DL, SplitVT, {LoX, LoY});

49406

SDValue HiV = DAG.getNode(X86ISD::ANDNP, DL, SplitVT, {HiX, HiY});

49407

return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, {LoV, HiV});

49408

}

49409

return DAG.getNode(X86ISD::ANDNP, DL, VT, {X, Y});

49410

}

49411

49412

// Try to widen AND, OR and XOR nodes to VT in order to remove casts around

49413

// logical operations, like in the example below.

49414

// or (and (truncate x, truncate y)),

49415

// (xor (truncate z, build_vector (constants)))

49416

// Given a target type \p VT, we generate

49417

// or (and x, y), (xor z, zext(build_vector (constants)))

49418

// given x, y and z are of type \p VT. We can do so, if operands are either

49419

// truncates from VT types, the second operand is a vector of constants or can

49420

// be recursively promoted.

49421

static SDValue PromoteMaskArithmetic(SDNode *N, EVT VT, SelectionDAG &DAG,

49422

unsigned Depth) {

49423

// Limit recursion to avoid excessive compile times.

49424

if (Depth >= SelectionDAG::MaxRecursionDepth)

49425

return SDValue();

49426

49427

if (N->getOpcode() != ISD::XOR && N->getOpcode() != ISD::AND &&

49428

N->getOpcode() != ISD::OR)

49429

return SDValue();

49430

49431

SDValue N0 = N->getOperand(0);

49432

SDValue N1 = N->getOperand(1);

49433

SDLoc DL(N);

49434

49435

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

49436

if (!TLI.isOperationLegalOrPromote(N->getOpcode(), VT))

49437

return SDValue();

49438

49439

if (SDValue NN0 = PromoteMaskArithmetic(N0.getNode(), VT, DAG, Depth + 1))

49440

N0 = NN0;

49441

else {

49442

// The Left side has to be a trunc.

49443

if (N0.getOpcode() != ISD::TRUNCATE)

49444

return SDValue();

49445

49446

// The type of the truncated inputs.

49447

if (N0.getOperand(0).getValueType() != VT)

49448

return SDValue();

49449

49450

N0 = N0.getOperand(0);

49451

}

49452

49453

if (SDValue NN1 = PromoteMaskArithmetic(N1.getNode(), VT, DAG, Depth + 1))

49454

N1 = NN1;

49455

else {

49456

// The right side has to be a 'trunc' or a constant vector.

49457

bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE &&

49458

N1.getOperand(0).getValueType() == VT;

49459

if (!RHSTrunc && !ISD::isBuildVectorOfConstantSDNodes(N1.getNode()))

49460

return SDValue();

49461

49462

if (RHSTrunc)

49463

N1 = N1.getOperand(0);

49464

else

49465

N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N1);

49466

}

49467

49468

return DAG.getNode(N->getOpcode(), DL, VT, N0, N1);

49469

}

49470

49471

// On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized

49472

// register. In most cases we actually compare or select YMM-sized registers

49473

// and mixing the two types creates horrible code. This method optimizes

49474

// some of the transition sequences.

49475

// Even with AVX-512 this is still useful for removing casts around logical

49476

// operations on vXi1 mask types.

49477

static SDValue PromoteMaskArithmetic(SDNode *N, SelectionDAG &DAG,

49478

const X86Subtarget &Subtarget) {

49479

EVT VT = N->getValueType(0);

49480

assert(VT.isVector() && "Expected vector type")(static_cast <bool> (VT.isVector() && "Expected vector type"
) ? void (0) : __assert_fail ("VT.isVector() && \"Expected vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49480, __extension__
__PRETTY_FUNCTION__));

49481

49482

SDLoc DL(N);

49483

assert((N->getOpcode() == ISD::ANY_EXTEND ||(static_cast <bool> ((N->getOpcode() == ISD::ANY_EXTEND
|| N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode(
) == ISD::SIGN_EXTEND) && "Invalid Node") ? void (0) :
__assert_fail ("(N->getOpcode() == ISD::ANY_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::SIGN_EXTEND) && \"Invalid Node\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49485, __extension__
__PRETTY_FUNCTION__))

49484

N->getOpcode() == ISD::ZERO_EXTEND ||(static_cast <bool> ((N->getOpcode() == ISD::ANY_EXTEND
|| N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode(
) == ISD::SIGN_EXTEND) && "Invalid Node") ? void (0) :
__assert_fail ("(N->getOpcode() == ISD::ANY_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::SIGN_EXTEND) && \"Invalid Node\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49485, __extension__
__PRETTY_FUNCTION__))

49485

N->getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node")(static_cast <bool> ((N->getOpcode() == ISD::ANY_EXTEND
|| N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode(
) == ISD::SIGN_EXTEND) && "Invalid Node") ? void (0) :
__assert_fail ("(N->getOpcode() == ISD::ANY_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::SIGN_EXTEND) && \"Invalid Node\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49485, __extension__
__PRETTY_FUNCTION__));

49486

49487

SDValue Narrow = N->getOperand(0);

49488

EVT NarrowVT = Narrow.getValueType();

49489

49490

// Generate the wide operation.

49491

SDValue Op = PromoteMaskArithmetic(Narrow.getNode(), VT, DAG, 0);

49492

if (!Op)

49493

return SDValue();

49494

switch (N->getOpcode()) {

49495

default: llvm_unreachable("Unexpected opcode")::llvm::llvm_unreachable_internal("Unexpected opcode", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 49495);

49496

case ISD::ANY_EXTEND:

49497

return Op;

49498

case ISD::ZERO_EXTEND:

49499

return DAG.getZeroExtendInReg(Op, DL, NarrowVT);

49500

case ISD::SIGN_EXTEND:

49501

return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT,

49502

Op, DAG.getValueType(NarrowVT));

49503

}

49504

}

49505

49506

static unsigned convertIntLogicToFPLogicOpcode(unsigned Opcode) {

49507

unsigned FPOpcode;

49508

switch (Opcode) {

49509

default: llvm_unreachable("Unexpected input node for FP logic conversion")::llvm::llvm_unreachable_internal("Unexpected input node for FP logic conversion"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49509);

49510

case ISD::AND: FPOpcode = X86ISD::FAND; break;

49511

case ISD::OR: FPOpcode = X86ISD::FOR; break;

49512

case ISD::XOR: FPOpcode = X86ISD::FXOR; break;

49513

}

49514

return FPOpcode;

49515

}

49516

49517

/// If both input operands of a logic op are being cast from floating-point

49518

/// types or FP compares, try to convert this into a floating-point logic node

49519

/// to avoid unnecessary moves from SSE to integer registers.

49520

static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG,

49521

TargetLowering::DAGCombinerInfo &DCI,

49522

const X86Subtarget &Subtarget) {

49523

EVT VT = N->getValueType(0);

49524

SDValue N0 = N->getOperand(0);

49525

SDValue N1 = N->getOperand(1);

49526

SDLoc DL(N);

49527

49528

if (!((N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST) ||

49529

(N0.getOpcode() == ISD::SETCC && N1.getOpcode() == ISD::SETCC)))

49530

return SDValue();

49531

49532

SDValue N00 = N0.getOperand(0);

49533

SDValue N10 = N1.getOperand(0);

49534

EVT N00Type = N00.getValueType();

49535

EVT N10Type = N10.getValueType();

49536

49537

// Ensure that both types are the same and are legal scalar fp types.

49538

if (N00Type != N10Type || !((Subtarget.hasSSE1() && N00Type == MVT::f32) ||

49539

(Subtarget.hasSSE2() && N00Type == MVT::f64) ||

49540

(Subtarget.hasFP16() && N00Type == MVT::f16)))

49541

return SDValue();

49542

49543

if (N0.getOpcode() == ISD::BITCAST && !DCI.isBeforeLegalizeOps()) {

49544

unsigned FPOpcode = convertIntLogicToFPLogicOpcode(N->getOpcode());

49545

SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10);

49546

return DAG.getBitcast(VT, FPLogic);

49547

}

49548

49549

if (VT != MVT::i1 || N0.getOpcode() != ISD::SETCC || !N0.hasOneUse() ||

49550

!N1.hasOneUse())

49551

return SDValue();

49552

49553

ISD::CondCode CC0 = cast<CondCodeSDNode>(N0.getOperand(2))->get();

49554

ISD::CondCode CC1 = cast<CondCodeSDNode>(N1.getOperand(2))->get();

49555

49556

// The vector ISA for FP predicates is incomplete before AVX, so converting

49557

// COMIS* to CMPS* may not be a win before AVX.

49558

if (!Subtarget.hasAVX() &&

49559

!(cheapX86FSETCC_SSE(CC0) && cheapX86FSETCC_SSE(CC1)))

49560

return SDValue();

49561

49562

// Convert scalar FP compares and logic to vector compares (COMIS* to CMPS*)

49563

// and vector logic:

49564

// logic (setcc N00, N01), (setcc N10, N11) -->

49565

// extelt (logic (setcc (s2v N00), (s2v N01)), setcc (s2v N10), (s2v N11))), 0

49566

unsigned NumElts = 128 / N00Type.getSizeInBits();

49567

EVT VecVT = EVT::getVectorVT(*DAG.getContext(), N00Type, NumElts);

49568

EVT BoolVecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElts);

49569

SDValue ZeroIndex = DAG.getVectorIdxConstant(0, DL);

49570

SDValue N01 = N0.getOperand(1);

49571

SDValue N11 = N1.getOperand(1);

49572

SDValue Vec00 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N00);

49573

SDValue Vec01 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N01);

49574

SDValue Vec10 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N10);

49575

SDValue Vec11 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N11);

49576

SDValue Setcc0 = DAG.getSetCC(DL, BoolVecVT, Vec00, Vec01, CC0);

49577

SDValue Setcc1 = DAG.getSetCC(DL, BoolVecVT, Vec10, Vec11, CC1);

49578

SDValue Logic = DAG.getNode(N->getOpcode(), DL, BoolVecVT, Setcc0, Setcc1);

49579

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Logic, ZeroIndex);

49580

}

49581

49582

// Attempt to fold BITOP(MOVMSK(X),MOVMSK(Y)) -> MOVMSK(BITOP(X,Y))

49583

// to reduce XMM->GPR traffic.

49584

static SDValue combineBitOpWithMOVMSK(SDNode *N, SelectionDAG &DAG) {

49585

unsigned Opc = N->getOpcode();

49586

assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) &&(static_cast <bool> ((Opc == ISD::OR || Opc == ISD::AND
|| Opc == ISD::XOR) && "Unexpected bit opcode") ? void
(0) : __assert_fail ("(Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) && \"Unexpected bit opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49587, __extension__
__PRETTY_FUNCTION__))

49587

"Unexpected bit opcode")(static_cast <bool> ((Opc == ISD::OR || Opc == ISD::AND
|| Opc == ISD::XOR) && "Unexpected bit opcode") ? void
(0) : __assert_fail ("(Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) && \"Unexpected bit opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49587, __extension__
__PRETTY_FUNCTION__));

49588

49589

SDValue N0 = N->getOperand(0);

49590

SDValue N1 = N->getOperand(1);

49591

49592

// Both operands must be single use MOVMSK.

49593

if (N0.getOpcode() != X86ISD::MOVMSK || !N0.hasOneUse() ||

49594

N1.getOpcode() != X86ISD::MOVMSK || !N1.hasOneUse())

49595

return SDValue();

49596

49597

SDValue Vec0 = N0.getOperand(0);

49598

SDValue Vec1 = N1.getOperand(0);

49599

EVT VecVT0 = Vec0.getValueType();

49600

EVT VecVT1 = Vec1.getValueType();

49601

49602

// Both MOVMSK operands must be from vectors of the same size and same element

49603

// size, but its OK for a fp/int diff.

49604

if (VecVT0.getSizeInBits() != VecVT1.getSizeInBits() ||

49605

VecVT0.getScalarSizeInBits() != VecVT1.getScalarSizeInBits())

49606

return SDValue();

49607

49608

SDLoc DL(N);

49609

unsigned VecOpc =

49610

VecVT0.isFloatingPoint() ? convertIntLogicToFPLogicOpcode(Opc) : Opc;

49611

SDValue Result =

49612

DAG.getNode(VecOpc, DL, VecVT0, Vec0, DAG.getBitcast(VecVT0, Vec1));

49613

return DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);

49614

}

49615

49616

// Attempt to fold BITOP(SHIFT(X,Z),SHIFT(Y,Z)) -> SHIFT(BITOP(X,Y),Z).

49617

// NOTE: This is a very limited case of what SimplifyUsingDistributiveLaws

49618

// handles in InstCombine.

49619

static SDValue combineBitOpWithShift(SDNode *N, SelectionDAG &DAG) {

49620

unsigned Opc = N->getOpcode();

49621

assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) &&(static_cast <bool> ((Opc == ISD::OR || Opc == ISD::AND
|| Opc == ISD::XOR) && "Unexpected bit opcode") ? void
(0) : __assert_fail ("(Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) && \"Unexpected bit opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49622, __extension__
__PRETTY_FUNCTION__))

49622

"Unexpected bit opcode")(static_cast <bool> ((Opc == ISD::OR || Opc == ISD::AND
|| Opc == ISD::XOR) && "Unexpected bit opcode") ? void
(0) : __assert_fail ("(Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) && \"Unexpected bit opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49622, __extension__
__PRETTY_FUNCTION__));

49623

49624

SDValue N0 = N->getOperand(0);

49625

SDValue N1 = N->getOperand(1);

49626

EVT VT = N->getValueType(0);

49627

49628

// Both operands must be single use.

49629

if (!N0.hasOneUse() || !N1.hasOneUse())

49630

return SDValue();

49631

49632

// Search for matching shifts.

49633

SDValue BC0 = peekThroughOneUseBitcasts(N0);

49634

SDValue BC1 = peekThroughOneUseBitcasts(N1);

49635

49636

unsigned BCOpc = BC0.getOpcode();

49637

EVT BCVT = BC0.getValueType();

49638

if (BCOpc != BC1->getOpcode() || BCVT != BC1.getValueType())

49639

return SDValue();

49640

49641

switch (BCOpc) {

49642

case X86ISD::VSHLI:

49643

case X86ISD::VSRLI:

49644

case X86ISD::VSRAI: {

49645

if (BC0.getOperand(1) != BC1.getOperand(1))

49646

return SDValue();

49647

49648

SDLoc DL(N);

49649

SDValue BitOp =

49650

DAG.getNode(Opc, DL, BCVT, BC0.getOperand(0), BC1.getOperand(0));

49651

SDValue Shift = DAG.getNode(BCOpc, DL, BCVT, BitOp, BC0.getOperand(1));

49652

return DAG.getBitcast(VT, Shift);

49653

}

49654

}

49655

49656

return SDValue();

49657

}

49658

49659

/// If this is a zero/all-bits result that is bitwise-anded with a low bits

49660

/// mask. (Mask == 1 for the x86 lowering of a SETCC + ZEXT), replace the 'and'

49661

/// with a shift-right to eliminate loading the vector constant mask value.

49662

static SDValue combineAndMaskToShift(SDNode *N, SelectionDAG &DAG,

49663

const X86Subtarget &Subtarget) {

49664

SDValue Op0 = peekThroughBitcasts(N->getOperand(0));

49665

SDValue Op1 = peekThroughBitcasts(N->getOperand(1));

49666

EVT VT = Op0.getValueType();

49667

if (VT != Op1.getValueType() || !VT.isSimple() || !VT.isInteger())

49668

return SDValue();

49669

49670

// Try to convert an "is positive" signbit masking operation into arithmetic

49671

// shift and "andn". This saves a materialization of a -1 vector constant.

49672

// The "is negative" variant should be handled more generally because it only

49673

// requires "and" rather than "andn":

49674

// and (pcmpgt X, -1), Y --> pandn (vsrai X, BitWidth - 1), Y

49675

//

49676

// This is limited to the original type to avoid producing even more bitcasts.

49677

// If the bitcasts can't be eliminated, then it is unlikely that this fold

49678

// will be profitable.

49679

if (N->getValueType(0) == VT &&

49680

supportedVectorShiftWithImm(VT.getSimpleVT(), Subtarget, ISD::SRA)) {

49681

SDValue X, Y;

49682

if (Op1.getOpcode() == X86ISD::PCMPGT &&

49683

isAllOnesOrAllOnesSplat(Op1.getOperand(1)) && Op1.hasOneUse()) {

49684

X = Op1.getOperand(0);

49685

Y = Op0;

49686

} else if (Op0.getOpcode() == X86ISD::PCMPGT &&

49687

isAllOnesOrAllOnesSplat(Op0.getOperand(1)) && Op0.hasOneUse()) {

49688

X = Op0.getOperand(0);

49689

Y = Op1;

49690

}

49691

if (X && Y) {

49692

SDLoc DL(N);

49693

SDValue Sra =

49694

getTargetVShiftByConstNode(X86ISD::VSRAI, DL, VT.getSimpleVT(), X,

49695

VT.getScalarSizeInBits() - 1, DAG);

49696

return DAG.getNode(X86ISD::ANDNP, DL, VT, Sra, Y);

49697

}

49698

}

49699

49700

APInt SplatVal;

49701

if (!ISD::isConstantSplatVector(Op1.getNode(), SplatVal) ||

49702

!SplatVal.isMask())

49703

return SDValue();

49704

49705

// Don't prevent creation of ANDN.

49706

if (isBitwiseNot(Op0))

49707

return SDValue();

49708

49709

if (!supportedVectorShiftWithImm(VT.getSimpleVT(), Subtarget, ISD::SRL))

49710

return SDValue();

49711

49712

unsigned EltBitWidth = VT.getScalarSizeInBits();

49713

if (EltBitWidth != DAG.ComputeNumSignBits(Op0))

49714

return SDValue();

49715

49716

SDLoc DL(N);

49717

unsigned ShiftVal = SplatVal.countr_one();

49718

SDValue ShAmt = DAG.getTargetConstant(EltBitWidth - ShiftVal, DL, MVT::i8);

49719

SDValue Shift = DAG.getNode(X86ISD::VSRLI, DL, VT, Op0, ShAmt);

49720

return DAG.getBitcast(N->getValueType(0), Shift);

49721

}

49722

49723

// Get the index node from the lowered DAG of a GEP IR instruction with one

49724

// indexing dimension.

49725

static SDValue getIndexFromUnindexedLoad(LoadSDNode *Ld) {

49726

if (Ld->isIndexed())

49727

return SDValue();

49728

49729

SDValue Base = Ld->getBasePtr();

49730

49731

if (Base.getOpcode() != ISD::ADD)

49732

return SDValue();

49733

49734

SDValue ShiftedIndex = Base.getOperand(0);

49735

49736

if (ShiftedIndex.getOpcode() != ISD::SHL)

49737

return SDValue();

49738

49739

return ShiftedIndex.getOperand(0);

49740

49741

}

49742

49743

static bool hasBZHI(const X86Subtarget &Subtarget, MVT VT) {

49744

if (Subtarget.hasBMI2() && VT.isScalarInteger()) {

49745

switch (VT.getSizeInBits()) {

49746

default: return false;

49747

case 64: return Subtarget.is64Bit() ? true : false;

49748

case 32: return true;

49749

}

49750

}

49751

return false;

49752

}

49753

49754

// This function recognizes cases where X86 bzhi instruction can replace and

49755

// 'and-load' sequence.

49756

// In case of loading integer value from an array of constants which is defined

49757

// as follows:

49758

//

49759

// int array[SIZE] = {0x0, 0x1, 0x3, 0x7, 0xF ..., 2^(SIZE-1) - 1}

49760

//

49761

// then applying a bitwise and on the result with another input.

49762

// It's equivalent to performing bzhi (zero high bits) on the input, with the

49763

// same index of the load.

49764

static SDValue combineAndLoadToBZHI(SDNode *Node, SelectionDAG &DAG,

49765

const X86Subtarget &Subtarget) {

49766

MVT VT = Node->getSimpleValueType(0);

49767

SDLoc dl(Node);

49768

49769

// Check if subtarget has BZHI instruction for the node's type

49770

if (!hasBZHI(Subtarget, VT))

49771

return SDValue();

49772

49773

// Try matching the pattern for both operands.

49774

for (unsigned i = 0; i < 2; i++) {

49775

SDValue N = Node->getOperand(i);

49776

LoadSDNode *Ld = dyn_cast<LoadSDNode>(N.getNode());

49777

49778

// continue if the operand is not a load instruction

49779

if (!Ld)

49780

return SDValue();

49781

49782

const Value *MemOp = Ld->getMemOperand()->getValue();

49783

49784

if (!MemOp)

49785

return SDValue();

49786

49787

if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(MemOp)) {

49788

if (GlobalVariable *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0))) {

49789

if (GV->isConstant() && GV->hasDefinitiveInitializer()) {

49790

49791

Constant *Init = GV->getInitializer();

49792

Type *Ty = Init->getType();

49793

if (!isa<ConstantDataArray>(Init) ||

49794

!Ty->getArrayElementType()->isIntegerTy() ||

49795

Ty->getArrayElementType()->getScalarSizeInBits() !=

49796

VT.getSizeInBits() ||

49797

Ty->getArrayNumElements() >

49798

Ty->getArrayElementType()->getScalarSizeInBits())

49799

continue;

49800

49801

// Check if the array's constant elements are suitable to our case.

49802

uint64_t ArrayElementCount = Init->getType()->getArrayNumElements();

49803

bool ConstantsMatch = true;

49804

for (uint64_t j = 0; j < ArrayElementCount; j++) {

49805

auto *Elem = cast<ConstantInt>(Init->getAggregateElement(j));

49806

if (Elem->getZExtValue() != (((uint64_t)1 << j) - 1)) {

49807

ConstantsMatch = false;

49808

break;

49809

}

49810

}

49811

if (!ConstantsMatch)

49812

continue;

49813

49814

// Do the transformation (For 32-bit type):

49815

// -> (and (load arr[idx]), inp)

49816

// <- (and (srl 0xFFFFFFFF, (sub 32, idx)))

49817

// that will be replaced with one bzhi instruction.

49818

SDValue Inp = (i == 0) ? Node->getOperand(1) : Node->getOperand(0);

49819

SDValue SizeC = DAG.getConstant(VT.getSizeInBits(), dl, MVT::i32);

49820

49821

// Get the Node which indexes into the array.

49822

SDValue Index = getIndexFromUnindexedLoad(Ld);

49823

if (!Index)

49824

return SDValue();

49825

Index = DAG.getZExtOrTrunc(Index, dl, MVT::i32);

49826

49827

SDValue Sub = DAG.getNode(ISD::SUB, dl, MVT::i32, SizeC, Index);

49828

Sub = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Sub);

49829

49830

SDValue AllOnes = DAG.getAllOnesConstant(dl, VT);

49831

SDValue LShr = DAG.getNode(ISD::SRL, dl, VT, AllOnes, Sub);

49832

49833

return DAG.getNode(ISD::AND, dl, VT, Inp, LShr);

49834

}

49835

}

49836

}

49837

}

49838

return SDValue();

49839

}

49840

49841

// Look for (and (bitcast (vXi1 (concat_vectors (vYi1 setcc), undef,))), C)

49842

// Where C is a mask containing the same number of bits as the setcc and

49843

// where the setcc will freely 0 upper bits of k-register. We can replace the

49844

// undef in the concat with 0s and remove the AND. This mainly helps with

49845

// v2i1/v4i1 setcc being casted to scalar.

49846

static SDValue combineScalarAndWithMaskSetcc(SDNode *N, SelectionDAG &DAG,

49847

const X86Subtarget &Subtarget) {

49848

assert(N->getOpcode() == ISD::AND && "Unexpected opcode!")(static_cast <bool> (N->getOpcode() == ISD::AND &&
"Unexpected opcode!") ? void (0) : __assert_fail ("N->getOpcode() == ISD::AND && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49848, __extension__
__PRETTY_FUNCTION__));

49849

49850

EVT VT = N->getValueType(0);

49851

49852

// Make sure this is an AND with constant. We will check the value of the

49853

// constant later.

49854

auto *C1 = dyn_cast<ConstantSDNode>(N->getOperand(1));

49855

if (!C1)

49856

return SDValue();

49857

49858

// This is implied by the ConstantSDNode.

49859

assert(!VT.isVector() && "Expected scalar VT!")(static_cast <bool> (!VT.isVector() && "Expected scalar VT!"
) ? void (0) : __assert_fail ("!VT.isVector() && \"Expected scalar VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49859, __extension__
__PRETTY_FUNCTION__));

49860

49861

SDValue Src = N->getOperand(0);

49862

if (!Src.hasOneUse())

49863

return SDValue();

49864

49865

// (Optionally) peek through any_extend().

49866

if (Src.getOpcode() == ISD::ANY_EXTEND) {

49867

if (!Src.getOperand(0).hasOneUse())

49868

return SDValue();

49869

Src = Src.getOperand(0);

49870

}

49871

49872

if (Src.getOpcode() != ISD::BITCAST || !Src.getOperand(0).hasOneUse())

49873

return SDValue();

49874

49875

Src = Src.getOperand(0);

49876

EVT SrcVT = Src.getValueType();

49877

49878

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

49879

if (!SrcVT.isVector() || SrcVT.getVectorElementType() != MVT::i1 ||

49880

!TLI.isTypeLegal(SrcVT))

49881

return SDValue();

49882

49883

if (Src.getOpcode() != ISD::CONCAT_VECTORS)

49884

return SDValue();

49885

49886

// We only care about the first subvector of the concat, we expect the

49887

// other subvectors to be ignored due to the AND if we make the change.

49888

SDValue SubVec = Src.getOperand(0);

49889

EVT SubVecVT = SubVec.getValueType();

49890

49891

// The RHS of the AND should be a mask with as many bits as SubVec.

49892

if (!TLI.isTypeLegal(SubVecVT) ||

49893

!C1->getAPIntValue().isMask(SubVecVT.getVectorNumElements()))

49894

return SDValue();

49895

49896

// First subvector should be a setcc with a legal result type or a

49897

// AND containing at least one setcc with a legal result type.

49898

auto IsLegalSetCC = [&](SDValue V) {

49899

if (V.getOpcode() != ISD::SETCC)

49900

return false;

49901

EVT SetccVT = V.getOperand(0).getValueType();

49902

if (!TLI.isTypeLegal(SetccVT) ||

49903

!(Subtarget.hasVLX() || SetccVT.is512BitVector()))

49904

return false;

49905

if (!(Subtarget.hasBWI() || SetccVT.getScalarSizeInBits() >= 32))

49906

return false;

49907

return true;

49908

};

49909

if (!(IsLegalSetCC(SubVec) || (SubVec.getOpcode() == ISD::AND &&

49910

(IsLegalSetCC(SubVec.getOperand(0)) ||

49911

IsLegalSetCC(SubVec.getOperand(1))))))

49912

return SDValue();

49913

49914

// We passed all the checks. Rebuild the concat_vectors with zeroes

49915

// and cast it back to VT.

49916

SDLoc dl(N);

49917

SmallVector<SDValue, 4> Ops(Src.getNumOperands(),

49918

DAG.getConstant(0, dl, SubVecVT));

49919

Ops[0] = SubVec;

49920

SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, dl, SrcVT,

49921

Ops);

49922

EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), SrcVT.getSizeInBits());

49923

return DAG.getZExtOrTrunc(DAG.getBitcast(IntVT, Concat), dl, VT);

49924

}

49925

49926

static SDValue getBMIMatchingOp(unsigned Opc, SelectionDAG &DAG,

49927

SDValue OpMustEq, SDValue Op, unsigned Depth) {

49928

// We don't want to go crazy with the recursion here. This isn't a super

49929

// important optimization.

49930

static constexpr unsigned kMaxDepth = 2;

49931

49932

// Only do this re-ordering if op has one use.

49933

if (!Op.hasOneUse())

49934

return SDValue();

49935

49936

SDLoc DL(Op);

49937

// If we hit another assosiative op, recurse further.

49938

if (Op.getOpcode() == Opc) {

49939

// Done recursing.

49940

if (Depth++ >= kMaxDepth)

49941

return SDValue();

49942

49943

for (unsigned OpIdx = 0; OpIdx < 2; ++OpIdx)

49944

if (SDValue R =

49945

getBMIMatchingOp(Opc, DAG, OpMustEq, Op.getOperand(OpIdx), Depth))

49946

return DAG.getNode(Op.getOpcode(), DL, Op.getValueType(), R,

49947

Op.getOperand(1 - OpIdx));

49948

49949

} else if (Op.getOpcode() == ISD::SUB) {

49950

if (Opc == ISD::AND) {

49951

// BLSI: (and x, (sub 0, x))

49952

if (isNullConstant(Op.getOperand(0)) && Op.getOperand(1) == OpMustEq)

49953

return DAG.getNode(Opc, DL, Op.getValueType(), OpMustEq, Op);

49954

}

49955

// Opc must be ISD::AND or ISD::XOR

49956

// BLSR: (and x, (sub x, 1))

49957

// BLSMSK: (xor x, (sub x, 1))

49958

if (isOneConstant(Op.getOperand(1)) && Op.getOperand(0) == OpMustEq)

49959

return DAG.getNode(Opc, DL, Op.getValueType(), OpMustEq, Op);

49960

49961

} else if (Op.getOpcode() == ISD::ADD) {

49962

// Opc must be ISD::AND or ISD::XOR

49963

// BLSR: (and x, (add x, -1))

49964

// BLSMSK: (xor x, (add x, -1))

49965

if (isAllOnesConstant(Op.getOperand(1)) && Op.getOperand(0) == OpMustEq)

49966

return DAG.getNode(Opc, DL, Op.getValueType(), OpMustEq, Op);

49967

}

49968

return SDValue();

49969

}

49970

49971

static SDValue combineBMILogicOp(SDNode *N, SelectionDAG &DAG,

49972

const X86Subtarget &Subtarget) {

49973

EVT VT = N->getValueType(0);

49974

// Make sure this node is a candidate for BMI instructions.

49975

if (!Subtarget.hasBMI() || !VT.isScalarInteger() ||

49976

(VT != MVT::i32 && VT != MVT::i64))

49977

return SDValue();

49978

49979

assert(N->getOpcode() == ISD::AND || N->getOpcode() == ISD::XOR)(static_cast <bool> (N->getOpcode() == ISD::AND || N
->getOpcode() == ISD::XOR) ? void (0) : __assert_fail ("N->getOpcode() == ISD::AND || N->getOpcode() == ISD::XOR"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49979, __extension__
__PRETTY_FUNCTION__));

49980

49981

// Try and match LHS and RHS.

49982

for (unsigned OpIdx = 0; OpIdx < 2; ++OpIdx)

49983

if (SDValue OpMatch =

49984

getBMIMatchingOp(N->getOpcode(), DAG, N->getOperand(OpIdx),

49985

N->getOperand(1 - OpIdx), 0))

49986

return OpMatch;

49987

return SDValue();

49988

}

49989

49990

static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,

49991

TargetLowering::DAGCombinerInfo &DCI,

49992

const X86Subtarget &Subtarget) {

49993

SDValue N0 = N->getOperand(0);

49994

SDValue N1 = N->getOperand(1);

49995

EVT VT = N->getValueType(0);

49996

SDLoc dl(N);

49997

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

49998

49999

// If this is SSE1 only convert to FAND to avoid scalarization.

50000

if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {

50001

return DAG.getBitcast(MVT::v4i32,

50002

DAG.getNode(X86ISD::FAND, dl, MVT::v4f32,

50003

DAG.getBitcast(MVT::v4f32, N0),

50004

DAG.getBitcast(MVT::v4f32, N1)));

50005

}

50006

50007

// Use a 32-bit and+zext if upper bits known zero.

50008

if (VT == MVT::i64 && Subtarget.is64Bit() && !isa<ConstantSDNode>(N1)) {

50009

APInt HiMask = APInt::getHighBitsSet(64, 32);

50010

if (DAG.MaskedValueIsZero(N1, HiMask) ||

50011

DAG.MaskedValueIsZero(N0, HiMask)) {

50012

SDValue LHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N0);

50013

SDValue RHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N1);

50014

return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64,

50015

DAG.getNode(ISD::AND, dl, MVT::i32, LHS, RHS));

50016

}

50017

}

50018

50019

// Match all-of bool scalar reductions into a bitcast/movmsk + cmp.

50020

// TODO: Support multiple SrcOps.

50021

if (VT == MVT::i1) {

50022

SmallVector<SDValue, 2> SrcOps;

50023

SmallVector<APInt, 2> SrcPartials;

50024

if (matchScalarReduction(SDValue(N, 0), ISD::AND, SrcOps, &SrcPartials) &&

50025

SrcOps.size() == 1) {

50026

unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements();

50027

EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);

50028

SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget);

50029

if (!Mask && TLI.isTypeLegal(SrcOps[0].getValueType()))

50030

Mask = DAG.getBitcast(MaskVT, SrcOps[0]);

50031

if (Mask) {

50032

assert(SrcPartials[0].getBitWidth() == NumElts &&(static_cast <bool> (SrcPartials[0].getBitWidth() == NumElts
&& "Unexpected partial reduction mask") ? void (0) :
__assert_fail ("SrcPartials[0].getBitWidth() == NumElts && \"Unexpected partial reduction mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 50033, __extension__
__PRETTY_FUNCTION__))

50033

"Unexpected partial reduction mask")(static_cast <bool> (SrcPartials[0].getBitWidth() == NumElts
&& "Unexpected partial reduction mask") ? void (0) :
__assert_fail ("SrcPartials[0].getBitWidth() == NumElts && \"Unexpected partial reduction mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 50033, __extension__
__PRETTY_FUNCTION__));

50034

SDValue PartialBits = DAG.getConstant(SrcPartials[0], dl, MaskVT);

50035

Mask = DAG.getNode(ISD::AND, dl, MaskVT, Mask, PartialBits);

50036

return DAG.getSetCC(dl, MVT::i1, Mask, PartialBits, ISD::SETEQ);

50037

}

50038

}

50039

}

50040

50041

if (SDValue V = combineScalarAndWithMaskSetcc(N, DAG, Subtarget))

50042

return V;

50043

50044

if (SDValue R = combineBitOpWithMOVMSK(N, DAG))

50045

return R;

50046

50047

if (SDValue R = combineBitOpWithShift(N, DAG))

50048

return R;

50049

50050

if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, DCI, Subtarget))

50051

return FPLogic;

50052

50053

if (SDValue R = combineAndShuffleNot(N, DAG, Subtarget))

50054

return R;

50055

50056

if (DCI.isBeforeLegalizeOps())

50057

return SDValue();

50058

50059

if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))

50060

return R;

50061

50062

if (SDValue R = combineAndNotIntoANDNP(N, DAG))

50063

return R;

50064

50065

if (SDValue ShiftRight = combineAndMaskToShift(N, DAG, Subtarget))

50066

return ShiftRight;

50067

50068

if (SDValue R = combineAndLoadToBZHI(N, DAG, Subtarget))

50069

return R;

50070

50071

// fold (and (mul x, c1), c2) -> (mul x, (and c1, c2))

50072

// iff c2 is all/no bits mask - i.e. a select-with-zero mask.

50073

// TODO: Handle PMULDQ/PMULUDQ/VPMADDWD/VPMADDUBSW?

50074

if (VT.isVector() && getTargetConstantFromNode(N1)) {

50075

unsigned Opc0 = N0.getOpcode();

50076

if ((Opc0 == ISD::MUL || Opc0 == ISD::MULHU || Opc0 == ISD::MULHS) &&

50077

getTargetConstantFromNode(N0.getOperand(1)) &&

50078

DAG.ComputeNumSignBits(N1) == VT.getScalarSizeInBits() &&

50079

N0->hasOneUse() && N0.getOperand(1)->hasOneUse()) {

50080

SDValue MaskMul = DAG.getNode(ISD::AND, dl, VT, N0.getOperand(1), N1);

50081

return DAG.getNode(Opc0, dl, VT, N0.getOperand(0), MaskMul);

50082

}

50083

}

50084

50085

// Fold AND(SRL(X,Y),1) -> SETCC(BT(X,Y), COND_B) iff Y is not a constant

50086

// avoids slow variable shift (moving shift amount to ECX etc.)

50087

if (isOneConstant(N1) && N0->hasOneUse()) {

50088

SDValue Src = N0;

50089

while ((Src.getOpcode() == ISD::ZERO_EXTEND ||

50090

Src.getOpcode() == ISD::TRUNCATE) &&

50091

Src.getOperand(0)->hasOneUse())

50092

Src = Src.getOperand(0);

50093

bool ContainsNOT = false;

50094

X86::CondCode X86CC = X86::COND_B;

50095

// Peek through AND(NOT(SRL(X,Y)),1).

50096

if (isBitwiseNot(Src)) {

50097

Src = Src.getOperand(0);

50098

X86CC = X86::COND_AE;

50099

ContainsNOT = true;

50100

}

50101

if (Src.getOpcode() == ISD::SRL &&

50102

!isa<ConstantSDNode>(Src.getOperand(1))) {

50103

SDValue BitNo = Src.getOperand(1);

50104

Src = Src.getOperand(0);

50105

// Peek through AND(SRL(NOT(X),Y),1).

50106

if (isBitwiseNot(Src)) {

50107

Src = Src.getOperand(0);

50108

X86CC = X86CC == X86::COND_AE ? X86::COND_B : X86::COND_AE;

50109

ContainsNOT = true;

50110

}

50111

// If we have BMI2 then SHRX should be faster for i32/i64 cases.

50112

if (!(Subtarget.hasBMI2() && !ContainsNOT && VT.getSizeInBits() >= 32))

50113

if (SDValue BT = getBT(Src, BitNo, dl, DAG))

50114

return DAG.getZExtOrTrunc(getSETCC(X86CC, BT, dl, DAG), dl, VT);

50115

}

50116

}

50117

50118

if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {

50119

// Attempt to recursively combine a bitmask AND with shuffles.

50120

SDValue Op(N, 0);

50121

if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))

50122

return Res;

50123

50124

// If either operand is a constant mask, then only the elements that aren't

50125

// zero are actually demanded by the other operand.

50126

auto GetDemandedMasks = [&](SDValue Op) {

50127

APInt UndefElts;

50128

SmallVector<APInt> EltBits;

50129

int NumElts = VT.getVectorNumElements();

50130

int EltSizeInBits = VT.getScalarSizeInBits();

50131

APInt DemandedBits = APInt::getAllOnes(EltSizeInBits);

50132

APInt DemandedElts = APInt::getAllOnes(NumElts);

50133

if (getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,

50134

EltBits)) {

50135

DemandedBits.clearAllBits();

50136

DemandedElts.clearAllBits();

50137

for (int I = 0; I != NumElts; ++I) {

50138

if (UndefElts[I]) {

50139

// We can't assume an undef src element gives an undef dst - the

50140

// other src might be zero.

50141

DemandedBits.setAllBits();

50142

DemandedElts.setBit(I);

50143

} else if (!EltBits[I].isZero()) {

50144

DemandedBits |= EltBits[I];

50145

DemandedElts.setBit(I);

50146

}

50147

}

50148

}

50149

return std::make_pair(DemandedBits, DemandedElts);

50150

};

50151

APInt Bits0, Elts0;

50152

APInt Bits1, Elts1;

50153

std::tie(Bits0, Elts0) = GetDemandedMasks(N1);

50154

std::tie(Bits1, Elts1) = GetDemandedMasks(N0);

50155

50156

if (TLI.SimplifyDemandedVectorElts(N0, Elts0, DCI) ||

50157

TLI.SimplifyDemandedVectorElts(N1, Elts1, DCI) ||

50158

TLI.SimplifyDemandedBits(N0, Bits0, Elts0, DCI) ||

50159

TLI.SimplifyDemandedBits(N1, Bits1, Elts1, DCI)) {

50160

if (N->getOpcode() != ISD::DELETED_NODE)

50161

DCI.AddToWorklist(N);

50162

return SDValue(N, 0);

50163

}

50164

50165

SDValue NewN0 = TLI.SimplifyMultipleUseDemandedBits(N0, Bits0, Elts0, DAG);

50166

SDValue NewN1 = TLI.SimplifyMultipleUseDemandedBits(N1, Bits1, Elts1, DAG);

50167

if (NewN0 || NewN1)

50168

return DAG.getNode(ISD::AND, dl, VT, NewN0 ? NewN0 : N0,

50169

NewN1 ? NewN1 : N1);

50170

}

50171

50172

// Attempt to combine a scalar bitmask AND with an extracted shuffle.

50173

if ((VT.getScalarSizeInBits() % 8) == 0 &&

50174

N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&

50175

isa<ConstantSDNode>(N0.getOperand(1))) {

50176

SDValue BitMask = N1;

50177

SDValue SrcVec = N0.getOperand(0);

50178

EVT SrcVecVT = SrcVec.getValueType();

50179

50180

// Check that the constant bitmask masks whole bytes.

50181

APInt UndefElts;

50182

SmallVector<APInt, 64> EltBits;

50183

if (VT == SrcVecVT.getScalarType() && N0->isOnlyUserOf(SrcVec.getNode()) &&

50184

getTargetConstantBitsFromNode(BitMask, 8, UndefElts, EltBits) &&

50185

llvm::all_of(EltBits, [](const APInt &M) {

50186

return M.isZero() || M.isAllOnes();

50187

})) {

50188

unsigned NumElts = SrcVecVT.getVectorNumElements();

50189

unsigned Scale = SrcVecVT.getScalarSizeInBits() / 8;

50190

unsigned Idx = N0.getConstantOperandVal(1);

50191

50192

// Create a root shuffle mask from the byte mask and the extracted index.

50193

SmallVector<int, 16> ShuffleMask(NumElts * Scale, SM_SentinelUndef);

50194

for (unsigned i = 0; i != Scale; ++i) {

50195

if (UndefElts[i])

50196

continue;

50197

int VecIdx = Scale * Idx + i;

50198

ShuffleMask[VecIdx] = EltBits[i].isZero() ? SM_SentinelZero : VecIdx;

50199

}

50200

50201

if (SDValue Shuffle = combineX86ShufflesRecursively(

50202

{SrcVec}, 0, SrcVec, ShuffleMask, {}, /*Depth*/ 1,

50203

X86::MaxShuffleCombineDepth,

50204

/*HasVarMask*/ false, /*AllowVarCrossLaneMask*/ true,

50205

/*AllowVarPerLaneMask*/ true, DAG, Subtarget))

50206

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Shuffle,

50207

N0.getOperand(1));

50208

}

50209

}

50210

50211

if (SDValue R = combineBMILogicOp(N, DAG, Subtarget))

50212

return R;

50213

50214

return SDValue();

50215

}

50216

50217

// Canonicalize OR(AND(X,C),AND(Y,~C)) -> OR(AND(X,C),ANDNP(C,Y))

50218

static SDValue canonicalizeBitSelect(SDNode *N, SelectionDAG &DAG,

50219

const X86Subtarget &Subtarget) {

50220

assert(N->getOpcode() == ISD::OR && "Unexpected Opcode")(static_cast <bool> (N->getOpcode() == ISD::OR &&
"Unexpected Opcode") ? void (0) : __assert_fail ("N->getOpcode() == ISD::OR && \"Unexpected Opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 50220, __extension__
__PRETTY_FUNCTION__));

50221

50222

MVT VT = N->getSimpleValueType(0);

50223

unsigned EltSizeInBits = VT.getScalarSizeInBits();

50224

if (!VT.isVector() || (EltSizeInBits % 8) != 0)

50225

return SDValue();

50226

50227

SDValue N0 = peekThroughBitcasts(N->getOperand(0));

50228

SDValue N1 = peekThroughBitcasts(N->getOperand(1));

50229

if (N0.getOpcode() != ISD::AND || N1.getOpcode() != ISD::AND)

50230

return SDValue();

50231

50232

// On XOP we'll lower to PCMOV so accept one use. With AVX512, we can use

50233

// VPTERNLOG. Otherwise only do this if either mask has multiple uses already.

50234

if (!(Subtarget.hasXOP() || useVPTERNLOG(Subtarget, VT) ||

50235

!N0.getOperand(1).hasOneUse() || !N1.getOperand(1).hasOneUse()))

50236

return SDValue();

50237

50238

// Attempt to extract constant byte masks.

50239

APInt UndefElts0, UndefElts1;

50240

SmallVector<APInt, 32> EltBits0, EltBits1;

50241

if (!getTargetConstantBitsFromNode(N0.getOperand(1), 8, UndefElts0, EltBits0,

50242

false, false))

50243

return SDValue();

50244

if (!getTargetConstantBitsFromNode(N1.getOperand(1), 8, UndefElts1, EltBits1,

50245

false, false))

50246

return SDValue();

50247

50248

for (unsigned i = 0, e = EltBits0.size(); i != e; ++i) {

50249

// TODO - add UNDEF elts support.

50250

if (UndefElts0[i] || UndefElts1[i])

50251

return SDValue();

50252

if (EltBits0[i] != ~EltBits1[i])

50253

return SDValue();

50254

}

50255

50256

SDLoc DL(N);

50257

50258

if (useVPTERNLOG(Subtarget, VT)) {

50259

// Emit a VPTERNLOG node directly - 0xCA is the imm code for A?B:C.

50260

// VPTERNLOG is only available as vXi32/64-bit types.

50261

MVT OpSVT = EltSizeInBits == 32 ? MVT::i32 : MVT::i64;

50262

MVT OpVT =

50263

MVT::getVectorVT(OpSVT, VT.getSizeInBits() / OpSVT.getSizeInBits());

50264

SDValue A = DAG.getBitcast(OpVT, N0.getOperand(1));

50265

SDValue B = DAG.getBitcast(OpVT, N0.getOperand(0));

50266

SDValue C = DAG.getBitcast(OpVT, N1.getOperand(0));

50267

SDValue Imm = DAG.getTargetConstant(0xCA, DL, MVT::i8);

50268

SDValue Res = getAVX512Node(X86ISD::VPTERNLOG, DL, OpVT, {A, B, C, Imm},

50269

DAG, Subtarget);

50270

return DAG.getBitcast(VT, Res);

50271

}

50272

50273

SDValue X = N->getOperand(0);

50274

SDValue Y =

50275

DAG.getNode(X86ISD::ANDNP, DL, VT, DAG.getBitcast(VT, N0.getOperand(1)),

50276

DAG.getBitcast(VT, N1.getOperand(0)));

50277

return DAG.getNode(ISD::OR, DL, VT, X, Y);

50278

}

50279

50280

// Try to match OR(AND(~MASK,X),AND(MASK,Y)) logic pattern.

50281

static bool matchLogicBlend(SDNode *N, SDValue &X, SDValue &Y, SDValue &Mask) {

50282

if (N->getOpcode() != ISD::OR)

50283

return false;

50284

50285

SDValue N0 = N->getOperand(0);

50286

SDValue N1 = N->getOperand(1);

50287

50288

// Canonicalize AND to LHS.

50289

if (N1.getOpcode() == ISD::AND)

50290

std::swap(N0, N1);

50291

50292

// Attempt to match OR(AND(M,Y),ANDNP(M,X)).

50293

if (N0.getOpcode() != ISD::AND || N1.getOpcode() != X86ISD::ANDNP)

50294

return false;

50295

50296

Mask = N1.getOperand(0);

50297

X = N1.getOperand(1);

50298

50299

// Check to see if the mask appeared in both the AND and ANDNP.

50300

if (N0.getOperand(0) == Mask)

50301

Y = N0.getOperand(1);

50302

else if (N0.getOperand(1) == Mask)

50303

Y = N0.getOperand(0);

50304

else

50305

return false;

50306

50307

// TODO: Attempt to match against AND(XOR(-1,M),Y) as well, waiting for

50308

// ANDNP combine allows other combines to happen that prevent matching.

50309

return true;

50310

}

50311

50312

// Try to fold:

50313

// (or (and (m, y), (pandn m, x)))

50314

// into:

50315

// (vselect m, x, y)

50316

// As a special case, try to fold:

50317

// (or (and (m, (sub 0, x)), (pandn m, x)))

50318

// into:

50319

// (sub (xor X, M), M)

50320

static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG,

50321

const X86Subtarget &Subtarget) {

50322

assert(N->getOpcode() == ISD::OR && "Unexpected Opcode")(static_cast <bool> (N->getOpcode() == ISD::OR &&
"Unexpected Opcode") ? void (0) : __assert_fail ("N->getOpcode() == ISD::OR && \"Unexpected Opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 50322, __extension__
__PRETTY_FUNCTION__));

50323

50324

EVT VT = N->getValueType(0);

50325

if (!((VT.is128BitVector() && Subtarget.hasSSE2()) ||

50326

(VT.is256BitVector() && Subtarget.hasInt256())))

50327

return SDValue();

50328

50329

SDValue X, Y, Mask;

50330

if (!matchLogicBlend(N, X, Y, Mask))

50331

return SDValue();

50332

50333

// Validate that X, Y, and Mask are bitcasts, and see through them.

50334

Mask = peekThroughBitcasts(Mask);

50335

X = peekThroughBitcasts(X);

50336

Y = peekThroughBitcasts(Y);

50337

50338

EVT MaskVT = Mask.getValueType();

50339

unsigned EltBits = MaskVT.getScalarSizeInBits();

50340

50341

// TODO: Attempt to handle floating point cases as well?

50342

if (!MaskVT.isInteger() || DAG.ComputeNumSignBits(Mask) != EltBits)

50343

return SDValue();

50344

50345

SDLoc DL(N);

50346

50347

// Attempt to combine to conditional negate: (sub (xor X, M), M)

50348

if (SDValue Res = combineLogicBlendIntoConditionalNegate(VT, Mask, X, Y, DL,

50349

DAG, Subtarget))

50350

return Res;

50351

50352

// PBLENDVB is only available on SSE 4.1.

50353

if (!Subtarget.hasSSE41())

50354

return SDValue();

50355

50356

// If we have VPTERNLOG we should prefer that since PBLENDVB is multiple uops.

50357

if (Subtarget.hasVLX())

50358

return SDValue();

50359

50360

MVT BlendVT = VT.is256BitVector() ? MVT::v32i8 : MVT::v16i8;

50361

50362

X = DAG.getBitcast(BlendVT, X);

50363

Y = DAG.getBitcast(BlendVT, Y);

50364

Mask = DAG.getBitcast(BlendVT, Mask);

50365

Mask = DAG.getSelect(DL, BlendVT, Mask, Y, X);

50366

return DAG.getBitcast(VT, Mask);

50367

}

50368

50369

// Helper function for combineOrCmpEqZeroToCtlzSrl

50370

// Transforms:

50371

// seteq(cmp x, 0)

50372

// into:

50373

// srl(ctlz x), log2(bitsize(x))

50374

// Input pattern is checked by caller.

50375

static SDValue lowerX86CmpEqZeroToCtlzSrl(SDValue Op, SelectionDAG &DAG) {

50376

SDValue Cmp = Op.getOperand(1);

50377

EVT VT = Cmp.getOperand(0).getValueType();

50378

unsigned Log2b = Log2_32(VT.getSizeInBits());

50379

SDLoc dl(Op);

50380

SDValue Clz = DAG.getNode(ISD::CTLZ, dl, VT, Cmp->getOperand(0));

50381

// The result of the shift is true or false, and on X86, the 32-bit

50382

// encoding of shr and lzcnt is more desirable.

50383

SDValue Trunc = DAG.getZExtOrTrunc(Clz, dl, MVT::i32);

50384

SDValue Scc = DAG.getNode(ISD::SRL, dl, MVT::i32, Trunc,

50385

DAG.getConstant(Log2b, dl, MVT::i8));

50386

return Scc;

50387

}

50388

50389

// Try to transform:

50390

// zext(or(setcc(eq, (cmp x, 0)), setcc(eq, (cmp y, 0))))

50391

// into:

50392

// srl(or(ctlz(x), ctlz(y)), log2(bitsize(x))

50393

// Will also attempt to match more generic cases, eg:

50394

// zext(or(or(setcc(eq, cmp 0), setcc(eq, cmp 0)), setcc(eq, cmp 0)))

50395

// Only applies if the target supports the FastLZCNT feature.

50396

static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG,

50397

TargetLowering::DAGCombinerInfo &DCI,

50398

const X86Subtarget &Subtarget) {

50399

if (DCI.isBeforeLegalize() || !Subtarget.getTargetLowering()->isCtlzFast())

50400

return SDValue();

50401

50402

auto isORCandidate = [](SDValue N) {

50403

return (N->getOpcode() == ISD::OR && N->hasOneUse());

50404

};

50405

50406

// Check the zero extend is extending to 32-bit or more. The code generated by

50407

// srl(ctlz) for 16-bit or less variants of the pattern would require extra

50408

// instructions to clear the upper bits.

50409

if (!N->hasOneUse() || !N->getSimpleValueType(0).bitsGE(MVT::i32) ||

50410

!isORCandidate(N->getOperand(0)))

50411

return SDValue();

50412

50413

// Check the node matches: setcc(eq, cmp 0)

50414

auto isSetCCCandidate = [](SDValue N) {

50415

return N->getOpcode() == X86ISD::SETCC && N->hasOneUse() &&

50416

X86::CondCode(N->getConstantOperandVal(0)) == X86::COND_E &&

50417

N->getOperand(1).getOpcode() == X86ISD::CMP &&

50418

isNullConstant(N->getOperand(1).getOperand(1)) &&

50419

N->getOperand(1).getValueType().bitsGE(MVT::i32);

50420

};

50421

50422

SDNode *OR = N->getOperand(0).getNode();

50423

SDValue LHS = OR->getOperand(0);

50424

SDValue RHS = OR->getOperand(1);

50425

50426

// Save nodes matching or(or, setcc(eq, cmp 0)).

50427

SmallVector<SDNode *, 2> ORNodes;

50428

while (((isORCandidate(LHS) && isSetCCCandidate(RHS)) ||

50429

(isORCandidate(RHS) && isSetCCCandidate(LHS)))) {

50430

ORNodes.push_back(OR);

50431

OR = (LHS->getOpcode() == ISD::OR) ? LHS.getNode() : RHS.getNode();

50432

LHS = OR->getOperand(0);

50433

RHS = OR->getOperand(1);

50434

}

50435

50436

// The last OR node should match or(setcc(eq, cmp 0), setcc(eq, cmp 0)).

50437

if (!(isSetCCCandidate(LHS) && isSetCCCandidate(RHS)) ||

50438

!isORCandidate(SDValue(OR, 0)))

50439

return SDValue();

50440

50441

// We have a or(setcc(eq, cmp 0), setcc(eq, cmp 0)) pattern, try to lower it

50442

// to

50443

// or(srl(ctlz),srl(ctlz)).

50444

// The dag combiner can then fold it into:

50445

// srl(or(ctlz, ctlz)).

50446

SDValue NewLHS = lowerX86CmpEqZeroToCtlzSrl(LHS, DAG);

50447

SDValue Ret, NewRHS;

50448

if (NewLHS && (NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, DAG)))

50449

Ret = DAG.getNode(ISD::OR, SDLoc(OR), MVT::i32, NewLHS, NewRHS);

50450

50451

if (!Ret)

50452

return SDValue();

50453

50454

// Try to lower nodes matching the or(or, setcc(eq, cmp 0)) pattern.

50455

while (!ORNodes.empty()) {

50456

OR = ORNodes.pop_back_val();

50457

LHS = OR->getOperand(0);

50458

RHS = OR->getOperand(1);

50459

// Swap rhs with lhs to match or(setcc(eq, cmp, 0), or).

50460

if (RHS->getOpcode() == ISD::OR)

50461

std::swap(LHS, RHS);

50462

NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, DAG);

50463

if (!NewRHS)

50464

return SDValue();

50465

Ret = DAG.getNode(ISD::OR, SDLoc(OR), MVT::i32, Ret, NewRHS);

50466

}

50467

50468

return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), Ret);

50469

}

50470

50471

static SDValue foldMaskedMergeImpl(SDValue And0_L, SDValue And0_R,

50472

SDValue And1_L, SDValue And1_R,

50473

const SDLoc &DL, SelectionDAG &DAG) {

50474

if (!isBitwiseNot(And0_L, true) || !And0_L->hasOneUse())

50475

return SDValue();

50476

SDValue NotOp = And0_L->getOperand(0);

50477

if (NotOp == And1_R)

50478

std::swap(And1_R, And1_L);

50479

if (NotOp != And1_L)

50480

return SDValue();

50481

50482

// (~(NotOp) & And0_R) | (NotOp & And1_R)

50483

// --> ((And0_R ^ And1_R) & NotOp) ^ And1_R

50484

EVT VT = And1_L->getValueType(0);

50485

SDValue Freeze_And0_R = DAG.getNode(ISD::FREEZE, SDLoc(), VT, And0_R);

50486

SDValue Xor0 = DAG.getNode(ISD::XOR, DL, VT, And1_R, Freeze_And0_R);

50487

SDValue And = DAG.getNode(ISD::AND, DL, VT, Xor0, NotOp);

50488

SDValue Xor1 = DAG.getNode(ISD::XOR, DL, VT, And, Freeze_And0_R);

50489

return Xor1;

50490

}

50491

50492

/// Fold "masked merge" expressions like `(m & x) | (~m & y)` into the

50493

/// equivalent `((x ^ y) & m) ^ y)` pattern.

50494

/// This is typically a better representation for targets without a fused

50495

/// "and-not" operation. This function is intended to be called from a

50496

/// `TargetLowering::PerformDAGCombine` callback on `ISD::OR` nodes.

50497

static SDValue foldMaskedMerge(SDNode *Node, SelectionDAG &DAG) {

50498

// Note that masked-merge variants using XOR or ADD expressions are

50499

// normalized to OR by InstCombine so we only check for OR.

50500

assert(Node->getOpcode() == ISD::OR && "Must be called with ISD::OR node")(static_cast <bool> (Node->getOpcode() == ISD::OR &&
"Must be called with ISD::OR node") ? void (0) : __assert_fail
("Node->getOpcode() == ISD::OR && \"Must be called with ISD::OR node\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 50500, __extension__
__PRETTY_FUNCTION__));

50501

SDValue N0 = Node->getOperand(0);

50502

if (N0->getOpcode() != ISD::AND || !N0->hasOneUse())

50503

return SDValue();

50504

SDValue N1 = Node->getOperand(1);

50505

if (N1->getOpcode() != ISD::AND || !N1->hasOneUse())

50506

return SDValue();

50507

50508

SDLoc DL(Node);

50509

SDValue N00 = N0->getOperand(0);

50510

SDValue N01 = N0->getOperand(1);

50511

SDValue N10 = N1->getOperand(0);

50512

SDValue N11 = N1->getOperand(1);

50513

if (SDValue Result = foldMaskedMergeImpl(N00, N01, N10, N11, DL, DAG))

50514

return Result;

50515

if (SDValue Result = foldMaskedMergeImpl(N01, N00, N10, N11, DL, DAG))

50516

return Result;

50517

if (SDValue Result = foldMaskedMergeImpl(N10, N11, N00, N01, DL, DAG))

50518

return Result;

50519

if (SDValue Result = foldMaskedMergeImpl(N11, N10, N00, N01, DL, DAG))

50520

return Result;

50521

return SDValue();

50522

}

50523

50524

/// If this is an add or subtract where one operand is produced by a cmp+setcc,

50525

/// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}

50526

/// with CMP+{ADC, SBB}.

50527

/// Also try (ADD/SUB)+(AND(SRL,1)) bit extraction pattern with BT+{ADC, SBB}.

50528

static SDValue combineAddOrSubToADCOrSBB(bool IsSub, const SDLoc &DL, EVT VT,

50529

SDValue X, SDValue Y,

50530

SelectionDAG &DAG,

50531

bool ZeroSecondOpOnly = false) {

50532

if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))

50533

return SDValue();

50534

50535

// Look through a one-use zext.

50536

if (Y.getOpcode() == ISD::ZERO_EXTEND && Y.hasOneUse())

50537

Y = Y.getOperand(0);

50538

50539

X86::CondCode CC;

50540

SDValue EFLAGS;

50541

if (Y.getOpcode() == X86ISD::SETCC && Y.hasOneUse()) {

50542

CC = (X86::CondCode)Y.getConstantOperandVal(0);

50543

EFLAGS = Y.getOperand(1);

50544

} else if (Y.getOpcode() == ISD::AND && isOneConstant(Y.getOperand(1)) &&

50545

Y.hasOneUse()) {

50546

EFLAGS = LowerAndToBT(Y, ISD::SETNE, DL, DAG, CC);

50547

}

50548

50549

if (!EFLAGS)

50550

return SDValue();

50551

50552

// If X is -1 or 0, then we have an opportunity to avoid constants required in

50553

// the general case below.

50554

auto *ConstantX = dyn_cast<ConstantSDNode>(X);

50555

if (ConstantX && !ZeroSecondOpOnly) {

50556

if ((!IsSub && CC == X86::COND_AE && ConstantX->isAllOnes()) ||

50557

(IsSub && CC == X86::COND_B && ConstantX->isZero())) {

50558

// This is a complicated way to get -1 or 0 from the carry flag:

50559

// -1 + SETAE --> -1 + (!CF) --> CF ? -1 : 0 --> SBB %eax, %eax

50560

// 0 - SETB --> 0 - (CF) --> CF ? -1 : 0 --> SBB %eax, %eax

50561

return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,

50562

DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),

50563

EFLAGS);

50564

}

50565

50566

if ((!IsSub && CC == X86::COND_BE && ConstantX->isAllOnes()) ||

50567

(IsSub && CC == X86::COND_A && ConstantX->isZero())) {

50568

if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&

50569

EFLAGS.getValueType().isInteger() &&

50570

!isa<ConstantSDNode>(EFLAGS.getOperand(1))) {

50571

// Swap the operands of a SUB, and we have the same pattern as above.

50572

// -1 + SETBE (SUB A, B) --> -1 + SETAE (SUB B, A) --> SUB + SBB

50573

// 0 - SETA (SUB A, B) --> 0 - SETB (SUB B, A) --> SUB + SBB

50574

SDValue NewSub = DAG.getNode(

50575

X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),

50576

EFLAGS.getOperand(1), EFLAGS.getOperand(0));

50577

SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());

50578

return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,

50579

DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),

50580

NewEFLAGS);

50581

}

50582

}

50583

}

50584

50585

if (CC == X86::COND_B) {

50586

// X + SETB Z --> adc X, 0

50587

// X - SETB Z --> sbb X, 0

50588

return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,

50589

DAG.getVTList(VT, MVT::i32), X,

50590

DAG.getConstant(0, DL, VT), EFLAGS);

50591

}

50592

50593

if (ZeroSecondOpOnly)

50594

return SDValue();

50595

50596

if (CC == X86::COND_A) {

50597

// Try to convert COND_A into COND_B in an attempt to facilitate

50598

// materializing "setb reg".

50599

//

50600

// Do not flip "e > c", where "c" is a constant, because Cmp instruction

50601

// cannot take an immediate as its first operand.

50602

//

50603

if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.getNode()->hasOneUse() &&

50604

EFLAGS.getValueType().isInteger() &&

50605

!isa<ConstantSDNode>(EFLAGS.getOperand(1))) {

50606

SDValue NewSub =

50607

DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),

50608

EFLAGS.getOperand(1), EFLAGS.getOperand(0));

50609

SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo());

50610

return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,

50611

DAG.getVTList(VT, MVT::i32), X,

50612

DAG.getConstant(0, DL, VT), NewEFLAGS);

50613

}

50614

}

50615

50616

if (CC == X86::COND_AE) {

50617

// X + SETAE --> sbb X, -1

50618

// X - SETAE --> adc X, -1

50619

return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL,

50620

DAG.getVTList(VT, MVT::i32), X,

50621

DAG.getConstant(-1, DL, VT), EFLAGS);

50622

}

50623

50624

if (CC == X86::COND_BE) {

50625

// X + SETBE --> sbb X, -1

50626

// X - SETBE --> adc X, -1

50627

// Try to convert COND_BE into COND_AE in an attempt to facilitate

50628

// materializing "setae reg".

50629

//

50630

// Do not flip "e <= c", where "c" is a constant, because Cmp instruction

50631

// cannot take an immediate as its first operand.

50632

//

50633

if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.getNode()->hasOneUse() &&

50634

EFLAGS.getValueType().isInteger() &&

50635

!isa<ConstantSDNode>(EFLAGS.getOperand(1))) {

50636

SDValue NewSub =

50637

DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),

50638

EFLAGS.getOperand(1), EFLAGS.getOperand(0));

50639

SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo());

50640

return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL,

50641

DAG.getVTList(VT, MVT::i32), X,

50642

DAG.getConstant(-1, DL, VT), NewEFLAGS);

50643

}

50644

}

50645

50646

if (CC != X86::COND_E && CC != X86::COND_NE)

50647

return SDValue();

50648

50649

if (EFLAGS.getOpcode() != X86ISD::CMP || !EFLAGS.hasOneUse() ||

50650

!X86::isZeroNode(EFLAGS.getOperand(1)) ||

50651

!EFLAGS.getOperand(0).getValueType().isInteger())

50652

return SDValue();

50653

50654

SDValue Z = EFLAGS.getOperand(0);

50655

EVT ZVT = Z.getValueType();

50656

50657

// If X is -1 or 0, then we have an opportunity to avoid constants required in

50658

// the general case below.

50659

if (ConstantX) {

50660

// 'neg' sets the carry flag when Z != 0, so create 0 or -1 using 'sbb' with

50661

// fake operands:

50662

// 0 - (Z != 0) --> sbb %eax, %eax, (neg Z)

50663

// -1 + (Z == 0) --> sbb %eax, %eax, (neg Z)

50664

if ((IsSub && CC == X86::COND_NE && ConstantX->isZero()) ||

50665

(!IsSub && CC == X86::COND_E && ConstantX->isAllOnes())) {

50666

SDValue Zero = DAG.getConstant(0, DL, ZVT);

50667

SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);

50668

SDValue Neg = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Zero, Z);

50669

return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,

50670

DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),

50671

SDValue(Neg.getNode(), 1));

50672

}

50673

50674

// cmp with 1 sets the carry flag when Z == 0, so create 0 or -1 using 'sbb'

50675

// with fake operands:

50676

// 0 - (Z == 0) --> sbb %eax, %eax, (cmp Z, 1)

50677

// -1 + (Z != 0) --> sbb %eax, %eax, (cmp Z, 1)

50678

if ((IsSub && CC == X86::COND_E && ConstantX->isZero()) ||

50679

(!IsSub && CC == X86::COND_NE && ConstantX->isAllOnes())) {

50680

SDValue One = DAG.getConstant(1, DL, ZVT);

50681

SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);

50682

SDValue Cmp1 = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Z, One);

50683

return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,

50684

DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),

50685

Cmp1.getValue(1));

50686

}

50687

}

50688

50689

// (cmp Z, 1) sets the carry flag if Z is 0.

50690

SDValue One = DAG.getConstant(1, DL, ZVT);

50691

SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);

50692

SDValue Cmp1 = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Z, One);

50693

50694

// Add the flags type for ADC/SBB nodes.

50695

SDVTList VTs = DAG.getVTList(VT, MVT::i32);

50696

50697

// X - (Z != 0) --> sub X, (zext(setne Z, 0)) --> adc X, -1, (cmp Z, 1)

50698

// X + (Z != 0) --> add X, (zext(setne Z, 0)) --> sbb X, -1, (cmp Z, 1)

50699

if (CC == X86::COND_NE)

50700

return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL, VTs, X,

50701

DAG.getConstant(-1ULL, DL, VT), Cmp1.getValue(1));

50702

50703

// X - (Z == 0) --> sub X, (zext(sete Z, 0)) --> sbb X, 0, (cmp Z, 1)

50704

// X + (Z == 0) --> add X, (zext(sete Z, 0)) --> adc X, 0, (cmp Z, 1)

50705

return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL, VTs, X,

50706

DAG.getConstant(0, DL, VT), Cmp1.getValue(1));

50707

}

50708

50709

/// If this is an add or subtract where one operand is produced by a cmp+setcc,

50710

/// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}

50711

/// with CMP+{ADC, SBB}.

50712

static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) {

50713

bool IsSub = N->getOpcode() == ISD::SUB;

50714

SDValue X = N->getOperand(0);

50715

SDValue Y = N->getOperand(1);

50716

EVT VT = N->getValueType(0);

50717

SDLoc DL(N);

50718

50719

if (SDValue ADCOrSBB = combineAddOrSubToADCOrSBB(IsSub, DL, VT, X, Y, DAG))

50720

return ADCOrSBB;

50721

50722

// Commute and try again (negate the result for subtracts).

50723

if (SDValue ADCOrSBB = combineAddOrSubToADCOrSBB(IsSub, DL, VT, Y, X, DAG)) {

50724

if (IsSub)

50725

ADCOrSBB =

50726

DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), ADCOrSBB);

50727

return ADCOrSBB;

50728

}

50729

50730

return SDValue();

50731

}

50732

50733

static SDValue combineOrXorWithSETCC(SDNode *N, SDValue N0, SDValue N1,

50734

SelectionDAG &DAG) {

50735

assert((N->getOpcode() == ISD::XOR || N->getOpcode() == ISD::OR) &&(static_cast <bool> ((N->getOpcode() == ISD::XOR || N
->getOpcode() == ISD::OR) && "Unexpected opcode") ?
void (0) : __assert_fail ("(N->getOpcode() == ISD::XOR || N->getOpcode() == ISD::OR) && \"Unexpected opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 50736, __extension__
__PRETTY_FUNCTION__))

50736

"Unexpected opcode")(static_cast <bool> ((N->getOpcode() == ISD::XOR || N
->getOpcode() == ISD::OR) && "Unexpected opcode") ?
void (0) : __assert_fail ("(N->getOpcode() == ISD::XOR || N->getOpcode() == ISD::OR) && \"Unexpected opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 50736, __extension__
__PRETTY_FUNCTION__));

50737

50738

// Delegate to combineAddOrSubToADCOrSBB if we have:

50739

//

50740

// (xor/or (zero_extend (setcc)) imm)

50741

//

50742

// where imm is odd if and only if we have xor, in which case the XOR/OR are

50743

// equivalent to a SUB/ADD, respectively.

50744

if (N0.getOpcode() == ISD::ZERO_EXTEND &&

50745

N0.getOperand(0).getOpcode() == X86ISD::SETCC && N0.hasOneUse()) {

50746

if (auto *N1C = dyn_cast<ConstantSDNode>(N1)) {

50747

bool IsSub = N->getOpcode() == ISD::XOR;

50748

bool N1COdd = N1C->getZExtValue() & 1;

50749

if (IsSub ? N1COdd : !N1COdd) {

50750

SDLoc DL(N);

50751

EVT VT = N->getValueType(0);

50752

if (SDValue R = combineAddOrSubToADCOrSBB(IsSub, DL, VT, N1, N0, DAG))

50753

return R;

50754

}

50755

}

50756

}

50757

50758

return SDValue();

50759

}

50760

50761

static SDValue combineOr(SDNode *N, SelectionDAG &DAG,

50762

TargetLowering::DAGCombinerInfo &DCI,

50763

const X86Subtarget &Subtarget) {

50764

SDValue N0 = N->getOperand(0);

50765

SDValue N1 = N->getOperand(1);

50766

EVT VT = N->getValueType(0);

50767

SDLoc dl(N);

50768

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

50769

50770

// If this is SSE1 only convert to FOR to avoid scalarization.

50771

if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {

50772

return DAG.getBitcast(MVT::v4i32,

50773

DAG.getNode(X86ISD::FOR, dl, MVT::v4f32,

50774

DAG.getBitcast(MVT::v4f32, N0),

50775

DAG.getBitcast(MVT::v4f32, N1)));

50776

}

50777

50778

// Match any-of bool scalar reductions into a bitcast/movmsk + cmp.

50779

// TODO: Support multiple SrcOps.

50780

if (VT == MVT::i1) {

50781

SmallVector<SDValue, 2> SrcOps;

50782

SmallVector<APInt, 2> SrcPartials;

50783

if (matchScalarReduction(SDValue(N, 0), ISD::OR, SrcOps, &SrcPartials) &&

50784

SrcOps.size() == 1) {

50785

unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements();

50786

EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);

50787

SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget);

50788

if (!Mask && TLI.isTypeLegal(SrcOps[0].getValueType()))

50789

Mask = DAG.getBitcast(MaskVT, SrcOps[0]);

50790

if (Mask) {

50791

assert(SrcPartials[0].getBitWidth() == NumElts &&(static_cast <bool> (SrcPartials[0].getBitWidth() == NumElts
&& "Unexpected partial reduction mask") ? void (0) :
__assert_fail ("SrcPartials[0].getBitWidth() == NumElts && \"Unexpected partial reduction mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 50792, __extension__
__PRETTY_FUNCTION__))

50792

"Unexpected partial reduction mask")(static_cast <bool> (SrcPartials[0].getBitWidth() == NumElts
&& "Unexpected partial reduction mask") ? void (0) :
__assert_fail ("SrcPartials[0].getBitWidth() == NumElts && \"Unexpected partial reduction mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 50792, __extension__
__PRETTY_FUNCTION__));

50793

SDValue ZeroBits = DAG.getConstant(0, dl, MaskVT);

50794

SDValue PartialBits = DAG.getConstant(SrcPartials[0], dl, MaskVT);

50795

Mask = DAG.getNode(ISD::AND, dl, MaskVT, Mask, PartialBits);

50796

return DAG.getSetCC(dl, MVT::i1, Mask, ZeroBits, ISD::SETNE);

50797

}

50798

}

50799

}

50800

50801

if (SDValue R = combineBitOpWithMOVMSK(N, DAG))

50802

return R;

50803

50804

if (SDValue R = combineBitOpWithShift(N, DAG))

50805

return R;

50806

50807

if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, DCI, Subtarget))

50808

return FPLogic;

50809

50810

if (DCI.isBeforeLegalizeOps())

50811

return SDValue();

50812

50813

if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))

50814

return R;

50815

50816

if (SDValue R = canonicalizeBitSelect(N, DAG, Subtarget))

50817

return R;

50818

50819

if (SDValue R = combineLogicBlendIntoPBLENDV(N, DAG, Subtarget))

50820

return R;

50821

50822

// (0 - SetCC) | C -> (zext (not SetCC)) * (C + 1) - 1 if we can get a LEA out of it.

50823

if ((VT == MVT::i32 || VT == MVT::i64) &&

50824

N0.getOpcode() == ISD::SUB && N0.hasOneUse() &&

50825

isNullConstant(N0.getOperand(0))) {

50826

SDValue Cond = N0.getOperand(1);

50827

if (Cond.getOpcode() == ISD::ZERO_EXTEND && Cond.hasOneUse())

50828

Cond = Cond.getOperand(0);

50829

50830

if (Cond.getOpcode() == X86ISD::SETCC && Cond.hasOneUse()) {

50831

if (auto *CN = dyn_cast<ConstantSDNode>(N1)) {

50832

uint64_t Val = CN->getZExtValue();

50833

if (Val == 1 || Val == 2 || Val == 3 || Val == 4 || Val == 7 || Val == 8) {

50834

X86::CondCode CCode = (X86::CondCode)Cond.getConstantOperandVal(0);

50835

CCode = X86::GetOppositeBranchCondition(CCode);

50836

SDValue NotCond = getSETCC(CCode, Cond.getOperand(1), SDLoc(Cond), DAG);

50837

50838

SDValue R = DAG.getZExtOrTrunc(NotCond, dl, VT);

50839

R = DAG.getNode(ISD::MUL, dl, VT, R, DAG.getConstant(Val + 1, dl, VT));

50840

R = DAG.getNode(ISD::SUB, dl, VT, R, DAG.getConstant(1, dl, VT));

50841

return R;

50842

}

50843

}

50844

}

50845

}

50846

50847

// Combine OR(X,KSHIFTL(Y,Elts/2)) -> CONCAT_VECTORS(X,Y) == KUNPCK(X,Y).

50848

// Combine OR(KSHIFTL(X,Elts/2),Y) -> CONCAT_VECTORS(Y,X) == KUNPCK(Y,X).

50849

// iff the upper elements of the non-shifted arg are zero.

50850

// KUNPCK require 16+ bool vector elements.

50851

if (N0.getOpcode() == X86ISD::KSHIFTL || N1.getOpcode() == X86ISD::KSHIFTL) {

50852

unsigned NumElts = VT.getVectorNumElements();

50853

unsigned HalfElts = NumElts / 2;

50854

APInt UpperElts = APInt::getHighBitsSet(NumElts, HalfElts);

50855

if (NumElts >= 16 && N1.getOpcode() == X86ISD::KSHIFTL &&

50856

N1.getConstantOperandAPInt(1) == HalfElts &&

50857

DAG.MaskedVectorIsZero(N0, UpperElts)) {

50858

return DAG.getNode(

50859

ISD::CONCAT_VECTORS, dl, VT,

50860

extractSubVector(N0, 0, DAG, dl, HalfElts),

50861

extractSubVector(N1.getOperand(0), 0, DAG, dl, HalfElts));

50862

}

50863

if (NumElts >= 16 && N0.getOpcode() == X86ISD::KSHIFTL &&

50864

N0.getConstantOperandAPInt(1) == HalfElts &&

50865

DAG.MaskedVectorIsZero(N1, UpperElts)) {

50866

return DAG.getNode(

50867

ISD::CONCAT_VECTORS, dl, VT,

50868

extractSubVector(N1, 0, DAG, dl, HalfElts),

50869

extractSubVector(N0.getOperand(0), 0, DAG, dl, HalfElts));

50870

}

50871

}

50872

50873

if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {

50874

// Attempt to recursively combine an OR of shuffles.

50875

SDValue Op(N, 0);

50876

if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))

50877

return Res;

50878

50879

// If either operand is a constant mask, then only the elements that aren't

50880

// allones are actually demanded by the other operand.

50881

auto SimplifyUndemandedElts = [&](SDValue Op, SDValue OtherOp) {

50882

APInt UndefElts;

50883

SmallVector<APInt> EltBits;

50884

int NumElts = VT.getVectorNumElements();

50885

int EltSizeInBits = VT.getScalarSizeInBits();

50886

if (!getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts, EltBits))

50887

return false;

50888

50889

APInt DemandedElts = APInt::getZero(NumElts);

50890

for (int I = 0; I != NumElts; ++I)

50891

if (!EltBits[I].isAllOnes())

50892

DemandedElts.setBit(I);

50893

50894

return TLI.SimplifyDemandedVectorElts(OtherOp, DemandedElts, DCI);

50895

};

50896

if (SimplifyUndemandedElts(N0, N1) || SimplifyUndemandedElts(N1, N0)) {

50897

if (N->getOpcode() != ISD::DELETED_NODE)

50898

DCI.AddToWorklist(N);

50899

return SDValue(N, 0);

50900

}

50901

}

50902

50903

// We should fold "masked merge" patterns when `andn` is not available.

50904

if (!Subtarget.hasBMI() && VT.isScalarInteger() && VT != MVT::i1)

50905

if (SDValue R = foldMaskedMerge(N, DAG))

50906

return R;

50907

50908

if (SDValue R = combineOrXorWithSETCC(N, N0, N1, DAG))

50909

return R;

50910

50911

return SDValue();

50912

}

50913

50914

/// Try to turn tests against the signbit in the form of:

50915

/// XOR(TRUNCATE(SRL(X, size(X)-1)), 1)

50916

/// into:

50917

/// SETGT(X, -1)

50918

static SDValue foldXorTruncShiftIntoCmp(SDNode *N, SelectionDAG &DAG) {

50919

// This is only worth doing if the output type is i8 or i1.

50920

EVT ResultType = N->getValueType(0);

50921

if (ResultType != MVT::i8 && ResultType != MVT::i1)

50922

return SDValue();

50923

50924

SDValue N0 = N->getOperand(0);

50925

SDValue N1 = N->getOperand(1);

50926

50927

// We should be performing an xor against a truncated shift.

50928

if (N0.getOpcode() != ISD::TRUNCATE || !N0.hasOneUse())

50929

return SDValue();

50930

50931

// Make sure we are performing an xor against one.

50932

if (!isOneConstant(N1))

50933

return SDValue();

50934

50935

// SetCC on x86 zero extends so only act on this if it's a logical shift.

50936

SDValue Shift = N0.getOperand(0);

50937

if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse())

50938

return SDValue();

50939

50940

// Make sure we are truncating from one of i16, i32 or i64.

50941

EVT ShiftTy = Shift.getValueType();

50942

if (ShiftTy != MVT::i16 && ShiftTy != MVT::i32 && ShiftTy != MVT::i64)

50943

return SDValue();

50944

50945

// Make sure the shift amount extracts the sign bit.

50946

if (!isa<ConstantSDNode>(Shift.getOperand(1)) ||

50947

Shift.getConstantOperandAPInt(1) != (ShiftTy.getSizeInBits() - 1))

50948

return SDValue();

50949

50950

// Create a greater-than comparison against -1.

50951

// N.B. Using SETGE against 0 works but we want a canonical looking

50952

// comparison, using SETGT matches up with what TranslateX86CC.

50953

SDLoc DL(N);

50954

SDValue ShiftOp = Shift.getOperand(0);

50955

EVT ShiftOpTy = ShiftOp.getValueType();

50956

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

50957

EVT SetCCResultType = TLI.getSetCCResultType(DAG.getDataLayout(),

50958

*DAG.getContext(), ResultType);

50959

SDValue Cond = DAG.getSetCC(DL, SetCCResultType, ShiftOp,

50960

DAG.getConstant(-1, DL, ShiftOpTy), ISD::SETGT);

50961

if (SetCCResultType != ResultType)

50962

Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, ResultType, Cond);

50963

return Cond;

50964

}

50965

50966

/// Turn vector tests of the signbit in the form of:

50967

/// xor (sra X, elt_size(X)-1), -1

50968

/// into:

50969

/// pcmpgt X, -1

50970

///

50971

/// This should be called before type legalization because the pattern may not

50972

/// persist after that.

50973

static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,

50974

const X86Subtarget &Subtarget) {

50975

EVT VT = N->getValueType(0);

50976

if (!VT.isSimple())

50977

return SDValue();

50978

50979

switch (VT.getSimpleVT().SimpleTy) {

50980

default: return SDValue();

50981

case MVT::v16i8:

50982

case MVT::v8i16:

50983

case MVT::v4i32:

50984

case MVT::v2i64: if (!Subtarget.hasSSE2()) return SDValue(); break;

50985

case MVT::v32i8:

50986

case MVT::v16i16:

50987

case MVT::v8i32:

50988

case MVT::v4i64: if (!Subtarget.hasAVX2()) return SDValue(); break;

50989

}

50990

50991

// There must be a shift right algebraic before the xor, and the xor must be a

50992

// 'not' operation.

50993

SDValue Shift = N->getOperand(0);

50994

SDValue Ones = N->getOperand(1);

50995

if (Shift.getOpcode() != ISD::SRA || !Shift.hasOneUse() ||

50996

!ISD::isBuildVectorAllOnes(Ones.getNode()))

50997

return SDValue();

50998

50999

// The shift should be smearing the sign bit across each vector element.

51000

auto *ShiftAmt =

51001

isConstOrConstSplat(Shift.getOperand(1), /*AllowUndefs*/ true);

51002

if (!ShiftAmt ||

51003

ShiftAmt->getAPIntValue() != (Shift.getScalarValueSizeInBits() - 1))

51004

return SDValue();

51005

51006

// Create a greater-than comparison against -1. We don't use the more obvious

51007

// greater-than-or-equal-to-zero because SSE/AVX don't have that instruction.

51008

return DAG.getSetCC(SDLoc(N), VT, Shift.getOperand(0), Ones, ISD::SETGT);

51009

}

51010

51011

/// Detect patterns of truncation with unsigned saturation:

51012

///

51013

/// 1. (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).

51014

/// Return the source value x to be truncated or SDValue() if the pattern was

51015

/// not matched.

51016

///

51017

/// 2. (truncate (smin (smax (x, C1), C2)) to dest_type),

51018

/// where C1 >= 0 and C2 is unsigned max of destination type.

51019

///

51020

/// (truncate (smax (smin (x, C2), C1)) to dest_type)

51021

/// where C1 >= 0, C2 is unsigned max of destination type and C1 <= C2.

51022

///

51023

/// These two patterns are equivalent to:

51024

/// (truncate (umin (smax(x, C1), unsigned_max_of_dest_type)) to dest_type)

51025

/// So return the smax(x, C1) value to be truncated or SDValue() if the

51026

/// pattern was not matched.

51027

static SDValue detectUSatPattern(SDValue In, EVT VT, SelectionDAG &DAG,

51028

const SDLoc &DL) {

51029

EVT InVT = In.getValueType();

51030

51031

// Saturation with truncation. We truncate from InVT to VT.

51032

assert(InVT.getScalarSizeInBits() > VT.getScalarSizeInBits() &&(static_cast <bool> (InVT.getScalarSizeInBits() > VT
.getScalarSizeInBits() && "Unexpected types for truncate operation"
) ? void (0) : __assert_fail ("InVT.getScalarSizeInBits() > VT.getScalarSizeInBits() && \"Unexpected types for truncate operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 51033, __extension__
__PRETTY_FUNCTION__))

51033

"Unexpected types for truncate operation")(static_cast <bool> (InVT.getScalarSizeInBits() > VT
.getScalarSizeInBits() && "Unexpected types for truncate operation"
) ? void (0) : __assert_fail ("InVT.getScalarSizeInBits() > VT.getScalarSizeInBits() && \"Unexpected types for truncate operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 51033, __extension__
__PRETTY_FUNCTION__));

51034

51035

// Match min/max and return limit value as a parameter.

51036

auto MatchMinMax = [](SDValue V, unsigned Opcode, APInt &Limit) -> SDValue {

51037

if (V.getOpcode() == Opcode &&

51038

ISD::isConstantSplatVector(V.getOperand(1).getNode(), Limit))

51039

return V.getOperand(0);

51040

return SDValue();

51041

};

51042

51043

APInt C1, C2;

51044

if (SDValue UMin = MatchMinMax(In, ISD::UMIN, C2))

51045

// C2 should be equal to UINT32_MAX / UINT16_MAX / UINT8_MAX according

51046

// the element size of the destination type.

51047

if (C2.isMask(VT.getScalarSizeInBits()))

51048

return UMin;

51049

51050

if (SDValue SMin = MatchMinMax(In, ISD::SMIN, C2))

51051

if (MatchMinMax(SMin, ISD::SMAX, C1))

51052

if (C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits()))

51053

return SMin;

51054

51055

if (SDValue SMax = MatchMinMax(In, ISD::SMAX, C1))

51056

if (SDValue SMin = MatchMinMax(SMax, ISD::SMIN, C2))

51057

if (C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits()) &&

51058

C2.uge(C1)) {

51059

return DAG.getNode(ISD::SMAX, DL, InVT, SMin, In.getOperand(1));

51060

}

51061

51062

return SDValue();

51063

}

51064

51065

/// Detect patterns of truncation with signed saturation:

51066

/// (truncate (smin ((smax (x, signed_min_of_dest_type)),

51067

/// signed_max_of_dest_type)) to dest_type)

51068

/// or:

51069

/// (truncate (smax ((smin (x, signed_max_of_dest_type)),

51070

/// signed_min_of_dest_type)) to dest_type).

51071

/// With MatchPackUS, the smax/smin range is [0, unsigned_max_of_dest_type].

51072

/// Return the source value to be truncated or SDValue() if the pattern was not

51073

/// matched.

51074

static SDValue detectSSatPattern(SDValue In, EVT VT, bool MatchPackUS = false) {

51075

unsigned NumDstBits = VT.getScalarSizeInBits();

51076

unsigned NumSrcBits = In.getScalarValueSizeInBits();

51077

assert(NumSrcBits > NumDstBits && "Unexpected types for truncate operation")(static_cast <bool> (NumSrcBits > NumDstBits &&
"Unexpected types for truncate operation") ? void (0) : __assert_fail
("NumSrcBits > NumDstBits && \"Unexpected types for truncate operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 51077, __extension__
__PRETTY_FUNCTION__));

51078

51079

auto MatchMinMax = [](SDValue V, unsigned Opcode,

51080

const APInt &Limit) -> SDValue {

51081

APInt C;

51082

if (V.getOpcode() == Opcode &&

51083

ISD::isConstantSplatVector(V.getOperand(1).getNode(), C) && C == Limit)

51084

return V.getOperand(0);

51085

return SDValue();

51086

};

51087

51088

APInt SignedMax, SignedMin;

51089

if (MatchPackUS) {

51090

SignedMax = APInt::getAllOnes(NumDstBits).zext(NumSrcBits);

51091

SignedMin = APInt(NumSrcBits, 0);

51092

} else {

51093

SignedMax = APInt::getSignedMaxValue(NumDstBits).sext(NumSrcBits);

51094

SignedMin = APInt::getSignedMinValue(NumDstBits).sext(NumSrcBits);

51095

}

51096

51097

if (SDValue SMin = MatchMinMax(In, ISD::SMIN, SignedMax))

51098

if (SDValue SMax = MatchMinMax(SMin, ISD::SMAX, SignedMin))

51099

return SMax;

51100

51101

if (SDValue SMax = MatchMinMax(In, ISD::SMAX, SignedMin))

51102

if (SDValue SMin = MatchMinMax(SMax, ISD::SMIN, SignedMax))

51103

return SMin;

51104

51105

return SDValue();

51106

}

51107

51108

static SDValue combineTruncateWithSat(SDValue In, EVT VT, const SDLoc &DL,

51109

SelectionDAG &DAG,

51110

const X86Subtarget &Subtarget) {

51111

if (!Subtarget.hasSSE2() || !VT.isVector())

51112

return SDValue();

51113

51114

EVT SVT = VT.getVectorElementType();

51115

EVT InVT = In.getValueType();

51116

EVT InSVT = InVT.getVectorElementType();

51117

51118

// If we're clamping a signed 32-bit vector to 0-255 and the 32-bit vector is

51119

// split across two registers. We can use a packusdw+perm to clamp to 0-65535

51120

// and concatenate at the same time. Then we can use a final vpmovuswb to

51121

// clip to 0-255.

51122

if (Subtarget.hasBWI() && !Subtarget.useAVX512Regs() &&

51123

InVT == MVT::v16i32 && VT == MVT::v16i8) {

51124

if (SDValue USatVal = detectSSatPattern(In, VT, true)) {

51125

// Emit a VPACKUSDW+VPERMQ followed by a VPMOVUSWB.

51126

SDValue Mid = truncateVectorWithPACK(X86ISD::PACKUS, MVT::v16i16, USatVal,

51127

DL, DAG, Subtarget);

51128

assert(Mid && "Failed to pack!")(static_cast <bool> (Mid && "Failed to pack!") ?
void (0) : __assert_fail ("Mid && \"Failed to pack!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 51128, __extension__
__PRETTY_FUNCTION__));

51129

return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, Mid);

51130

}

51131

}

51132

51133

// vXi32 truncate instructions are available with AVX512F.

51134

// vXi16 truncate instructions are only available with AVX512BW.

51135

// For 256-bit or smaller vectors, we require VLX.

51136

// FIXME: We could widen truncates to 512 to remove the VLX restriction.

51137

// If the result type is 256-bits or larger and we have disable 512-bit

51138

// registers, we should go ahead and use the pack instructions if possible.

51139

bool PreferAVX512 = ((Subtarget.hasAVX512() && InSVT == MVT::i32) ||

51140

(Subtarget.hasBWI() && InSVT == MVT::i16)) &&

51141

(InVT.getSizeInBits() > 128) &&

51142

(Subtarget.hasVLX() || InVT.getSizeInBits() > 256) &&

51143

!(!Subtarget.useAVX512Regs() && VT.getSizeInBits() >= 256);

51144

51145

if (isPowerOf2_32(VT.getVectorNumElements()) && !PreferAVX512 &&

51146

VT.getSizeInBits() >= 64 &&

51147

(SVT == MVT::i8 || SVT == MVT::i16) &&

51148

(InSVT == MVT::i16 || InSVT == MVT::i32)) {

51149

if (SDValue USatVal = detectSSatPattern(In, VT, true)) {

51150

// vXi32 -> vXi8 must be performed as PACKUSWB(PACKSSDW,PACKSSDW).

51151

// Only do this when the result is at least 64 bits or we'll leaving

51152

// dangling PACKSSDW nodes.

51153

if (SVT == MVT::i8 && InSVT == MVT::i32) {

51154

EVT MidVT = VT.changeVectorElementType(MVT::i16);

51155

SDValue Mid = truncateVectorWithPACK(X86ISD::PACKSS, MidVT, USatVal, DL,

51156

DAG, Subtarget);

51157

assert(Mid && "Failed to pack!")(static_cast <bool> (Mid && "Failed to pack!") ?
void (0) : __assert_fail ("Mid && \"Failed to pack!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 51157, __extension__
__PRETTY_FUNCTION__));

51158

SDValue V = truncateVectorWithPACK(X86ISD::PACKUS, VT, Mid, DL, DAG,

51159

Subtarget);

51160

assert(V && "Failed to pack!")(static_cast <bool> (V && "Failed to pack!") ? void
(0) : __assert_fail ("V && \"Failed to pack!\"", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 51160, __extension__ __PRETTY_FUNCTION__));

51161

return V;

51162

} else if (SVT == MVT::i8 || Subtarget.hasSSE41())

51163

return truncateVectorWithPACK(X86ISD::PACKUS, VT, USatVal, DL, DAG,

51164

Subtarget);

51165

}

51166

if (SDValue SSatVal = detectSSatPattern(In, VT))

51167

return truncateVectorWithPACK(X86ISD::PACKSS, VT, SSatVal, DL, DAG,

51168

Subtarget);

51169

}

51170

51171

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

51172

if (TLI.isTypeLegal(InVT) && InVT.isVector() && SVT != MVT::i1 &&

51173

Subtarget.hasAVX512() && (InSVT != MVT::i16 || Subtarget.hasBWI()) &&

51174

(SVT == MVT::i32 || SVT == MVT::i16 || SVT == MVT::i8)) {

51175

unsigned TruncOpc = 0;

51176

SDValue SatVal;

51177

if (SDValue SSatVal = detectSSatPattern(In, VT)) {

51178

SatVal = SSatVal;

51179

TruncOpc = X86ISD::VTRUNCS;

51180

} else if (SDValue USatVal = detectUSatPattern(In, VT, DAG, DL)) {

51181

SatVal = USatVal;

51182

TruncOpc = X86ISD::VTRUNCUS;

51183

}

51184

if (SatVal) {

51185

unsigned ResElts = VT.getVectorNumElements();

51186

// If the input type is less than 512 bits and we don't have VLX, we need

51187

// to widen to 512 bits.

51188

if (!Subtarget.hasVLX() && !InVT.is512BitVector()) {

51189

unsigned NumConcats = 512 / InVT.getSizeInBits();

51190

ResElts *= NumConcats;

51191

SmallVector<SDValue, 4> ConcatOps(NumConcats, DAG.getUNDEF(InVT));

51192

ConcatOps[0] = SatVal;

51193

InVT = EVT::getVectorVT(*DAG.getContext(), InSVT,

51194

NumConcats * InVT.getVectorNumElements());

51195

SatVal = DAG.getNode(ISD::CONCAT_VECTORS, DL, InVT, ConcatOps);

51196

}

51197

// Widen the result if its narrower than 128 bits.

51198

if (ResElts * SVT.getSizeInBits() < 128)

51199

ResElts = 128 / SVT.getSizeInBits();

51200

EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), SVT, ResElts);

51201

SDValue Res = DAG.getNode(TruncOpc, DL, TruncVT, SatVal);

51202

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,

51203

DAG.getIntPtrConstant(0, DL));

51204

}

51205

}

51206

51207

return SDValue();

51208

}

51209

51210

/// This function detects the AVG pattern between vectors of unsigned i8/i16,

51211

/// which is c = (a + b + 1) / 2, and replace this operation with the efficient

51212

/// ISD::AVGCEILU (AVG) instruction.

51213

static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,

51214

const X86Subtarget &Subtarget,

51215

const SDLoc &DL) {

51216

if (!VT.isVector())

51217

return SDValue();

51218

EVT InVT = In.getValueType();

51219

unsigned NumElems = VT.getVectorNumElements();

51220

51221

EVT ScalarVT = VT.getVectorElementType();

51222

if (!((ScalarVT == MVT::i8 || ScalarVT == MVT::i16) && NumElems >= 2))

51223

return SDValue();

51224

51225

// InScalarVT is the intermediate type in AVG pattern and it should be greater

51226

// than the original input type (i8/i16).

51227

EVT InScalarVT = InVT.getVectorElementType();

51228

if (InScalarVT.getFixedSizeInBits() <= ScalarVT.getFixedSizeInBits())

51229

return SDValue();

51230

51231

if (!Subtarget.hasSSE2())

51232

return SDValue();

51233

51234

// Detect the following pattern:

51235

//

51236

// %1 = zext <N x i8> %a to <N x i32>

51237

// %2 = zext <N x i8> %b to <N x i32>

51238

// %3 = add nuw nsw <N x i32> %1, <i32 1 x N>

51239

// %4 = add nuw nsw <N x i32> %3, %2

51240

// %5 = lshr <N x i32> %N, <i32 1 x N>

51241

// %6 = trunc <N x i32> %5 to <N x i8>

51242

//

51243

// In AVX512, the last instruction can also be a trunc store.

51244

if (In.getOpcode() != ISD::SRL)

51245

return SDValue();

51246

51247

// A lambda checking the given SDValue is a constant vector and each element

51248

// is in the range [Min, Max].

51249

auto IsConstVectorInRange = [](SDValue V, unsigned Min, unsigned Max) {

51250

return ISD::matchUnaryPredicate(V, [Min, Max](ConstantSDNode *C) {

51251

return !(C->getAPIntValue().ult(Min) || C->getAPIntValue().ugt(Max));

51252

});

51253

};

51254

51255

auto IsZExtLike = [DAG = &DAG, ScalarVT](SDValue V) {

51256

unsigned MaxActiveBits = DAG->computeKnownBits(V).countMaxActiveBits();

51257

return MaxActiveBits <= ScalarVT.getSizeInBits();

51258

};

51259

51260

// Check if each element of the vector is right-shifted by one.

51261

SDValue LHS = In.getOperand(0);

51262

SDValue RHS = In.getOperand(1);

51263

if (!IsConstVectorInRange(RHS, 1, 1))

51264

return SDValue();

51265

if (LHS.getOpcode() != ISD::ADD)

51266

return SDValue();

51267

51268

// Detect a pattern of a + b + 1 where the order doesn't matter.

51269

SDValue Operands[3];

51270

Operands[0] = LHS.getOperand(0);

51271

Operands[1] = LHS.getOperand(1);

51272

51273

auto AVGBuilder = [](SelectionDAG &DAG, const SDLoc &DL,

51274

ArrayRef<SDValue> Ops) {

51275

return DAG.getNode(ISD::AVGCEILU, DL, Ops[0].getValueType(), Ops);

51276

};

51277

51278

auto AVGSplitter = [&](std::array<SDValue, 2> Ops) {

51279

for (SDValue &Op : Ops)

51280

if (Op.getValueType() != VT)

51281

Op = DAG.getNode(ISD::TRUNCATE, DL, VT, Op);

51282

// Pad to a power-of-2 vector, split+apply and extract the original vector.

51283

unsigned NumElemsPow2 = PowerOf2Ceil(NumElems);

51284

EVT Pow2VT = EVT::getVectorVT(*DAG.getContext(), ScalarVT, NumElemsPow2);

51285

if (NumElemsPow2 != NumElems) {

51286

for (SDValue &Op : Ops) {

51287

SmallVector<SDValue, 32> EltsOfOp(NumElemsPow2, DAG.getUNDEF(ScalarVT));

51288

for (unsigned i = 0; i != NumElems; ++i) {

51289

SDValue Idx = DAG.getIntPtrConstant(i, DL);

51290

EltsOfOp[i] =

51291

DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarVT, Op, Idx);

51292

}

51293

Op = DAG.getBuildVector(Pow2VT, DL, EltsOfOp);

51294

}

51295

}

51296

SDValue Res = SplitOpsAndApply(DAG, Subtarget, DL, Pow2VT, Ops, AVGBuilder);

51297

if (NumElemsPow2 == NumElems)

51298

return Res;

51299

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,

51300

DAG.getIntPtrConstant(0, DL));

51301

};

51302

51303

// Take care of the case when one of the operands is a constant vector whose

51304

// element is in the range [1, 256].

51305

if (IsConstVectorInRange(Operands[1], 1, ScalarVT == MVT::i8 ? 256 : 65536) &&

51306

IsZExtLike(Operands[0])) {

51307

// The pattern is detected. Subtract one from the constant vector, then

51308

// demote it and emit X86ISD::AVG instruction.

51309

SDValue VecOnes = DAG.getConstant(1, DL, InVT);

51310

Operands[1] = DAG.getNode(ISD::SUB, DL, InVT, Operands[1], VecOnes);

51311

return AVGSplitter({Operands[0], Operands[1]});

51312

}

51313

51314

// Matches 'add like' patterns: add(Op0,Op1) + zext(or(Op0,Op1)).

51315

// Match the or case only if its 'add-like' - can be replaced by an add.

51316

auto FindAddLike = [&](SDValue V, SDValue &Op0, SDValue &Op1) {

51317

if (ISD::ADD == V.getOpcode()) {

51318

Op0 = V.getOperand(0);

51319

Op1 = V.getOperand(1);

51320

return true;

51321

}

51322

if (ISD::ZERO_EXTEND != V.getOpcode())

51323

return false;

51324

V = V.getOperand(0);

51325

if (V.getValueType() != VT || ISD::OR != V.getOpcode() ||

51326

!DAG.haveNoCommonBitsSet(V.getOperand(0), V.getOperand(1)))

51327

return false;

51328

Op0 = V.getOperand(0);

51329

Op1 = V.getOperand(1);

51330

return true;

51331

};

51332

51333

SDValue Op0, Op1;

51334

if (FindAddLike(Operands[0], Op0, Op1))

51335

std::swap(Operands[0], Operands[1]);

51336

else if (!FindAddLike(Operands[1], Op0, Op1))

51337

return SDValue();

51338

Operands[2] = Op0;

51339

Operands[1] = Op1;

51340

51341

// Now we have three operands of two additions. Check that one of them is a

51342

// constant vector with ones, and the other two can be promoted from i8/i16.

51343

for (SDValue &Op : Operands) {

51344

if (!IsConstVectorInRange(Op, 1, 1))

51345

continue;

51346

std::swap(Op, Operands[2]);

51347

51348

// Check if Operands[0] and Operands[1] are results of type promotion.

51349

for (int j = 0; j < 2; ++j)

51350

if (Operands[j].getValueType() != VT)

51351

if (!IsZExtLike(Operands[j]))

51352

return SDValue();

51353

51354

// The pattern is detected, emit X86ISD::AVG instruction(s).

51355

return AVGSplitter({Operands[0], Operands[1]});

51356

}

51357

51358

return SDValue();

51359

}

51360

51361

static SDValue combineLoad(SDNode *N, SelectionDAG &DAG,

51362

TargetLowering::DAGCombinerInfo &DCI,

51363

const X86Subtarget &Subtarget) {

51364

LoadSDNode *Ld = cast<LoadSDNode>(N);

51365

EVT RegVT = Ld->getValueType(0);

51366

EVT MemVT = Ld->getMemoryVT();

51367

SDLoc dl(Ld);

51368

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

51369

51370

// For chips with slow 32-byte unaligned loads, break the 32-byte operation

51371

// into two 16-byte operations. Also split non-temporal aligned loads on

51372

// pre-AVX2 targets as 32-byte loads will lower to regular temporal loads.

51373

ISD::LoadExtType Ext = Ld->getExtensionType();

51374

unsigned Fast;

51375

if (RegVT.is256BitVector() && !DCI.isBeforeLegalizeOps() &&

51376

Ext == ISD::NON_EXTLOAD &&

51377

((Ld->isNonTemporal() && !Subtarget.hasInt256() &&

51378

Ld->getAlign() >= Align(16)) ||

51379

(TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT,

51380

*Ld->getMemOperand(), &Fast) &&

51381

!Fast))) {

51382

unsigned NumElems = RegVT.getVectorNumElements();

51383

if (NumElems < 2)

51384

return SDValue();

51385

51386

unsigned HalfOffset = 16;

51387

SDValue Ptr1 = Ld->getBasePtr();

51388

SDValue Ptr2 =

51389

DAG.getMemBasePlusOffset(Ptr1, TypeSize::Fixed(HalfOffset), dl);

51390

EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),

51391

NumElems / 2);

51392

SDValue Load1 =

51393

DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr1, Ld->getPointerInfo(),

51394

Ld->getOriginalAlign(),

51395

Ld->getMemOperand()->getFlags());

51396

SDValue Load2 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr2,

51397

Ld->getPointerInfo().getWithOffset(HalfOffset),

51398

Ld->getOriginalAlign(),

51399

Ld->getMemOperand()->getFlags());

51400

SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,

51401

Load1.getValue(1), Load2.getValue(1));

51402

51403

SDValue NewVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Load1, Load2);

51404

return DCI.CombineTo(N, NewVec, TF, true);

51405

}

51406

51407

// Bool vector load - attempt to cast to an integer, as we have good

51408

// (vXiY *ext(vXi1 bitcast(iX))) handling.

51409

if (Ext == ISD::NON_EXTLOAD && !Subtarget.hasAVX512() && RegVT.isVector() &&

51410

RegVT.getScalarType() == MVT::i1 && DCI.isBeforeLegalize()) {

51411

unsigned NumElts = RegVT.getVectorNumElements();

51412

EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);

51413

if (TLI.isTypeLegal(IntVT)) {

51414

SDValue IntLoad = DAG.getLoad(IntVT, dl, Ld->getChain(), Ld->getBasePtr(),

51415

Ld->getPointerInfo(),

51416

Ld->getOriginalAlign(),

51417

Ld->getMemOperand()->getFlags());

51418

SDValue BoolVec = DAG.getBitcast(RegVT, IntLoad);

51419

return DCI.CombineTo(N, BoolVec, IntLoad.getValue(1), true);

51420

}

51421

}

51422

51423

// If we also broadcast this as a subvector to a wider type, then just extract

51424

// the lowest subvector.

51425

if (Ext == ISD::NON_EXTLOAD && Subtarget.hasAVX() && Ld->isSimple() &&

51426

(RegVT.is128BitVector() || RegVT.is256BitVector())) {

51427

SDValue Ptr = Ld->getBasePtr();

51428

SDValue Chain = Ld->getChain();

51429

for (SDNode *User : Ptr->uses()) {

51430

if (User != N && User->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD &&

51431

cast<MemIntrinsicSDNode>(User)->getBasePtr() == Ptr &&

51432

cast<MemIntrinsicSDNode>(User)->getChain() == Chain &&

51433

cast<MemIntrinsicSDNode>(User)->getMemoryVT().getSizeInBits() ==

51434

MemVT.getSizeInBits() &&

51435

!User->hasAnyUseOfValue(1) &&

51436

User->getValueSizeInBits(0).getFixedValue() >

51437

RegVT.getFixedSizeInBits()) {

51438

SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, SDLoc(N),

51439

RegVT.getSizeInBits());

51440

Extract = DAG.getBitcast(RegVT, Extract);

51441

return DCI.CombineTo(N, Extract, SDValue(User, 1));

51442

}

51443

}

51444

}

51445

51446

// Cast ptr32 and ptr64 pointers to the default address space before a load.

51447

unsigned AddrSpace = Ld->getAddressSpace();

51448

if (AddrSpace == X86AS::PTR64 || AddrSpace == X86AS::PTR32_SPTR ||

51449

AddrSpace == X86AS::PTR32_UPTR) {

51450

MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());

51451

if (PtrVT != Ld->getBasePtr().getSimpleValueType()) {

51452

SDValue Cast =

51453

DAG.getAddrSpaceCast(dl, PtrVT, Ld->getBasePtr(), AddrSpace, 0);

51454

return DAG.getLoad(RegVT, dl, Ld->getChain(), Cast, Ld->getPointerInfo(),

51455

Ld->getOriginalAlign(),

51456

Ld->getMemOperand()->getFlags());

51457

}

51458

}

51459

51460

return SDValue();

51461

}

51462

51463

/// If V is a build vector of boolean constants and exactly one of those

51464

/// constants is true, return the operand index of that true element.

51465

/// Otherwise, return -1.

51466

static int getOneTrueElt(SDValue V) {

51467

// This needs to be a build vector of booleans.

51468

// TODO: Checking for the i1 type matches the IR definition for the mask,

51469

// but the mask check could be loosened to i8 or other types. That might

51470

// also require checking more than 'allOnesValue'; eg, the x86 HW

51471

// instructions only require that the MSB is set for each mask element.

51472

// The ISD::MSTORE comments/definition do not specify how the mask operand

51473

// is formatted.

51474

auto *BV = dyn_cast<BuildVectorSDNode>(V);

51475

if (!BV || BV->getValueType(0).getVectorElementType() != MVT::i1)

51476

return -1;

51477

51478

int TrueIndex = -1;

51479

unsigned NumElts = BV->getValueType(0).getVectorNumElements();

51480

for (unsigned i = 0; i < NumElts; ++i) {

51481

const SDValue &Op = BV->getOperand(i);

51482

if (Op.isUndef())

51483

continue;

51484

auto *ConstNode = dyn_cast<ConstantSDNode>(Op);

51485

if (!ConstNode)

51486

return -1;

51487

if (ConstNode->getAPIntValue().countr_one() >= 1) {

51488

// If we already found a one, this is too many.

51489

if (TrueIndex >= 0)

51490

return -1;

51491

TrueIndex = i;

51492

}

51493

}

51494

return TrueIndex;

51495

}

51496

51497

/// Given a masked memory load/store operation, return true if it has one mask

51498

/// bit set. If it has one mask bit set, then also return the memory address of

51499

/// the scalar element to load/store, the vector index to insert/extract that

51500

/// scalar element, and the alignment for the scalar memory access.

51501

static bool getParamsForOneTrueMaskedElt(MaskedLoadStoreSDNode *MaskedOp,

51502

SelectionDAG &DAG, SDValue &Addr,

51503

SDValue &Index, Align &Alignment,

51504

unsigned &Offset) {

51505

int TrueMaskElt = getOneTrueElt(MaskedOp->getMask());

51506

if (TrueMaskElt < 0)

51507

return false;

51508

51509

// Get the address of the one scalar element that is specified by the mask

51510

// using the appropriate offset from the base pointer.

51511

EVT EltVT = MaskedOp->getMemoryVT().getVectorElementType();

51512

Offset = 0;

51513

Addr = MaskedOp->getBasePtr();

51514

if (TrueMaskElt != 0) {

51515

Offset = TrueMaskElt * EltVT.getStoreSize();

51516

Addr = DAG.getMemBasePlusOffset(Addr, TypeSize::Fixed(Offset),

51517

SDLoc(MaskedOp));

51518

}

51519

51520

Index = DAG.getIntPtrConstant(TrueMaskElt, SDLoc(MaskedOp));

51521

Alignment = commonAlignment(MaskedOp->getOriginalAlign(),

51522

EltVT.getStoreSize());

51523

return true;

51524

}

51525

51526

/// If exactly one element of the mask is set for a non-extending masked load,

51527

/// it is a scalar load and vector insert.

51528

/// Note: It is expected that the degenerate cases of an all-zeros or all-ones

51529

/// mask have already been optimized in IR, so we don't bother with those here.

51530

static SDValue

51531

reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG,

51532

TargetLowering::DAGCombinerInfo &DCI,

51533

const X86Subtarget &Subtarget) {

51534

assert(ML->isUnindexed() && "Unexpected indexed masked load!")(static_cast <bool> (ML->isUnindexed() && "Unexpected indexed masked load!"
) ? void (0) : __assert_fail ("ML->isUnindexed() && \"Unexpected indexed masked load!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 51534, __extension__
__PRETTY_FUNCTION__));

51535

// TODO: This is not x86-specific, so it could be lifted to DAGCombiner.

51536

// However, some target hooks may need to be added to know when the transform

51537

// is profitable. Endianness would also have to be considered.

51538

51539

SDValue Addr, VecIndex;

51540

Align Alignment;

51541

unsigned Offset;

51542

if (!getParamsForOneTrueMaskedElt(ML, DAG, Addr, VecIndex, Alignment, Offset))

51543

return SDValue();

51544

51545

// Load the one scalar element that is specified by the mask using the

51546

// appropriate offset from the base pointer.

51547

SDLoc DL(ML);

51548

EVT VT = ML->getValueType(0);

51549

EVT EltVT = VT.getVectorElementType();

51550

51551

EVT CastVT = VT;

51552

if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {

51553

EltVT = MVT::f64;

51554

CastVT = VT.changeVectorElementType(EltVT);

51555

}

51556

51557

SDValue Load =

51558

DAG.getLoad(EltVT, DL, ML->getChain(), Addr,

51559

ML->getPointerInfo().getWithOffset(Offset),

51560

Alignment, ML->getMemOperand()->getFlags());

51561

51562

SDValue PassThru = DAG.getBitcast(CastVT, ML->getPassThru());

51563

51564

// Insert the loaded element into the appropriate place in the vector.

51565

SDValue Insert =

51566

DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, CastVT, PassThru, Load, VecIndex);

51567

Insert = DAG.getBitcast(VT, Insert);

51568

return DCI.CombineTo(ML, Insert, Load.getValue(1), true);

51569

}

51570

51571

static SDValue

51572

combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG,

51573

TargetLowering::DAGCombinerInfo &DCI) {

51574

assert(ML->isUnindexed() && "Unexpected indexed masked load!")(static_cast <bool> (ML->isUnindexed() && "Unexpected indexed masked load!"
) ? void (0) : __assert_fail ("ML->isUnindexed() && \"Unexpected indexed masked load!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 51574, __extension__
__PRETTY_FUNCTION__));

51575

if (!ISD::isBuildVectorOfConstantSDNodes(ML->getMask().getNode()))

51576

return SDValue();

51577

51578

SDLoc DL(ML);

51579

EVT VT = ML->getValueType(0);

51580

51581

// If we are loading the first and last elements of a vector, it is safe and

51582

// always faster to load the whole vector. Replace the masked load with a

51583

// vector load and select.

51584

unsigned NumElts = VT.getVectorNumElements();

51585

BuildVectorSDNode *MaskBV = cast<BuildVectorSDNode>(ML->getMask());

51586

bool LoadFirstElt = !isNullConstant(MaskBV->getOperand(0));

51587

bool LoadLastElt = !isNullConstant(MaskBV->getOperand(NumElts - 1));

51588

if (LoadFirstElt && LoadLastElt) {

51589

SDValue VecLd = DAG.getLoad(VT, DL, ML->getChain(), ML->getBasePtr(),

51590

ML->getMemOperand());

51591

SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), VecLd,

51592

ML->getPassThru());

51593

return DCI.CombineTo(ML, Blend, VecLd.getValue(1), true);

51594

}

51595

51596

// Convert a masked load with a constant mask into a masked load and a select.

51597

// This allows the select operation to use a faster kind of select instruction

51598

// (for example, vblendvps -> vblendps).

51599

51600

// Don't try this if the pass-through operand is already undefined. That would

51601

// cause an infinite loop because that's what we're about to create.

51602

if (ML->getPassThru().isUndef())

51603

return SDValue();

51604

51605

if (ISD::isBuildVectorAllZeros(ML->getPassThru().getNode()))

51606

return SDValue();

51607

51608

// The new masked load has an undef pass-through operand. The select uses the

51609

// original pass-through operand.

51610

SDValue NewML = DAG.getMaskedLoad(

51611

VT, DL, ML->getChain(), ML->getBasePtr(), ML->getOffset(), ML->getMask(),

51612

DAG.getUNDEF(VT), ML->getMemoryVT(), ML->getMemOperand(),

51613

ML->getAddressingMode(), ML->getExtensionType());

51614

SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), NewML,

51615

ML->getPassThru());

51616

51617

return DCI.CombineTo(ML, Blend, NewML.getValue(1), true);

51618

}

51619

51620

static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG,

51621

TargetLowering::DAGCombinerInfo &DCI,

51622

const X86Subtarget &Subtarget) {

51623

auto *Mld = cast<MaskedLoadSDNode>(N);

51624

51625

// TODO: Expanding load with constant mask may be optimized as well.

51626

if (Mld->isExpandingLoad())

51627

return SDValue();

51628

51629

if (Mld->getExtensionType() == ISD::NON_EXTLOAD) {

51630

if (SDValue ScalarLoad =

51631

reduceMaskedLoadToScalarLoad(Mld, DAG, DCI, Subtarget))

51632

return ScalarLoad;

51633

51634

// TODO: Do some AVX512 subsets benefit from this transform?

51635

if (!Subtarget.hasAVX512())

51636

if (SDValue Blend = combineMaskedLoadConstantMask(Mld, DAG, DCI))

51637

return Blend;

51638

}

51639

51640

// If the mask value has been legalized to a non-boolean vector, try to

51641

// simplify ops leading up to it. We only demand the MSB of each lane.

51642

SDValue Mask = Mld->getMask();

51643

if (Mask.getScalarValueSizeInBits() != 1) {

51644

EVT VT = Mld->getValueType(0);

51645

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

51646

APInt DemandedBits(APInt::getSignMask(VT.getScalarSizeInBits()));

51647

if (TLI.SimplifyDemandedBits(Mask, DemandedBits, DCI)) {

51648

if (N->getOpcode() != ISD::DELETED_NODE)

51649

DCI.AddToWorklist(N);

51650

return SDValue(N, 0);

51651

}

51652

if (SDValue NewMask =

51653

TLI.SimplifyMultipleUseDemandedBits(Mask, DemandedBits, DAG))

51654

return DAG.getMaskedLoad(

51655

VT, SDLoc(N), Mld->getChain(), Mld->getBasePtr(), Mld->getOffset(),

51656

NewMask, Mld->getPassThru(), Mld->getMemoryVT(), Mld->getMemOperand(),

51657

Mld->getAddressingMode(), Mld->getExtensionType());

51658

}

51659

51660

return SDValue();

51661

}

51662

51663

/// If exactly one element of the mask is set for a non-truncating masked store,

51664

/// it is a vector extract and scalar store.

51665

/// Note: It is expected that the degenerate cases of an all-zeros or all-ones

51666

/// mask have already been optimized in IR, so we don't bother with those here.

51667

static SDValue reduceMaskedStoreToScalarStore(MaskedStoreSDNode *MS,

51668

SelectionDAG &DAG,

51669

const X86Subtarget &Subtarget) {

51670

// TODO: This is not x86-specific, so it could be lifted to DAGCombiner.

51671

// However, some target hooks may need to be added to know when the transform

51672

// is profitable. Endianness would also have to be considered.

51673

51674

SDValue Addr, VecIndex;

51675

Align Alignment;

51676

unsigned Offset;

51677

if (!getParamsForOneTrueMaskedElt(MS, DAG, Addr, VecIndex, Alignment, Offset))

51678

return SDValue();

51679

51680

// Extract the one scalar element that is actually being stored.

51681

SDLoc DL(MS);

51682

SDValue Value = MS->getValue();

51683

EVT VT = Value.getValueType();

51684

EVT EltVT = VT.getVectorElementType();

51685

if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {

51686

EltVT = MVT::f64;

51687

EVT CastVT = VT.changeVectorElementType(EltVT);

51688

Value = DAG.getBitcast(CastVT, Value);

51689

}

51690

SDValue Extract =

51691

DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Value, VecIndex);

51692

51693

// Store that element at the appropriate offset from the base pointer.

51694

return DAG.getStore(MS->getChain(), DL, Extract, Addr,

51695

MS->getPointerInfo().getWithOffset(Offset),

51696

Alignment, MS->getMemOperand()->getFlags());

51697

}

51698

51699

static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,

51700

TargetLowering::DAGCombinerInfo &DCI,

51701

const X86Subtarget &Subtarget) {

51702

MaskedStoreSDNode *Mst = cast<MaskedStoreSDNode>(N);

51703

if (Mst->isCompressingStore())

51704

return SDValue();

51705

51706

EVT VT = Mst->getValue().getValueType();

51707

SDLoc dl(Mst);

51708

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

51709

51710

if (Mst->isTruncatingStore())

51711

return SDValue();

51712

51713

if (SDValue ScalarStore = reduceMaskedStoreToScalarStore(Mst, DAG, Subtarget))

51714

return ScalarStore;

51715

51716

// If the mask value has been legalized to a non-boolean vector, try to

51717

// simplify ops leading up to it. We only demand the MSB of each lane.

51718

SDValue Mask = Mst->getMask();

51719

if (Mask.getScalarValueSizeInBits() != 1) {

51720

APInt DemandedBits(APInt::getSignMask(VT.getScalarSizeInBits()));

51721

if (TLI.SimplifyDemandedBits(Mask, DemandedBits, DCI)) {

51722

if (N->getOpcode() != ISD::DELETED_NODE)

51723

DCI.AddToWorklist(N);

51724

return SDValue(N, 0);

51725

}

51726

if (SDValue NewMask =

51727

TLI.SimplifyMultipleUseDemandedBits(Mask, DemandedBits, DAG))

51728

return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Mst->getValue(),

51729

Mst->getBasePtr(), Mst->getOffset(), NewMask,

51730

Mst->getMemoryVT(), Mst->getMemOperand(),

51731

Mst->getAddressingMode());

51732

}

51733

51734

SDValue Value = Mst->getValue();

51735

if (Value.getOpcode() == ISD::TRUNCATE && Value.getNode()->hasOneUse() &&

51736

TLI.isTruncStoreLegal(Value.getOperand(0).getValueType(),

51737

Mst->getMemoryVT())) {

51738

return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Value.getOperand(0),

51739

Mst->getBasePtr(), Mst->getOffset(), Mask,

51740

Mst->getMemoryVT(), Mst->getMemOperand(),

51741

Mst->getAddressingMode(), true);

51742

}

51743

51744

return SDValue();

51745

}

51746

51747

static SDValue combineStore(SDNode *N, SelectionDAG &DAG,

51748

TargetLowering::DAGCombinerInfo &DCI,

51749

const X86Subtarget &Subtarget) {

51750

StoreSDNode *St = cast<StoreSDNode>(N);

51751

EVT StVT = St->getMemoryVT();

51752

SDLoc dl(St);

51753

SDValue StoredVal = St->getValue();

51754

EVT VT = StoredVal.getValueType();

51755

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

51756

51757

// Convert a store of vXi1 into a store of iX and a bitcast.

51758

if (!Subtarget.hasAVX512() && VT == StVT && VT.isVector() &&

51759

VT.getVectorElementType() == MVT::i1) {

51760

51761

EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), VT.getVectorNumElements());

51762

StoredVal = DAG.getBitcast(NewVT, StoredVal);

51763

51764

return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),

51765

St->getPointerInfo(), St->getOriginalAlign(),

51766

St->getMemOperand()->getFlags());

51767

}

51768

51769

// If this is a store of a scalar_to_vector to v1i1, just use a scalar store.

51770

// This will avoid a copy to k-register.

51771

if (VT == MVT::v1i1 && VT == StVT && Subtarget.hasAVX512() &&

51772

StoredVal.getOpcode() == ISD::SCALAR_TO_VECTOR &&

51773

StoredVal.getOperand(0).getValueType() == MVT::i8) {

51774

SDValue Val = StoredVal.getOperand(0);

51775

// We must store zeros to the unused bits.

51776

Val = DAG.getZeroExtendInReg(Val, dl, MVT::i1);

51777

return DAG.getStore(St->getChain(), dl, Val,

51778

St->getBasePtr(), St->getPointerInfo(),

51779

St->getOriginalAlign(),

51780

St->getMemOperand()->getFlags());

51781

}

51782

51783

// Widen v2i1/v4i1 stores to v8i1.

51784

if ((VT == MVT::v1i1 || VT == MVT::v2i1 || VT == MVT::v4i1) && VT == StVT &&

51785

Subtarget.hasAVX512()) {

51786

unsigned NumConcats = 8 / VT.getVectorNumElements();

51787

// We must store zeros to the unused bits.

51788

SmallVector<SDValue, 4> Ops(NumConcats, DAG.getConstant(0, dl, VT));

51789

Ops[0] = StoredVal;

51790

StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);

51791

return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),

51792

St->getPointerInfo(), St->getOriginalAlign(),

51793

St->getMemOperand()->getFlags());

51794

}

51795

51796

// Turn vXi1 stores of constants into a scalar store.

51797

if ((VT == MVT::v8i1 || VT == MVT::v16i1 || VT == MVT::v32i1 ||

51798

VT == MVT::v64i1) && VT == StVT && TLI.isTypeLegal(VT) &&

51799

ISD::isBuildVectorOfConstantSDNodes(StoredVal.getNode())) {

51800

// If its a v64i1 store without 64-bit support, we need two stores.

51801

if (!DCI.isBeforeLegalize() && VT == MVT::v64i1 && !Subtarget.is64Bit()) {

51802

SDValue Lo = DAG.getBuildVector(MVT::v32i1, dl,

51803

StoredVal->ops().slice(0, 32));

51804

Lo = combinevXi1ConstantToInteger(Lo, DAG);

51805

SDValue Hi = DAG.getBuildVector(MVT::v32i1, dl,

51806

StoredVal->ops().slice(32, 32));

51807

Hi = combinevXi1ConstantToInteger(Hi, DAG);

51808

51809

SDValue Ptr0 = St->getBasePtr();

51810

SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, TypeSize::Fixed(4), dl);

51811

51812

SDValue Ch0 =

51813

DAG.getStore(St->getChain(), dl, Lo, Ptr0, St->getPointerInfo(),

51814

St->getOriginalAlign(),

51815

St->getMemOperand()->getFlags());

51816

SDValue Ch1 =

51817

DAG.getStore(St->getChain(), dl, Hi, Ptr1,

51818

St->getPointerInfo().getWithOffset(4),

51819

St->getOriginalAlign(),

51820

St->getMemOperand()->getFlags());

51821

return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);

51822

}

51823

51824

StoredVal = combinevXi1ConstantToInteger(StoredVal, DAG);

51825

return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),

51826

St->getPointerInfo(), St->getOriginalAlign(),

51827

St->getMemOperand()->getFlags());

51828

}

51829

51830

// If we are saving a 32-byte vector and 32-byte stores are slow, such as on

51831

// Sandy Bridge, perform two 16-byte stores.

51832

unsigned Fast;

51833

if (VT.is256BitVector() && StVT == VT &&

51834

TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,

51835

*St->getMemOperand(), &Fast) &&

51836

!Fast) {

51837

unsigned NumElems = VT.getVectorNumElements();

51838

if (NumElems < 2)

51839

return SDValue();

51840

51841

return splitVectorStore(St, DAG);

51842

}

51843

51844

// Split under-aligned vector non-temporal stores.

51845

if (St->isNonTemporal() && StVT == VT &&

51846

St->getAlign().value() < VT.getStoreSize()) {

51847

// ZMM/YMM nt-stores - either it can be stored as a series of shorter

51848

// vectors or the legalizer can scalarize it to use MOVNTI.

51849

if (VT.is256BitVector() || VT.is512BitVector()) {

51850

unsigned NumElems = VT.getVectorNumElements();

51851

if (NumElems < 2)

51852

return SDValue();

51853

return splitVectorStore(St, DAG);

51854

}

51855

51856

// XMM nt-stores - scalarize this to f64 nt-stores on SSE4A, else i32/i64

51857

// to use MOVNTI.

51858

if (VT.is128BitVector() && Subtarget.hasSSE2()) {

51859

MVT NTVT = Subtarget.hasSSE4A()

51860

? MVT::v2f64

51861

: (TLI.isTypeLegal(MVT::i64) ? MVT::v2i64 : MVT::v4i32);

51862

return scalarizeVectorStore(St, NTVT, DAG);

51863

}

51864

}

51865

51866

// Try to optimize v16i16->v16i8 truncating stores when BWI is not

51867

// supported, but avx512f is by extending to v16i32 and truncating.

51868

if (!St->isTruncatingStore() && VT == MVT::v16i8 && !Subtarget.hasBWI() &&

51869

St->getValue().getOpcode() == ISD::TRUNCATE &&

51870

St->getValue().getOperand(0).getValueType() == MVT::v16i16 &&

51871

TLI.isTruncStoreLegal(MVT::v16i32, MVT::v16i8) &&

51872

St->getValue().hasOneUse() && !DCI.isBeforeLegalizeOps()) {

51873

SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::v16i32,

51874

St->getValue().getOperand(0));

51875

return DAG.getTruncStore(St->getChain(), dl, Ext, St->getBasePtr(),

51876

MVT::v16i8, St->getMemOperand());

51877

}

51878

51879

// Try to fold a VTRUNCUS or VTRUNCS into a truncating store.

51880

if (!St->isTruncatingStore() &&

51881

(StoredVal.getOpcode() == X86ISD::VTRUNCUS ||

51882

StoredVal.getOpcode() == X86ISD::VTRUNCS) &&

51883

StoredVal.hasOneUse() &&

51884

TLI.isTruncStoreLegal(StoredVal.getOperand(0).getValueType(), VT)) {

51885

bool IsSigned = StoredVal.getOpcode() == X86ISD::VTRUNCS;

51886

return EmitTruncSStore(IsSigned, St->getChain(),

51887

dl, StoredVal.getOperand(0), St->getBasePtr(),

51888

VT, St->getMemOperand(), DAG);

51889

}

51890

51891

// Try to fold a extract_element(VTRUNC) pattern into a truncating store.

51892

if (!St->isTruncatingStore()) {

51893

auto IsExtractedElement = [](SDValue V) {

51894

if (V.getOpcode() == ISD::TRUNCATE && V.hasOneUse())

51895

V = V.getOperand(0);

51896

unsigned Opc = V.getOpcode();

51897

if ((Opc == ISD::EXTRACT_VECTOR_ELT || Opc == X86ISD::PEXTRW) &&

51898

isNullConstant(V.getOperand(1)) && V.hasOneUse() &&

51899

V.getOperand(0).hasOneUse())

51900

return V.getOperand(0);

51901

return SDValue();

51902

};

51903

if (SDValue Extract = IsExtractedElement(StoredVal)) {

51904

SDValue Trunc = peekThroughOneUseBitcasts(Extract);

51905

if (Trunc.getOpcode() == X86ISD::VTRUNC) {

51906

SDValue Src = Trunc.getOperand(0);

51907

MVT DstVT = Trunc.getSimpleValueType();

51908

MVT SrcVT = Src.getSimpleValueType();

51909

unsigned NumSrcElts = SrcVT.getVectorNumElements();

51910

unsigned NumTruncBits = DstVT.getScalarSizeInBits() * NumSrcElts;

51911

MVT TruncVT = MVT::getVectorVT(DstVT.getScalarType(), NumSrcElts);

51912

if (NumTruncBits == VT.getSizeInBits() &&

51913

TLI.isTruncStoreLegal(SrcVT, TruncVT)) {

51914

return DAG.getTruncStore(St->getChain(), dl, Src, St->getBasePtr(),

51915

TruncVT, St->getMemOperand());

51916

}

51917

}

51918

}

51919

}

51920

51921

// Optimize trunc store (of multiple scalars) to shuffle and store.

51922

// First, pack all of the elements in one place. Next, store to memory

51923

// in fewer chunks.

51924

if (St->isTruncatingStore() && VT.isVector()) {

51925

// Check if we can detect an AVG pattern from the truncation. If yes,

51926

// replace the trunc store by a normal store with the result of X86ISD::AVG

51927

// instruction.

51928

if (DCI.isBeforeLegalize() || TLI.isTypeLegal(St->getMemoryVT()))

51929

if (SDValue Avg = detectAVGPattern(St->getValue(), St->getMemoryVT(), DAG,

51930

Subtarget, dl))

51931

return DAG.getStore(St->getChain(), dl, Avg, St->getBasePtr(),

51932

St->getPointerInfo(), St->getOriginalAlign(),

51933

St->getMemOperand()->getFlags());

51934

51935

if (TLI.isTruncStoreLegal(VT, StVT)) {

51936

if (SDValue Val = detectSSatPattern(St->getValue(), St->getMemoryVT()))

51937

return EmitTruncSStore(true /* Signed saturation */, St->getChain(),

51938

dl, Val, St->getBasePtr(),

51939

St->getMemoryVT(), St->getMemOperand(), DAG);

51940

if (SDValue Val = detectUSatPattern(St->getValue(), St->getMemoryVT(),

51941

DAG, dl))

51942

return EmitTruncSStore(false /* Unsigned saturation */, St->getChain(),

51943

dl, Val, St->getBasePtr(),

51944

St->getMemoryVT(), St->getMemOperand(), DAG);

51945

}

51946

51947

return SDValue();

51948

}

51949

51950

// Cast ptr32 and ptr64 pointers to the default address space before a store.

51951

unsigned AddrSpace = St->getAddressSpace();

51952

if (AddrSpace == X86AS::PTR64 || AddrSpace == X86AS::PTR32_SPTR ||

51953

AddrSpace == X86AS::PTR32_UPTR) {

51954

MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());

51955

if (PtrVT != St->getBasePtr().getSimpleValueType()) {

51956

SDValue Cast =

51957

DAG.getAddrSpaceCast(dl, PtrVT, St->getBasePtr(), AddrSpace, 0);

51958

return DAG.getStore(St->getChain(), dl, StoredVal, Cast,

51959

St->getPointerInfo(), St->getOriginalAlign(),

51960

St->getMemOperand()->getFlags(), St->getAAInfo());

51961

}

51962

}

51963

51964

// Turn load->store of MMX types into GPR load/stores. This avoids clobbering

51965

// the FP state in cases where an emms may be missing.

51966

// A preferable solution to the general problem is to figure out the right

51967

// places to insert EMMS. This qualifies as a quick hack.

51968

51969

// Similarly, turn load->store of i64 into double load/stores in 32-bit mode.

51970

if (VT.getSizeInBits() != 64)

51971

return SDValue();

51972

51973

const Function &F = DAG.getMachineFunction().getFunction();

51974

bool NoImplicitFloatOps = F.hasFnAttribute(Attribute::NoImplicitFloat);

51975

bool F64IsLegal =

51976

!Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2();

51977

if ((VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit()) &&

51978

isa<LoadSDNode>(St->getValue()) &&

51979

cast<LoadSDNode>(St->getValue())->isSimple() &&

51980

St->getChain().hasOneUse() && St->isSimple()) {

51981

LoadSDNode *Ld = cast<LoadSDNode>(St->getValue().getNode());

51982

51983

if (!ISD::isNormalLoad(Ld))

51984

return SDValue();

51985

51986

// Avoid the transformation if there are multiple uses of the loaded value.

51987

if (!Ld->hasNUsesOfValue(1, 0))

51988

return SDValue();

51989

51990

SDLoc LdDL(Ld);

51991

SDLoc StDL(N);

51992

// Lower to a single movq load/store pair.

51993

SDValue NewLd = DAG.getLoad(MVT::f64, LdDL, Ld->getChain(),

51994

Ld->getBasePtr(), Ld->getMemOperand());

51995

51996

// Make sure new load is placed in same chain order.

51997

DAG.makeEquivalentMemoryOrdering(Ld, NewLd);

51998

return DAG.getStore(St->getChain(), StDL, NewLd, St->getBasePtr(),

51999

St->getMemOperand());

52000

}

52001

52002

// This is similar to the above case, but here we handle a scalar 64-bit

52003

// integer store that is extracted from a vector on a 32-bit target.

52004

// If we have SSE2, then we can treat it like a floating-point double

52005

// to get past legalization. The execution dependencies fixup pass will

52006

// choose the optimal machine instruction for the store if this really is

52007

// an integer or v2f32 rather than an f64.

52008

if (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit() &&

52009

St->getOperand(1).getOpcode() == ISD::EXTRACT_VECTOR_ELT) {

52010

SDValue OldExtract = St->getOperand(1);

52011

SDValue ExtOp0 = OldExtract.getOperand(0);

52012

unsigned VecSize = ExtOp0.getValueSizeInBits();

52013

EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, VecSize / 64);

52014

SDValue BitCast = DAG.getBitcast(VecVT, ExtOp0);

52015

SDValue NewExtract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,

52016

BitCast, OldExtract.getOperand(1));

52017

return DAG.getStore(St->getChain(), dl, NewExtract, St->getBasePtr(),

52018

St->getPointerInfo(), St->getOriginalAlign(),

52019

St->getMemOperand()->getFlags());

52020

}

52021

52022

return SDValue();

52023

}

52024

52025

static SDValue combineVEXTRACT_STORE(SDNode *N, SelectionDAG &DAG,

52026

TargetLowering::DAGCombinerInfo &DCI,

52027

const X86Subtarget &Subtarget) {

52028

auto *St = cast<MemIntrinsicSDNode>(N);

52029

52030

SDValue StoredVal = N->getOperand(1);

52031

MVT VT = StoredVal.getSimpleValueType();

52032

EVT MemVT = St->getMemoryVT();

52033

52034

// Figure out which elements we demand.

52035

unsigned StElts = MemVT.getSizeInBits() / VT.getScalarSizeInBits();

52036

APInt DemandedElts = APInt::getLowBitsSet(VT.getVectorNumElements(), StElts);

52037

52038

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

52039

if (TLI.SimplifyDemandedVectorElts(StoredVal, DemandedElts, DCI)) {

52040

if (N->getOpcode() != ISD::DELETED_NODE)

52041

DCI.AddToWorklist(N);

52042

return SDValue(N, 0);

52043

}

52044

52045

return SDValue();

52046

}

52047

52048

/// Return 'true' if this vector operation is "horizontal"

52049

/// and return the operands for the horizontal operation in LHS and RHS. A

52050

/// horizontal operation performs the binary operation on successive elements

52051

/// of its first operand, then on successive elements of its second operand,

52052

/// returning the resulting values in a vector. For example, if

52053

/// A = < float a0, float a1, float a2, float a3 >

52054

/// and

52055

/// B = < float b0, float b1, float b2, float b3 >

52056

/// then the result of doing a horizontal operation on A and B is

52057

/// A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.

52058

/// In short, LHS and RHS are inspected to see if LHS op RHS is of the form

52059

/// A horizontal-op B, for some already available A and B, and if so then LHS is

52060

/// set to A, RHS to B, and the routine returns 'true'.

52061

static bool isHorizontalBinOp(unsigned HOpcode, SDValue &LHS, SDValue &RHS,

52062

SelectionDAG &DAG, const X86Subtarget &Subtarget,

52063

bool IsCommutative,

52064

SmallVectorImpl<int> &PostShuffleMask) {

52065

// If either operand is undef, bail out. The binop should be simplified.

52066

if (LHS.isUndef() || RHS.isUndef())

52067

return false;

52068

52069

// Look for the following pattern:

52070

// A = < float a0, float a1, float a2, float a3 >

52071

// B = < float b0, float b1, float b2, float b3 >

52072

// and

52073

// LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6>

52074

// RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7>

52075

// then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >

52076

// which is A horizontal-op B.

52077

52078

MVT VT = LHS.getSimpleValueType();

52079

assert((VT.is128BitVector() || VT.is256BitVector()) &&(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
()) && "Unsupported vector type for horizontal add/sub"
) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector()) && \"Unsupported vector type for horizontal add/sub\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 52080, __extension__
__PRETTY_FUNCTION__))

52080

"Unsupported vector type for horizontal add/sub")(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
()) && "Unsupported vector type for horizontal add/sub"
) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector()) && \"Unsupported vector type for horizontal add/sub\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 52080, __extension__
__PRETTY_FUNCTION__));

52081

unsigned NumElts = VT.getVectorNumElements();

52082

52083

auto GetShuffle = [&](SDValue Op, SDValue &N0, SDValue &N1,

52084

SmallVectorImpl<int> &ShuffleMask) {

52085

bool UseSubVector = false;

52086

if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&

52087

Op.getOperand(0).getValueType().is256BitVector() &&

52088

llvm::isNullConstant(Op.getOperand(1))) {

52089

Op = Op.getOperand(0);

52090

UseSubVector = true;

52091

}

52092

SmallVector<SDValue, 2> SrcOps;

52093

SmallVector<int, 16> SrcMask, ScaledMask;

52094

SDValue BC = peekThroughBitcasts(Op);

52095

if (getTargetShuffleInputs(BC, SrcOps, SrcMask, DAG) &&

52096

!isAnyZero(SrcMask) && all_of(SrcOps, [BC](SDValue Op) {

52097

return Op.getValueSizeInBits() == BC.getValueSizeInBits();

52098

})) {

52099

resolveTargetShuffleInputsAndMask(SrcOps, SrcMask);

52100

if (!UseSubVector && SrcOps.size() <= 2 &&

52101

scaleShuffleElements(SrcMask, NumElts, ScaledMask)) {

52102

N0 = !SrcOps.empty() ? SrcOps[0] : SDValue();

52103

N1 = SrcOps.size() > 1 ? SrcOps[1] : SDValue();

52104

ShuffleMask.assign(ScaledMask.begin(), ScaledMask.end());

52105

}

52106

if (UseSubVector && SrcOps.size() == 1 &&

52107

scaleShuffleElements(SrcMask, 2 * NumElts, ScaledMask)) {

52108

std::tie(N0, N1) = DAG.SplitVector(SrcOps[0], SDLoc(Op));

52109

ArrayRef<int> Mask = ArrayRef<int>(ScaledMask).slice(0, NumElts);

52110

ShuffleMask.assign(Mask.begin(), Mask.end());

52111

}

52112

}

52113

};

52114

52115

// View LHS in the form

52116

// LHS = VECTOR_SHUFFLE A, B, LMask

52117

// If LHS is not a shuffle, then pretend it is the identity shuffle:

52118

// LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>

52119

// NOTE: A default initialized SDValue represents an UNDEF of type VT.

52120

SDValue A, B;

52121

SmallVector<int, 16> LMask;

52122

GetShuffle(LHS, A, B, LMask);

52123

52124

// Likewise, view RHS in the form

52125

// RHS = VECTOR_SHUFFLE C, D, RMask

52126

SDValue C, D;

52127

SmallVector<int, 16> RMask;

52128

GetShuffle(RHS, C, D, RMask);

52129

52130

// At least one of the operands should be a vector shuffle.

52131

unsigned NumShuffles = (LMask.empty() ? 0 : 1) + (RMask.empty() ? 0 : 1);

52132

if (NumShuffles == 0)

52133

return false;

52134

52135

if (LMask.empty()) {

52136

A = LHS;

52137

for (unsigned i = 0; i != NumElts; ++i)

52138

LMask.push_back(i);

52139

}

52140

52141

if (RMask.empty()) {

52142

C = RHS;

52143

for (unsigned i = 0; i != NumElts; ++i)

52144

RMask.push_back(i);

52145

}

52146

52147

// If we have an unary mask, ensure the other op is set to null.

52148

if (isUndefOrInRange(LMask, 0, NumElts))

52149

B = SDValue();

52150

else if (isUndefOrInRange(LMask, NumElts, NumElts * 2))

52151

A = SDValue();

52152

52153

if (isUndefOrInRange(RMask, 0, NumElts))

52154

D = SDValue();

52155

else if (isUndefOrInRange(RMask, NumElts, NumElts * 2))

52156

C = SDValue();

52157

52158

// If A and B occur in reverse order in RHS, then canonicalize by commuting

52159

// RHS operands and shuffle mask.

52160

if (A != C) {

52161

std::swap(C, D);

52162

ShuffleVectorSDNode::commuteMask(RMask);

52163

}

52164

// Check that the shuffles are both shuffling the same vectors.

52165

if (!(A == C && B == D))

52166

return false;

52167

52168

PostShuffleMask.clear();

52169

PostShuffleMask.append(NumElts, SM_SentinelUndef);

52170

52171

// LHS and RHS are now:

52172

// LHS = shuffle A, B, LMask

52173

// RHS = shuffle A, B, RMask

52174

// Check that the masks correspond to performing a horizontal operation.

52175

// AVX defines horizontal add/sub to operate independently on 128-bit lanes,

52176

// so we just repeat the inner loop if this is a 256-bit op.

52177

unsigned Num128BitChunks = VT.getSizeInBits() / 128;

52178

unsigned NumEltsPer128BitChunk = NumElts / Num128BitChunks;

52179

unsigned NumEltsPer64BitChunk = NumEltsPer128BitChunk / 2;

52180

assert((NumEltsPer128BitChunk % 2 == 0) &&(static_cast <bool> ((NumEltsPer128BitChunk % 2 == 0) &&
"Vector type should have an even number of elements in each lane"
) ? void (0) : __assert_fail ("(NumEltsPer128BitChunk % 2 == 0) && \"Vector type should have an even number of elements in each lane\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 52181, __extension__
__PRETTY_FUNCTION__))

52181

"Vector type should have an even number of elements in each lane")(static_cast <bool> ((NumEltsPer128BitChunk % 2 == 0) &&
"Vector type should have an even number of elements in each lane"
) ? void (0) : __assert_fail ("(NumEltsPer128BitChunk % 2 == 0) && \"Vector type should have an even number of elements in each lane\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 52181, __extension__
__PRETTY_FUNCTION__));

52182

for (unsigned j = 0; j != NumElts; j += NumEltsPer128BitChunk) {

52183

for (unsigned i = 0; i != NumEltsPer128BitChunk; ++i) {

52184

// Ignore undefined components.

52185

int LIdx = LMask[i + j], RIdx = RMask[i + j];

52186

if (LIdx < 0 || RIdx < 0 ||

52187

(!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) ||

52188

(!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts)))

52189

continue;

52190

52191

// Check that successive odd/even elements are being operated on. If not,

52192

// this is not a horizontal operation.

52193

if (!((RIdx & 1) == 1 && (LIdx + 1) == RIdx) &&

52194

!((LIdx & 1) == 1 && (RIdx + 1) == LIdx && IsCommutative))

52195

return false;

52196

52197

// Compute the post-shuffle mask index based on where the element

52198

// is stored in the HOP result, and where it needs to be moved to.

52199

int Base = LIdx & ~1u;

52200

int Index = ((Base % NumEltsPer128BitChunk) / 2) +

52201

((Base % NumElts) & ~(NumEltsPer128BitChunk - 1));

52202

52203

// The low half of the 128-bit result must choose from A.

52204

// The high half of the 128-bit result must choose from B,

52205

// unless B is undef. In that case, we are always choosing from A.

52206

if ((B && Base >= (int)NumElts) || (!B && i >= NumEltsPer64BitChunk))

52207

Index += NumEltsPer64BitChunk;

52208

PostShuffleMask[i + j] = Index;

52209

}

52210

}

52211

52212

SDValue NewLHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.

52213

SDValue NewRHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.

52214

52215

bool IsIdentityPostShuffle =

52216

isSequentialOrUndefInRange(PostShuffleMask, 0, NumElts, 0);

52217

if (IsIdentityPostShuffle)

52218

PostShuffleMask.clear();

52219

52220

// Avoid 128-bit multi lane shuffles if pre-AVX2 and FP (integer will split).

52221

if (!IsIdentityPostShuffle && !Subtarget.hasAVX2() && VT.isFloatingPoint() &&

52222

isMultiLaneShuffleMask(128, VT.getScalarSizeInBits(), PostShuffleMask))

52223

return false;

52224

52225

// If the source nodes are already used in HorizOps then always accept this.

52226

// Shuffle folding should merge these back together.

52227

bool FoundHorizLHS = llvm::any_of(NewLHS->uses(), [&](SDNode *User) {

52228

return User->getOpcode() == HOpcode && User->getValueType(0) == VT;

52229

});

52230

bool FoundHorizRHS = llvm::any_of(NewRHS->uses(), [&](SDNode *User) {

52231

return User->getOpcode() == HOpcode && User->getValueType(0) == VT;

52232

});

52233

bool ForceHorizOp = FoundHorizLHS && FoundHorizRHS;

52234

52235

// Assume a SingleSource HOP if we only shuffle one input and don't need to

52236

// shuffle the result.

52237

if (!ForceHorizOp &&

52238

!shouldUseHorizontalOp(NewLHS == NewRHS &&

52239

(NumShuffles < 2 || !IsIdentityPostShuffle),

52240

DAG, Subtarget))

52241

return false;

52242

52243

LHS = DAG.getBitcast(VT, NewLHS);

52244

RHS = DAG.getBitcast(VT, NewRHS);

52245

return true;

52246

}

52247

52248

// Try to synthesize horizontal (f)hadd/hsub from (f)adds/subs of shuffles.

52249

static SDValue combineToHorizontalAddSub(SDNode *N, SelectionDAG &DAG,

52250

const X86Subtarget &Subtarget) {

52251

EVT VT = N->getValueType(0);

52252

unsigned Opcode = N->getOpcode();

52253

bool IsAdd = (Opcode == ISD::FADD) || (Opcode == ISD::ADD);

52254

SmallVector<int, 8> PostShuffleMask;

52255

52256

switch (Opcode) {

52257

case ISD::FADD:

52258

case ISD::FSUB:

52259

if ((Subtarget.hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||

52260

(Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) {

52261

SDValue LHS = N->getOperand(0);

52262

SDValue RHS = N->getOperand(1);

52263

auto HorizOpcode = IsAdd ? X86ISD::FHADD : X86ISD::FHSUB;

52264

if (isHorizontalBinOp(HorizOpcode, LHS, RHS, DAG, Subtarget, IsAdd,

52265

PostShuffleMask)) {

52266

SDValue HorizBinOp = DAG.getNode(HorizOpcode, SDLoc(N), VT, LHS, RHS);

52267

if (!PostShuffleMask.empty())

52268

HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp,

52269

DAG.getUNDEF(VT), PostShuffleMask);

52270

return HorizBinOp;

52271

}

52272

}

52273

break;

52274

case ISD::ADD:

52275

case ISD::SUB:

52276

if (Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32 ||

52277

VT == MVT::v16i16 || VT == MVT::v8i32)) {

52278

SDValue LHS = N->getOperand(0);

52279

SDValue RHS = N->getOperand(1);

52280

auto HorizOpcode = IsAdd ? X86ISD::HADD : X86ISD::HSUB;

52281

if (isHorizontalBinOp(HorizOpcode, LHS, RHS, DAG, Subtarget, IsAdd,

52282

PostShuffleMask)) {

52283

auto HOpBuilder = [HorizOpcode](SelectionDAG &DAG, const SDLoc &DL,

52284

ArrayRef<SDValue> Ops) {

52285

return DAG.getNode(HorizOpcode, DL, Ops[0].getValueType(), Ops);

52286

};

52287

SDValue HorizBinOp = SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT,

52288

{LHS, RHS}, HOpBuilder);

52289

if (!PostShuffleMask.empty())

52290

HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp,

52291

DAG.getUNDEF(VT), PostShuffleMask);

52292

return HorizBinOp;

52293

}

52294

}

52295

break;

52296

}

52297

52298

return SDValue();

52299

}

52300

52301

// Try to combine the following nodes

52302

// t29: i64 = X86ISD::Wrapper TargetConstantPool:i64

52303

// <i32 -2147483648[float -0.000000e+00]> 0

52304

// t27: v16i32[v16f32],ch = X86ISD::VBROADCAST_LOAD

52305

// <(load 4 from constant-pool)> t0, t29

52306

// [t30: v16i32 = bitcast t27]

52307

// t6: v16i32 = xor t7, t27[t30]

52308

// t11: v16f32 = bitcast t6

52309

// t21: v16f32 = X86ISD::VFMULC[X86ISD::VCFMULC] t11, t8

52310

// into X86ISD::VFCMULC[X86ISD::VFMULC] if possible:

52311

// t22: v16f32 = bitcast t7

52312

// t23: v16f32 = X86ISD::VFCMULC[X86ISD::VFMULC] t8, t22

52313

// t24: v32f16 = bitcast t23

52314

static SDValue combineFMulcFCMulc(SDNode *N, SelectionDAG &DAG,

52315

const X86Subtarget &Subtarget) {

52316

EVT VT = N->getValueType(0);

52317

SDValue LHS = N->getOperand(0);

52318

SDValue RHS = N->getOperand(1);

52319

int CombineOpcode =

52320

N->getOpcode() == X86ISD::VFCMULC ? X86ISD::VFMULC : X86ISD::VFCMULC;

52321

auto isConjugationConstant = [](const Constant *c) {

52322

if (const auto *CI = dyn_cast<ConstantInt>(c)) {

52323

APInt ConjugationInt32 = APInt(32, 0x80000000, true);

52324

APInt ConjugationInt64 = APInt(64, 0x8000000080000000ULL, true);

52325

switch (CI->getBitWidth()) {

52326

case 16:

52327

return false;

52328

case 32:

52329

return CI->getValue() == ConjugationInt32;

52330

case 64:

52331

return CI->getValue() == ConjugationInt64;

52332

default:

52333

llvm_unreachable("Unexpected bit width")::llvm::llvm_unreachable_internal("Unexpected bit width", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 52333);

52334

}

52335

}

52336

if (const auto *CF = dyn_cast<ConstantFP>(c))

52337

return CF->isNegativeZeroValue();

52338

return false;

52339

};

52340

auto combineConjugation = [&](SDValue &r) {

52341

if (LHS->getOpcode() == ISD::BITCAST && RHS.hasOneUse()) {

52342

SDValue XOR = LHS.getOperand(0);

52343

if (XOR->getOpcode() == ISD::XOR && XOR.hasOneUse()) {

52344

SDValue XORRHS = XOR.getOperand(1);

52345

if (XORRHS.getOpcode() == ISD::BITCAST && XORRHS.hasOneUse())

52346

XORRHS = XORRHS.getOperand(0);

52347

if (XORRHS.getOpcode() == X86ISD::VBROADCAST_LOAD &&

52348

XORRHS.getOperand(1).getNumOperands()) {

52349

ConstantPoolSDNode *CP =

52350

dyn_cast<ConstantPoolSDNode>(XORRHS.getOperand(1).getOperand(0));

52351

if (CP && isConjugationConstant(CP->getConstVal())) {

52352

SelectionDAG::FlagInserter FlagsInserter(DAG, N);

52353

SDValue I2F = DAG.getBitcast(VT, LHS.getOperand(0).getOperand(0));

52354

SDValue FCMulC = DAG.getNode(CombineOpcode, SDLoc(N), VT, RHS, I2F);

52355

r = DAG.getBitcast(VT, FCMulC);

52356

return true;

52357

}

52358

}

52359

}

52360

}

52361

return false;

52362

};

52363

SDValue Res;

52364

if (combineConjugation(Res))

52365

return Res;

52366

std::swap(LHS, RHS);

52367

if (combineConjugation(Res))

52368

return Res;

52369

return Res;

52370

}

52371

52372

// Try to combine the following nodes:

52373

// FADD(A, FMA(B, C, 0)) and FADD(A, FMUL(B, C)) to FMA(B, C, A)

52374

static SDValue combineFaddCFmul(SDNode *N, SelectionDAG &DAG,

52375

const X86Subtarget &Subtarget) {

52376

auto AllowContract = [&DAG](const SDNodeFlags &Flags) {

52377

return DAG.getTarget().Options.AllowFPOpFusion == FPOpFusion::Fast ||

52378

Flags.hasAllowContract();

52379

};

52380

52381

auto HasNoSignedZero = [&DAG](const SDNodeFlags &Flags) {

52382

return DAG.getTarget().Options.NoSignedZerosFPMath ||

52383

Flags.hasNoSignedZeros();

52384

};

52385

auto IsVectorAllNegativeZero = [](const SDNode *N) {

52386

if (N->getOpcode() != X86ISD::VBROADCAST_LOAD)

52387

return false;

52388

assert(N->getSimpleValueType(0).getScalarType() == MVT::f32 &&(static_cast <bool> (N->getSimpleValueType(0).getScalarType
() == MVT::f32 && "Unexpected vector type!") ? void (
0) : __assert_fail ("N->getSimpleValueType(0).getScalarType() == MVT::f32 && \"Unexpected vector type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 52389, __extension__
__PRETTY_FUNCTION__))

52389

"Unexpected vector type!")(static_cast <bool> (N->getSimpleValueType(0).getScalarType
() == MVT::f32 && "Unexpected vector type!") ? void (
0) : __assert_fail ("N->getSimpleValueType(0).getScalarType() == MVT::f32 && \"Unexpected vector type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 52389, __extension__
__PRETTY_FUNCTION__));

52390

if (ConstantPoolSDNode *CP =

52391

dyn_cast<ConstantPoolSDNode>(N->getOperand(1)->getOperand(0))) {

52392

APInt AI = APInt(32, 0x80008000, true);

52393

if (const auto *CI = dyn_cast<ConstantInt>(CP->getConstVal()))

52394

return CI->getValue() == AI;

52395

if (const auto *CF = dyn_cast<ConstantFP>(CP->getConstVal()))

52396

return CF->getValue() == APFloat(APFloat::IEEEsingle(), AI);

52397

}

52398

return false;

52399

};

52400

52401

if (N->getOpcode() != ISD::FADD || !Subtarget.hasFP16() ||

52402

!AllowContract(N->getFlags()))

52403

return SDValue();

52404

52405

EVT VT = N->getValueType(0);

52406

if (VT != MVT::v8f16 && VT != MVT::v16f16 && VT != MVT::v32f16)

52407

return SDValue();

52408

52409

SDValue LHS = N->getOperand(0);

52410

SDValue RHS = N->getOperand(1);

52411

bool IsConj;

52412

SDValue FAddOp1, MulOp0, MulOp1;

52413

auto GetCFmulFrom = [&MulOp0, &MulOp1, &IsConj, &AllowContract,

52414

&IsVectorAllNegativeZero,

52415

&HasNoSignedZero](SDValue N) -> bool {

52416

if (!N.hasOneUse() || N.getOpcode() != ISD::BITCAST)

52417

return false;

52418

SDValue Op0 = N.getOperand(0);

52419

unsigned Opcode = Op0.getOpcode();

52420

if (Op0.hasOneUse() && AllowContract(Op0->getFlags())) {

52421

if ((Opcode == X86ISD::VFMULC || Opcode == X86ISD::VFCMULC)) {

52422

MulOp0 = Op0.getOperand(0);

52423

MulOp1 = Op0.getOperand(1);

52424

IsConj = Opcode == X86ISD::VFCMULC;

52425

return true;

52426

}

52427

if ((Opcode == X86ISD::VFMADDC || Opcode == X86ISD::VFCMADDC) &&

52428

((ISD::isBuildVectorAllZeros(Op0->getOperand(2).getNode()) &&

52429

HasNoSignedZero(Op0->getFlags())) ||

52430

IsVectorAllNegativeZero(Op0->getOperand(2).getNode()))) {

52431

MulOp0 = Op0.getOperand(0);

52432

MulOp1 = Op0.getOperand(1);

52433

IsConj = Opcode == X86ISD::VFCMADDC;

52434

return true;

52435

}

52436

}

52437

return false;

52438

};

52439

52440

if (GetCFmulFrom(LHS))

52441

FAddOp1 = RHS;

52442

else if (GetCFmulFrom(RHS))

52443

FAddOp1 = LHS;

52444

else

52445

return SDValue();

52446

52447

MVT CVT = MVT::getVectorVT(MVT::f32, VT.getVectorNumElements() / 2);

52448

FAddOp1 = DAG.getBitcast(CVT, FAddOp1);

52449

unsigned NewOp = IsConj ? X86ISD::VFCMADDC : X86ISD::VFMADDC;

52450

// FIXME: How do we handle when fast math flags of FADD are different from

52451

// CFMUL's?

52452

SDValue CFmul =

52453

DAG.getNode(NewOp, SDLoc(N), CVT, MulOp0, MulOp1, FAddOp1, N->getFlags());

52454

return DAG.getBitcast(VT, CFmul);

52455

}

52456

52457

/// Do target-specific dag combines on floating-point adds/subs.

52458

static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG,

52459

const X86Subtarget &Subtarget) {

52460

if (SDValue HOp = combineToHorizontalAddSub(N, DAG, Subtarget))

52461

return HOp;

52462

52463

if (SDValue COp = combineFaddCFmul(N, DAG, Subtarget))

52464

return COp;

52465

52466

return SDValue();

52467

}

52468

52469

/// Attempt to pre-truncate inputs to arithmetic ops if it will simplify

52470

/// the codegen.

52471

/// e.g. TRUNC( BINOP( X, Y ) ) --> BINOP( TRUNC( X ), TRUNC( Y ) )

52472

/// TODO: This overlaps with the generic combiner's visitTRUNCATE. Remove

52473

/// anything that is guaranteed to be transformed by DAGCombiner.

52474

static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,

52475

const X86Subtarget &Subtarget,

52476

const SDLoc &DL) {

52477

assert(N->getOpcode() == ISD::TRUNCATE && "Wrong opcode")(static_cast <bool> (N->getOpcode() == ISD::TRUNCATE
&& "Wrong opcode") ? void (0) : __assert_fail ("N->getOpcode() == ISD::TRUNCATE && \"Wrong opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 52477, __extension__
__PRETTY_FUNCTION__));

52478

SDValue Src = N->getOperand(0);

52479

unsigned SrcOpcode = Src.getOpcode();

52480

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

52481

52482

EVT VT = N->getValueType(0);

52483

EVT SrcVT = Src.getValueType();

52484

52485

auto IsFreeTruncation = [VT](SDValue Op) {

52486

unsigned TruncSizeInBits = VT.getScalarSizeInBits();

52487

52488

// See if this has been extended from a smaller/equal size to

52489

// the truncation size, allowing a truncation to combine with the extend.

52490

unsigned Opcode = Op.getOpcode();

52491

if ((Opcode == ISD::ANY_EXTEND || Opcode == ISD::SIGN_EXTEND ||

52492

Opcode == ISD::ZERO_EXTEND) &&

52493

Op.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits)

52494

return true;

52495

52496

// See if this is a single use constant which can be constant folded.

52497

// NOTE: We don't peek throught bitcasts here because there is currently

52498

// no support for constant folding truncate+bitcast+vector_of_constants. So

52499

// we'll just send up with a truncate on both operands which will

52500

// get turned back into (truncate (binop)) causing an infinite loop.

52501

return ISD::isBuildVectorOfConstantSDNodes(Op.getNode());

52502

};

52503

52504

auto TruncateArithmetic = [&](SDValue N0, SDValue N1) {

52505

SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, VT, N0);

52506

SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, VT, N1);

52507

return DAG.getNode(SrcOpcode, DL, VT, Trunc0, Trunc1);

52508

};

52509

52510

// Don't combine if the operation has other uses.

52511

if (!Src.hasOneUse())

52512

return SDValue();

52513

52514

// Only support vector truncation for now.

52515

// TODO: i64 scalar math would benefit as well.

52516

if (!VT.isVector())

52517

return SDValue();

52518

52519

// In most cases its only worth pre-truncating if we're only facing the cost

52520

// of one truncation.

52521

// i.e. if one of the inputs will constant fold or the input is repeated.

52522

switch (SrcOpcode) {

52523

case ISD::MUL:

52524

// X86 is rubbish at scalar and vector i64 multiplies (until AVX512DQ) - its

52525

// better to truncate if we have the chance.

52526

if (SrcVT.getScalarType() == MVT::i64 &&

52527

TLI.isOperationLegal(SrcOpcode, VT) &&

52528

!TLI.isOperationLegal(SrcOpcode, SrcVT))

52529

return TruncateArithmetic(Src.getOperand(0), Src.getOperand(1));

52530

[[fallthrough]];

52531

case ISD::AND:

52532

case ISD::XOR:

52533

case ISD::OR:

52534

case ISD::ADD:

52535

case ISD::SUB: {

52536

SDValue Op0 = Src.getOperand(0);

52537

SDValue Op1 = Src.getOperand(1);

52538

if (TLI.isOperationLegal(SrcOpcode, VT) &&

52539

(Op0 == Op1 || IsFreeTruncation(Op0) || IsFreeTruncation(Op1)))

52540

return TruncateArithmetic(Op0, Op1);

52541

break;

52542

}

52543

}

52544

52545

return SDValue();

52546

}

52547

52548

/// Truncate using ISD::AND mask and X86ISD::PACKUS.

52549

/// e.g. trunc <8 x i32> X to <8 x i16> -->

52550

/// MaskX = X & 0xffff (clear high bits to prevent saturation)

52551

/// packus (extract_subv MaskX, 0), (extract_subv MaskX, 1)

52552

static SDValue combineVectorTruncationWithPACKUS(SDNode *N, const SDLoc &DL,

52553

const X86Subtarget &Subtarget,

52554

SelectionDAG &DAG) {

52555

SDValue In = N->getOperand(0);

52556

EVT InVT = In.getValueType();

52557

EVT OutVT = N->getValueType(0);

52558

52559

APInt Mask = APInt::getLowBitsSet(InVT.getScalarSizeInBits(),

52560

OutVT.getScalarSizeInBits());

52561

In = DAG.getNode(ISD::AND, DL, InVT, In, DAG.getConstant(Mask, DL, InVT));

52562

return truncateVectorWithPACK(X86ISD::PACKUS, OutVT, In, DL, DAG, Subtarget);

52563

}

52564

52565

/// Truncate a group of v4i32 into v8i16 using X86ISD::PACKSS.

52566

static SDValue combineVectorTruncationWithPACKSS(SDNode *N, const SDLoc &DL,

52567

const X86Subtarget &Subtarget,

52568

SelectionDAG &DAG) {

52569

SDValue In = N->getOperand(0);

52570

EVT InVT = In.getValueType();

52571

EVT OutVT = N->getValueType(0);

52572

In = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, InVT, In,

52573

DAG.getValueType(OutVT));

52574

return truncateVectorWithPACK(X86ISD::PACKSS, OutVT, In, DL, DAG, Subtarget);

52575

}

52576

52577

/// This function transforms truncation from vXi32/vXi64 to vXi8/vXi16 into

52578

/// X86ISD::PACKUS/X86ISD::PACKSS operations. We do it here because after type

52579

/// legalization the truncation will be translated into a BUILD_VECTOR with each

52580

/// element that is extracted from a vector and then truncated, and it is

52581

/// difficult to do this optimization based on them.

52582

static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG,

52583

const X86Subtarget &Subtarget) {

52584

EVT OutVT = N->getValueType(0);

52585

if (!OutVT.isVector())

52586

return SDValue();

52587

52588

SDValue In = N->getOperand(0);

52589

if (!In.getValueType().isSimple())

52590

return SDValue();

52591

52592

EVT InVT = In.getValueType();

52593

unsigned NumElems = OutVT.getVectorNumElements();

52594

52595

// AVX512 provides fast truncate ops.

52596

if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())

52597

return SDValue();

52598

52599

EVT OutSVT = OutVT.getVectorElementType();

52600

EVT InSVT = InVT.getVectorElementType();

52601

if (!((InSVT == MVT::i16 || InSVT == MVT::i32 || InSVT == MVT::i64) &&

52602

(OutSVT == MVT::i8 || OutSVT == MVT::i16) && isPowerOf2_32(NumElems) &&

52603

NumElems >= 8))

52604

return SDValue();

52605

52606

// SSSE3's pshufb results in less instructions in the cases below.

52607

if (Subtarget.hasSSSE3() && NumElems == 8) {

52608

if (InSVT == MVT::i16)

52609

return SDValue();

52610

if (InSVT == MVT::i32 &&

52611

(OutSVT == MVT::i8 || !Subtarget.hasSSE41() || Subtarget.hasInt256()))

52612

return SDValue();

52613

}

52614

52615

SDLoc DL(N);

52616

// SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PACKUS

52617

// for 2 x v4i32 -> v8i16. For SSSE3 and below, we need to use PACKSS to

52618

// truncate 2 x v4i32 to v8i16.

52619

if (Subtarget.hasSSE41() || OutSVT == MVT::i8)

52620

return combineVectorTruncationWithPACKUS(N, DL, Subtarget, DAG);

52621

if (InSVT == MVT::i32)

52622

return combineVectorTruncationWithPACKSS(N, DL, Subtarget, DAG);

52623

52624

return SDValue();

52625

}

52626

52627

/// This function transforms vector truncation of 'extended sign-bits' or

52628

/// 'extended zero-bits' values.

52629

/// vXi16/vXi32/vXi64 to vXi8/vXi16/vXi32 into X86ISD::PACKSS/PACKUS operations.

52630

static SDValue combineVectorSignBitsTruncation(SDNode *N, const SDLoc &DL,

52631

SelectionDAG &DAG,

52632

const X86Subtarget &Subtarget) {

52633

// Requires SSE2.

52634

if (!Subtarget.hasSSE2())

52635

return SDValue();

52636

52637

if (!N->getValueType(0).isVector() || !N->getValueType(0).isSimple())

52638

return SDValue();

52639

52640

SDValue In = N->getOperand(0);

52641

if (!In.getValueType().isSimple())

52642

return SDValue();

52643

52644

MVT VT = N->getValueType(0).getSimpleVT();

52645

MVT SVT = VT.getScalarType();

52646

52647

MVT InVT = In.getValueType().getSimpleVT();

52648

MVT InSVT = InVT.getScalarType();

52649

52650

// Check we have a truncation suited for PACKSS/PACKUS.

52651

if (!isPowerOf2_32(VT.getVectorNumElements()))

52652

return SDValue();

52653

if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32)

52654

return SDValue();

52655

if (InSVT != MVT::i16 && InSVT != MVT::i32 && InSVT != MVT::i64)

52656

return SDValue();

52657

52658

// Truncation to sub-128bit vXi32 can be better handled with shuffles.

52659

if (SVT == MVT::i32 && VT.getSizeInBits() < 128)

52660

return SDValue();

52661

52662

// AVX512 has fast truncate, but if the input is already going to be split,

52663

// there's no harm in trying pack.

52664

if (Subtarget.hasAVX512() &&

52665

!(!Subtarget.useAVX512Regs() && VT.is256BitVector() &&

52666

InVT.is512BitVector())) {

52667

// PACK should still be worth it for 128-bit vectors if the sources were

52668

// originally concatenated from subvectors.

52669

SmallVector<SDValue> ConcatOps;

52670

if (VT.getSizeInBits() > 128 ||

52671

!collectConcatOps(In.getNode(), ConcatOps, DAG))

52672

return SDValue();

52673

}

52674

52675

unsigned NumPackedSignBits = std::min<unsigned>(SVT.getSizeInBits(), 16);

52676

unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8;

52677

52678

// Use PACKUS if the input has zero-bits that extend all the way to the

52679

// packed/truncated value. e.g. masks, zext_in_reg, etc.

52680

KnownBits Known = DAG.computeKnownBits(In);

52681

unsigned NumLeadingZeroBits = Known.countMinLeadingZeros();

52682

if (NumLeadingZeroBits >= (InSVT.getSizeInBits() - NumPackedZeroBits))

52683

return truncateVectorWithPACK(X86ISD::PACKUS, VT, In, DL, DAG, Subtarget);

52684

52685

// Use PACKSS if the input has sign-bits that extend all the way to the

52686

// packed/truncated value. e.g. Comparison result, sext_in_reg, etc.

52687

unsigned NumSignBits = DAG.ComputeNumSignBits(In);

52688

52689

// Don't use PACKSS for vXi64 -> vXi32 truncations unless we're dealing with

52690

// a sign splat. ComputeNumSignBits struggles to see through BITCASTs later

52691

// on and combines/simplifications can't then use it.

52692

if (SVT == MVT::i32 && NumSignBits != InSVT.getSizeInBits())

52693

return SDValue();

52694

52695

unsigned MinSignBits = InSVT.getSizeInBits() - NumPackedSignBits;

52696

if (NumSignBits > MinSignBits)

52697

return truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget);

52698

52699

// If we have a srl that only generates signbits that we will discard in

52700

// the truncation then we can use PACKSS by converting the srl to a sra.

52701

// SimplifyDemandedBits often relaxes sra to srl so we need to reverse it.

52702

if (In.getOpcode() == ISD::SRL && N->isOnlyUserOf(In.getNode()))

52703

if (const APInt *ShAmt = DAG.getValidShiftAmountConstant(

52704

In, APInt::getAllOnes(VT.getVectorNumElements()))) {

52705

if (*ShAmt == MinSignBits) {

52706

SDValue NewIn = DAG.getNode(ISD::SRA, DL, InVT, In->ops());

52707

return truncateVectorWithPACK(X86ISD::PACKSS, VT, NewIn, DL, DAG,

52708

Subtarget);

52709

}

52710

}

52711

52712

return SDValue();

52713

}

52714

52715

// Try to form a MULHU or MULHS node by looking for

52716

// (trunc (srl (mul ext, ext), 16))

52717

// TODO: This is X86 specific because we want to be able to handle wide types

52718

// before type legalization. But we can only do it if the vector will be

52719

// legalized via widening/splitting. Type legalization can't handle promotion

52720

// of a MULHU/MULHS. There isn't a way to convey this to the generic DAG

52721

// combiner.

52722

static SDValue combinePMULH(SDValue Src, EVT VT, const SDLoc &DL,

52723

SelectionDAG &DAG, const X86Subtarget &Subtarget) {

52724

// First instruction should be a right shift of a multiply.

52725

if (Src.getOpcode() != ISD::SRL ||

52726

Src.getOperand(0).getOpcode() != ISD::MUL)

52727

return SDValue();

52728

52729

if (!Subtarget.hasSSE2())

52730

return SDValue();

52731

52732

// Only handle vXi16 types that are at least 128-bits unless they will be

52733

// widened.

52734

if (!VT.isVector() || VT.getVectorElementType() != MVT::i16)

52735

return SDValue();

52736

52737

// Input type should be at least vXi32.

52738

EVT InVT = Src.getValueType();

52739

if (InVT.getVectorElementType().getSizeInBits() < 32)

52740

return SDValue();

52741

52742

// Need a shift by 16.

52743

APInt ShiftAmt;

52744

if (!ISD::isConstantSplatVector(Src.getOperand(1).getNode(), ShiftAmt) ||

52745

ShiftAmt != 16)

52746

return SDValue();

52747

52748

SDValue LHS = Src.getOperand(0).getOperand(0);

52749

SDValue RHS = Src.getOperand(0).getOperand(1);

52750

52751

// Count leading sign/zero bits on both inputs - if there are enough then

52752

// truncation back to vXi16 will be cheap - either as a pack/shuffle

52753

// sequence or using AVX512 truncations. If the inputs are sext/zext then the

52754

// truncations may actually be free by peeking through to the ext source.

52755

auto IsSext = [&DAG](SDValue V) {

52756

return DAG.ComputeMaxSignificantBits(V) <= 16;

52757

};

52758

auto IsZext = [&DAG](SDValue V) {

52759

return DAG.computeKnownBits(V).countMaxActiveBits() <= 16;

52760

};

52761

52762

bool IsSigned = IsSext(LHS) && IsSext(RHS);

52763

bool IsUnsigned = IsZext(LHS) && IsZext(RHS);

52764

if (!IsSigned && !IsUnsigned)

52765

return SDValue();

52766

52767

// Check if both inputs are extensions, which will be removed by truncation.

52768

bool IsTruncateFree = (LHS.getOpcode() == ISD::SIGN_EXTEND ||

52769

LHS.getOpcode() == ISD::ZERO_EXTEND) &&

52770

(RHS.getOpcode() == ISD::SIGN_EXTEND ||

52771

RHS.getOpcode() == ISD::ZERO_EXTEND) &&

52772

LHS.getOperand(0).getScalarValueSizeInBits() <= 16 &&

52773

RHS.getOperand(0).getScalarValueSizeInBits() <= 16;

52774

52775

// For AVX2+ targets, with the upper bits known zero, we can perform MULHU on

52776

// the (bitcasted) inputs directly, and then cheaply pack/truncate the result

52777

// (upper elts will be zero). Don't attempt this with just AVX512F as MULHU

52778

// will have to split anyway.

52779

unsigned InSizeInBits = InVT.getSizeInBits();

52780

if (IsUnsigned && !IsTruncateFree && Subtarget.hasInt256() &&

52781

!(Subtarget.hasAVX512() && !Subtarget.hasBWI() && VT.is256BitVector()) &&

52782

(InSizeInBits % 16) == 0) {

52783

EVT BCVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,

52784

InVT.getSizeInBits() / 16);

52785

SDValue Res = DAG.getNode(ISD::MULHU, DL, BCVT, DAG.getBitcast(BCVT, LHS),

52786

DAG.getBitcast(BCVT, RHS));

52787

return DAG.getNode(ISD::TRUNCATE, DL, VT, DAG.getBitcast(InVT, Res));

52788

}

52789

52790

// Truncate back to source type.

52791

LHS = DAG.getNode(ISD::TRUNCATE, DL, VT, LHS);

52792

RHS = DAG.getNode(ISD::TRUNCATE, DL, VT, RHS);

52793

52794

unsigned Opc = IsSigned ? ISD::MULHS : ISD::MULHU;

52795

return DAG.getNode(Opc, DL, VT, LHS, RHS);

52796

}

52797

52798

// Attempt to match PMADDUBSW, which multiplies corresponding unsigned bytes

52799

// from one vector with signed bytes from another vector, adds together

52800

// adjacent pairs of 16-bit products, and saturates the result before

52801

// truncating to 16-bits.

52802

//

52803

// Which looks something like this:

52804

// (i16 (ssat (add (mul (zext (even elts (i8 A))), (sext (even elts (i8 B)))),

52805

// (mul (zext (odd elts (i8 A)), (sext (odd elts (i8 B))))))))

52806

static SDValue detectPMADDUBSW(SDValue In, EVT VT, SelectionDAG &DAG,

52807

const X86Subtarget &Subtarget,

52808

const SDLoc &DL) {

52809

if (!VT.isVector() || !Subtarget.hasSSSE3())

52810

return SDValue();

52811

52812

unsigned NumElems = VT.getVectorNumElements();

52813

EVT ScalarVT = VT.getVectorElementType();

52814

if (ScalarVT != MVT::i16 || NumElems < 8 || !isPowerOf2_32(NumElems))

52815

return SDValue();

52816

52817

SDValue SSatVal = detectSSatPattern(In, VT);

52818

if (!SSatVal || SSatVal.getOpcode() != ISD::ADD)

52819

return SDValue();

52820

52821

// Ok this is a signed saturation of an ADD. See if this ADD is adding pairs

52822

// of multiplies from even/odd elements.

52823

SDValue N0 = SSatVal.getOperand(0);

52824

SDValue N1 = SSatVal.getOperand(1);

52825

52826

if (N0.getOpcode() != ISD::MUL || N1.getOpcode() != ISD::MUL)

52827

return SDValue();

52828

52829

SDValue N00 = N0.getOperand(0);

52830

SDValue N01 = N0.getOperand(1);

52831

SDValue N10 = N1.getOperand(0);

52832

SDValue N11 = N1.getOperand(1);

52833

52834

// TODO: Handle constant vectors and use knownbits/computenumsignbits?

52835

// Canonicalize zero_extend to LHS.

52836

if (N01.getOpcode() == ISD::ZERO_EXTEND)

52837

std::swap(N00, N01);

52838

if (N11.getOpcode() == ISD::ZERO_EXTEND)

52839

std::swap(N10, N11);

52840

52841

// Ensure we have a zero_extend and a sign_extend.

52842

if (N00.getOpcode() != ISD::ZERO_EXTEND ||

52843

N01.getOpcode() != ISD::SIGN_EXTEND ||

52844

N10.getOpcode() != ISD::ZERO_EXTEND ||

52845

N11.getOpcode() != ISD::SIGN_EXTEND)

52846

return SDValue();

52847

52848

// Peek through the extends.

52849

N00 = N00.getOperand(0);

52850

N01 = N01.getOperand(0);

52851

N10 = N10.getOperand(0);

52852

N11 = N11.getOperand(0);

52853

52854

// Ensure the extend is from vXi8.

52855

if (N00.getValueType().getVectorElementType() != MVT::i8 ||

52856

N01.getValueType().getVectorElementType() != MVT::i8 ||

52857

N10.getValueType().getVectorElementType() != MVT::i8 ||

52858

N11.getValueType().getVectorElementType() != MVT::i8)

52859

return SDValue();

52860

52861

// All inputs should be build_vectors.

52862

if (N00.getOpcode() != ISD::BUILD_VECTOR ||

52863

N01.getOpcode() != ISD::BUILD_VECTOR ||

52864

N10.getOpcode() != ISD::BUILD_VECTOR ||

52865

N11.getOpcode() != ISD::BUILD_VECTOR)

52866

return SDValue();

52867

52868

// N00/N10 are zero extended. N01/N11 are sign extended.

52869

52870

// For each element, we need to ensure we have an odd element from one vector

52871

// multiplied by the odd element of another vector and the even element from

52872

// one of the same vectors being multiplied by the even element from the

52873

// other vector. So we need to make sure for each element i, this operator

52874

// is being performed:

52875

// A[2 * i] * B[2 * i] + A[2 * i + 1] * B[2 * i + 1]

52876

SDValue ZExtIn, SExtIn;

52877

for (unsigned i = 0; i != NumElems; ++i) {

52878

SDValue N00Elt = N00.getOperand(i);

52879

SDValue N01Elt = N01.getOperand(i);

52880

SDValue N10Elt = N10.getOperand(i);

52881

SDValue N11Elt = N11.getOperand(i);

52882

// TODO: Be more tolerant to undefs.

52883

if (N00Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

52884

N01Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

52885

N10Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

52886

N11Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT)

52887

return SDValue();

52888

auto *ConstN00Elt = dyn_cast<ConstantSDNode>(N00Elt.getOperand(1));

52889

auto *ConstN01Elt = dyn_cast<ConstantSDNode>(N01Elt.getOperand(1));

52890

auto *ConstN10Elt = dyn_cast<ConstantSDNode>(N10Elt.getOperand(1));

52891

auto *ConstN11Elt = dyn_cast<ConstantSDNode>(N11Elt.getOperand(1));

52892

if (!ConstN00Elt || !ConstN01Elt || !ConstN10Elt || !ConstN11Elt)

52893

return SDValue();

52894

unsigned IdxN00 = ConstN00Elt->getZExtValue();

52895

unsigned IdxN01 = ConstN01Elt->getZExtValue();

52896

unsigned IdxN10 = ConstN10Elt->getZExtValue();

52897

unsigned IdxN11 = ConstN11Elt->getZExtValue();

52898

// Add is commutative so indices can be reordered.

52899

if (IdxN00 > IdxN10) {

52900

std::swap(IdxN00, IdxN10);

52901

std::swap(IdxN01, IdxN11);

52902

}

52903

// N0 indices be the even element. N1 indices must be the next odd element.

52904

if (IdxN00 != 2 * i || IdxN10 != 2 * i + 1 ||

52905

IdxN01 != 2 * i || IdxN11 != 2 * i + 1)

52906

return SDValue();

52907

SDValue N00In = N00Elt.getOperand(0);

52908

SDValue N01In = N01Elt.getOperand(0);

52909

SDValue N10In = N10Elt.getOperand(0);

52910

SDValue N11In = N11Elt.getOperand(0);

52911

// First time we find an input capture it.

52912

if (!ZExtIn) {

52913

ZExtIn = N00In;

52914

SExtIn = N01In;

52915

}

52916

if (ZExtIn != N00In || SExtIn != N01In ||

52917

ZExtIn != N10In || SExtIn != N11In)

52918

return SDValue();

52919

}

52920

52921

auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,

52922

ArrayRef<SDValue> Ops) {

52923

// Shrink by adding truncate nodes and let DAGCombine fold with the

52924

// sources.

52925

EVT InVT = Ops[0].getValueType();

52926

assert(InVT.getScalarType() == MVT::i8 &&(static_cast <bool> (InVT.getScalarType() == MVT::i8 &&
"Unexpected scalar element type") ? void (0) : __assert_fail
("InVT.getScalarType() == MVT::i8 && \"Unexpected scalar element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 52927, __extension__
__PRETTY_FUNCTION__))

52927

"Unexpected scalar element type")(static_cast <bool> (InVT.getScalarType() == MVT::i8 &&
"Unexpected scalar element type") ? void (0) : __assert_fail
("InVT.getScalarType() == MVT::i8 && \"Unexpected scalar element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 52927, __extension__
__PRETTY_FUNCTION__));

52928

assert(InVT == Ops[1].getValueType() && "Operands' types mismatch")(static_cast <bool> (InVT == Ops[1].getValueType() &&
"Operands' types mismatch") ? void (0) : __assert_fail ("InVT == Ops[1].getValueType() && \"Operands' types mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 52928, __extension__
__PRETTY_FUNCTION__));

52929

EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,

52930

InVT.getVectorNumElements() / 2);

52931

return DAG.getNode(X86ISD::VPMADDUBSW, DL, ResVT, Ops[0], Ops[1]);

52932

};

52933

return SplitOpsAndApply(DAG, Subtarget, DL, VT, { ZExtIn, SExtIn },

52934

PMADDBuilder);

52935

}

52936

52937

static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,

52938

const X86Subtarget &Subtarget) {

52939

EVT VT = N->getValueType(0);

52940

SDValue Src = N->getOperand(0);

52941

SDLoc DL(N);

52942

52943

// Attempt to pre-truncate inputs to arithmetic ops instead.

52944

if (SDValue V = combineTruncatedArithmetic(N, DAG, Subtarget, DL))

52945

return V;

52946

52947

// Try to detect AVG pattern first.

52948

if (SDValue Avg = detectAVGPattern(Src, VT, DAG, Subtarget, DL))

52949

return Avg;

52950

52951

// Try to detect PMADD

52952

if (SDValue PMAdd = detectPMADDUBSW(Src, VT, DAG, Subtarget, DL))

52953

return PMAdd;

52954

52955

// Try to combine truncation with signed/unsigned saturation.

52956

if (SDValue Val = combineTruncateWithSat(Src, VT, DL, DAG, Subtarget))

52957

return Val;

52958

52959

// Try to combine PMULHUW/PMULHW for vXi16.

52960

if (SDValue V = combinePMULH(Src, VT, DL, DAG, Subtarget))

52961

return V;

52962

52963

// The bitcast source is a direct mmx result.

52964

// Detect bitcasts between i32 to x86mmx

52965

if (Src.getOpcode() == ISD::BITCAST && VT == MVT::i32) {

52966

SDValue BCSrc = Src.getOperand(0);

52967

if (BCSrc.getValueType() == MVT::x86mmx)

52968

return DAG.getNode(X86ISD::MMX_MOVD2W, DL, MVT::i32, BCSrc);

52969

}

52970

52971

// Try to truncate extended sign/zero bits with PACKSS/PACKUS.

52972

if (SDValue V = combineVectorSignBitsTruncation(N, DL, DAG, Subtarget))

52973

return V;

52974

52975

return combineVectorTruncation(N, DAG, Subtarget);

52976

}

52977

52978

static SDValue combineVTRUNC(SDNode *N, SelectionDAG &DAG,

52979

TargetLowering::DAGCombinerInfo &DCI) {

52980

EVT VT = N->getValueType(0);

52981

SDValue In = N->getOperand(0);

52982

SDLoc DL(N);

52983

52984

if (SDValue SSatVal = detectSSatPattern(In, VT))

52985

return DAG.getNode(X86ISD::VTRUNCS, DL, VT, SSatVal);

52986

if (SDValue USatVal = detectUSatPattern(In, VT, DAG, DL))

52987

return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, USatVal);

52988

52989

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

52990

APInt DemandedMask(APInt::getAllOnes(VT.getScalarSizeInBits()));

52991

if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))

52992

return SDValue(N, 0);

52993

52994

return SDValue();

52995

}

52996

52997

/// Returns the negated value if the node \p N flips sign of FP value.

52998

///

52999

/// FP-negation node may have different forms: FNEG(x), FXOR (x, 0x80000000)

53000

/// or FSUB(0, x)

53001

/// AVX512F does not have FXOR, so FNEG is lowered as

53002

/// (bitcast (xor (bitcast x), (bitcast ConstantFP(0x80000000)))).

53003

/// In this case we go though all bitcasts.

53004

/// This also recognizes splat of a negated value and returns the splat of that

53005

/// value.

53006

static SDValue isFNEG(SelectionDAG &DAG, SDNode *N, unsigned Depth = 0) {

53007

if (N->getOpcode() == ISD::FNEG)

53008

return N->getOperand(0);

53009

53010

// Don't recurse exponentially.

53011

if (Depth > SelectionDAG::MaxRecursionDepth)

53012

return SDValue();

53013

53014

unsigned ScalarSize = N->getValueType(0).getScalarSizeInBits();

53015

53016

SDValue Op = peekThroughBitcasts(SDValue(N, 0));

53017

EVT VT = Op->getValueType(0);

53018

53019

// Make sure the element size doesn't change.

53020

if (VT.getScalarSizeInBits() != ScalarSize)

53021

return SDValue();

53022

53023

unsigned Opc = Op.getOpcode();

53024

switch (Opc) {

53025

case ISD::VECTOR_SHUFFLE: {

53026

// For a VECTOR_SHUFFLE(VEC1, VEC2), if the VEC2 is undef, then the negate

53027

// of this is VECTOR_SHUFFLE(-VEC1, UNDEF). The mask can be anything here.

53028

if (!Op.getOperand(1).isUndef())

53029

return SDValue();

53030

if (SDValue NegOp0 = isFNEG(DAG, Op.getOperand(0).getNode(), Depth + 1))

53031

if (NegOp0.getValueType() == VT) // FIXME: Can we do better?

53032

return DAG.getVectorShuffle(VT, SDLoc(Op), NegOp0, DAG.getUNDEF(VT),

53033

cast<ShuffleVectorSDNode>(Op)->getMask());

53034

break;

53035

}

53036

case ISD::INSERT_VECTOR_ELT: {

53037

// Negate of INSERT_VECTOR_ELT(UNDEF, V, INDEX) is INSERT_VECTOR_ELT(UNDEF,

53038

// -V, INDEX).

53039

SDValue InsVector = Op.getOperand(0);

53040

SDValue InsVal = Op.getOperand(1);

53041

if (!InsVector.isUndef())

53042

return SDValue();

53043

if (SDValue NegInsVal = isFNEG(DAG, InsVal.getNode(), Depth + 1))

53044

if (NegInsVal.getValueType() == VT.getVectorElementType()) // FIXME

53045

return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(Op), VT, InsVector,

53046

NegInsVal, Op.getOperand(2));

53047

break;

53048

}

53049

case ISD::FSUB:

53050

case ISD::XOR:

53051

case X86ISD::FXOR: {

53052

SDValue Op1 = Op.getOperand(1);

53053

SDValue Op0 = Op.getOperand(0);

53054

53055

// For XOR and FXOR, we want to check if constant

53056

// bits of Op1 are sign bit masks. For FSUB, we

53057

// have to check if constant bits of Op0 are sign

53058

// bit masks and hence we swap the operands.

53059

if (Opc == ISD::FSUB)

53060

std::swap(Op0, Op1);

53061

53062

APInt UndefElts;

53063

SmallVector<APInt, 16> EltBits;

53064

// Extract constant bits and see if they are all

53065

// sign bit masks. Ignore the undef elements.

53066

if (getTargetConstantBitsFromNode(Op1, ScalarSize, UndefElts, EltBits,

53067

/* AllowWholeUndefs */ true,

53068

/* AllowPartialUndefs */ false)) {

53069

for (unsigned I = 0, E = EltBits.size(); I < E; I++)

53070

if (!UndefElts[I] && !EltBits[I].isSignMask())

53071

return SDValue();

53072

53073

// Only allow bitcast from correctly-sized constant.

53074

Op0 = peekThroughBitcasts(Op0);

53075

if (Op0.getScalarValueSizeInBits() == ScalarSize)

53076

return Op0;

53077

}

53078

break;

53079

} // case

53080

} // switch

53081

53082

return SDValue();

53083

}

53084

53085

static unsigned negateFMAOpcode(unsigned Opcode, bool NegMul, bool NegAcc,

53086

bool NegRes) {

53087

if (NegMul) {

53088

switch (Opcode) {

53089

default: llvm_unreachable("Unexpected opcode")::llvm::llvm_unreachable_internal("Unexpected opcode", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 53089);

53090

case ISD::FMA: Opcode = X86ISD::FNMADD; break;

53091

case ISD::STRICT_FMA: Opcode = X86ISD::STRICT_FNMADD; break;

53092

case X86ISD::FMADD_RND: Opcode = X86ISD::FNMADD_RND; break;

53093

case X86ISD::FMSUB: Opcode = X86ISD::FNMSUB; break;

53094

case X86ISD::STRICT_FMSUB: Opcode = X86ISD::STRICT_FNMSUB; break;

53095

case X86ISD::FMSUB_RND: Opcode = X86ISD::FNMSUB_RND; break;

53096

case X86ISD::FNMADD: Opcode = ISD::FMA; break;

53097

case X86ISD::STRICT_FNMADD: Opcode = ISD::STRICT_FMA; break;

53098

case X86ISD::FNMADD_RND: Opcode = X86ISD::FMADD_RND; break;

53099

case X86ISD::FNMSUB: Opcode = X86ISD::FMSUB; break;

53100

case X86ISD::STRICT_FNMSUB: Opcode = X86ISD::STRICT_FMSUB; break;

53101

case X86ISD::FNMSUB_RND: Opcode = X86ISD::FMSUB_RND; break;

53102

}

53103

}

53104

53105

if (NegAcc) {

53106

switch (Opcode) {

53107

default: llvm_unreachable("Unexpected opcode")::llvm::llvm_unreachable_internal("Unexpected opcode", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 53107);

53108

case ISD::FMA: Opcode = X86ISD::FMSUB; break;

53109

case ISD::STRICT_FMA: Opcode = X86ISD::STRICT_FMSUB; break;

53110

case X86ISD::FMADD_RND: Opcode = X86ISD::FMSUB_RND; break;

53111

case X86ISD::FMSUB: Opcode = ISD::FMA; break;

53112

case X86ISD::STRICT_FMSUB: Opcode = ISD::STRICT_FMA; break;

53113

case X86ISD::FMSUB_RND: Opcode = X86ISD::FMADD_RND; break;

53114

case X86ISD::FNMADD: Opcode = X86ISD::FNMSUB; break;

53115

case X86ISD::STRICT_FNMADD: Opcode = X86ISD::STRICT_FNMSUB; break;

53116

case X86ISD::FNMADD_RND: Opcode = X86ISD::FNMSUB_RND; break;

53117

case X86ISD::FNMSUB: Opcode = X86ISD::FNMADD; break;

53118

case X86ISD::STRICT_FNMSUB: Opcode = X86ISD::STRICT_FNMADD; break;

53119

case X86ISD::FNMSUB_RND: Opcode = X86ISD::FNMADD_RND; break;

53120

case X86ISD::FMADDSUB: Opcode = X86ISD::FMSUBADD; break;

53121

case X86ISD::FMADDSUB_RND: Opcode = X86ISD::FMSUBADD_RND; break;

53122

case X86ISD::FMSUBADD: Opcode = X86ISD::FMADDSUB; break;

53123

case X86ISD::FMSUBADD_RND: Opcode = X86ISD::FMADDSUB_RND; break;

53124

}

53125

}

53126

53127

if (NegRes) {

53128

switch (Opcode) {

53129

// For accuracy reason, we never combine fneg and fma under strict FP.

53130

default: llvm_unreachable("Unexpected opcode")::llvm::llvm_unreachable_internal("Unexpected opcode", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 53130);

53131

case ISD::FMA: Opcode = X86ISD::FNMSUB; break;

53132

case X86ISD::FMADD_RND: Opcode = X86ISD::FNMSUB_RND; break;

53133

case X86ISD::FMSUB: Opcode = X86ISD::FNMADD; break;

53134

case X86ISD::FMSUB_RND: Opcode = X86ISD::FNMADD_RND; break;

53135

case X86ISD::FNMADD: Opcode = X86ISD::FMSUB; break;

53136

case X86ISD::FNMADD_RND: Opcode = X86ISD::FMSUB_RND; break;

53137

case X86ISD::FNMSUB: Opcode = ISD::FMA; break;

53138

case X86ISD::FNMSUB_RND: Opcode = X86ISD::FMADD_RND; break;

53139

}

53140

}

53141

53142

return Opcode;

53143

}

53144

53145

/// Do target-specific dag combines on floating point negations.

53146

static SDValue combineFneg(SDNode *N, SelectionDAG &DAG,

53147

TargetLowering::DAGCombinerInfo &DCI,

53148

const X86Subtarget &Subtarget) {

53149

EVT OrigVT = N->getValueType(0);

53150

SDValue Arg = isFNEG(DAG, N);

53151

if (!Arg)

53152

return SDValue();

53153

53154

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

53155

EVT VT = Arg.getValueType();

53156

EVT SVT = VT.getScalarType();

53157

SDLoc DL(N);

53158

53159

// Let legalize expand this if it isn't a legal type yet.

53160

if (!TLI.isTypeLegal(VT))

53161

return SDValue();

53162

53163

// If we're negating a FMUL node on a target with FMA, then we can avoid the

53164

// use of a constant by performing (-0 - A*B) instead.

53165

// FIXME: Check rounding control flags as well once it becomes available.

53166

if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 || SVT == MVT::f64) &&

53167

Arg->getFlags().hasNoSignedZeros() && Subtarget.hasAnyFMA()) {

53168

SDValue Zero = DAG.getConstantFP(0.0, DL, VT);

53169

SDValue NewNode = DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0),

53170

Arg.getOperand(1), Zero);

53171

return DAG.getBitcast(OrigVT, NewNode);

53172

}

53173

53174

bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();

53175

bool LegalOperations = !DCI.isBeforeLegalizeOps();

53176

if (SDValue NegArg =

53177

TLI.getNegatedExpression(Arg, DAG, LegalOperations, CodeSize))

53178

return DAG.getBitcast(OrigVT, NegArg);

53179

53180

return SDValue();

53181

}

53182

53183

SDValue X86TargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG,

53184

bool LegalOperations,

53185

bool ForCodeSize,

53186

NegatibleCost &Cost,

53187

unsigned Depth) const {

53188

// fneg patterns are removable even if they have multiple uses.

53189

if (SDValue Arg = isFNEG(DAG, Op.getNode(), Depth)) {

53190

Cost = NegatibleCost::Cheaper;

53191

return DAG.getBitcast(Op.getValueType(), Arg);

53192

}

53193

53194

EVT VT = Op.getValueType();

53195

EVT SVT = VT.getScalarType();

53196

unsigned Opc = Op.getOpcode();

53197

SDNodeFlags Flags = Op.getNode()->getFlags();

53198

switch (Opc) {

53199

case ISD::FMA:

53200

case X86ISD::FMSUB:

53201

case X86ISD::FNMADD:

53202

case X86ISD::FNMSUB:

53203

case X86ISD::FMADD_RND:

53204

case X86ISD::FMSUB_RND:

53205

case X86ISD::FNMADD_RND:

53206

case X86ISD::FNMSUB_RND: {

53207

if (!Op.hasOneUse() || !Subtarget.hasAnyFMA() || !isTypeLegal(VT) ||

53208

!(SVT == MVT::f32 || SVT == MVT::f64) ||

53209

!isOperationLegal(ISD::FMA, VT))

53210

break;

53211

53212

// Don't fold (fneg (fma (fneg x), y, (fneg z))) to (fma x, y, z)

53213

// if it may have signed zeros.

53214

if (!Flags.hasNoSignedZeros())

53215

break;

53216

53217

// This is always negatible for free but we might be able to remove some

53218

// extra operand negations as well.

53219

SmallVector<SDValue, 4> NewOps(Op.getNumOperands(), SDValue());

53220

for (int i = 0; i != 3; ++i)

53221

NewOps[i] = getCheaperNegatedExpression(

53222

Op.getOperand(i), DAG, LegalOperations, ForCodeSize, Depth + 1);

53223

53224

bool NegA = !!NewOps[0];

53225

bool NegB = !!NewOps[1];

53226

bool NegC = !!NewOps[2];

53227

unsigned NewOpc = negateFMAOpcode(Opc, NegA != NegB, NegC, true);

53228

53229

Cost = (NegA || NegB || NegC) ? NegatibleCost::Cheaper

53230

: NegatibleCost::Neutral;

53231

53232

// Fill in the non-negated ops with the original values.

53233

for (int i = 0, e = Op.getNumOperands(); i != e; ++i)

53234

if (!NewOps[i])

53235

NewOps[i] = Op.getOperand(i);

53236

return DAG.getNode(NewOpc, SDLoc(Op), VT, NewOps);

53237

}

53238

case X86ISD::FRCP:

53239

if (SDValue NegOp0 =

53240

getNegatedExpression(Op.getOperand(0), DAG, LegalOperations,

53241

ForCodeSize, Cost, Depth + 1))

53242

return DAG.getNode(Opc, SDLoc(Op), VT, NegOp0);

53243

break;

53244

}

53245

53246

return TargetLowering::getNegatedExpression(Op, DAG, LegalOperations,

53247

ForCodeSize, Cost, Depth);

53248

}

53249

53250

static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG,

53251

const X86Subtarget &Subtarget) {

53252

MVT VT = N->getSimpleValueType(0);

53253

// If we have integer vector types available, use the integer opcodes.

53254

if (!VT.isVector() || !Subtarget.hasSSE2())

53255

return SDValue();

53256

53257

SDLoc dl(N);

53258

53259

unsigned IntBits = VT.getScalarSizeInBits();

53260

MVT IntSVT = MVT::getIntegerVT(IntBits);

53261

MVT IntVT = MVT::getVectorVT(IntSVT, VT.getSizeInBits() / IntBits);

53262

53263

SDValue Op0 = DAG.getBitcast(IntVT, N->getOperand(0));

53264

SDValue Op1 = DAG.getBitcast(IntVT, N->getOperand(1));

53265

unsigned IntOpcode;

53266

switch (N->getOpcode()) {

53267

default: llvm_unreachable("Unexpected FP logic op")::llvm::llvm_unreachable_internal("Unexpected FP logic op", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 53267);

53268

case X86ISD::FOR: IntOpcode = ISD::OR; break;

53269

case X86ISD::FXOR: IntOpcode = ISD::XOR; break;

53270

case X86ISD::FAND: IntOpcode = ISD::AND; break;

53271

case X86ISD::FANDN: IntOpcode = X86ISD::ANDNP; break;

53272

}

53273

SDValue IntOp = DAG.getNode(IntOpcode, dl, IntVT, Op0, Op1);

53274

return DAG.getBitcast(VT, IntOp);

53275

}

53276

53277

53278

/// Fold a xor(setcc cond, val), 1 --> setcc (inverted(cond), val)

53279

static SDValue foldXor1SetCC(SDNode *N, SelectionDAG &DAG) {

53280

if (N->getOpcode() != ISD::XOR)

53281

return SDValue();

53282

53283

SDValue LHS = N->getOperand(0);

53284

if (!isOneConstant(N->getOperand(1)) || LHS->getOpcode() != X86ISD::SETCC)

53285

return SDValue();

53286

53287

X86::CondCode NewCC = X86::GetOppositeBranchCondition(

53288

X86::CondCode(LHS->getConstantOperandVal(0)));

53289

SDLoc DL(N);

53290

return getSETCC(NewCC, LHS->getOperand(1), DL, DAG);

53291

}

53292

53293

static SDValue combineXorSubCTLZ(SDNode *N, SelectionDAG &DAG,

53294

const X86Subtarget &Subtarget) {

53295

assert((N->getOpcode() == ISD::XOR || N->getOpcode() == ISD::SUB) &&(static_cast <bool> ((N->getOpcode() == ISD::XOR || N
->getOpcode() == ISD::SUB) && "Invalid opcode for combing with CTLZ"
) ? void (0) : __assert_fail ("(N->getOpcode() == ISD::XOR || N->getOpcode() == ISD::SUB) && \"Invalid opcode for combing with CTLZ\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 53296, __extension__
__PRETTY_FUNCTION__))

53296

"Invalid opcode for combing with CTLZ")(static_cast <bool> ((N->getOpcode() == ISD::XOR || N
->getOpcode() == ISD::SUB) && "Invalid opcode for combing with CTLZ"
) ? void (0) : __assert_fail ("(N->getOpcode() == ISD::XOR || N->getOpcode() == ISD::SUB) && \"Invalid opcode for combing with CTLZ\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 53296, __extension__
__PRETTY_FUNCTION__));

53297

if (Subtarget.hasFastLZCNT())

53298

return SDValue();

53299

53300

EVT VT = N->getValueType(0);

53301

if (VT != MVT::i8 && VT != MVT::i16 && VT != MVT::i32 &&

53302

(VT != MVT::i64 || !Subtarget.is64Bit()))

53303

return SDValue();

53304

53305

SDValue N0 = N->getOperand(0);

53306

SDValue N1 = N->getOperand(1);

53307

53308

if (N0.getOpcode() != ISD::CTLZ_ZERO_UNDEF &&

53309

N1.getOpcode() != ISD::CTLZ_ZERO_UNDEF)

53310

return SDValue();

53311

53312

SDValue OpCTLZ;

53313

SDValue OpSizeTM1;

53314

53315

if (N1.getOpcode() == ISD::CTLZ_ZERO_UNDEF) {

53316

OpCTLZ = N1;

53317

OpSizeTM1 = N0;

53318

} else if (N->getOpcode() == ISD::SUB) {

53319

return SDValue();

53320

} else {

53321

OpCTLZ = N0;

53322

OpSizeTM1 = N1;

53323

}

53324

53325

if (!OpCTLZ.hasOneUse())

53326

return SDValue();

53327

auto *C = dyn_cast<ConstantSDNode>(OpSizeTM1);

53328

if (!C)

53329

return SDValue();

53330

53331

if (C->getZExtValue() != uint64_t(OpCTLZ.getValueSizeInBits() - 1))

53332

return SDValue();

53333

SDLoc DL(N);

53334

EVT OpVT = VT;

53335

SDValue Op = OpCTLZ.getOperand(0);

53336

if (VT == MVT::i8) {

53337

// Zero extend to i32 since there is not an i8 bsr.

53338

OpVT = MVT::i32;

53339

Op = DAG.getNode(ISD::ZERO_EXTEND, DL, OpVT, Op);

53340

}

53341

53342

SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);

53343

Op = DAG.getNode(X86ISD::BSR, DL, VTs, Op);

53344

if (VT == MVT::i8)

53345

Op = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Op);

53346

53347

return Op;

53348

}

53349

53350

static SDValue combineXor(SDNode *N, SelectionDAG &DAG,

53351

TargetLowering::DAGCombinerInfo &DCI,

53352

const X86Subtarget &Subtarget) {

53353

SDValue N0 = N->getOperand(0);

53354

SDValue N1 = N->getOperand(1);

53355

EVT VT = N->getValueType(0);

53356

53357

// If this is SSE1 only convert to FXOR to avoid scalarization.

53358

if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {

53359

return DAG.getBitcast(MVT::v4i32,

53360

DAG.getNode(X86ISD::FXOR, SDLoc(N), MVT::v4f32,

53361

DAG.getBitcast(MVT::v4f32, N0),

53362

DAG.getBitcast(MVT::v4f32, N1)));

53363

}

53364

53365

if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget))

53366

return Cmp;

53367

53368

if (SDValue R = combineBitOpWithMOVMSK(N, DAG))

53369

return R;

53370

53371

if (SDValue R = combineBitOpWithShift(N, DAG))

53372

return R;

53373

53374

if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, DCI, Subtarget))

53375

return FPLogic;

53376

53377

if (SDValue R = combineXorSubCTLZ(N, DAG, Subtarget))

53378

return R;

53379

53380

if (DCI.isBeforeLegalizeOps())

53381

return SDValue();

53382

53383

if (SDValue SetCC = foldXor1SetCC(N, DAG))

53384

return SetCC;

53385

53386

if (SDValue R = combineOrXorWithSETCC(N, N0, N1, DAG))

53387

return R;

53388

53389

if (SDValue RV = foldXorTruncShiftIntoCmp(N, DAG))

53390

return RV;

53391

53392

// Fold not(iX bitcast(vXi1)) -> (iX bitcast(not(vec))) for legal boolvecs.

53393

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

53394

if (llvm::isAllOnesConstant(N1) && N0.getOpcode() == ISD::BITCAST &&

53395

N0.getOperand(0).getValueType().isVector() &&

53396

N0.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&

53397

TLI.isTypeLegal(N0.getOperand(0).getValueType()) && N0.hasOneUse()) {

53398

return DAG.getBitcast(VT, DAG.getNOT(SDLoc(N), N0.getOperand(0),

53399

N0.getOperand(0).getValueType()));

53400

}

53401

53402

// Handle AVX512 mask widening.

53403

// Fold not(insert_subvector(undef,sub)) -> insert_subvector(undef,not(sub))

53404

if (ISD::isBuildVectorAllOnes(N1.getNode()) && VT.isVector() &&

53405

VT.getVectorElementType() == MVT::i1 &&

53406

N0.getOpcode() == ISD::INSERT_SUBVECTOR && N0.getOperand(0).isUndef() &&

53407

TLI.isTypeLegal(N0.getOperand(1).getValueType())) {

53408

return DAG.getNode(

53409

ISD::INSERT_SUBVECTOR, SDLoc(N), VT, N0.getOperand(0),

53410

DAG.getNOT(SDLoc(N), N0.getOperand(1), N0.getOperand(1).getValueType()),

53411

N0.getOperand(2));

53412

}

53413

53414

// Fold xor(zext(xor(x,c1)),c2) -> xor(zext(x),xor(zext(c1),c2))

53415

// Fold xor(truncate(xor(x,c1)),c2) -> xor(truncate(x),xor(truncate(c1),c2))

53416

// TODO: Under what circumstances could this be performed in DAGCombine?

53417

if ((N0.getOpcode() == ISD::TRUNCATE || N0.getOpcode() == ISD::ZERO_EXTEND) &&

53418

N0.getOperand(0).getOpcode() == N->getOpcode()) {

53419

SDValue TruncExtSrc = N0.getOperand(0);

53420

auto *N1C = dyn_cast<ConstantSDNode>(N1);

53421

auto *N001C = dyn_cast<ConstantSDNode>(TruncExtSrc.getOperand(1));

53422

if (N1C && !N1C->isOpaque() && N001C && !N001C->isOpaque()) {

53423

SDLoc DL(N);

53424

SDValue LHS = DAG.getZExtOrTrunc(TruncExtSrc.getOperand(0), DL, VT);

53425

SDValue RHS = DAG.getZExtOrTrunc(TruncExtSrc.getOperand(1), DL, VT);

53426

return DAG.getNode(ISD::XOR, DL, VT, LHS,

53427

DAG.getNode(ISD::XOR, DL, VT, RHS, N1));

53428

}

53429

}

53430

53431

if (SDValue R = combineBMILogicOp(N, DAG, Subtarget))

53432

return R;

53433

53434

return combineFneg(N, DAG, DCI, Subtarget);

53435

}

53436

53437

static SDValue combineBEXTR(SDNode *N, SelectionDAG &DAG,

53438

TargetLowering::DAGCombinerInfo &DCI,

53439

const X86Subtarget &Subtarget) {

53440

EVT VT = N->getValueType(0);

53441

unsigned NumBits = VT.getSizeInBits();

53442

53443

// TODO - Constant Folding.

53444

53445

// Simplify the inputs.

53446

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

53447

APInt DemandedMask(APInt::getAllOnes(NumBits));

53448

if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))

53449

return SDValue(N, 0);

53450

53451

return SDValue();

53452

}

53453

53454

static bool isNullFPScalarOrVectorConst(SDValue V) {

53455

return isNullFPConstant(V) || ISD::isBuildVectorAllZeros(V.getNode());

53456

}

53457

53458

/// If a value is a scalar FP zero or a vector FP zero (potentially including

53459

/// undefined elements), return a zero constant that may be used to fold away

53460

/// that value. In the case of a vector, the returned constant will not contain

53461

/// undefined elements even if the input parameter does. This makes it suitable

53462

/// to be used as a replacement operand with operations (eg, bitwise-and) where

53463

/// an undef should not propagate.

53464

static SDValue getNullFPConstForNullVal(SDValue V, SelectionDAG &DAG,

53465

const X86Subtarget &Subtarget) {

53466

if (!isNullFPScalarOrVectorConst(V))

53467

return SDValue();

53468

53469

if (V.getValueType().isVector())

53470

return getZeroVector(V.getSimpleValueType(), Subtarget, DAG, SDLoc(V));

53471

53472

return V;

53473

}

53474

53475

static SDValue combineFAndFNotToFAndn(SDNode *N, SelectionDAG &DAG,

53476

const X86Subtarget &Subtarget) {

53477

SDValue N0 = N->getOperand(0);

53478

SDValue N1 = N->getOperand(1);

53479

EVT VT = N->getValueType(0);

53480

SDLoc DL(N);

53481

53482

// Vector types are handled in combineANDXORWithAllOnesIntoANDNP().

53483

if (!((VT == MVT::f32 && Subtarget.hasSSE1()) ||

53484

(VT == MVT::f64 && Subtarget.hasSSE2()) ||

53485

(VT == MVT::v4f32 && Subtarget.hasSSE1() && !Subtarget.hasSSE2())))

53486

return SDValue();

53487

53488

auto isAllOnesConstantFP = [](SDValue V) {

53489

if (V.getSimpleValueType().isVector())

53490

return ISD::isBuildVectorAllOnes(V.getNode());

53491

auto *C = dyn_cast<ConstantFPSDNode>(V);

53492

return C && C->getConstantFPValue()->isAllOnesValue();

53493

};

53494

53495

// fand (fxor X, -1), Y --> fandn X, Y

53496

if (N0.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N0.getOperand(1)))

53497

return DAG.getNode(X86ISD::FANDN, DL, VT, N0.getOperand(0), N1);

53498

53499

// fand X, (fxor Y, -1) --> fandn Y, X

53500

if (N1.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N1.getOperand(1)))

53501

return DAG.getNode(X86ISD::FANDN, DL, VT, N1.getOperand(0), N0);

53502

53503

return SDValue();

53504

}

53505

53506

/// Do target-specific dag combines on X86ISD::FAND nodes.

53507

static SDValue combineFAnd(SDNode *N, SelectionDAG &DAG,

53508

const X86Subtarget &Subtarget) {

53509

// FAND(0.0, x) -> 0.0

53510

if (SDValue V = getNullFPConstForNullVal(N->getOperand(0), DAG, Subtarget))

53511

return V;

53512

53513

// FAND(x, 0.0) -> 0.0

53514

if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))

53515

return V;

53516

53517

if (SDValue V = combineFAndFNotToFAndn(N, DAG, Subtarget))

53518

return V;

53519

53520

return lowerX86FPLogicOp(N, DAG, Subtarget);

53521

}

53522

53523

/// Do target-specific dag combines on X86ISD::FANDN nodes.

53524

static SDValue combineFAndn(SDNode *N, SelectionDAG &DAG,

53525

const X86Subtarget &Subtarget) {

53526

// FANDN(0.0, x) -> x

53527

if (isNullFPScalarOrVectorConst(N->getOperand(0)))

53528

return N->getOperand(1);

53529

53530

// FANDN(x, 0.0) -> 0.0

53531

if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))

53532

return V;

53533

53534

return lowerX86FPLogicOp(N, DAG, Subtarget);

53535

}

53536

53537

/// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.

53538

static SDValue combineFOr(SDNode *N, SelectionDAG &DAG,

53539

TargetLowering::DAGCombinerInfo &DCI,

53540

const X86Subtarget &Subtarget) {

53541

assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR)(static_cast <bool> (N->getOpcode() == X86ISD::FOR ||
N->getOpcode() == X86ISD::FXOR) ? void (0) : __assert_fail
("N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 53541, __extension__
__PRETTY_FUNCTION__));

53542

53543

// F[X]OR(0.0, x) -> x

53544

if (isNullFPScalarOrVectorConst(N->getOperand(0)))

53545

return N->getOperand(1);

53546

53547

// F[X]OR(x, 0.0) -> x

53548

if (isNullFPScalarOrVectorConst(N->getOperand(1)))

53549

return N->getOperand(0);

53550

53551

if (SDValue NewVal = combineFneg(N, DAG, DCI, Subtarget))

53552

return NewVal;

53553

53554

return lowerX86FPLogicOp(N, DAG, Subtarget);

53555

}

53556

53557

/// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.

53558

static SDValue combineFMinFMax(SDNode *N, SelectionDAG &DAG) {

53559

assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX)(static_cast <bool> (N->getOpcode() == X86ISD::FMIN ||
N->getOpcode() == X86ISD::FMAX) ? void (0) : __assert_fail
("N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 53559, __extension__
__PRETTY_FUNCTION__));

53560

53561

// FMIN/FMAX are commutative if no NaNs and no negative zeros are allowed.

53562

if (!DAG.getTarget().Options.NoNaNsFPMath ||

53563

!DAG.getTarget().Options.NoSignedZerosFPMath)

53564

return SDValue();

53565

53566

// If we run in unsafe-math mode, then convert the FMAX and FMIN nodes

53567

// into FMINC and FMAXC, which are Commutative operations.

53568

unsigned NewOp = 0;

53569

switch (N->getOpcode()) {

53570

default: llvm_unreachable("unknown opcode")::llvm::llvm_unreachable_internal("unknown opcode", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 53570);

53571

case X86ISD::FMIN: NewOp = X86ISD::FMINC; break;

53572

case X86ISD::FMAX: NewOp = X86ISD::FMAXC; break;

53573

}

53574

53575

return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0),

53576

N->getOperand(0), N->getOperand(1));

53577

}

53578

53579

static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG,

53580

const X86Subtarget &Subtarget) {

53581

EVT VT = N->getValueType(0);

53582

if (Subtarget.useSoftFloat() || isSoftFP16(VT, Subtarget))

53583

return SDValue();

53584

53585

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

53586

53587

if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||

53588

(Subtarget.hasSSE2() && VT == MVT::f64) ||

53589

(Subtarget.hasFP16() && VT == MVT::f16) ||

53590

(VT.isVector() && TLI.isTypeLegal(VT))))

53591

return SDValue();

53592

53593

SDValue Op0 = N->getOperand(0);

53594

SDValue Op1 = N->getOperand(1);

53595

SDLoc DL(N);

53596

auto MinMaxOp = N->getOpcode() == ISD::FMAXNUM ? X86ISD::FMAX : X86ISD::FMIN;

53597

53598

// If we don't have to respect NaN inputs, this is a direct translation to x86

53599

// min/max instructions.

53600

if (DAG.getTarget().Options.NoNaNsFPMath || N->getFlags().hasNoNaNs())

53601

return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());

53602

53603

// If one of the operands is known non-NaN use the native min/max instructions

53604

// with the non-NaN input as second operand.

53605

if (DAG.isKnownNeverNaN(Op1))

53606

return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());

53607

if (DAG.isKnownNeverNaN(Op0))

53608

return DAG.getNode(MinMaxOp, DL, VT, Op1, Op0, N->getFlags());

53609

53610

// If we have to respect NaN inputs, this takes at least 3 instructions.

53611

// Favor a library call when operating on a scalar and minimizing code size.

53612

if (!VT.isVector() && DAG.getMachineFunction().getFunction().hasMinSize())

53613

return SDValue();

53614

53615

EVT SetCCType = TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(),

53616

VT);

53617

53618

// There are 4 possibilities involving NaN inputs, and these are the required

53619

// outputs:

53620

// Op1

53621

// Num NaN

53622

// ----------------

53623

// Num | Max | Op0 |

53624

// Op0 ----------------

53625

// NaN | Op1 | NaN |

53626

// ----------------

53627

//

53628

// The SSE FP max/min instructions were not designed for this case, but rather

53629

// to implement:

53630

// Min = Op1 < Op0 ? Op1 : Op0

53631

// Max = Op1 > Op0 ? Op1 : Op0

53632

//

53633

// So they always return Op0 if either input is a NaN. However, we can still

53634

// use those instructions for fmaxnum by selecting away a NaN input.

53635

53636

// If either operand is NaN, the 2nd source operand (Op0) is passed through.

53637

SDValue MinOrMax = DAG.getNode(MinMaxOp, DL, VT, Op1, Op0);

53638

SDValue IsOp0Nan = DAG.getSetCC(DL, SetCCType, Op0, Op0, ISD::SETUO);

53639

53640

// If Op0 is a NaN, select Op1. Otherwise, select the max. If both operands

53641

// are NaN, the NaN value of Op1 is the result.

53642

return DAG.getSelect(DL, VT, IsOp0Nan, Op1, MinOrMax);

53643

}

53644

53645

static SDValue combineX86INT_TO_FP(SDNode *N, SelectionDAG &DAG,

53646

TargetLowering::DAGCombinerInfo &DCI) {

53647

EVT VT = N->getValueType(0);

53648

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

53649

53650

APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());

53651

if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))

53652

return SDValue(N, 0);

53653

53654

// Convert a full vector load into vzload when not all bits are needed.

53655

SDValue In = N->getOperand(0);

53656

MVT InVT = In.getSimpleValueType();

53657

if (VT.getVectorNumElements() < InVT.getVectorNumElements() &&

53658

ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {

53659

assert(InVT.is128BitVector() && "Expected 128-bit input vector")(static_cast <bool> (InVT.is128BitVector() && "Expected 128-bit input vector"
) ? void (0) : __assert_fail ("InVT.is128BitVector() && \"Expected 128-bit input vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 53659, __extension__
__PRETTY_FUNCTION__));

53660

LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(0));

53661

unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();

53662

MVT MemVT = MVT::getIntegerVT(NumBits);

53663

MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);

53664

if (SDValue VZLoad = narrowLoadToVZLoad(LN, MemVT, LoadVT, DAG)) {

53665

SDLoc dl(N);

53666

SDValue Convert = DAG.getNode(N->getOpcode(), dl, VT,

53667

DAG.getBitcast(InVT, VZLoad));

53668

DCI.CombineTo(N, Convert);

53669

DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));

53670

DCI.recursivelyDeleteUnusedNodes(LN);

53671

return SDValue(N, 0);

53672

}

53673

}

53674

53675

return SDValue();

53676

}

53677

53678

static SDValue combineCVTP2I_CVTTP2I(SDNode *N, SelectionDAG &DAG,

53679

TargetLowering::DAGCombinerInfo &DCI) {

53680

bool IsStrict = N->isTargetStrictFPOpcode();

53681

EVT VT = N->getValueType(0);

53682

53683

// Convert a full vector load into vzload when not all bits are needed.

53684

SDValue In = N->getOperand(IsStrict ? 1 : 0);

53685

MVT InVT = In.getSimpleValueType();

53686

if (VT.getVectorNumElements() < InVT.getVectorNumElements() &&

53687

ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {

53688

assert(InVT.is128BitVector() && "Expected 128-bit input vector")(static_cast <bool> (InVT.is128BitVector() && "Expected 128-bit input vector"
) ? void (0) : __assert_fail ("InVT.is128BitVector() && \"Expected 128-bit input vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 53688, __extension__
__PRETTY_FUNCTION__));

53689

LoadSDNode *LN = cast<LoadSDNode>(In);

53690

unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();

53691

MVT MemVT = MVT::getFloatingPointVT(NumBits);

53692

MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);

53693

if (SDValue VZLoad = narrowLoadToVZLoad(LN, MemVT, LoadVT, DAG)) {

53694

SDLoc dl(N);

53695

if (IsStrict) {

53696

SDValue Convert =

53697

DAG.getNode(N->getOpcode(), dl, {VT, MVT::Other},

53698

{N->getOperand(0), DAG.getBitcast(InVT, VZLoad)});

53699

DCI.CombineTo(N, Convert, Convert.getValue(1));

53700

} else {

53701

SDValue Convert =

53702

DAG.getNode(N->getOpcode(), dl, VT, DAG.getBitcast(InVT, VZLoad));

53703

DCI.CombineTo(N, Convert);

53704

}

53705

DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));

53706

DCI.recursivelyDeleteUnusedNodes(LN);

53707

return SDValue(N, 0);

53708

}

53709

}

53710

53711

return SDValue();

53712

}

53713

53714

/// Do target-specific dag combines on X86ISD::ANDNP nodes.

53715

static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG,

53716

TargetLowering::DAGCombinerInfo &DCI,

53717

const X86Subtarget &Subtarget) {

53718

SDValue N0 = N->getOperand(0);

53719

SDValue N1 = N->getOperand(1);

53720

MVT VT = N->getSimpleValueType(0);

53721

int NumElts = VT.getVectorNumElements();

53722

unsigned EltSizeInBits = VT.getScalarSizeInBits();

53723

53724

// ANDNP(undef, x) -> 0

53725

// ANDNP(x, undef) -> 0

53726

if (N0.isUndef() || N1.isUndef())

53727

return DAG.getConstant(0, SDLoc(N), VT);

53728

53729

// ANDNP(0, x) -> x

53730

if (ISD::isBuildVectorAllZeros(N0.getNode()))

53731

return N1;

53732

53733

// ANDNP(x, 0) -> 0

53734

if (ISD::isBuildVectorAllZeros(N1.getNode()))

53735

return DAG.getConstant(0, SDLoc(N), VT);

53736

53737

// Turn ANDNP back to AND if input is inverted.

53738

if (SDValue Not = IsNOT(N0, DAG))

53739

return DAG.getNode(ISD::AND, SDLoc(N), VT, DAG.getBitcast(VT, Not), N1);

53740

53741

// Constant Folding

53742

APInt Undefs0, Undefs1;

53743

SmallVector<APInt> EltBits0, EltBits1;

53744

if (getTargetConstantBitsFromNode(N0, EltSizeInBits, Undefs0, EltBits0)) {

53745

SDLoc DL(N);

53746

APInt ResultUndefs = APInt::getZero(NumElts);

53747

53748

if (getTargetConstantBitsFromNode(N1, EltSizeInBits, Undefs1, EltBits1)) {

53749

SmallVector<APInt> ResultBits;

53750

for (int I = 0; I != NumElts; ++I)

53751

ResultBits.push_back(~EltBits0[I] & EltBits1[I]);

53752

return getConstVector(ResultBits, ResultUndefs, VT, DAG, DL);

53753

}

53754

53755

// Constant fold NOT(N0) to allow us to use AND.

53756

// Ensure this is only performed if we can confirm that the bitcasted source

53757

// has oneuse to prevent an infinite loop with canonicalizeBitSelect.

53758

if (N0->hasOneUse()) {

53759

SDValue BC0 = peekThroughOneUseBitcasts(N0);

53760

if (BC0.getOpcode() != ISD::BITCAST) {

53761

for (APInt &Elt : EltBits0)

53762

Elt = ~Elt;

53763

SDValue Not = getConstVector(EltBits0, ResultUndefs, VT, DAG, DL);

53764

return DAG.getNode(ISD::AND, DL, VT, Not, N1);

53765

}

53766

}

53767

}

53768

53769

// Attempt to recursively combine a bitmask ANDNP with shuffles.

53770

if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {

53771

SDValue Op(N, 0);

53772

if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))

53773

return Res;

53774

53775

// If either operand is a constant mask, then only the elements that aren't

53776

// zero are actually demanded by the other operand.

53777

auto GetDemandedMasks = [&](SDValue Op, bool Invert = false) {

53778

APInt UndefElts;

53779

SmallVector<APInt> EltBits;

53780

APInt DemandedBits = APInt::getAllOnes(EltSizeInBits);

53781

APInt DemandedElts = APInt::getAllOnes(NumElts);

53782

if (getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,

53783

EltBits)) {

53784

DemandedBits.clearAllBits();

53785

DemandedElts.clearAllBits();

53786

for (int I = 0; I != NumElts; ++I) {

53787

if (UndefElts[I]) {

53788

// We can't assume an undef src element gives an undef dst - the

53789

// other src might be zero.

53790

DemandedBits.setAllBits();

53791

DemandedElts.setBit(I);

53792

} else if ((Invert && !EltBits[I].isAllOnes()) ||

53793

(!Invert && !EltBits[I].isZero())) {

53794

DemandedBits |= Invert ? ~EltBits[I] : EltBits[I];

53795

DemandedElts.setBit(I);

53796

}

53797

}

53798

}

53799

return std::make_pair(DemandedBits, DemandedElts);

53800

};

53801

APInt Bits0, Elts0;

53802

APInt Bits1, Elts1;

53803

std::tie(Bits0, Elts0) = GetDemandedMasks(N1);

53804

std::tie(Bits1, Elts1) = GetDemandedMasks(N0, true);

53805

53806

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

53807

if (TLI.SimplifyDemandedVectorElts(N0, Elts0, DCI) ||

53808

TLI.SimplifyDemandedVectorElts(N1, Elts1, DCI) ||

53809

TLI.SimplifyDemandedBits(N0, Bits0, Elts0, DCI) ||

53810

TLI.SimplifyDemandedBits(N1, Bits1, Elts1, DCI)) {

53811

if (N->getOpcode() != ISD::DELETED_NODE)

53812

DCI.AddToWorklist(N);

53813

return SDValue(N, 0);

53814

}

53815

}

53816

53817

return SDValue();

53818

}

53819

53820

static SDValue combineBT(SDNode *N, SelectionDAG &DAG,

53821

TargetLowering::DAGCombinerInfo &DCI) {

53822

SDValue N1 = N->getOperand(1);

53823

53824

// BT ignores high bits in the bit index operand.

53825

unsigned BitWidth = N1.getValueSizeInBits();

53826

APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth));

53827

if (DAG.getTargetLoweringInfo().SimplifyDemandedBits(N1, DemandedMask, DCI)) {

53828

if (N->getOpcode() != ISD::DELETED_NODE)

53829

DCI.AddToWorklist(N);

53830

return SDValue(N, 0);

53831

}

53832

53833

return SDValue();

53834

}

53835

53836

static SDValue combineCVTPH2PS(SDNode *N, SelectionDAG &DAG,

53837

TargetLowering::DAGCombinerInfo &DCI) {

53838

bool IsStrict = N->getOpcode() == X86ISD::STRICT_CVTPH2PS;

53839

SDValue Src = N->getOperand(IsStrict ? 1 : 0);

53840

53841

if (N->getValueType(0) == MVT::v4f32 && Src.getValueType() == MVT::v8i16) {

53842

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

53843

APInt DemandedElts = APInt::getLowBitsSet(8, 4);

53844

if (TLI.SimplifyDemandedVectorElts(Src, DemandedElts, DCI)) {

53845

if (N->getOpcode() != ISD::DELETED_NODE)

53846

DCI.AddToWorklist(N);

53847

return SDValue(N, 0);

53848

}

53849

53850

// Convert a full vector load into vzload when not all bits are needed.

53851

if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) {

53852

LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(IsStrict ? 1 : 0));

53853

if (SDValue VZLoad = narrowLoadToVZLoad(LN, MVT::i64, MVT::v2i64, DAG)) {

53854

SDLoc dl(N);

53855

if (IsStrict) {

53856

SDValue Convert = DAG.getNode(

53857

N->getOpcode(), dl, {MVT::v4f32, MVT::Other},

53858

{N->getOperand(0), DAG.getBitcast(MVT::v8i16, VZLoad)});

53859

DCI.CombineTo(N, Convert, Convert.getValue(1));

53860

} else {

53861

SDValue Convert = DAG.getNode(N->getOpcode(), dl, MVT::v4f32,

53862

DAG.getBitcast(MVT::v8i16, VZLoad));

53863

DCI.CombineTo(N, Convert);

53864

}

53865

53866

DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));

53867

DCI.recursivelyDeleteUnusedNodes(LN);

53868

return SDValue(N, 0);

53869

}

53870

}

53871

}

53872

53873

return SDValue();

53874

}

53875

53876

// Try to combine sext_in_reg of a cmov of constants by extending the constants.

53877

static SDValue combineSextInRegCmov(SDNode *N, SelectionDAG &DAG) {

53878

assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG)(static_cast <bool> (N->getOpcode() == ISD::SIGN_EXTEND_INREG
) ? void (0) : __assert_fail ("N->getOpcode() == ISD::SIGN_EXTEND_INREG"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 53878, __extension__
__PRETTY_FUNCTION__));

53879

53880

EVT DstVT = N->getValueType(0);

53881

53882

SDValue N0 = N->getOperand(0);

53883

SDValue N1 = N->getOperand(1);

53884

EVT ExtraVT = cast<VTSDNode>(N1)->getVT();

53885

53886

if (ExtraVT != MVT::i8 && ExtraVT != MVT::i16)

53887

return SDValue();

53888

53889

// Look through single use any_extends / truncs.

53890

SDValue IntermediateBitwidthOp;

53891

if ((N0.getOpcode() == ISD::ANY_EXTEND || N0.getOpcode() == ISD::TRUNCATE) &&

53892

N0.hasOneUse()) {

53893

IntermediateBitwidthOp = N0;

53894

N0 = N0.getOperand(0);

53895

}

53896

53897

// See if we have a single use cmov.

53898

if (N0.getOpcode() != X86ISD::CMOV || !N0.hasOneUse())

53899

return SDValue();

53900

53901

SDValue CMovOp0 = N0.getOperand(0);

53902

SDValue CMovOp1 = N0.getOperand(1);

53903

53904

// Make sure both operands are constants.

53905

if (!isa<ConstantSDNode>(CMovOp0.getNode()) ||

53906

!isa<ConstantSDNode>(CMovOp1.getNode()))

53907

return SDValue();

53908

53909

SDLoc DL(N);

53910

53911

// If we looked through an any_extend/trunc above, add one to the constants.

53912

if (IntermediateBitwidthOp) {

53913

unsigned IntermediateOpc = IntermediateBitwidthOp.getOpcode();

53914

CMovOp0 = DAG.getNode(IntermediateOpc, DL, DstVT, CMovOp0);

53915

CMovOp1 = DAG.getNode(IntermediateOpc, DL, DstVT, CMovOp1);

53916

}

53917

53918

CMovOp0 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, CMovOp0, N1);

53919

CMovOp1 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, CMovOp1, N1);

53920

53921

EVT CMovVT = DstVT;

53922

// We do not want i16 CMOV's. Promote to i32 and truncate afterwards.

53923

if (DstVT == MVT::i16) {

53924

CMovVT = MVT::i32;

53925

CMovOp0 = DAG.getNode(ISD::ZERO_EXTEND, DL, CMovVT, CMovOp0);

53926

CMovOp1 = DAG.getNode(ISD::ZERO_EXTEND, DL, CMovVT, CMovOp1);

53927

}

53928

53929

SDValue CMov = DAG.getNode(X86ISD::CMOV, DL, CMovVT, CMovOp0, CMovOp1,

53930

N0.getOperand(2), N0.getOperand(3));

53931

53932

if (CMovVT != DstVT)

53933

CMov = DAG.getNode(ISD::TRUNCATE, DL, DstVT, CMov);

53934

53935

return CMov;

53936

}

53937

53938

static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG,

53939

const X86Subtarget &Subtarget) {

53940

assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG)(static_cast <bool> (N->getOpcode() == ISD::SIGN_EXTEND_INREG
) ? void (0) : __assert_fail ("N->getOpcode() == ISD::SIGN_EXTEND_INREG"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 53940, __extension__
__PRETTY_FUNCTION__));

53941

53942

if (SDValue V = combineSextInRegCmov(N, DAG))

53943

return V;

53944

53945

EVT VT = N->getValueType(0);

53946

SDValue N0 = N->getOperand(0);

53947

SDValue N1 = N->getOperand(1);

53948

EVT ExtraVT = cast<VTSDNode>(N1)->getVT();

53949

SDLoc dl(N);

53950

53951

// The SIGN_EXTEND_INREG to v4i64 is expensive operation on the

53952

// both SSE and AVX2 since there is no sign-extended shift right

53953

// operation on a vector with 64-bit elements.

53954

//(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) ->

53955

// (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT)))

53956

if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND ||

53957

N0.getOpcode() == ISD::SIGN_EXTEND)) {

53958

SDValue N00 = N0.getOperand(0);

53959

53960

// EXTLOAD has a better solution on AVX2,

53961

// it may be replaced with X86ISD::VSEXT node.

53962

if (N00.getOpcode() == ISD::LOAD && Subtarget.hasInt256())

53963

if (!ISD::isNormalLoad(N00.getNode()))

53964

return SDValue();

53965

53966

// Attempt to promote any comparison mask ops before moving the

53967

// SIGN_EXTEND_INREG in the way.

53968

if (SDValue Promote = PromoteMaskArithmetic(N0.getNode(), DAG, Subtarget))

53969

return DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, VT, Promote, N1);

53970

53971

if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {

53972

SDValue Tmp =

53973

DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32, N00, N1);

53974

return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp);

53975

}

53976

}

53977

return SDValue();

53978

}

53979

53980

/// sext(add_nsw(x, C)) --> add(sext(x), C_sext)

53981

/// zext(add_nuw(x, C)) --> add(zext(x), C_zext)

53982

/// Promoting a sign/zero extension ahead of a no overflow 'add' exposes

53983

/// opportunities to combine math ops, use an LEA, or use a complex addressing

53984

/// mode. This can eliminate extend, add, and shift instructions.

53985

static SDValue promoteExtBeforeAdd(SDNode *Ext, SelectionDAG &DAG,

53986

const X86Subtarget &Subtarget) {

53987

if (Ext->getOpcode() != ISD::SIGN_EXTEND &&

53988

Ext->getOpcode() != ISD::ZERO_EXTEND)

53989

return SDValue();

53990

53991

// TODO: This should be valid for other integer types.

53992

EVT VT = Ext->getValueType(0);

53993

if (VT != MVT::i64)

53994

return SDValue();

53995

53996

SDValue Add = Ext->getOperand(0);

53997

if (Add.getOpcode() != ISD::ADD)

53998

return SDValue();

53999

54000

bool Sext = Ext->getOpcode() == ISD::SIGN_EXTEND;

54001

bool NSW = Add->getFlags().hasNoSignedWrap();

54002

bool NUW = Add->getFlags().hasNoUnsignedWrap();

54003

54004

// We need an 'add nsw' feeding into the 'sext' or 'add nuw' feeding

54005

// into the 'zext'

54006

if ((Sext && !NSW) || (!Sext && !NUW))

54007

return SDValue();

54008

54009

// Having a constant operand to the 'add' ensures that we are not increasing

54010

// the instruction count because the constant is extended for free below.

54011

// A constant operand can also become the displacement field of an LEA.

54012

auto *AddOp1 = dyn_cast<ConstantSDNode>(Add.getOperand(1));

54013

if (!AddOp1)

54014

return SDValue();

54015

54016

// Don't make the 'add' bigger if there's no hope of combining it with some

54017

// other 'add' or 'shl' instruction.

54018

// TODO: It may be profitable to generate simpler LEA instructions in place

54019

// of single 'add' instructions, but the cost model for selecting an LEA

54020

// currently has a high threshold.

54021

bool HasLEAPotential = false;

54022

for (auto *User : Ext->uses()) {

54023

if (User->getOpcode() == ISD::ADD || User->getOpcode() == ISD::SHL) {

54024

HasLEAPotential = true;

54025

break;

54026

}

54027

}

54028

if (!HasLEAPotential)

54029

return SDValue();

54030

54031

// Everything looks good, so pull the '{s|z}ext' ahead of the 'add'.

54032

int64_t AddConstant = Sext ? AddOp1->getSExtValue() : AddOp1->getZExtValue();

54033

SDValue AddOp0 = Add.getOperand(0);

54034

SDValue NewExt = DAG.getNode(Ext->getOpcode(), SDLoc(Ext), VT, AddOp0);

54035

SDValue NewConstant = DAG.getConstant(AddConstant, SDLoc(Add), VT);

54036

54037

// The wider add is guaranteed to not wrap because both operands are

54038

// sign-extended.

54039

SDNodeFlags Flags;

54040

Flags.setNoSignedWrap(NSW);

54041

Flags.setNoUnsignedWrap(NUW);

54042

return DAG.getNode(ISD::ADD, SDLoc(Add), VT, NewExt, NewConstant, Flags);

54043

}

54044

54045

// If we face {ANY,SIGN,ZERO}_EXTEND that is applied to a CMOV with constant

54046

// operands and the result of CMOV is not used anywhere else - promote CMOV

54047

// itself instead of promoting its result. This could be beneficial, because:

54048

// 1) X86TargetLowering::EmitLoweredSelect later can do merging of two

54049

// (or more) pseudo-CMOVs only when they go one-after-another and

54050

// getting rid of result extension code after CMOV will help that.

54051

// 2) Promotion of constant CMOV arguments is free, hence the

54052

// {ANY,SIGN,ZERO}_EXTEND will just be deleted.

54053

// 3) 16-bit CMOV encoding is 4 bytes, 32-bit CMOV is 3-byte, so this

54054

// promotion is also good in terms of code-size.

54055

// (64-bit CMOV is 4-bytes, that's why we don't do 32-bit => 64-bit

54056

// promotion).

54057

static SDValue combineToExtendCMOV(SDNode *Extend, SelectionDAG &DAG) {

54058

SDValue CMovN = Extend->getOperand(0);

54059

if (CMovN.getOpcode() != X86ISD::CMOV || !CMovN.hasOneUse())

54060

return SDValue();

54061

54062

EVT TargetVT = Extend->getValueType(0);

54063

unsigned ExtendOpcode = Extend->getOpcode();

54064

SDLoc DL(Extend);

54065

54066

EVT VT = CMovN.getValueType();

54067

SDValue CMovOp0 = CMovN.getOperand(0);

54068

SDValue CMovOp1 = CMovN.getOperand(1);

54069

54070

if (!isa<ConstantSDNode>(CMovOp0.getNode()) ||

54071

!isa<ConstantSDNode>(CMovOp1.getNode()))

54072

return SDValue();

54073

54074

// Only extend to i32 or i64.

54075

if (TargetVT != MVT::i32 && TargetVT != MVT::i64)

54076

return SDValue();

54077

54078

// Only extend from i16 unless its a sign_extend from i32. Zext/aext from i32

54079

// are free.

54080

if (VT != MVT::i16 && !(ExtendOpcode == ISD::SIGN_EXTEND && VT == MVT::i32))

54081

return SDValue();

54082

54083

// If this a zero extend to i64, we should only extend to i32 and use a free

54084

// zero extend to finish.

54085

EVT ExtendVT = TargetVT;

54086

if (TargetVT == MVT::i64 && ExtendOpcode != ISD::SIGN_EXTEND)

54087

ExtendVT = MVT::i32;

54088

54089

CMovOp0 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp0);

54090

CMovOp1 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp1);

54091

54092

SDValue Res = DAG.getNode(X86ISD::CMOV, DL, ExtendVT, CMovOp0, CMovOp1,

54093

CMovN.getOperand(2), CMovN.getOperand(3));

54094

54095

// Finish extending if needed.

54096

if (ExtendVT != TargetVT)

54097

Res = DAG.getNode(ExtendOpcode, DL, TargetVT, Res);

54098

54099

return Res;

54100

}

54101

54102

// Attempt to combine a (sext/zext (setcc)) to a setcc with a xmm/ymm/zmm

54103

// result type.

54104

static SDValue combineExtSetcc(SDNode *N, SelectionDAG &DAG,

54105

const X86Subtarget &Subtarget) {

54106

SDValue N0 = N->getOperand(0);

54107

EVT VT = N->getValueType(0);

54108

SDLoc dl(N);

54109

54110

// Only do this combine with AVX512 for vector extends.

54111

if (!Subtarget.hasAVX512() || !VT.isVector() || N0.getOpcode() != ISD::SETCC)

54112

return SDValue();

54113

54114

// Only combine legal element types.

54115

EVT SVT = VT.getVectorElementType();

54116

if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32 &&

54117

SVT != MVT::i64 && SVT != MVT::f32 && SVT != MVT::f64)

54118

return SDValue();

54119

54120

// We don't have CMPP Instruction for vxf16

54121

if (N0.getOperand(0).getValueType().getVectorElementType() == MVT::f16)

54122

return SDValue();

54123

// We can only do this if the vector size in 256 bits or less.

54124

unsigned Size = VT.getSizeInBits();

54125

if (Size > 256 && Subtarget.useAVX512Regs())

54126

return SDValue();

54127

54128

// Don't fold if the condition code can't be handled by PCMPEQ/PCMPGT since

54129

// that's the only integer compares with we have.

54130

ISD::CondCode CC = cast<CondCodeSDNode>(N0.getOperand(2))->get();

54131

if (ISD::isUnsignedIntSetCC(CC))

54132

return SDValue();

54133

54134

// Only do this combine if the extension will be fully consumed by the setcc.

54135

EVT N00VT = N0.getOperand(0).getValueType();

54136

EVT MatchingVecType = N00VT.changeVectorElementTypeToInteger();

54137

if (Size != MatchingVecType.getSizeInBits())

54138

return SDValue();

54139

54140

SDValue Res = DAG.getSetCC(dl, VT, N0.getOperand(0), N0.getOperand(1), CC);

54141

54142

if (N->getOpcode() == ISD::ZERO_EXTEND)

54143

Res = DAG.getZeroExtendInReg(Res, dl, N0.getValueType());

54144

54145

return Res;

54146

}

54147

54148

static SDValue combineSext(SDNode *N, SelectionDAG &DAG,

54149

TargetLowering::DAGCombinerInfo &DCI,

54150

const X86Subtarget &Subtarget) {

54151

SDValue N0 = N->getOperand(0);

54152

EVT VT = N->getValueType(0);

54153

SDLoc DL(N);

54154

54155

// (i32 (sext (i8 (x86isd::setcc_carry)))) -> (i32 (x86isd::setcc_carry))

54156

if (!DCI.isBeforeLegalizeOps() &&

54157

N0.getOpcode() == X86ISD::SETCC_CARRY) {

54158

SDValue Setcc = DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, N0->getOperand(0),

54159

N0->getOperand(1));

54160

bool ReplaceOtherUses = !N0.hasOneUse();

54161

DCI.CombineTo(N, Setcc);

54162

// Replace other uses with a truncate of the widened setcc_carry.

54163

if (ReplaceOtherUses) {

54164

SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),

54165

N0.getValueType(), Setcc);

54166

DCI.CombineTo(N0.getNode(), Trunc);

54167

}

54168

54169

return SDValue(N, 0);

54170

}

54171

54172

if (SDValue NewCMov = combineToExtendCMOV(N, DAG))

54173

return NewCMov;

54174

54175

if (!DCI.isBeforeLegalizeOps())

54176

return SDValue();

54177

54178

if (SDValue V = combineExtSetcc(N, DAG, Subtarget))

54179

return V;

54180

54181

if (SDValue V = combineToExtendBoolVectorInReg(N->getOpcode(), DL, VT, N0,

54182

DAG, DCI, Subtarget))

54183

return V;

54184

54185

if (VT.isVector()) {

54186

if (SDValue R = PromoteMaskArithmetic(N, DAG, Subtarget))

54187

return R;

54188

54189

if (N0.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG)

54190

return DAG.getNode(N0.getOpcode(), DL, VT, N0.getOperand(0));

54191

}

54192

54193

if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))

54194

return NewAdd;

54195

54196

return SDValue();

54197

}

54198

54199

static SDValue combineFMA(SDNode *N, SelectionDAG &DAG,

54200

TargetLowering::DAGCombinerInfo &DCI,

54201

const X86Subtarget &Subtarget) {

54202

SDLoc dl(N);

54203

EVT VT = N->getValueType(0);

54204

bool IsStrict = N->isStrictFPOpcode() || N->isTargetStrictFPOpcode();

54205

54206

// Let legalize expand this if it isn't a legal type yet.

54207

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

54208

if (!TLI.isTypeLegal(VT))

54209

return SDValue();

54210

54211

SDValue A = N->getOperand(IsStrict ? 1 : 0);

54212

SDValue B = N->getOperand(IsStrict ? 2 : 1);

54213

SDValue C = N->getOperand(IsStrict ? 3 : 2);

54214

54215

// If the operation allows fast-math and the target does not support FMA,

54216

// split this into mul+add to avoid libcall(s).

54217

SDNodeFlags Flags = N->getFlags();

54218

if (!IsStrict && Flags.hasAllowReassociation() &&

54219

TLI.isOperationExpand(ISD::FMA, VT)) {

54220

SDValue Fmul = DAG.getNode(ISD::FMUL, dl, VT, A, B, Flags);

54221

return DAG.getNode(ISD::FADD, dl, VT, Fmul, C, Flags);

54222

}

54223

54224

EVT ScalarVT = VT.getScalarType();

54225

if (((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) ||

54226

!Subtarget.hasAnyFMA()) &&

54227

!(ScalarVT == MVT::f16 && Subtarget.hasFP16()))

54228

return SDValue();

54229

54230

auto invertIfNegative = [&DAG, &TLI, &DCI](SDValue &V) {

54231

bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();

54232

bool LegalOperations = !DCI.isBeforeLegalizeOps();

54233

if (SDValue NegV = TLI.getCheaperNegatedExpression(V, DAG, LegalOperations,

54234

CodeSize)) {

54235

V = NegV;

54236

return true;

54237

}

54238

// Look through extract_vector_elts. If it comes from an FNEG, create a

54239

// new extract from the FNEG input.

54240

if (V.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&

54241

isNullConstant(V.getOperand(1))) {

54242

SDValue Vec = V.getOperand(0);

54243

if (SDValue NegV = TLI.getCheaperNegatedExpression(

54244

Vec, DAG, LegalOperations, CodeSize)) {

54245

V = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(V), V.getValueType(),

54246

NegV, V.getOperand(1));

54247

return true;

54248

}

54249

}

54250

54251

return false;

54252

};

54253

54254

// Do not convert the passthru input of scalar intrinsics.

54255

// FIXME: We could allow negations of the lower element only.

54256

bool NegA = invertIfNegative(A);

54257

bool NegB = invertIfNegative(B);

54258

bool NegC = invertIfNegative(C);

54259

54260

if (!NegA && !NegB && !NegC)

54261

return SDValue();

54262

54263

unsigned NewOpcode =

54264

negateFMAOpcode(N->getOpcode(), NegA != NegB, NegC, false);

54265

54266

// Propagate fast-math-flags to new FMA node.

54267

SelectionDAG::FlagInserter FlagsInserter(DAG, Flags);

54268

if (IsStrict) {

54269

assert(N->getNumOperands() == 4 && "Shouldn't be greater than 4")(static_cast <bool> (N->getNumOperands() == 4 &&
"Shouldn't be greater than 4") ? void (0) : __assert_fail ("N->getNumOperands() == 4 && \"Shouldn't be greater than 4\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 54269, __extension__
__PRETTY_FUNCTION__));

54270

return DAG.getNode(NewOpcode, dl, {VT, MVT::Other},

54271

{N->getOperand(0), A, B, C});

54272

} else {

54273

if (N->getNumOperands() == 4)

54274

return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3));

54275

return DAG.getNode(NewOpcode, dl, VT, A, B, C);

54276

}

54277

}

54278

54279

// Combine FMADDSUB(A, B, FNEG(C)) -> FMSUBADD(A, B, C)

54280

// Combine FMSUBADD(A, B, FNEG(C)) -> FMADDSUB(A, B, C)

54281

static SDValue combineFMADDSUB(SDNode *N, SelectionDAG &DAG,

54282

TargetLowering::DAGCombinerInfo &DCI) {

54283

SDLoc dl(N);

54284

EVT VT = N->getValueType(0);

54285

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

54286

bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();

54287

bool LegalOperations = !DCI.isBeforeLegalizeOps();

54288

54289

SDValue N2 = N->getOperand(2);

54290

54291

SDValue NegN2 =

54292

TLI.getCheaperNegatedExpression(N2, DAG, LegalOperations, CodeSize);

54293

if (!NegN2)

54294

return SDValue();

54295

unsigned NewOpcode = negateFMAOpcode(N->getOpcode(), false, true, false);

54296

54297

if (N->getNumOperands() == 4)

54298

return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),

54299

NegN2, N->getOperand(3));

54300

return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),

54301

NegN2);

54302

}

54303

54304

static SDValue combineZext(SDNode *N, SelectionDAG &DAG,

54305

TargetLowering::DAGCombinerInfo &DCI,

54306

const X86Subtarget &Subtarget) {

54307

SDLoc dl(N);

54308

SDValue N0 = N->getOperand(0);

54309

EVT VT = N->getValueType(0);

54310

54311

// (i32 (aext (i8 (x86isd::setcc_carry)))) -> (i32 (x86isd::setcc_carry))

54312

// FIXME: Is this needed? We don't seem to have any tests for it.

54313

if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ANY_EXTEND &&

54314

N0.getOpcode() == X86ISD::SETCC_CARRY) {

54315

SDValue Setcc = DAG.getNode(X86ISD::SETCC_CARRY, dl, VT, N0->getOperand(0),

54316

N0->getOperand(1));

54317

bool ReplaceOtherUses = !N0.hasOneUse();

54318

DCI.CombineTo(N, Setcc);

54319

// Replace other uses with a truncate of the widened setcc_carry.

54320

if (ReplaceOtherUses) {

54321

SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),

54322

N0.getValueType(), Setcc);

54323

DCI.CombineTo(N0.getNode(), Trunc);

54324

}

54325

54326

return SDValue(N, 0);

54327

}

54328

54329

if (SDValue NewCMov = combineToExtendCMOV(N, DAG))

54330

return NewCMov;

54331

54332

if (DCI.isBeforeLegalizeOps())

54333

if (SDValue V = combineExtSetcc(N, DAG, Subtarget))

54334

return V;

54335

54336

if (SDValue V = combineToExtendBoolVectorInReg(N->getOpcode(), dl, VT, N0,

54337

DAG, DCI, Subtarget))

54338

return V;

54339

54340

if (VT.isVector())

54341

if (SDValue R = PromoteMaskArithmetic(N, DAG, Subtarget))

54342

return R;

54343

54344

if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))

54345

return NewAdd;

54346

54347

if (SDValue R = combineOrCmpEqZeroToCtlzSrl(N, DAG, DCI, Subtarget))

54348

return R;

54349

54350

// TODO: Combine with any target/faux shuffle.

54351

if (N0.getOpcode() == X86ISD::PACKUS && N0.getValueSizeInBits() == 128 &&

54352

VT.getScalarSizeInBits() == N0.getOperand(0).getScalarValueSizeInBits()) {

54353

SDValue N00 = N0.getOperand(0);

54354

SDValue N01 = N0.getOperand(1);

54355

unsigned NumSrcEltBits = N00.getScalarValueSizeInBits();

54356

APInt ZeroMask = APInt::getHighBitsSet(NumSrcEltBits, NumSrcEltBits / 2);

54357

if ((N00.isUndef() || DAG.MaskedValueIsZero(N00, ZeroMask)) &&

54358

(N01.isUndef() || DAG.MaskedValueIsZero(N01, ZeroMask))) {

54359

return concatSubVectors(N00, N01, DAG, dl);

54360

}

54361

}

54362

54363

return SDValue();

54364

}

54365

54366

/// If we have AVX512, but not BWI and this is a vXi16/vXi8 setcc, just

54367

/// pre-promote its result type since vXi1 vectors don't get promoted

54368

/// during type legalization.

54369

static SDValue truncateAVX512SetCCNoBWI(EVT VT, EVT OpVT, SDValue LHS,

54370

SDValue RHS, ISD::CondCode CC,

54371

const SDLoc &DL, SelectionDAG &DAG,

54372

const X86Subtarget &Subtarget) {

54373

if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && VT.isVector() &&

54374

VT.getVectorElementType() == MVT::i1 &&

54375

(OpVT.getVectorElementType() == MVT::i8 ||

54376

OpVT.getVectorElementType() == MVT::i16)) {

54377

SDValue Setcc = DAG.getSetCC(DL, OpVT, LHS, RHS, CC);

54378

return DAG.getNode(ISD::TRUNCATE, DL, VT, Setcc);

54379

}

54380

return SDValue();

54381

}

54382

54383

static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,

54384

TargetLowering::DAGCombinerInfo &DCI,

54385

const X86Subtarget &Subtarget) {

54386

const ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();

54387

const SDValue LHS = N->getOperand(0);

54388

const SDValue RHS = N->getOperand(1);

54389

EVT VT = N->getValueType(0);

54390

EVT OpVT = LHS.getValueType();

54391

SDLoc DL(N);

54392

54393

if (CC == ISD::SETNE || CC == ISD::SETEQ) {

54394

if (SDValue V = combineVectorSizedSetCCEquality(VT, LHS, RHS, CC, DL, DAG,

54395

Subtarget))

54396

return V;

54397

54398

if (VT == MVT::i1) {

54399

X86::CondCode X86CC;

54400

if (SDValue V =

54401

MatchVectorAllEqualTest(LHS, RHS, CC, DL, Subtarget, DAG, X86CC))

54402

return DAG.getNode(ISD::TRUNCATE, DL, VT, getSETCC(X86CC, V, DL, DAG));

54403

}

54404

54405

if (OpVT.isScalarInteger()) {

54406

// cmpeq(or(X,Y),X) --> cmpeq(and(~X,Y),0)

54407

// cmpne(or(X,Y),X) --> cmpne(and(~X,Y),0)

54408

auto MatchOrCmpEq = [&](SDValue N0, SDValue N1) {

54409

if (N0.getOpcode() == ISD::OR && N0->hasOneUse()) {

54410

if (N0.getOperand(0) == N1)

54411

return DAG.getNode(ISD::AND, DL, OpVT, DAG.getNOT(DL, N1, OpVT),

54412

N0.getOperand(1));

54413

if (N0.getOperand(1) == N1)

54414

return DAG.getNode(ISD::AND, DL, OpVT, DAG.getNOT(DL, N1, OpVT),

54415

N0.getOperand(0));

54416

}

54417

return SDValue();

54418

};

54419

if (SDValue AndN = MatchOrCmpEq(LHS, RHS))

54420

return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);

54421

if (SDValue AndN = MatchOrCmpEq(RHS, LHS))

54422

return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);

54423

54424

// cmpeq(and(X,Y),Y) --> cmpeq(and(~X,Y),0)

54425

// cmpne(and(X,Y),Y) --> cmpne(and(~X,Y),0)

54426

auto MatchAndCmpEq = [&](SDValue N0, SDValue N1) {

54427

if (N0.getOpcode() == ISD::AND && N0->hasOneUse()) {

54428

if (N0.getOperand(0) == N1)

54429

return DAG.getNode(ISD::AND, DL, OpVT, N1,

54430

DAG.getNOT(DL, N0.getOperand(1), OpVT));

54431

if (N0.getOperand(1) == N1)

54432

return DAG.getNode(ISD::AND, DL, OpVT, N1,

54433

DAG.getNOT(DL, N0.getOperand(0), OpVT));

54434

}

54435

return SDValue();

54436

};

54437

if (SDValue AndN = MatchAndCmpEq(LHS, RHS))

54438

return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);

54439

if (SDValue AndN = MatchAndCmpEq(RHS, LHS))

54440

return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);

54441

54442

// cmpeq(trunc(x),0) --> cmpeq(x,0)

54443

// cmpne(trunc(x),0) --> cmpne(x,0)

54444

// iff x upper bits are zero.

54445

// TODO: Add support for RHS to be truncate as well?

54446

if (LHS.getOpcode() == ISD::TRUNCATE &&

54447

LHS.getOperand(0).getScalarValueSizeInBits() >= 32 &&

54448

isNullConstant(RHS) && !DCI.isBeforeLegalize()) {

54449

EVT SrcVT = LHS.getOperand(0).getValueType();

54450

APInt UpperBits = APInt::getBitsSetFrom(SrcVT.getScalarSizeInBits(),

54451

OpVT.getScalarSizeInBits());

54452

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

54453

if (DAG.MaskedValueIsZero(LHS.getOperand(0), UpperBits) &&

54454

TLI.isTypeLegal(LHS.getOperand(0).getValueType()))

54455

return DAG.getSetCC(DL, VT, LHS.getOperand(0),

54456

DAG.getConstant(0, DL, SrcVT), CC);

54457

}

54458

54459

// With C as a power of 2 and C != 0 and C != INT_MIN:

54460

// icmp eq Abs(X) C ->

54461

// (icmp eq A, C) | (icmp eq A, -C)

54462

// icmp ne Abs(X) C ->

54463

// (icmp ne A, C) & (icmp ne A, -C)

54464

// Both of these patterns can be better optimized in

54465

// DAGCombiner::foldAndOrOfSETCC. Note this only applies for scalar

54466

// integers which is checked above.

54467

if (LHS.getOpcode() == ISD::ABS && LHS.hasOneUse()) {

54468

if (auto *C = dyn_cast<ConstantSDNode>(RHS)) {

54469

const APInt &CInt = C->getAPIntValue();

54470

// We can better optimize this case in DAGCombiner::foldAndOrOfSETCC.

54471

if (CInt.isPowerOf2() && !CInt.isMinSignedValue()) {

54472

SDValue BaseOp = LHS.getOperand(0);

54473

SDValue SETCC0 = DAG.getSetCC(DL, VT, BaseOp, RHS, CC);

54474

SDValue SETCC1 = DAG.getSetCC(

54475

DL, VT, BaseOp, DAG.getConstant(-CInt, DL, OpVT), CC);

54476

return DAG.getNode(CC == ISD::SETEQ ? ISD::OR : ISD::AND, DL, VT,

54477

SETCC0, SETCC1);

54478

}

54479

}

54480

}

54481

}

54482

}

54483

54484

if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&

54485

(CC == ISD::SETNE || CC == ISD::SETEQ || ISD::isSignedIntSetCC(CC))) {

54486

// Using temporaries to avoid messing up operand ordering for later

54487

// transformations if this doesn't work.

54488

SDValue Op0 = LHS;

54489

SDValue Op1 = RHS;

54490

ISD::CondCode TmpCC = CC;

54491

// Put build_vector on the right.

54492

if (Op0.getOpcode() == ISD::BUILD_VECTOR) {

54493

std::swap(Op0, Op1);

54494

TmpCC = ISD::getSetCCSwappedOperands(TmpCC);

54495

}

54496

54497

bool IsSEXT0 =

54498

(Op0.getOpcode() == ISD::SIGN_EXTEND) &&

54499

(Op0.getOperand(0).getValueType().getVectorElementType() == MVT::i1);

54500

bool IsVZero1 = ISD::isBuildVectorAllZeros(Op1.getNode());

54501

54502

if (IsSEXT0 && IsVZero1) {

54503

assert(VT == Op0.getOperand(0).getValueType() &&(static_cast <bool> (VT == Op0.getOperand(0).getValueType
() && "Unexpected operand type") ? void (0) : __assert_fail
("VT == Op0.getOperand(0).getValueType() && \"Unexpected operand type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 54504, __extension__
__PRETTY_FUNCTION__))

54504

"Unexpected operand type")(static_cast <bool> (VT == Op0.getOperand(0).getValueType
() && "Unexpected operand type") ? void (0) : __assert_fail
("VT == Op0.getOperand(0).getValueType() && \"Unexpected operand type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 54504, __extension__
__PRETTY_FUNCTION__));

54505

if (TmpCC == ISD::SETGT)

54506

return DAG.getConstant(0, DL, VT);

54507

if (TmpCC == ISD::SETLE)

54508

return DAG.getConstant(1, DL, VT);

54509

if (TmpCC == ISD::SETEQ || TmpCC == ISD::SETGE)

54510

return DAG.getNOT(DL, Op0.getOperand(0), VT);

54511

54512

assert((TmpCC == ISD::SETNE || TmpCC == ISD::SETLT) &&(static_cast <bool> ((TmpCC == ISD::SETNE || TmpCC == ISD
::SETLT) && "Unexpected condition code!") ? void (0) :
__assert_fail ("(TmpCC == ISD::SETNE || TmpCC == ISD::SETLT) && \"Unexpected condition code!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 54513, __extension__
__PRETTY_FUNCTION__))

54513

"Unexpected condition code!")(static_cast <bool> ((TmpCC == ISD::SETNE || TmpCC == ISD
::SETLT) && "Unexpected condition code!") ? void (0) :
__assert_fail ("(TmpCC == ISD::SETNE || TmpCC == ISD::SETLT) && \"Unexpected condition code!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 54513, __extension__
__PRETTY_FUNCTION__));

54514

return Op0.getOperand(0);

54515

}

54516

}

54517

54518

// Try and make unsigned vector comparison signed. On pre AVX512 targets there

54519

// only are unsigned comparisons (`PCMPGT`) and on AVX512 its often better to

54520

// use `PCMPGT` if the result is mean to stay in a vector (and if its going to

54521

// a mask, there are signed AVX512 comparisons).

54522

if (VT.isVector() && OpVT.isVector() && OpVT.isInteger()) {

54523

bool CanMakeSigned = false;

54524

if (ISD::isUnsignedIntSetCC(CC)) {

54525

KnownBits CmpKnown = KnownBits::commonBits(DAG.computeKnownBits(LHS),

54526

DAG.computeKnownBits(RHS));

54527

// If we know LHS/RHS share the same sign bit at each element we can

54528

// make this signed.

54529

// NOTE: `computeKnownBits` on a vector type aggregates common bits

54530

// across all lanes. So a pattern where the sign varies from lane to

54531

// lane, but at each lane Sign(LHS) is known to equal Sign(RHS), will be

54532

// missed. We could get around this by demanding each lane

54533

// independently, but this isn't the most important optimization and

54534

// that may eat into compile time.

54535

CanMakeSigned =

54536

CmpKnown.Zero.isSignBitSet() || CmpKnown.One.isSignBitSet();

54537

}

54538

if (CanMakeSigned || ISD::isSignedIntSetCC(CC)) {

54539

SDValue LHSOut = LHS;

54540

SDValue RHSOut = RHS;

54541

ISD::CondCode NewCC = CC;

54542

switch (CC) {

54543

case ISD::SETGE:

54544

case ISD::SETUGE:

54545

if (SDValue NewLHS = incDecVectorConstant(LHS, DAG, /*IsInc*/ true,

54546

/*NSW*/ true))

54547

LHSOut = NewLHS;

54548

else if (SDValue NewRHS = incDecVectorConstant(

54549

RHS, DAG, /*IsInc*/ false, /*NSW*/ true))

54550

RHSOut = NewRHS;

54551

else

54552

break;

54553

54554

[[fallthrough]];

54555

case ISD::SETUGT:

54556

NewCC = ISD::SETGT;

54557

break;

54558

54559

case ISD::SETLE:

54560

case ISD::SETULE:

54561

if (SDValue NewLHS = incDecVectorConstant(LHS, DAG, /*IsInc*/ false,

54562

/*NSW*/ true))

54563

LHSOut = NewLHS;

54564

else if (SDValue NewRHS = incDecVectorConstant(RHS, DAG, /*IsInc*/ true,

54565

/*NSW*/ true))

54566

RHSOut = NewRHS;

54567

else

54568

break;

54569

54570

[[fallthrough]];

54571

case ISD::SETULT:

54572

// Will be swapped to SETGT in LowerVSETCC*.

54573

NewCC = ISD::SETLT;

54574

break;

54575

default:

54576

break;

54577

}

54578

if (NewCC != CC) {

54579

if (SDValue R = truncateAVX512SetCCNoBWI(VT, OpVT, LHSOut, RHSOut,

54580

NewCC, DL, DAG, Subtarget))

54581

return R;

54582

return DAG.getSetCC(DL, VT, LHSOut, RHSOut, NewCC);

54583

}

54584

}

54585

}

54586

54587

if (SDValue R =

54588

truncateAVX512SetCCNoBWI(VT, OpVT, LHS, RHS, CC, DL, DAG, Subtarget))

54589

return R;

54590

54591

// For an SSE1-only target, lower a comparison of v4f32 to X86ISD::CMPP early

54592

// to avoid scalarization via legalization because v4i32 is not a legal type.

54593

if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32 &&

54594

LHS.getValueType() == MVT::v4f32)

54595

return LowerVSETCC(SDValue(N, 0), Subtarget, DAG);

54596

54597

// X pred 0.0 --> X pred -X

54598

// If the negation of X already exists, use it in the comparison. This removes

54599

// the need to materialize 0.0 and allows matching to SSE's MIN/MAX

54600

// instructions in patterns with a 'select' node.

54601

if (isNullFPScalarOrVectorConst(RHS)) {

54602

SDVTList FNegVT = DAG.getVTList(OpVT);

54603

if (SDNode *FNeg = DAG.getNodeIfExists(ISD::FNEG, FNegVT, {LHS}))

54604

return DAG.getSetCC(DL, VT, LHS, SDValue(FNeg, 0), CC);

54605

}

54606

54607

return SDValue();

54608

}

54609

54610

static SDValue combineMOVMSK(SDNode *N, SelectionDAG &DAG,

54611

TargetLowering::DAGCombinerInfo &DCI,

54612

const X86Subtarget &Subtarget) {

54613

SDValue Src = N->getOperand(0);

54614

MVT SrcVT = Src.getSimpleValueType();

54615

MVT VT = N->getSimpleValueType(0);

54616

unsigned NumBits = VT.getScalarSizeInBits();

54617

unsigned NumElts = SrcVT.getVectorNumElements();

54618

unsigned NumBitsPerElt = SrcVT.getScalarSizeInBits();

54619

assert(VT == MVT::i32 && NumElts <= NumBits && "Unexpected MOVMSK types")(static_cast <bool> (VT == MVT::i32 && NumElts <=
NumBits && "Unexpected MOVMSK types") ? void (0) : __assert_fail
("VT == MVT::i32 && NumElts <= NumBits && \"Unexpected MOVMSK types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 54619, __extension__
__PRETTY_FUNCTION__));

54620

54621

// Perform constant folding.

54622

APInt UndefElts;

54623

SmallVector<APInt, 32> EltBits;

54624

if (getTargetConstantBitsFromNode(Src, NumBitsPerElt, UndefElts, EltBits)) {

54625

APInt Imm(32, 0);

54626

for (unsigned Idx = 0; Idx != NumElts; ++Idx)

54627

if (!UndefElts[Idx] && EltBits[Idx].isNegative())

54628

Imm.setBit(Idx);

54629

54630

return DAG.getConstant(Imm, SDLoc(N), VT);

54631

}

54632

54633

// Look through int->fp bitcasts that don't change the element width.

54634

unsigned EltWidth = SrcVT.getScalarSizeInBits();

54635

if (Subtarget.hasSSE2() && Src.getOpcode() == ISD::BITCAST &&

54636

Src.getOperand(0).getScalarValueSizeInBits() == EltWidth)

54637

return DAG.getNode(X86ISD::MOVMSK, SDLoc(N), VT, Src.getOperand(0));

54638

54639

// Fold movmsk(not(x)) -> not(movmsk(x)) to improve folding of movmsk results

54640

// with scalar comparisons.

54641

if (SDValue NotSrc = IsNOT(Src, DAG)) {

54642

SDLoc DL(N);

54643

APInt NotMask = APInt::getLowBitsSet(NumBits, NumElts);

54644

NotSrc = DAG.getBitcast(SrcVT, NotSrc);

54645

return DAG.getNode(ISD::XOR, DL, VT,

54646

DAG.getNode(X86ISD::MOVMSK, DL, VT, NotSrc),

54647

DAG.getConstant(NotMask, DL, VT));

54648

}

54649

54650

// Fold movmsk(icmp_sgt(x,-1)) -> not(movmsk(x)) to improve folding of movmsk

54651

// results with scalar comparisons.

54652

if (Src.getOpcode() == X86ISD::PCMPGT &&

54653

ISD::isBuildVectorAllOnes(Src.getOperand(1).getNode())) {

54654

SDLoc DL(N);

54655

APInt NotMask = APInt::getLowBitsSet(NumBits, NumElts);

54656

return DAG.getNode(ISD::XOR, DL, VT,

54657

DAG.getNode(X86ISD::MOVMSK, DL, VT, Src.getOperand(0)),

54658

DAG.getConstant(NotMask, DL, VT));

54659

}

54660

54661

// Fold movmsk(icmp_eq(and(x,c1),c1)) -> movmsk(shl(x,c2))

54662

// Fold movmsk(icmp_eq(and(x,c1),0)) -> movmsk(not(shl(x,c2)))

54663

// iff pow2splat(c1).

54664

// Use KnownBits to determine if only a single bit is non-zero

54665

// in each element (pow2 or zero), and shift that bit to the msb.

54666

if (Src.getOpcode() == X86ISD::PCMPEQ) {

54667

KnownBits KnownLHS = DAG.computeKnownBits(Src.getOperand(0));

54668

KnownBits KnownRHS = DAG.computeKnownBits(Src.getOperand(1));

54669

unsigned ShiftAmt = KnownLHS.countMinLeadingZeros();

54670

if (KnownLHS.countMaxPopulation() == 1 &&

54671

(KnownRHS.isZero() || (KnownRHS.countMaxPopulation() == 1 &&

54672

ShiftAmt == KnownRHS.countMinLeadingZeros()))) {

54673

SDLoc DL(N);

54674

MVT ShiftVT = SrcVT;

54675

SDValue ShiftLHS = Src.getOperand(0);

54676

SDValue ShiftRHS = Src.getOperand(1);

54677

if (ShiftVT.getScalarType() == MVT::i8) {

54678

// vXi8 shifts - we only care about the signbit so can use PSLLW.

54679

ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);

54680

ShiftLHS = DAG.getBitcast(ShiftVT, ShiftLHS);

54681

ShiftRHS = DAG.getBitcast(ShiftVT, ShiftRHS);

54682

}

54683

ShiftLHS = getTargetVShiftByConstNode(X86ISD::VSHLI, DL, ShiftVT,

54684

ShiftLHS, ShiftAmt, DAG);

54685

ShiftRHS = getTargetVShiftByConstNode(X86ISD::VSHLI, DL, ShiftVT,

54686

ShiftRHS, ShiftAmt, DAG);

54687

ShiftLHS = DAG.getBitcast(SrcVT, ShiftLHS);

54688

ShiftRHS = DAG.getBitcast(SrcVT, ShiftRHS);

54689

SDValue Res = DAG.getNode(ISD::XOR, DL, SrcVT, ShiftLHS, ShiftRHS);

54690

return DAG.getNode(X86ISD::MOVMSK, DL, VT, DAG.getNOT(DL, Res, SrcVT));

54691

}

54692

}

54693

54694

// Fold movmsk(logic(X,C)) -> logic(movmsk(X),C)

54695

if (N->isOnlyUserOf(Src.getNode())) {

54696

SDValue SrcBC = peekThroughOneUseBitcasts(Src);

54697

if (ISD::isBitwiseLogicOp(SrcBC.getOpcode())) {

54698

APInt UndefElts;

54699

SmallVector<APInt, 32> EltBits;

54700

if (getTargetConstantBitsFromNode(SrcBC.getOperand(1), NumBitsPerElt,

54701

UndefElts, EltBits)) {

54702

APInt Mask = APInt::getZero(NumBits);

54703

for (unsigned Idx = 0; Idx != NumElts; ++Idx) {

54704

if (!UndefElts[Idx] && EltBits[Idx].isNegative())

54705

Mask.setBit(Idx);

54706

}

54707

SDLoc DL(N);

54708

SDValue NewSrc = DAG.getBitcast(SrcVT, SrcBC.getOperand(0));

54709

SDValue NewMovMsk = DAG.getNode(X86ISD::MOVMSK, DL, VT, NewSrc);

54710

return DAG.getNode(SrcBC.getOpcode(), DL, VT, NewMovMsk,

54711

DAG.getConstant(Mask, DL, VT));

54712

}

54713

}

54714

}

54715

54716

// Simplify the inputs.

54717

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

54718

APInt DemandedMask(APInt::getAllOnes(NumBits));

54719

if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))

54720

return SDValue(N, 0);

54721

54722

return SDValue();

54723

}

54724

54725

static SDValue combineTESTP(SDNode *N, SelectionDAG &DAG,

54726

TargetLowering::DAGCombinerInfo &DCI,

54727

const X86Subtarget &Subtarget) {

54728

MVT VT = N->getSimpleValueType(0);

54729

unsigned NumBits = VT.getScalarSizeInBits();

54730

54731

// Simplify the inputs.

54732

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

54733

APInt DemandedMask(APInt::getAllOnes(NumBits));

54734

if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))

54735

return SDValue(N, 0);

54736

54737

return SDValue();

54738

}

54739

54740

static SDValue combineX86GatherScatter(SDNode *N, SelectionDAG &DAG,

54741

TargetLowering::DAGCombinerInfo &DCI,

54742

const X86Subtarget &Subtarget) {

54743

auto *MemOp = cast<X86MaskedGatherScatterSDNode>(N);

54744

SDValue BasePtr = MemOp->getBasePtr();

54745

SDValue Index = MemOp->getIndex();

54746

SDValue Scale = MemOp->getScale();

54747

SDValue Mask = MemOp->getMask();

54748

54749

// Attempt to fold an index scale into the scale value directly.

54750

// For smaller indices, implicit sext is performed BEFORE scale, preventing

54751

// this fold under most circumstances.

54752

// TODO: Move this into X86DAGToDAGISel::matchVectorAddressRecursively?

54753

if ((Index.getOpcode() == X86ISD::VSHLI ||

54754

(Index.getOpcode() == ISD::ADD &&

54755

Index.getOperand(0) == Index.getOperand(1))) &&

54756

isa<ConstantSDNode>(Scale) &&

54757

BasePtr.getScalarValueSizeInBits() == Index.getScalarValueSizeInBits()) {

54758

unsigned ShiftAmt =

54759

Index.getOpcode() == ISD::ADD ? 1 : Index.getConstantOperandVal(1);

54760

uint64_t ScaleAmt = cast<ConstantSDNode>(Scale)->getZExtValue();

54761

uint64_t NewScaleAmt = ScaleAmt * (1ULL << ShiftAmt);

54762

if (isPowerOf2_64(NewScaleAmt) && NewScaleAmt <= 8) {

54763

SDValue NewIndex = Index.getOperand(0);

54764

SDValue NewScale =

54765

DAG.getTargetConstant(NewScaleAmt, SDLoc(N), Scale.getValueType());

54766

if (N->getOpcode() == X86ISD::MGATHER)

54767

return getAVX2GatherNode(N->getOpcode(), SDValue(N, 0), DAG,

54768

MemOp->getOperand(1), Mask,

54769

MemOp->getBasePtr(), NewIndex, NewScale,

54770

MemOp->getChain(), Subtarget);

54771

if (N->getOpcode() == X86ISD::MSCATTER)

54772

return getScatterNode(N->getOpcode(), SDValue(N, 0), DAG,

54773

MemOp->getOperand(1), Mask, MemOp->getBasePtr(),

54774

NewIndex, NewScale, MemOp->getChain(), Subtarget);

54775

}

54776

}

54777

54778

// With vector masks we only demand the upper bit of the mask.

54779

if (Mask.getScalarValueSizeInBits() != 1) {

54780

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

54781

APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));

54782

if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) {

54783

if (N->getOpcode() != ISD::DELETED_NODE)

54784

DCI.AddToWorklist(N);

54785

return SDValue(N, 0);

54786

}

54787

}

54788

54789

return SDValue();

54790

}

54791

54792

static SDValue rebuildGatherScatter(MaskedGatherScatterSDNode *GorS,

54793

SDValue Index, SDValue Base, SDValue Scale,

54794

SelectionDAG &DAG) {

54795

SDLoc DL(GorS);

54796

54797

if (auto *Gather = dyn_cast<MaskedGatherSDNode>(GorS)) {

54798

SDValue Ops[] = { Gather->getChain(), Gather->getPassThru(),

54799

Gather->getMask(), Base, Index, Scale } ;

54800

return DAG.getMaskedGather(Gather->getVTList(),

54801

Gather->getMemoryVT(), DL, Ops,

54802

Gather->getMemOperand(),

54803

Gather->getIndexType(),

54804

Gather->getExtensionType());

54805

}

54806

auto *Scatter = cast<MaskedScatterSDNode>(GorS);

54807

SDValue Ops[] = { Scatter->getChain(), Scatter->getValue(),

54808

Scatter->getMask(), Base, Index, Scale };

54809

return DAG.getMaskedScatter(Scatter->getVTList(),

54810

Scatter->getMemoryVT(), DL,

54811

Ops, Scatter->getMemOperand(),

54812

Scatter->getIndexType(),

54813

Scatter->isTruncatingStore());

54814

}

54815

54816

static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG,

54817

TargetLowering::DAGCombinerInfo &DCI) {

54818

SDLoc DL(N);

54819

auto *GorS = cast<MaskedGatherScatterSDNode>(N);

54820

SDValue Index = GorS->getIndex();

54821

SDValue Base = GorS->getBasePtr();

54822

SDValue Scale = GorS->getScale();

54823

54824

if (DCI.isBeforeLegalize()) {

54825

unsigned IndexWidth = Index.getScalarValueSizeInBits();

54826

54827

// Shrink constant indices if they are larger than 32-bits.

54828

// Only do this before legalize types since v2i64 could become v2i32.

54829

// FIXME: We could check that the type is legal if we're after legalize

54830

// types, but then we would need to construct test cases where that happens.

54831

// FIXME: We could support more than just constant vectors, but we need to

54832

// careful with costing. A truncate that can be optimized out would be fine.

54833

// Otherwise we might only want to create a truncate if it avoids a split.

54834

if (auto *BV = dyn_cast<BuildVectorSDNode>(Index)) {

54835

if (BV->isConstant() && IndexWidth > 32 &&

54836

DAG.ComputeNumSignBits(Index) > (IndexWidth - 32)) {

54837

EVT NewVT = Index.getValueType().changeVectorElementType(MVT::i32);

54838

Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index);

54839

return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);

54840

}

54841

}

54842

54843

// Shrink any sign/zero extends from 32 or smaller to larger than 32 if

54844

// there are sufficient sign bits. Only do this before legalize types to

54845

// avoid creating illegal types in truncate.

54846

if ((Index.getOpcode() == ISD::SIGN_EXTEND ||

54847

Index.getOpcode() == ISD::ZERO_EXTEND) &&

54848

IndexWidth > 32 &&

54849

Index.getOperand(0).getScalarValueSizeInBits() <= 32 &&

54850

DAG.ComputeNumSignBits(Index) > (IndexWidth - 32)) {

54851

EVT NewVT = Index.getValueType().changeVectorElementType(MVT::i32);

54852

Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index);

54853

return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);

54854

}

54855

}

54856

54857

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

54858

EVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());

54859

// Try to move splat constant adders from the index operand to the base

54860

// pointer operand. Taking care to multiply by the scale. We can only do

54861

// this when index element type is the same as the pointer type.

54862

// Otherwise we need to be sure the math doesn't wrap before the scale.

54863

if (Index.getOpcode() == ISD::ADD &&

54864

Index.getValueType().getVectorElementType() == PtrVT &&

54865

isa<ConstantSDNode>(Scale)) {

54866

uint64_t ScaleAmt = cast<ConstantSDNode>(Scale)->getZExtValue();

54867

if (auto *BV = dyn_cast<BuildVectorSDNode>(Index.getOperand(1))) {

54868

BitVector UndefElts;

54869

if (ConstantSDNode *C = BV->getConstantSplatNode(&UndefElts)) {

54870

// FIXME: Allow non-constant?

54871

if (UndefElts.none()) {

54872

// Apply the scale.

54873

APInt Adder = C->getAPIntValue() * ScaleAmt;

54874

// Add it to the existing base.

54875

Base = DAG.getNode(ISD::ADD, DL, PtrVT, Base,

54876

DAG.getConstant(Adder, DL, PtrVT));

54877

Index = Index.getOperand(0);

54878

return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);

54879

}

54880

}

54881

54882

// It's also possible base is just a constant. In that case, just

54883

// replace it with 0 and move the displacement into the index.

54884

if (BV->isConstant() && isa<ConstantSDNode>(Base) &&

54885

isOneConstant(Scale)) {

54886

SDValue Splat = DAG.getSplatBuildVector(Index.getValueType(), DL, Base);

54887

// Combine the constant build_vector and the constant base.

54888

Splat = DAG.getNode(ISD::ADD, DL, Index.getValueType(),

54889

Index.getOperand(1), Splat);

54890

// Add to the LHS of the original Index add.

54891

Index = DAG.getNode(ISD::ADD, DL, Index.getValueType(),

54892

Index.getOperand(0), Splat);

54893

Base = DAG.getConstant(0, DL, Base.getValueType());

54894

return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);

54895

}

54896

}

54897

}

54898

54899

if (DCI.isBeforeLegalizeOps()) {

54900

unsigned IndexWidth = Index.getScalarValueSizeInBits();

54901

54902

// Make sure the index is either i32 or i64

54903

if (IndexWidth != 32 && IndexWidth != 64) {

54904

MVT EltVT = IndexWidth > 32 ? MVT::i64 : MVT::i32;

54905

EVT IndexVT = Index.getValueType().changeVectorElementType(EltVT);

54906

Index = DAG.getSExtOrTrunc(Index, DL, IndexVT);

54907

return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);

54908

}

54909

}

54910

54911

// With vector masks we only demand the upper bit of the mask.

54912

SDValue Mask = GorS->getMask();

54913

if (Mask.getScalarValueSizeInBits() != 1) {

54914

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

54915

APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));

54916

if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) {

54917

if (N->getOpcode() != ISD::DELETED_NODE)

54918

DCI.AddToWorklist(N);

54919

return SDValue(N, 0);

54920

}

54921

}

54922

54923

return SDValue();

54924

}

54925

54926

// Optimize RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT

54927

static SDValue combineX86SetCC(SDNode *N, SelectionDAG &DAG,

54928

const X86Subtarget &Subtarget) {

54929

SDLoc DL(N);

54930

X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));

54931

SDValue EFLAGS = N->getOperand(1);

54932

54933

// Try to simplify the EFLAGS and condition code operands.

54934

if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget))

54935

return getSETCC(CC, Flags, DL, DAG);

54936

54937

return SDValue();

54938

}

54939

54940

/// Optimize branch condition evaluation.

54941

static SDValue combineBrCond(SDNode *N, SelectionDAG &DAG,

54942

const X86Subtarget &Subtarget) {

54943

SDLoc DL(N);

54944

SDValue EFLAGS = N->getOperand(3);

54945

X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2));

54946

54947

// Try to simplify the EFLAGS and condition code operands.

54948

// Make sure to not keep references to operands, as combineSetCCEFLAGS can

54949

// RAUW them under us.

54950

if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget)) {

54951

SDValue Cond = DAG.getTargetConstant(CC, DL, MVT::i8);

54952

return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), N->getOperand(0),

54953

N->getOperand(1), Cond, Flags);

54954

}

54955

54956

return SDValue();

54957

}

54958

54959

// TODO: Could we move this to DAGCombine?

54960

static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N,

54961

SelectionDAG &DAG) {

54962

// Take advantage of vector comparisons (etc.) producing 0 or -1 in each lane

54963

// to optimize away operation when it's from a constant.

54964

//

54965

// The general transformation is:

54966

// UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->

54967

// AND(VECTOR_CMP(x,y), constant2)

54968

// constant2 = UNARYOP(constant)

54969

54970

// Early exit if this isn't a vector operation, the operand of the

54971

// unary operation isn't a bitwise AND, or if the sizes of the operations

54972

// aren't the same.

54973

EVT VT = N->getValueType(0);

54974

bool IsStrict = N->isStrictFPOpcode();

54975

unsigned NumEltBits = VT.getScalarSizeInBits();

54976

SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);

54977

if (!VT.isVector() || Op0.getOpcode() != ISD::AND ||

54978

DAG.ComputeNumSignBits(Op0.getOperand(0)) != NumEltBits ||

54979

VT.getSizeInBits() != Op0.getValueSizeInBits())

54980

return SDValue();

54981

54982

// Now check that the other operand of the AND is a constant. We could

54983

// make the transformation for non-constant splats as well, but it's unclear

54984

// that would be a benefit as it would not eliminate any operations, just

54985

// perform one more step in scalar code before moving to the vector unit.

54986

if (auto *BV = dyn_cast<BuildVectorSDNode>(Op0.getOperand(1))) {

54987

// Bail out if the vector isn't a constant.

54988

if (!BV->isConstant())

54989

return SDValue();

54990

54991

// Everything checks out. Build up the new and improved node.

54992

SDLoc DL(N);

54993

EVT IntVT = BV->getValueType(0);

54994

// Create a new constant of the appropriate type for the transformed

54995

// DAG.

54996

SDValue SourceConst;

54997

if (IsStrict)

54998

SourceConst = DAG.getNode(N->getOpcode(), DL, {VT, MVT::Other},

54999

{N->getOperand(0), SDValue(BV, 0)});

55000

else

55001

SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));

55002

// The AND node needs bitcasts to/from an integer vector type around it.

55003

SDValue MaskConst = DAG.getBitcast(IntVT, SourceConst);

55004

SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT, Op0->getOperand(0),

55005

MaskConst);

55006

SDValue Res = DAG.getBitcast(VT, NewAnd);

55007

if (IsStrict)

55008

return DAG.getMergeValues({Res, SourceConst.getValue(1)}, DL);

55009

return Res;

55010

}

55011

55012

return SDValue();

55013

}

55014

55015

/// If we are converting a value to floating-point, try to replace scalar

55016

/// truncate of an extracted vector element with a bitcast. This tries to keep

55017

/// the sequence on XMM registers rather than moving between vector and GPRs.

55018

static SDValue combineToFPTruncExtElt(SDNode *N, SelectionDAG &DAG) {

55019

// TODO: This is currently only used by combineSIntToFP, but it is generalized

55020

// to allow being called by any similar cast opcode.

55021

// TODO: Consider merging this into lowering: vectorizeExtractedCast().

55022

SDValue Trunc = N->getOperand(0);

55023

if (!Trunc.hasOneUse() || Trunc.getOpcode() != ISD::TRUNCATE)

55024

return SDValue();

55025

55026

SDValue ExtElt = Trunc.getOperand(0);

55027

if (!ExtElt.hasOneUse() || ExtElt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

55028

!isNullConstant(ExtElt.getOperand(1)))

55029

return SDValue();

55030

55031

EVT TruncVT = Trunc.getValueType();

55032

EVT SrcVT = ExtElt.getValueType();

55033

unsigned DestWidth = TruncVT.getSizeInBits();

55034

unsigned SrcWidth = SrcVT.getSizeInBits();

55035

if (SrcWidth % DestWidth != 0)

55036

return SDValue();

55037

55038

// inttofp (trunc (extelt X, 0)) --> inttofp (extelt (bitcast X), 0)

55039

EVT SrcVecVT = ExtElt.getOperand(0).getValueType();

55040

unsigned VecWidth = SrcVecVT.getSizeInBits();

55041

unsigned NumElts = VecWidth / DestWidth;

55042

EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), TruncVT, NumElts);

55043

SDValue BitcastVec = DAG.getBitcast(BitcastVT, ExtElt.getOperand(0));

55044

SDLoc DL(N);

55045

SDValue NewExtElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, TruncVT,

55046

BitcastVec, ExtElt.getOperand(1));

55047

return DAG.getNode(N->getOpcode(), DL, N->getValueType(0), NewExtElt);

55048

}

55049

55050

static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG,

55051

const X86Subtarget &Subtarget) {

55052

bool IsStrict = N->isStrictFPOpcode();

55053

SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);

55054

EVT VT = N->getValueType(0);

55055

EVT InVT = Op0.getValueType();

55056

55057

// UINT_TO_FP(vXi1~15) -> UINT_TO_FP(ZEXT(vXi1~15 to vXi16))

55058

// UINT_TO_FP(vXi17~31) -> UINT_TO_FP(ZEXT(vXi17~31 to vXi32))

55059

// UINT_TO_FP(vXi33~63) -> UINT_TO_FP(ZEXT(vXi33~63 to vXi64))

55060

if (InVT.isVector() && VT.getVectorElementType() == MVT::f16) {

55061

unsigned ScalarSize = InVT.getScalarSizeInBits();

55062

if (ScalarSize == 16 || ScalarSize == 32 || ScalarSize >= 64)

55063

return SDValue();

55064

SDLoc dl(N);

55065

EVT DstVT = EVT::getVectorVT(*DAG.getContext(),

55066

ScalarSize < 16 ? MVT::i16

55067

: ScalarSize < 32 ? MVT::i32

55068

: MVT::i64,

55069

InVT.getVectorNumElements());

55070

SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);

55071

if (IsStrict)

55072

return DAG.getNode(ISD::STRICT_UINT_TO_FP, dl, {VT, MVT::Other},

55073

{N->getOperand(0), P});

55074

return DAG.getNode(ISD::UINT_TO_FP, dl, VT, P);

55075

}

55076

55077

// UINT_TO_FP(vXi1) -> SINT_TO_FP(ZEXT(vXi1 to vXi32))

55078

// UINT_TO_FP(vXi8) -> SINT_TO_FP(ZEXT(vXi8 to vXi32))

55079

// UINT_TO_FP(vXi16) -> SINT_TO_FP(ZEXT(vXi16 to vXi32))

55080

if (InVT.isVector() && InVT.getScalarSizeInBits() < 32 &&

55081

VT.getScalarType() != MVT::f16) {

55082

SDLoc dl(N);

55083

EVT DstVT = InVT.changeVectorElementType(MVT::i32);

55084

SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);

55085

55086

// UINT_TO_FP isn't legal without AVX512 so use SINT_TO_FP.

55087

if (IsStrict)

55088

return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},

55089

{N->getOperand(0), P});

55090

return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);

55091

}

55092

55093

// Since UINT_TO_FP is legal (it's marked custom), dag combiner won't

55094

// optimize it to a SINT_TO_FP when the sign bit is known zero. Perform

55095

// the optimization here.

55096

if (DAG.SignBitIsZero(Op0)) {

55097

if (IsStrict)

55098

return DAG.getNode(ISD::STRICT_SINT_TO_FP, SDLoc(N), {VT, MVT::Other},

55099

{N->getOperand(0), Op0});

55100

return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, Op0);

55101

}

55102

55103

return SDValue();

55104

}

55105

55106

static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG,

55107

TargetLowering::DAGCombinerInfo &DCI,

55108

const X86Subtarget &Subtarget) {

55109

// First try to optimize away the conversion entirely when it's

55110

// conditionally from a constant. Vectors only.

55111

bool IsStrict = N->isStrictFPOpcode();

55112

if (SDValue Res = combineVectorCompareAndMaskUnaryOp(N, DAG))

55113

return Res;

55114

55115

// Now move on to more general possibilities.

55116

SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);

55117

EVT VT = N->getValueType(0);

55118

EVT InVT = Op0.getValueType();

55119

55120

// SINT_TO_FP(vXi1~15) -> SINT_TO_FP(SEXT(vXi1~15 to vXi16))

55121

// SINT_TO_FP(vXi17~31) -> SINT_TO_FP(SEXT(vXi17~31 to vXi32))

55122

// SINT_TO_FP(vXi33~63) -> SINT_TO_FP(SEXT(vXi33~63 to vXi64))

55123

if (InVT.isVector() && VT.getVectorElementType() == MVT::f16) {

55124

unsigned ScalarSize = InVT.getScalarSizeInBits();

55125

if (ScalarSize == 16 || ScalarSize == 32 || ScalarSize >= 64)

55126

return SDValue();

55127

SDLoc dl(N);

55128

EVT DstVT = EVT::getVectorVT(*DAG.getContext(),

55129

ScalarSize < 16 ? MVT::i16

55130

: ScalarSize < 32 ? MVT::i32

55131

: MVT::i64,

55132

InVT.getVectorNumElements());

55133

SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);

55134

if (IsStrict)

55135

return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},

55136

{N->getOperand(0), P});

55137

return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);

55138

}

55139

55140

// SINT_TO_FP(vXi1) -> SINT_TO_FP(SEXT(vXi1 to vXi32))

55141

// SINT_TO_FP(vXi8) -> SINT_TO_FP(SEXT(vXi8 to vXi32))

55142

// SINT_TO_FP(vXi16) -> SINT_TO_FP(SEXT(vXi16 to vXi32))

55143

if (InVT.isVector() && InVT.getScalarSizeInBits() < 32 &&

55144

VT.getScalarType() != MVT::f16) {

55145

SDLoc dl(N);

55146

EVT DstVT = InVT.changeVectorElementType(MVT::i32);

55147

SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);

55148

if (IsStrict)

55149

return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},

55150

{N->getOperand(0), P});

55151

return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);

55152

}

55153

55154

// Without AVX512DQ we only support i64 to float scalar conversion. For both

55155

// vectors and scalars, see if we know that the upper bits are all the sign

55156

// bit, in which case we can truncate the input to i32 and convert from that.

55157

if (InVT.getScalarSizeInBits() > 32 && !Subtarget.hasDQI()) {

55158

unsigned BitWidth = InVT.getScalarSizeInBits();

55159

unsigned NumSignBits = DAG.ComputeNumSignBits(Op0);

55160

if (NumSignBits >= (BitWidth - 31)) {

55161

EVT TruncVT = MVT::i32;

55162

if (InVT.isVector())

55163

TruncVT = InVT.changeVectorElementType(TruncVT);

55164

SDLoc dl(N);

55165

if (DCI.isBeforeLegalize() || TruncVT != MVT::v2i32) {

55166

SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Op0);

55167

if (IsStrict)

55168

return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},

55169

{N->getOperand(0), Trunc});

55170

return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Trunc);

55171

}

55172

// If we're after legalize and the type is v2i32 we need to shuffle and

55173

// use CVTSI2P.

55174

assert(InVT == MVT::v2i64 && "Unexpected VT!")(static_cast <bool> (InVT == MVT::v2i64 && "Unexpected VT!"
) ? void (0) : __assert_fail ("InVT == MVT::v2i64 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 55174, __extension__
__PRETTY_FUNCTION__));

55175

SDValue Cast = DAG.getBitcast(MVT::v4i32, Op0);

55176

SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Cast, Cast,

55177

{ 0, 2, -1, -1 });

55178

if (IsStrict)

55179

return DAG.getNode(X86ISD::STRICT_CVTSI2P, dl, {VT, MVT::Other},

55180

{N->getOperand(0), Shuf});

55181

return DAG.getNode(X86ISD::CVTSI2P, dl, VT, Shuf);

55182

}

55183

}

55184

55185

// Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have

55186

// a 32-bit target where SSE doesn't support i64->FP operations.

55187

if (!Subtarget.useSoftFloat() && Subtarget.hasX87() &&

55188

Op0.getOpcode() == ISD::LOAD) {

55189

LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());

55190

55191

// This transformation is not supported if the result type is f16 or f128.

55192

if (VT == MVT::f16 || VT == MVT::f128)

55193

return SDValue();

55194

55195

// If we have AVX512DQ we can use packed conversion instructions unless

55196

// the VT is f80.

55197

if (Subtarget.hasDQI() && VT != MVT::f80)

55198

return SDValue();

55199

55200

if (Ld->isSimple() && !VT.isVector() && ISD::isNormalLoad(Op0.getNode()) &&

55201

Op0.hasOneUse() && !Subtarget.is64Bit() && InVT == MVT::i64) {

55202

std::pair<SDValue, SDValue> Tmp =

55203

Subtarget.getTargetLowering()->BuildFILD(

55204

VT, InVT, SDLoc(N), Ld->getChain(), Ld->getBasePtr(),

55205

Ld->getPointerInfo(), Ld->getOriginalAlign(), DAG);

55206

DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), Tmp.second);

55207

return Tmp.first;

55208

}

55209

}

55210

55211

if (IsStrict)

55212

return SDValue();

55213

55214

if (SDValue V = combineToFPTruncExtElt(N, DAG))

55215

return V;

55216

55217

return SDValue();

55218

}

55219

55220

static bool needCarryOrOverflowFlag(SDValue Flags) {

55221

assert(Flags.getValueType() == MVT::i32 && "Unexpected VT!")(static_cast <bool> (Flags.getValueType() == MVT::i32 &&
"Unexpected VT!") ? void (0) : __assert_fail ("Flags.getValueType() == MVT::i32 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 55221, __extension__
__PRETTY_FUNCTION__));

55222

55223

for (const SDNode *User : Flags->uses()) {

55224

X86::CondCode CC;

55225

switch (User->getOpcode()) {

55226

default:

55227

// Be conservative.

55228

return true;

55229

case X86ISD::SETCC:

55230

case X86ISD::SETCC_CARRY:

55231

CC = (X86::CondCode)User->getConstantOperandVal(0);

55232

break;

55233

case X86ISD::BRCOND:

55234

case X86ISD::CMOV:

55235

CC = (X86::CondCode)User->getConstantOperandVal(2);

55236

break;

55237

}

55238

55239

switch (CC) {

55240

default: break;

55241

case X86::COND_A: case X86::COND_AE:

55242

case X86::COND_B: case X86::COND_BE:

55243

case X86::COND_O: case X86::COND_NO:

55244

case X86::COND_G: case X86::COND_GE:

55245

case X86::COND_L: case X86::COND_LE:

55246

return true;

55247

}

55248

}

55249

55250

return false;

55251

}

55252

55253

static bool onlyZeroFlagUsed(SDValue Flags) {

55254

assert(Flags.getValueType() == MVT::i32 && "Unexpected VT!")(static_cast <bool> (Flags.getValueType() == MVT::i32 &&
"Unexpected VT!") ? void (0) : __assert_fail ("Flags.getValueType() == MVT::i32 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 55254, __extension__
__PRETTY_FUNCTION__));

55255

55256

for (const SDNode *User : Flags->uses()) {

55257

unsigned CCOpNo;

55258

switch (User->getOpcode()) {

55259

default:

55260

// Be conservative.

55261

return false;

55262

case X86ISD::SETCC:

55263

case X86ISD::SETCC_CARRY:

55264

CCOpNo = 0;

55265

break;

55266

case X86ISD::BRCOND:

55267

case X86ISD::CMOV:

55268

CCOpNo = 2;

55269

break;

55270

}

55271

55272

X86::CondCode CC = (X86::CondCode)User->getConstantOperandVal(CCOpNo);

55273

if (CC != X86::COND_E && CC != X86::COND_NE)

55274

return false;

55275

}

55276

55277

return true;

55278

}

55279

55280

static SDValue combineCMP(SDNode *N, SelectionDAG &DAG) {

55281

// Only handle test patterns.

55282

if (!isNullConstant(N->getOperand(1)))

55283

return SDValue();

55284

55285

// If we have a CMP of a truncated binop, see if we can make a smaller binop

55286

// and use its flags directly.

55287

// TODO: Maybe we should try promoting compares that only use the zero flag

55288

// first if we can prove the upper bits with computeKnownBits?

55289

SDLoc dl(N);

55290

SDValue Op = N->getOperand(0);

55291

EVT VT = Op.getValueType();

55292

55293

// If we have a constant logical shift that's only used in a comparison

55294

// against zero turn it into an equivalent AND. This allows turning it into

55295

// a TEST instruction later.

55296

if ((Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SHL) &&

55297

Op.hasOneUse() && isa<ConstantSDNode>(Op.getOperand(1)) &&

55298

onlyZeroFlagUsed(SDValue(N, 0))) {

55299

unsigned BitWidth = VT.getSizeInBits();

55300

const APInt &ShAmt = Op.getConstantOperandAPInt(1);

55301

if (ShAmt.ult(BitWidth)) { // Avoid undefined shifts.

55302

unsigned MaskBits = BitWidth - ShAmt.getZExtValue();

55303

APInt Mask = Op.getOpcode() == ISD::SRL

55304

? APInt::getHighBitsSet(BitWidth, MaskBits)

55305

: APInt::getLowBitsSet(BitWidth, MaskBits);

55306

if (Mask.isSignedIntN(32)) {

55307

Op = DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0),

55308

DAG.getConstant(Mask, dl, VT));

55309

return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,

55310

DAG.getConstant(0, dl, VT));

55311

}

55312

}

55313

}

55314

55315

// Peek through any zero-extend if we're only testing for a zero result.

55316

if (Op.getOpcode() == ISD::ZERO_EXTEND && onlyZeroFlagUsed(SDValue(N, 0))) {

55317

SDValue Src = Op.getOperand(0);

55318

EVT SrcVT = Src.getValueType();

55319

if (SrcVT.getScalarSizeInBits() >= 8 &&

55320

DAG.getTargetLoweringInfo().isTypeLegal(SrcVT))

55321

return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Src,

55322

DAG.getConstant(0, dl, SrcVT));

55323

}

55324

55325

// Look for a truncate.

55326

if (Op.getOpcode() != ISD::TRUNCATE)

55327

return SDValue();

55328

55329

SDValue Trunc = Op;

55330

Op = Op.getOperand(0);

55331

55332

// See if we can compare with zero against the truncation source,

55333

// which should help using the Z flag from many ops. Only do this for

55334

// i32 truncated op to prevent partial-reg compares of promoted ops.

55335

EVT OpVT = Op.getValueType();

55336

APInt UpperBits =

55337

APInt::getBitsSetFrom(OpVT.getSizeInBits(), VT.getSizeInBits());

55338

if (OpVT == MVT::i32 && DAG.MaskedValueIsZero(Op, UpperBits) &&

55339

onlyZeroFlagUsed(SDValue(N, 0))) {

55340

return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,

55341

DAG.getConstant(0, dl, OpVT));

55342

}

55343

55344

// After this the truncate and arithmetic op must have a single use.

55345

if (!Trunc.hasOneUse() || !Op.hasOneUse())

55346

return SDValue();

55347

55348

unsigned NewOpc;

55349

switch (Op.getOpcode()) {

55350

default: return SDValue();

55351

case ISD::AND:

55352

// Skip and with constant. We have special handling for and with immediate

55353

// during isel to generate test instructions.

55354

if (isa<ConstantSDNode>(Op.getOperand(1)))

55355

return SDValue();

55356

NewOpc = X86ISD::AND;

55357

break;

55358

case ISD::OR: NewOpc = X86ISD::OR; break;

55359

case ISD::XOR: NewOpc = X86ISD::XOR; break;

55360

case ISD::ADD:

55361

// If the carry or overflow flag is used, we can't truncate.

55362

if (needCarryOrOverflowFlag(SDValue(N, 0)))

55363

return SDValue();

55364

NewOpc = X86ISD::ADD;

55365

break;

55366

case ISD::SUB:

55367

// If the carry or overflow flag is used, we can't truncate.

55368

if (needCarryOrOverflowFlag(SDValue(N, 0)))

55369

return SDValue();

55370

NewOpc = X86ISD::SUB;

55371

break;

55372

}

55373

55374

// We found an op we can narrow. Truncate its inputs.

55375

SDValue Op0 = DAG.getNode(ISD::TRUNCATE, dl, VT, Op.getOperand(0));

55376

SDValue Op1 = DAG.getNode(ISD::TRUNCATE, dl, VT, Op.getOperand(1));

55377

55378

// Use a X86 specific opcode to avoid DAG combine messing with it.

55379

SDVTList VTs = DAG.getVTList(VT, MVT::i32);

55380

Op = DAG.getNode(NewOpc, dl, VTs, Op0, Op1);

55381

55382

// For AND, keep a CMP so that we can match the test pattern.

55383

if (NewOpc == X86ISD::AND)

55384

return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,

55385

DAG.getConstant(0, dl, VT));

55386

55387

// Return the flags.

55388

return Op.getValue(1);

55389

}

55390

55391

static SDValue combineX86AddSub(SDNode *N, SelectionDAG &DAG,

55392

TargetLowering::DAGCombinerInfo &DCI) {

55393

assert((X86ISD::ADD == N->getOpcode() || X86ISD::SUB == N->getOpcode()) &&(static_cast <bool> ((X86ISD::ADD == N->getOpcode() ||
X86ISD::SUB == N->getOpcode()) && "Expected X86ISD::ADD or X86ISD::SUB"
) ? void (0) : __assert_fail ("(X86ISD::ADD == N->getOpcode() || X86ISD::SUB == N->getOpcode()) && \"Expected X86ISD::ADD or X86ISD::SUB\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 55394, __extension__
__PRETTY_FUNCTION__))

55394

"Expected X86ISD::ADD or X86ISD::SUB")(static_cast <bool> ((X86ISD::ADD == N->getOpcode() ||
X86ISD::SUB == N->getOpcode()) && "Expected X86ISD::ADD or X86ISD::SUB"
) ? void (0) : __assert_fail ("(X86ISD::ADD == N->getOpcode() || X86ISD::SUB == N->getOpcode()) && \"Expected X86ISD::ADD or X86ISD::SUB\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 55394, __extension__
__PRETTY_FUNCTION__));

55395

55396

SDLoc DL(N);

55397

SDValue LHS = N->getOperand(0);

55398

SDValue RHS = N->getOperand(1);

55399

MVT VT = LHS.getSimpleValueType();

55400

bool IsSub = X86ISD::SUB == N->getOpcode();

55401

unsigned GenericOpc = IsSub ? ISD::SUB : ISD::ADD;

55402

55403

// If we don't use the flag result, simplify back to a generic ADD/SUB.

55404

if (!N->hasAnyUseOfValue(1)) {

55405

SDValue Res = DAG.getNode(GenericOpc, DL, VT, LHS, RHS);

55406

return DAG.getMergeValues({Res, DAG.getConstant(0, DL, MVT::i32)}, DL);

55407

}

55408

55409

// Fold any similar generic ADD/SUB opcodes to reuse this node.

55410

auto MatchGeneric = [&](SDValue N0, SDValue N1, bool Negate) {

55411

SDValue Ops[] = {N0, N1};

55412

SDVTList VTs = DAG.getVTList(N->getValueType(0));

55413

if (SDNode *GenericAddSub = DAG.getNodeIfExists(GenericOpc, VTs, Ops)) {

55414

SDValue Op(N, 0);

55415

if (Negate)

55416

Op = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Op);

55417

DCI.CombineTo(GenericAddSub, Op);

55418

}

55419

};

55420

MatchGeneric(LHS, RHS, false);

55421

MatchGeneric(RHS, LHS, X86ISD::SUB == N->getOpcode());

55422

55423

// TODO: Can we drop the ZeroSecondOpOnly limit? This is to guarantee that the

55424

// EFLAGS result doesn't change.

55425

return combineAddOrSubToADCOrSBB(IsSub, DL, VT, LHS, RHS, DAG,

55426

/*ZeroSecondOpOnly*/ true);

55427

}

55428

55429

static SDValue combineSBB(SDNode *N, SelectionDAG &DAG) {

55430

SDValue LHS = N->getOperand(0);

55431

SDValue RHS = N->getOperand(1);

55432

SDValue BorrowIn = N->getOperand(2);

55433

55434

if (SDValue Flags = combineCarryThroughADD(BorrowIn, DAG)) {

55435

MVT VT = N->getSimpleValueType(0);

55436

SDVTList VTs = DAG.getVTList(VT, MVT::i32);

55437

return DAG.getNode(X86ISD::SBB, SDLoc(N), VTs, LHS, RHS, Flags);

55438

}

55439

55440

// Fold SBB(SUB(X,Y),0,Carry) -> SBB(X,Y,Carry)

55441

// iff the flag result is dead.

55442

if (LHS.getOpcode() == ISD::SUB && isNullConstant(RHS) &&

55443

!N->hasAnyUseOfValue(1))

55444

return DAG.getNode(X86ISD::SBB, SDLoc(N), N->getVTList(), LHS.getOperand(0),

55445

LHS.getOperand(1), BorrowIn);

55446

55447

return SDValue();

55448

}

55449

55450

// Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS

55451

static SDValue combineADC(SDNode *N, SelectionDAG &DAG,

55452

TargetLowering::DAGCombinerInfo &DCI) {

55453

SDValue LHS = N->getOperand(0);

55454

SDValue RHS = N->getOperand(1);

55455

SDValue CarryIn = N->getOperand(2);

55456

auto *LHSC = dyn_cast<ConstantSDNode>(LHS);

55457

auto *RHSC = dyn_cast<ConstantSDNode>(RHS);

55458

55459

// Canonicalize constant to RHS.

55460

if (LHSC && !RHSC)

55461

return DAG.getNode(X86ISD::ADC, SDLoc(N), N->getVTList(), RHS, LHS,

55462

CarryIn);

55463

55464

// If the LHS and RHS of the ADC node are zero, then it can't overflow and

55465

// the result is either zero or one (depending on the input carry bit).

55466

// Strength reduce this down to a "set on carry" aka SETCC_CARRY&1.

55467

if (LHSC && RHSC && LHSC->isZero() && RHSC->isZero() &&

55468

// We don't have a good way to replace an EFLAGS use, so only do this when

55469

// dead right now.

55470

SDValue(N, 1).use_empty()) {

55471

SDLoc DL(N);

55472

EVT VT = N->getValueType(0);

55473

SDValue CarryOut = DAG.getConstant(0, DL, N->getValueType(1));

55474

SDValue Res1 = DAG.getNode(

55475

ISD::AND, DL, VT,

55476

DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,

55477

DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), CarryIn),

55478

DAG.getConstant(1, DL, VT));

55479

return DCI.CombineTo(N, Res1, CarryOut);

55480

}

55481

55482

// Fold ADC(C1,C2,Carry) -> ADC(0,C1+C2,Carry)

55483

// iff the flag result is dead.

55484

// TODO: Allow flag result if C1+C2 doesn't signed/unsigned overflow.

55485

if (LHSC && RHSC && !LHSC->isZero() && !N->hasAnyUseOfValue(1)) {

55486

SDLoc DL(N);

55487

APInt Sum = LHSC->getAPIntValue() + RHSC->getAPIntValue();

55488

return DAG.getNode(X86ISD::ADC, DL, N->getVTList(),

55489

DAG.getConstant(0, DL, LHS.getValueType()),

55490

DAG.getConstant(Sum, DL, LHS.getValueType()), CarryIn);

55491

}

55492

55493

if (SDValue Flags = combineCarryThroughADD(CarryIn, DAG)) {

55494

MVT VT = N->getSimpleValueType(0);

55495

SDVTList VTs = DAG.getVTList(VT, MVT::i32);

55496

return DAG.getNode(X86ISD::ADC, SDLoc(N), VTs, LHS, RHS, Flags);

55497

}

55498

55499

// Fold ADC(ADD(X,Y),0,Carry) -> ADC(X,Y,Carry)

55500

// iff the flag result is dead.

55501

if (LHS.getOpcode() == ISD::ADD && RHSC && RHSC->isZero() &&

55502

!N->hasAnyUseOfValue(1))

55503

return DAG.getNode(X86ISD::ADC, SDLoc(N), N->getVTList(), LHS.getOperand(0),

55504

LHS.getOperand(1), CarryIn);

55505

55506

return SDValue();

55507

}

55508

55509

static SDValue matchPMADDWD(SelectionDAG &DAG, SDValue Op0, SDValue Op1,

55510

const SDLoc &DL, EVT VT,

55511

const X86Subtarget &Subtarget) {

55512

// Example of pattern we try to detect:

55513

// t := (v8i32 mul (sext (v8i16 x0), (sext (v8i16 x1))))

55514

//(add (build_vector (extract_elt t, 0),

55515

// (extract_elt t, 2),

55516

// (extract_elt t, 4),

55517

// (extract_elt t, 6)),

55518

// (build_vector (extract_elt t, 1),

55519

// (extract_elt t, 3),

55520

// (extract_elt t, 5),

55521

// (extract_elt t, 7)))

55522

55523

if (!Subtarget.hasSSE2())

55524

return SDValue();

55525

55526

if (Op0.getOpcode() != ISD::BUILD_VECTOR ||

55527

Op1.getOpcode() != ISD::BUILD_VECTOR)

55528

return SDValue();

55529

55530

if (!VT.isVector() || VT.getVectorElementType() != MVT::i32 ||

55531

VT.getVectorNumElements() < 4 ||

55532

!isPowerOf2_32(VT.getVectorNumElements()))

55533

return SDValue();

55534

55535

// Check if one of Op0,Op1 is of the form:

55536

// (build_vector (extract_elt Mul, 0),

55537

// (extract_elt Mul, 2),

55538

// (extract_elt Mul, 4),

55539

// ...

55540

// the other is of the form:

55541

// (build_vector (extract_elt Mul, 1),

55542

// (extract_elt Mul, 3),

55543

// (extract_elt Mul, 5),

55544

// ...

55545

// and identify Mul.

55546

SDValue Mul;

55547

for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; i += 2) {

55548

SDValue Op0L = Op0->getOperand(i), Op1L = Op1->getOperand(i),

55549

Op0H = Op0->getOperand(i + 1), Op1H = Op1->getOperand(i + 1);

55550

// TODO: Be more tolerant to undefs.

55551

if (Op0L.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

55552

Op1L.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

55553

Op0H.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

55554

Op1H.getOpcode() != ISD::EXTRACT_VECTOR_ELT)

55555

return SDValue();

55556

auto *Const0L = dyn_cast<ConstantSDNode>(Op0L->getOperand(1));

55557

auto *Const1L = dyn_cast<ConstantSDNode>(Op1L->getOperand(1));

55558

auto *Const0H = dyn_cast<ConstantSDNode>(Op0H->getOperand(1));

55559

auto *Const1H = dyn_cast<ConstantSDNode>(Op1H->getOperand(1));

55560

if (!Const0L || !Const1L || !Const0H || !Const1H)

55561

return SDValue();

55562

unsigned Idx0L = Const0L->getZExtValue(), Idx1L = Const1L->getZExtValue(),

55563

Idx0H = Const0H->getZExtValue(), Idx1H = Const1H->getZExtValue();

55564

// Commutativity of mul allows factors of a product to reorder.

55565

if (Idx0L > Idx1L)

55566

std::swap(Idx0L, Idx1L);

55567

if (Idx0H > Idx1H)

55568

std::swap(Idx0H, Idx1H);

55569

// Commutativity of add allows pairs of factors to reorder.

55570

if (Idx0L > Idx0H) {

55571

std::swap(Idx0L, Idx0H);

55572

std::swap(Idx1L, Idx1H);

55573

}

55574

if (Idx0L != 2 * i || Idx1L != 2 * i + 1 || Idx0H != 2 * i + 2 ||

55575

Idx1H != 2 * i + 3)

55576

return SDValue();

55577

if (!Mul) {

55578

// First time an extract_elt's source vector is visited. Must be a MUL

55579

// with 2X number of vector elements than the BUILD_VECTOR.

55580

// Both extracts must be from same MUL.

55581

Mul = Op0L->getOperand(0);

55582

if (Mul->getOpcode() != ISD::MUL ||

55583

Mul.getValueType().getVectorNumElements() != 2 * e)

55584

return SDValue();

55585

}

55586

// Check that the extract is from the same MUL previously seen.

55587

if (Mul != Op0L->getOperand(0) || Mul != Op1L->getOperand(0) ||

55588

Mul != Op0H->getOperand(0) || Mul != Op1H->getOperand(0))

55589

return SDValue();

55590

}

55591

55592

// Check if the Mul source can be safely shrunk.

55593

ShrinkMode Mode;

55594

if (!canReduceVMulWidth(Mul.getNode(), DAG, Mode) ||

55595

Mode == ShrinkMode::MULU16)

55596

return SDValue();

55597

55598

EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,

55599

VT.getVectorNumElements() * 2);

55600

SDValue N0 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Mul.getOperand(0));

55601

SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Mul.getOperand(1));

55602

55603

auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,

55604

ArrayRef<SDValue> Ops) {

55605

EVT InVT = Ops[0].getValueType();

55606

assert(InVT == Ops[1].getValueType() && "Operands' types mismatch")(static_cast <bool> (InVT == Ops[1].getValueType() &&
"Operands' types mismatch") ? void (0) : __assert_fail ("InVT == Ops[1].getValueType() && \"Operands' types mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 55606, __extension__
__PRETTY_FUNCTION__));

55607

EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,

55608

InVT.getVectorNumElements() / 2);

55609

return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, Ops[0], Ops[1]);

55610

};

55611

return SplitOpsAndApply(DAG, Subtarget, DL, VT, { N0, N1 }, PMADDBuilder);

55612

}

55613

55614

// Attempt to turn this pattern into PMADDWD.

55615

// (add (mul (sext (build_vector)), (sext (build_vector))),

55616

// (mul (sext (build_vector)), (sext (build_vector)))

55617

static SDValue matchPMADDWD_2(SelectionDAG &DAG, SDValue N0, SDValue N1,

55618

const SDLoc &DL, EVT VT,

55619

const X86Subtarget &Subtarget) {

55620

if (!Subtarget.hasSSE2())

55621

return SDValue();

55622

55623

if (N0.getOpcode() != ISD::MUL || N1.getOpcode() != ISD::MUL)

55624

return SDValue();

55625

55626

if (!VT.isVector() || VT.getVectorElementType() != MVT::i32 ||

55627

VT.getVectorNumElements() < 4 ||

55628

!isPowerOf2_32(VT.getVectorNumElements()))

55629

return SDValue();

55630

55631

SDValue N00 = N0.getOperand(0);

55632

SDValue N01 = N0.getOperand(1);

55633

SDValue N10 = N1.getOperand(0);

55634

SDValue N11 = N1.getOperand(1);

55635

55636

// All inputs need to be sign extends.

55637

// TODO: Support ZERO_EXTEND from known positive?

55638

if (N00.getOpcode() != ISD::SIGN_EXTEND ||

55639

N01.getOpcode() != ISD::SIGN_EXTEND ||

55640

N10.getOpcode() != ISD::SIGN_EXTEND ||

55641

N11.getOpcode() != ISD::SIGN_EXTEND)

55642

return SDValue();

55643

55644

// Peek through the extends.

55645

N00 = N00.getOperand(0);

55646

N01 = N01.getOperand(0);

55647

N10 = N10.getOperand(0);

55648

N11 = N11.getOperand(0);

55649

55650

// Must be extending from vXi16.

55651

EVT InVT = N00.getValueType();

55652

if (InVT.getVectorElementType() != MVT::i16 || N01.getValueType() != InVT ||

55653

N10.getValueType() != InVT || N11.getValueType() != InVT)

55654

return SDValue();

55655

55656

// All inputs should be build_vectors.

55657

if (N00.getOpcode() != ISD::BUILD_VECTOR ||

55658

N01.getOpcode() != ISD::BUILD_VECTOR ||

55659

N10.getOpcode() != ISD::BUILD_VECTOR ||

55660

N11.getOpcode() != ISD::BUILD_VECTOR)

55661

return SDValue();

55662

55663

// For each element, we need to ensure we have an odd element from one vector

55664

// multiplied by the odd element of another vector and the even element from

55665

// one of the same vectors being multiplied by the even element from the

55666

// other vector. So we need to make sure for each element i, this operator

55667

// is being performed:

55668

// A[2 * i] * B[2 * i] + A[2 * i + 1] * B[2 * i + 1]

55669

SDValue In0, In1;

55670

for (unsigned i = 0; i != N00.getNumOperands(); ++i) {

55671

SDValue N00Elt = N00.getOperand(i);

55672

SDValue N01Elt = N01.getOperand(i);

55673

SDValue N10Elt = N10.getOperand(i);

55674

SDValue N11Elt = N11.getOperand(i);

55675

// TODO: Be more tolerant to undefs.

55676

if (N00Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

55677

N01Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

55678

N10Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

55679

N11Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT)

55680

return SDValue();

55681

auto *ConstN00Elt = dyn_cast<ConstantSDNode>(N00Elt.getOperand(1));

55682

auto *ConstN01Elt = dyn_cast<ConstantSDNode>(N01Elt.getOperand(1));

55683

auto *ConstN10Elt = dyn_cast<ConstantSDNode>(N10Elt.getOperand(1));

55684

auto *ConstN11Elt = dyn_cast<ConstantSDNode>(N11Elt.getOperand(1));

55685

if (!ConstN00Elt || !ConstN01Elt || !ConstN10Elt || !ConstN11Elt)

55686

return SDValue();

55687

unsigned IdxN00 = ConstN00Elt->getZExtValue();

55688

unsigned IdxN01 = ConstN01Elt->getZExtValue();

55689

unsigned IdxN10 = ConstN10Elt->getZExtValue();

55690

unsigned IdxN11 = ConstN11Elt->getZExtValue();

55691

// Add is commutative so indices can be reordered.

55692

if (IdxN00 > IdxN10) {

55693

std::swap(IdxN00, IdxN10);

55694

std::swap(IdxN01, IdxN11);

55695

}

55696

// N0 indices be the even element. N1 indices must be the next odd element.

55697

if (IdxN00 != 2 * i || IdxN10 != 2 * i + 1 ||

55698

IdxN01 != 2 * i || IdxN11 != 2 * i + 1)

55699

return SDValue();

55700

SDValue N00In = N00Elt.getOperand(0);

55701

SDValue N01In = N01Elt.getOperand(0);

55702

SDValue N10In = N10Elt.getOperand(0);

55703

SDValue N11In = N11Elt.getOperand(0);

55704

55705

// First time we find an input capture it.

55706

if (!In0) {

55707

In0 = N00In;

55708

In1 = N01In;

55709

55710

// The input vectors must be at least as wide as the output.

55711

// If they are larger than the output, we extract subvector below.

55712

if (In0.getValueSizeInBits() < VT.getSizeInBits() ||

55713

In1.getValueSizeInBits() < VT.getSizeInBits())

55714

return SDValue();

55715

}

55716

// Mul is commutative so the input vectors can be in any order.

55717

// Canonicalize to make the compares easier.

55718

if (In0 != N00In)

55719

std::swap(N00In, N01In);

55720

if (In0 != N10In)

55721

std::swap(N10In, N11In);

55722

if (In0 != N00In || In1 != N01In || In0 != N10In || In1 != N11In)

55723

return SDValue();

55724

}

55725

55726

auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,

55727

ArrayRef<SDValue> Ops) {

55728

EVT OpVT = Ops[0].getValueType();

55729

assert(OpVT.getScalarType() == MVT::i16 &&(static_cast <bool> (OpVT.getScalarType() == MVT::i16 &&
"Unexpected scalar element type") ? void (0) : __assert_fail
("OpVT.getScalarType() == MVT::i16 && \"Unexpected scalar element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 55730, __extension__
__PRETTY_FUNCTION__))

55730

"Unexpected scalar element type")(static_cast <bool> (OpVT.getScalarType() == MVT::i16 &&
"Unexpected scalar element type") ? void (0) : __assert_fail
("OpVT.getScalarType() == MVT::i16 && \"Unexpected scalar element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 55730, __extension__
__PRETTY_FUNCTION__));

55731

assert(OpVT == Ops[1].getValueType() && "Operands' types mismatch")(static_cast <bool> (OpVT == Ops[1].getValueType() &&
"Operands' types mismatch") ? void (0) : __assert_fail ("OpVT == Ops[1].getValueType() && \"Operands' types mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 55731, __extension__
__PRETTY_FUNCTION__));

55732

EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,

55733

OpVT.getVectorNumElements() / 2);

55734

return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, Ops[0], Ops[1]);

55735

};

55736

55737

// If the output is narrower than an input, extract the low part of the input

55738

// vector.

55739

EVT OutVT16 = EVT::getVectorVT(*DAG.getContext(), MVT::i16,

55740

VT.getVectorNumElements() * 2);

55741

if (OutVT16.bitsLT(In0.getValueType())) {

55742

In0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT16, In0,

55743

DAG.getIntPtrConstant(0, DL));

55744

}

55745

if (OutVT16.bitsLT(In1.getValueType())) {

55746

In1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT16, In1,

55747

DAG.getIntPtrConstant(0, DL));

55748

}

55749

return SplitOpsAndApply(DAG, Subtarget, DL, VT, { In0, In1 },

55750

PMADDBuilder);

55751

}

55752

55753

// ADD(VPMADDWD(X,Y),VPMADDWD(Z,W)) -> VPMADDWD(SHUFFLE(X,Z), SHUFFLE(Y,W))

55754

// If upper element in each pair of both VPMADDWD are zero then we can merge

55755

// the operand elements and use the implicit add of VPMADDWD.

55756

// TODO: Add support for VPMADDUBSW (which isn't commutable).

55757

static SDValue combineAddOfPMADDWD(SelectionDAG &DAG, SDValue N0, SDValue N1,

55758

const SDLoc &DL, EVT VT) {

55759

if (N0.getOpcode() != N1.getOpcode() || N0.getOpcode() != X86ISD::VPMADDWD)

55760

return SDValue();

55761

55762

// TODO: Add 256/512-bit support once VPMADDWD combines with shuffles.

55763

if (VT.getSizeInBits() > 128)

55764

return SDValue();

55765

55766

unsigned NumElts = VT.getVectorNumElements();

55767

MVT OpVT = N0.getOperand(0).getSimpleValueType();

55768

APInt DemandedBits = APInt::getAllOnes(OpVT.getScalarSizeInBits());

55769

APInt DemandedHiElts = APInt::getSplat(2 * NumElts, APInt(2, 2));

55770

55771

bool Op0HiZero =

55772

DAG.MaskedValueIsZero(N0.getOperand(0), DemandedBits, DemandedHiElts) ||

55773

DAG.MaskedValueIsZero(N0.getOperand(1), DemandedBits, DemandedHiElts);

55774

bool Op1HiZero =

55775

DAG.MaskedValueIsZero(N1.getOperand(0), DemandedBits, DemandedHiElts) ||

55776

DAG.MaskedValueIsZero(N1.getOperand(1), DemandedBits, DemandedHiElts);

55777

55778

// TODO: Check for zero lower elements once we have actual codegen that

55779

// creates them.

55780

if (!Op0HiZero || !Op1HiZero)

55781

return SDValue();

55782

55783

// Create a shuffle mask packing the lower elements from each VPMADDWD.

55784

SmallVector<int> Mask;

55785

for (int i = 0; i != (int)NumElts; ++i) {

55786

Mask.push_back(2 * i);

55787

Mask.push_back(2 * (i + NumElts));

55788

}

55789

55790

SDValue LHS =

55791

DAG.getVectorShuffle(OpVT, DL, N0.getOperand(0), N1.getOperand(0), Mask);

55792

SDValue RHS =

55793

DAG.getVectorShuffle(OpVT, DL, N0.getOperand(1), N1.getOperand(1), Mask);

55794

return DAG.getNode(X86ISD::VPMADDWD, DL, VT, LHS, RHS);

55795

}

55796

55797

/// CMOV of constants requires materializing constant operands in registers.

55798

/// Try to fold those constants into an 'add' instruction to reduce instruction

55799

/// count. We do this with CMOV rather the generic 'select' because there are

55800

/// earlier folds that may be used to turn select-of-constants into logic hacks.

55801

static SDValue pushAddIntoCmovOfConsts(SDNode *N, SelectionDAG &DAG,

55802

const X86Subtarget &Subtarget) {

55803

// If an operand is zero, add-of-0 gets simplified away, so that's clearly

55804

// better because we eliminate 1-2 instructions. This transform is still

55805

// an improvement without zero operands because we trade 2 move constants and

55806

// 1 add for 2 adds (LEA) as long as the constants can be represented as

55807

// immediate asm operands (fit in 32-bits).

55808

auto isSuitableCmov = [](SDValue V) {

55809

if (V.getOpcode() != X86ISD::CMOV || !V.hasOneUse())

55810

return false;

55811

if (!isa<ConstantSDNode>(V.getOperand(0)) ||

55812

!isa<ConstantSDNode>(V.getOperand(1)))

55813

return false;

55814

return isNullConstant(V.getOperand(0)) || isNullConstant(V.getOperand(1)) ||

55815

(V.getConstantOperandAPInt(0).isSignedIntN(32) &&

55816

V.getConstantOperandAPInt(1).isSignedIntN(32));

55817

};

55818

55819

// Match an appropriate CMOV as the first operand of the add.

55820

SDValue Cmov = N->getOperand(0);

55821

SDValue OtherOp = N->getOperand(1);

55822

if (!isSuitableCmov(Cmov))

55823

std::swap(Cmov, OtherOp);

55824

if (!isSuitableCmov(Cmov))

55825

return SDValue();

55826

55827

// Don't remove a load folding opportunity for the add. That would neutralize

55828

// any improvements from removing constant materializations.

55829

if (X86::mayFoldLoad(OtherOp, Subtarget))

55830

return SDValue();

55831

55832

EVT VT = N->getValueType(0);

55833

SDLoc DL(N);

55834

SDValue FalseOp = Cmov.getOperand(0);

55835

SDValue TrueOp = Cmov.getOperand(1);

55836

55837

// We will push the add through the select, but we can potentially do better

55838

// if we know there is another add in the sequence and this is pointer math.

55839

// In that case, we can absorb an add into the trailing memory op and avoid

55840

// a 3-operand LEA which is likely slower than a 2-operand LEA.

55841

// TODO: If target has "slow3OpsLEA", do this even without the trailing memop?

55842

if (OtherOp.getOpcode() == ISD::ADD && OtherOp.hasOneUse() &&

55843

!isa<ConstantSDNode>(OtherOp.getOperand(0)) &&

55844

all_of(N->uses(), [&](SDNode *Use) {

55845

auto *MemNode = dyn_cast<MemSDNode>(Use);

55846

return MemNode && MemNode->getBasePtr().getNode() == N;

55847

})) {

55848

// add (cmov C1, C2), add (X, Y) --> add (cmov (add X, C1), (add X, C2)), Y

55849

// TODO: We are arbitrarily choosing op0 as the 1st piece of the sum, but

55850

// it is possible that choosing op1 might be better.

55851

SDValue X = OtherOp.getOperand(0), Y = OtherOp.getOperand(1);

55852

FalseOp = DAG.getNode(ISD::ADD, DL, VT, X, FalseOp);

55853

TrueOp = DAG.getNode(ISD::ADD, DL, VT, X, TrueOp);

55854

Cmov = DAG.getNode(X86ISD::CMOV, DL, VT, FalseOp, TrueOp,

55855

Cmov.getOperand(2), Cmov.getOperand(3));

55856

return DAG.getNode(ISD::ADD, DL, VT, Cmov, Y);

55857

}

55858

55859

// add (cmov C1, C2), OtherOp --> cmov (add OtherOp, C1), (add OtherOp, C2)

55860

FalseOp = DAG.getNode(ISD::ADD, DL, VT, OtherOp, FalseOp);

55861

TrueOp = DAG.getNode(ISD::ADD, DL, VT, OtherOp, TrueOp);

55862

return DAG.getNode(X86ISD::CMOV, DL, VT, FalseOp, TrueOp, Cmov.getOperand(2),

55863

Cmov.getOperand(3));

55864

}

55865

55866

static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,

55867

TargetLowering::DAGCombinerInfo &DCI,

55868

const X86Subtarget &Subtarget) {

55869

EVT VT = N->getValueType(0);

55870

SDValue Op0 = N->getOperand(0);

55871

SDValue Op1 = N->getOperand(1);

55872

SDLoc DL(N);

55873

55874

if (SDValue Select = pushAddIntoCmovOfConsts(N, DAG, Subtarget))

55875

return Select;

55876

55877

if (SDValue MAdd = matchPMADDWD(DAG, Op0, Op1, DL, VT, Subtarget))

55878

return MAdd;

55879

if (SDValue MAdd = matchPMADDWD_2(DAG, Op0, Op1, DL, VT, Subtarget))

55880

return MAdd;

55881

if (SDValue MAdd = combineAddOfPMADDWD(DAG, Op0, Op1, DL, VT))

55882

return MAdd;

55883

55884

// Try to synthesize horizontal adds from adds of shuffles.

55885

if (SDValue V = combineToHorizontalAddSub(N, DAG, Subtarget))

55886

return V;

55887

55888

// If vectors of i1 are legal, turn (add (zext (vXi1 X)), Y) into

55889

// (sub Y, (sext (vXi1 X))).

55890

// FIXME: We have the (sub Y, (zext (vXi1 X))) -> (add (sext (vXi1 X)), Y) in

55891

// generic DAG combine without a legal type check, but adding this there

55892

// caused regressions.

55893

if (VT.isVector()) {

55894

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

55895

if (Op0.getOpcode() == ISD::ZERO_EXTEND &&

55896

Op0.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&

55897

TLI.isTypeLegal(Op0.getOperand(0).getValueType())) {

55898

SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Op0.getOperand(0));

55899

return DAG.getNode(ISD::SUB, DL, VT, Op1, SExt);

55900

}

55901

55902

if (Op1.getOpcode() == ISD::ZERO_EXTEND &&

55903

Op1.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&

55904

TLI.isTypeLegal(Op1.getOperand(0).getValueType())) {

55905

SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Op1.getOperand(0));

55906

return DAG.getNode(ISD::SUB, DL, VT, Op0, SExt);

55907

}

55908

}

55909

55910

// Fold ADD(ADC(Y,0,W),X) -> ADC(X,Y,W)

55911

if (Op0.getOpcode() == X86ISD::ADC && Op0->hasOneUse() &&

55912

X86::isZeroNode(Op0.getOperand(1))) {

55913

assert(!Op0->hasAnyUseOfValue(1) && "Overflow bit in use")(static_cast <bool> (!Op0->hasAnyUseOfValue(1) &&
"Overflow bit in use") ? void (0) : __assert_fail ("!Op0->hasAnyUseOfValue(1) && \"Overflow bit in use\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 55913, __extension__
__PRETTY_FUNCTION__));

55914

return DAG.getNode(X86ISD::ADC, SDLoc(Op0), Op0->getVTList(), Op1,

55915

Op0.getOperand(0), Op0.getOperand(2));

55916

}

55917

55918

return combineAddOrSubToADCOrSBB(N, DAG);

55919

}

55920

55921

// Try to fold (sub Y, cmovns X, -X) -> (add Y, cmovns -X, X) if the cmov

55922

// condition comes from the subtract node that produced -X. This matches the

55923

// cmov expansion for absolute value. By swapping the operands we convert abs

55924

// to nabs.

55925

static SDValue combineSubABS(SDNode *N, SelectionDAG &DAG) {

55926

SDValue N0 = N->getOperand(0);

55927

SDValue N1 = N->getOperand(1);

55928

55929

if (N1.getOpcode() != X86ISD::CMOV || !N1.hasOneUse())

55930

return SDValue();

55931

55932

X86::CondCode CC = (X86::CondCode)N1.getConstantOperandVal(2);

55933

if (CC != X86::COND_S && CC != X86::COND_NS)

55934

return SDValue();

55935

55936

// Condition should come from a negate operation.

55937

SDValue Cond = N1.getOperand(3);

55938

if (Cond.getOpcode() != X86ISD::SUB || !isNullConstant(Cond.getOperand(0)))

55939

return SDValue();

55940

assert(Cond.getResNo() == 1 && "Unexpected result number")(static_cast <bool> (Cond.getResNo() == 1 && "Unexpected result number"
) ? void (0) : __assert_fail ("Cond.getResNo() == 1 && \"Unexpected result number\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 55940, __extension__
__PRETTY_FUNCTION__));

55941

55942

// Get the X and -X from the negate.

55943

SDValue NegX = Cond.getValue(0);

55944

SDValue X = Cond.getOperand(1);

55945

55946

SDValue FalseOp = N1.getOperand(0);

55947

SDValue TrueOp = N1.getOperand(1);

55948

55949

// Cmov operands should be X and NegX. Order doesn't matter.

55950

if (!(TrueOp == X && FalseOp == NegX) && !(TrueOp == NegX && FalseOp == X))

55951

return SDValue();

55952

55953

// Build a new CMOV with the operands swapped.

55954

SDLoc DL(N);

55955

MVT VT = N->getSimpleValueType(0);

55956

SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, VT, TrueOp, FalseOp,

55957

N1.getOperand(2), Cond);

55958

// Convert sub to add.

55959

return DAG.getNode(ISD::ADD, DL, VT, N0, Cmov);

55960

}

55961

55962

static SDValue combineSubSetcc(SDNode *N, SelectionDAG &DAG) {

55963

SDValue Op0 = N->getOperand(0);

55964

SDValue Op1 = N->getOperand(1);

55965

55966

// (sub C (zero_extend (setcc)))

55967

// =>

55968

// (add (zero_extend (setcc inverted) C-1)) if C is a nonzero immediate

55969

// Don't disturb (sub 0 setcc), which is easily done with neg.

55970

EVT VT = N->getValueType(0);

55971

auto *Op0C = dyn_cast<ConstantSDNode>(Op0);

55972

if (Op1.getOpcode() == ISD::ZERO_EXTEND && Op1.hasOneUse() && Op0C &&

55973

!Op0C->isZero() && Op1.getOperand(0).getOpcode() == X86ISD::SETCC &&

55974

Op1.getOperand(0).hasOneUse()) {

55975

SDValue SetCC = Op1.getOperand(0);

55976

X86::CondCode CC = (X86::CondCode)SetCC.getConstantOperandVal(0);

55977

X86::CondCode NewCC = X86::GetOppositeBranchCondition(CC);

55978

uint64_t NewImm = Op0C->getZExtValue() - 1;

55979

SDLoc DL(Op1);

55980

SDValue NewSetCC = getSETCC(NewCC, SetCC.getOperand(1), DL, DAG);

55981

NewSetCC = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, NewSetCC);

55982

return DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(VT, VT), NewSetCC,

55983

DAG.getConstant(NewImm, DL, VT));

55984

}

55985

55986

return SDValue();

55987

}

55988

55989

static SDValue combineSub(SDNode *N, SelectionDAG &DAG,

55990

TargetLowering::DAGCombinerInfo &DCI,

55991

const X86Subtarget &Subtarget) {

55992

SDValue Op0 = N->getOperand(0);

55993

SDValue Op1 = N->getOperand(1);

55994

55995

// TODO: Add NoOpaque handling to isConstantIntBuildVectorOrConstantInt.

55996

auto IsNonOpaqueConstant = [&](SDValue Op) {

55997

if (SDNode *C = DAG.isConstantIntBuildVectorOrConstantInt(Op)) {

55998

if (auto *Cst = dyn_cast<ConstantSDNode>(C))

55999

return !Cst->isOpaque();

56000

return true;

56001

}

56002

return false;

56003

};

56004

56005

// X86 can't encode an immediate LHS of a sub. See if we can push the

56006

// negation into a preceding instruction. If the RHS of the sub is a XOR with

56007

// one use and a constant, invert the immediate, saving one register.

56008

// sub(C1, xor(X, C2)) -> add(xor(X, ~C2), C1+1)

56009

if (Op1.getOpcode() == ISD::XOR && IsNonOpaqueConstant(Op0) &&

56010

IsNonOpaqueConstant(Op1.getOperand(1)) && Op1->hasOneUse()) {

56011

SDLoc DL(N);

56012

EVT VT = Op0.getValueType();

56013

SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT, Op1.getOperand(0),

56014

DAG.getNOT(SDLoc(Op1), Op1.getOperand(1), VT));

56015

SDValue NewAdd =

56016

DAG.getNode(ISD::ADD, DL, VT, Op0, DAG.getConstant(1, DL, VT));

56017

return DAG.getNode(ISD::ADD, DL, VT, NewXor, NewAdd);

56018

}

56019

56020

if (SDValue V = combineSubABS(N, DAG))

56021

return V;

56022

56023

// Try to synthesize horizontal subs from subs of shuffles.

56024

if (SDValue V = combineToHorizontalAddSub(N, DAG, Subtarget))

56025

return V;

56026

56027

// Fold SUB(X,ADC(Y,0,W)) -> SBB(X,Y,W)

56028

if (Op1.getOpcode() == X86ISD::ADC && Op1->hasOneUse() &&

56029

X86::isZeroNode(Op1.getOperand(1))) {

56030

assert(!Op1->hasAnyUseOfValue(1) && "Overflow bit in use")(static_cast <bool> (!Op1->hasAnyUseOfValue(1) &&
"Overflow bit in use") ? void (0) : __assert_fail ("!Op1->hasAnyUseOfValue(1) && \"Overflow bit in use\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 56030, __extension__
__PRETTY_FUNCTION__));

56031

return DAG.getNode(X86ISD::SBB, SDLoc(Op1), Op1->getVTList(), Op0,

56032

Op1.getOperand(0), Op1.getOperand(2));

56033

}

56034

56035

// Fold SUB(X,SBB(Y,Z,W)) -> SUB(ADC(X,Z,W),Y)

56036

// Don't fold to ADC(0,0,W)/SETCC_CARRY pattern which will prevent more folds.

56037

if (Op1.getOpcode() == X86ISD::SBB && Op1->hasOneUse() &&

56038

!(X86::isZeroNode(Op0) && X86::isZeroNode(Op1.getOperand(1)))) {

56039

assert(!Op1->hasAnyUseOfValue(1) && "Overflow bit in use")(static_cast <bool> (!Op1->hasAnyUseOfValue(1) &&
"Overflow bit in use") ? void (0) : __assert_fail ("!Op1->hasAnyUseOfValue(1) && \"Overflow bit in use\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 56039, __extension__
__PRETTY_FUNCTION__));

56040

SDValue ADC = DAG.getNode(X86ISD::ADC, SDLoc(Op1), Op1->getVTList(), Op0,

56041

Op1.getOperand(1), Op1.getOperand(2));

56042

return DAG.getNode(ISD::SUB, SDLoc(N), Op0.getValueType(), ADC.getValue(0),

56043

Op1.getOperand(0));

56044

}

56045

56046

if (SDValue V = combineXorSubCTLZ(N, DAG, Subtarget))

56047

return V;

56048

56049

if (SDValue V = combineAddOrSubToADCOrSBB(N, DAG))

56050

return V;

56051

56052

return combineSubSetcc(N, DAG);

56053

}

56054

56055

static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG,

56056

const X86Subtarget &Subtarget) {

56057

MVT VT = N->getSimpleValueType(0);

56058

SDLoc DL(N);

56059

56060

if (N->getOperand(0) == N->getOperand(1)) {

56061

if (N->getOpcode() == X86ISD::PCMPEQ)

56062

return DAG.getConstant(-1, DL, VT);

56063

if (N->getOpcode() == X86ISD::PCMPGT)

56064

return DAG.getConstant(0, DL, VT);

56065

}

56066

56067

return SDValue();

56068

}

56069

56070

/// Helper that combines an array of subvector ops as if they were the operands

56071

/// of a ISD::CONCAT_VECTORS node, but may have come from another source (e.g.

56072

/// ISD::INSERT_SUBVECTOR). The ops are assumed to be of the same type.

56073

static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,

56074

ArrayRef<SDValue> Ops, SelectionDAG &DAG,

56075

TargetLowering::DAGCombinerInfo &DCI,

56076

const X86Subtarget &Subtarget) {

56077

assert(Subtarget.hasAVX() && "AVX assumed for concat_vectors")(static_cast <bool> (Subtarget.hasAVX() && "AVX assumed for concat_vectors"
) ? void (0) : __assert_fail ("Subtarget.hasAVX() && \"AVX assumed for concat_vectors\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 56077, __extension__
__PRETTY_FUNCTION__));

56078

unsigned EltSizeInBits = VT.getScalarSizeInBits();

56079

56080

if (llvm::all_of(Ops, [](SDValue Op) { return Op.isUndef(); }))

56081

return DAG.getUNDEF(VT);

56082

56083

if (llvm::all_of(Ops, [](SDValue Op) {

56084

return ISD::isBuildVectorAllZeros(Op.getNode());

56085

}))

56086

return getZeroVector(VT, Subtarget, DAG, DL);

56087

56088

SDValue Op0 = Ops[0];

56089

bool IsSplat = llvm::all_equal(Ops);

56090

56091

// Repeated subvectors.

56092

if (IsSplat &&

56093

(VT.is256BitVector() || (VT.is512BitVector() && Subtarget.hasAVX512()))) {

56094

// If this broadcast is inserted into both halves, use a larger broadcast.

56095

if (Op0.getOpcode() == X86ISD::VBROADCAST)

56096

return DAG.getNode(Op0.getOpcode(), DL, VT, Op0.getOperand(0));

56097

56098

// If this simple subvector or scalar/subvector broadcast_load is inserted

56099

// into both halves, use a larger broadcast_load. Update other uses to use

56100

// an extracted subvector.

56101

if (ISD::isNormalLoad(Op0.getNode()) ||

56102

Op0.getOpcode() == X86ISD::VBROADCAST_LOAD ||

56103

Op0.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) {

56104

auto *Mem = cast<MemSDNode>(Op0);

56105

unsigned Opc = Op0.getOpcode() == X86ISD::VBROADCAST_LOAD

56106

? X86ISD::VBROADCAST_LOAD

56107

: X86ISD::SUBV_BROADCAST_LOAD;

56108

if (SDValue BcastLd =

56109

getBROADCAST_LOAD(Opc, DL, VT, Mem->getMemoryVT(), Mem, 0, DAG)) {

56110

SDValue BcastSrc =

56111

extractSubVector(BcastLd, 0, DAG, DL, Op0.getValueSizeInBits());

56112

DAG.ReplaceAllUsesOfValueWith(Op0, BcastSrc);

56113

return BcastLd;

56114

}

56115

}

56116

56117

// concat_vectors(movddup(x),movddup(x)) -> broadcast(x)

56118

if (Op0.getOpcode() == X86ISD::MOVDDUP && VT == MVT::v4f64 &&

56119

(Subtarget.hasAVX2() ||

56120

X86::mayFoldLoadIntoBroadcastFromMem(Op0.getOperand(0),

56121

VT.getScalarType(), Subtarget)))

56122

return DAG.getNode(X86ISD::VBROADCAST, DL, VT,

56123

DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f64,

56124

Op0.getOperand(0),

56125

DAG.getIntPtrConstant(0, DL)));

56126

56127

// concat_vectors(scalar_to_vector(x),scalar_to_vector(x)) -> broadcast(x)

56128

if (Op0.getOpcode() == ISD::SCALAR_TO_VECTOR &&

56129

(Subtarget.hasAVX2() ||

56130

(EltSizeInBits >= 32 &&

56131

X86::mayFoldLoad(Op0.getOperand(0), Subtarget))) &&

56132

Op0.getOperand(0).getValueType() == VT.getScalarType())

56133

return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Op0.getOperand(0));

56134

56135

// concat_vectors(extract_subvector(broadcast(x)),

56136

// extract_subvector(broadcast(x))) -> broadcast(x)

56137

if (Op0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&

56138

Op0.getOperand(0).getValueType() == VT) {

56139

if (Op0.getOperand(0).getOpcode() == X86ISD::VBROADCAST ||

56140

Op0.getOperand(0).getOpcode() == X86ISD::VBROADCAST_LOAD)

56141

return Op0.getOperand(0);

56142

}

56143

}

56144

56145

// concat(extract_subvector(v0,c0), extract_subvector(v1,c1)) -> vperm2x128.

56146

// Only concat of subvector high halves which vperm2x128 is best at.

56147

// TODO: This should go in combineX86ShufflesRecursively eventually.

56148

if (VT.is256BitVector() && Ops.size() == 2) {

56149

SDValue Src0 = peekThroughBitcasts(Ops[0]);

56150

SDValue Src1 = peekThroughBitcasts(Ops[1]);

56151

if (Src0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&

56152

Src1.getOpcode() == ISD::EXTRACT_SUBVECTOR) {

56153

EVT SrcVT0 = Src0.getOperand(0).getValueType();

56154

EVT SrcVT1 = Src1.getOperand(0).getValueType();

56155

unsigned NumSrcElts0 = SrcVT0.getVectorNumElements();

56156

unsigned NumSrcElts1 = SrcVT1.getVectorNumElements();

56157

if (SrcVT0.is256BitVector() && SrcVT1.is256BitVector() &&

56158

Src0.getConstantOperandAPInt(1) == (NumSrcElts0 / 2) &&

56159

Src1.getConstantOperandAPInt(1) == (NumSrcElts1 / 2)) {

56160

return DAG.getNode(X86ISD::VPERM2X128, DL, VT,

56161

DAG.getBitcast(VT, Src0.getOperand(0)),

56162

DAG.getBitcast(VT, Src1.getOperand(0)),

56163

DAG.getTargetConstant(0x31, DL, MVT::i8));

56164

}

56165

}

56166

}

56167

56168

// Repeated opcode.

56169

// TODO - combineX86ShufflesRecursively should handle shuffle concatenation

56170

// but it currently struggles with different vector widths.

56171

if (llvm::all_of(Ops, [Op0](SDValue Op) {

56172

return Op.getOpcode() == Op0.getOpcode();

56173

})) {

56174

auto ConcatSubOperand = [&](EVT VT, ArrayRef<SDValue> SubOps, unsigned I) {

56175

SmallVector<SDValue> Subs;

56176

for (SDValue SubOp : SubOps)

56177

Subs.push_back(SubOp.getOperand(I));

56178

return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);

56179

};

56180

auto IsConcatFree = [](MVT VT, ArrayRef<SDValue> SubOps, unsigned Op) {

56181

for (unsigned I = 0, E = SubOps.size(); I != E; ++I) {

56182

SDValue Sub = SubOps[I].getOperand(Op);

56183

unsigned NumSubElts = Sub.getValueType().getVectorNumElements();

56184

if (Sub.getOpcode() != ISD::EXTRACT_SUBVECTOR ||

56185

Sub.getOperand(0).getValueType() != VT ||

56186

Sub.getConstantOperandAPInt(1) != (I * NumSubElts))

56187

return false;

56188

}

56189

return true;

56190

};

56191

56192

unsigned NumOps = Ops.size();

56193

switch (Op0.getOpcode()) {

56194

case X86ISD::VBROADCAST: {

56195

if (!IsSplat && llvm::all_of(Ops, [](SDValue Op) {

56196

return Op.getOperand(0).getValueType().is128BitVector();

56197

})) {

56198

if (VT == MVT::v4f64 || VT == MVT::v4i64)

56199

return DAG.getNode(X86ISD::UNPCKL, DL, VT,

56200

ConcatSubOperand(VT, Ops, 0),

56201

ConcatSubOperand(VT, Ops, 0));

56202

// TODO: Add pseudo v8i32 PSHUFD handling to AVX1Only targets.

56203

if (VT == MVT::v8f32 || (VT == MVT::v8i32 && Subtarget.hasInt256()))

56204

return DAG.getNode(VT == MVT::v8f32 ? X86ISD::VPERMILPI

56205

: X86ISD::PSHUFD,

56206

DL, VT, ConcatSubOperand(VT, Ops, 0),

56207

getV4X86ShuffleImm8ForMask({0, 0, 0, 0}, DL, DAG));

56208

}

56209

break;

56210

}

56211

case X86ISD::MOVDDUP:

56212

case X86ISD::MOVSHDUP:

56213

case X86ISD::MOVSLDUP: {

56214

if (!IsSplat)

56215

return DAG.getNode(Op0.getOpcode(), DL, VT,

56216

ConcatSubOperand(VT, Ops, 0));

56217

break;

56218

}

56219

case X86ISD::SHUFP: {

56220

// Add SHUFPD support if/when necessary.

56221

if (!IsSplat && VT.getScalarType() == MVT::f32 &&

56222

llvm::all_of(Ops, [Op0](SDValue Op) {

56223

return Op.getOperand(2) == Op0.getOperand(2);

56224

})) {

56225

return DAG.getNode(Op0.getOpcode(), DL, VT,

56226

ConcatSubOperand(VT, Ops, 0),

56227

ConcatSubOperand(VT, Ops, 1), Op0.getOperand(2));

56228

}

56229

break;

56230

}

56231

case X86ISD::PSHUFHW:

56232

case X86ISD::PSHUFLW:

56233

case X86ISD::PSHUFD:

56234

if (!IsSplat && NumOps == 2 && VT.is256BitVector() &&

56235

Subtarget.hasInt256() && Op0.getOperand(1) == Ops[1].getOperand(1)) {

56236

return DAG.getNode(Op0.getOpcode(), DL, VT,

56237

ConcatSubOperand(VT, Ops, 0), Op0.getOperand(1));

56238

}

56239

[[fallthrough]];

56240

case X86ISD::VPERMILPI:

56241

if (!IsSplat && VT.getScalarSizeInBits() == 32 &&

56242

(VT.is256BitVector() ||

56243

(VT.is512BitVector() && Subtarget.useAVX512Regs())) &&

56244

all_of(Ops, [&Op0](SDValue Op) {

56245

return Op0.getOperand(1) == Op.getOperand(1);

56246

})) {

56247

MVT FloatVT = VT.changeVectorElementType(MVT::f32);

56248

SDValue Res = DAG.getBitcast(FloatVT, ConcatSubOperand(VT, Ops, 0));

56249

Res =

56250

DAG.getNode(X86ISD::VPERMILPI, DL, FloatVT, Res, Op0.getOperand(1));

56251

return DAG.getBitcast(VT, Res);

56252

}

56253

if (!IsSplat && NumOps == 2 && VT == MVT::v4f64) {

56254

uint64_t Idx0 = Ops[0].getConstantOperandVal(1);

56255

uint64_t Idx1 = Ops[1].getConstantOperandVal(1);

56256

uint64_t Idx = ((Idx1 & 3) << 2) | (Idx0 & 3);

56257

return DAG.getNode(Op0.getOpcode(), DL, VT,

56258

ConcatSubOperand(VT, Ops, 0),

56259

DAG.getTargetConstant(Idx, DL, MVT::i8));

56260

}

56261

break;

56262

case X86ISD::PSHUFB:

56263

if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||

56264

(VT.is512BitVector() && Subtarget.useBWIRegs()))) {

56265

return DAG.getNode(Op0.getOpcode(), DL, VT,

56266

ConcatSubOperand(VT, Ops, 0),

56267

ConcatSubOperand(VT, Ops, 1));

56268

}

56269

break;

56270

case X86ISD::VPERMV:

56271

if (!IsSplat && NumOps == 2 &&

56272

(VT.is512BitVector() && Subtarget.useAVX512Regs())) {

56273

MVT OpVT = Op0.getSimpleValueType();

56274

int NumSrcElts = OpVT.getVectorNumElements();

56275

SmallVector<int, 64> ConcatMask;

56276

for (unsigned i = 0; i != NumOps; ++i) {

56277

SmallVector<int, 64> SubMask;

56278

SmallVector<SDValue, 2> SubOps;

56279

if (!getTargetShuffleMask(Ops[i].getNode(), OpVT, false, SubOps,

56280

SubMask))

56281

break;

56282

for (int M : SubMask) {

56283

if (0 <= M)

56284

M += i * NumSrcElts;

56285

ConcatMask.push_back(M);

56286

}

56287

}

56288

if (ConcatMask.size() == (NumOps * NumSrcElts)) {

56289

SDValue Src = concatSubVectors(Ops[0].getOperand(1),

56290

Ops[1].getOperand(1), DAG, DL);

56291

MVT IntMaskSVT = MVT::getIntegerVT(EltSizeInBits);

56292

MVT IntMaskVT = MVT::getVectorVT(IntMaskSVT, NumOps * NumSrcElts);

56293

SDValue Mask = getConstVector(ConcatMask, IntMaskVT, DAG, DL, true);

56294

return DAG.getNode(X86ISD::VPERMV, DL, VT, Mask, Src);

56295

}

56296

}

56297

break;

56298

case X86ISD::VPERMV3:

56299

if (!IsSplat && NumOps == 2 && VT.is512BitVector()) {

56300

MVT OpVT = Op0.getSimpleValueType();

56301

int NumSrcElts = OpVT.getVectorNumElements();

56302

SmallVector<int, 64> ConcatMask;

56303

for (unsigned i = 0; i != NumOps; ++i) {

56304

SmallVector<int, 64> SubMask;

56305

SmallVector<SDValue, 2> SubOps;

56306

if (!getTargetShuffleMask(Ops[i].getNode(), OpVT, false, SubOps,

56307

SubMask))

56308

break;

56309

for (int M : SubMask) {

56310

if (0 <= M) {

56311

M += M < NumSrcElts ? 0 : NumSrcElts;

56312

M += i * NumSrcElts;

56313

}

56314

ConcatMask.push_back(M);

56315

}

56316

}

56317

if (ConcatMask.size() == (NumOps * NumSrcElts)) {

56318

SDValue Src0 = concatSubVectors(Ops[0].getOperand(0),

56319

Ops[1].getOperand(0), DAG, DL);

56320

SDValue Src1 = concatSubVectors(Ops[0].getOperand(2),

56321

Ops[1].getOperand(2), DAG, DL);

56322

MVT IntMaskSVT = MVT::getIntegerVT(EltSizeInBits);

56323

MVT IntMaskVT = MVT::getVectorVT(IntMaskSVT, NumOps * NumSrcElts);

56324

SDValue Mask = getConstVector(ConcatMask, IntMaskVT, DAG, DL, true);

56325

return DAG.getNode(X86ISD::VPERMV3, DL, VT, Src0, Mask, Src1);

56326

}

56327

}

56328

break;

56329

case ISD::TRUNCATE:

56330

if (!IsSplat && NumOps == 2 && VT.is256BitVector()) {

56331

EVT SrcVT = Ops[0].getOperand(0).getValueType();

56332

if (SrcVT.is256BitVector() && SrcVT.isSimple() &&

56333

SrcVT == Ops[1].getOperand(0).getValueType() &&

56334

Subtarget.useAVX512Regs() &&

56335

Subtarget.getPreferVectorWidth() >= 512 &&

56336

(SrcVT.getScalarSizeInBits() > 16 || Subtarget.useBWIRegs())) {

56337

EVT NewSrcVT = SrcVT.getDoubleNumVectorElementsVT(*DAG.getContext());

56338

return DAG.getNode(ISD::TRUNCATE, DL, VT,

56339

ConcatSubOperand(NewSrcVT, Ops, 0));

56340

}

56341

}

56342

break;

56343

case X86ISD::VSHLI:

56344

case X86ISD::VSRLI:

56345

// Special case: SHL/SRL AVX1 V4i64 by 32-bits can lower as a shuffle.

56346

// TODO: Move this to LowerShiftByScalarImmediate?

56347

if (VT == MVT::v4i64 && !Subtarget.hasInt256() &&

56348

llvm::all_of(Ops, [](SDValue Op) {

56349

return Op.getConstantOperandAPInt(1) == 32;

56350

})) {

56351

SDValue Res = DAG.getBitcast(MVT::v8i32, ConcatSubOperand(VT, Ops, 0));

56352

SDValue Zero = getZeroVector(MVT::v8i32, Subtarget, DAG, DL);

56353

if (Op0.getOpcode() == X86ISD::VSHLI) {

56354

Res = DAG.getVectorShuffle(MVT::v8i32, DL, Res, Zero,

56355

{8, 0, 8, 2, 8, 4, 8, 6});

56356

} else {

56357

Res = DAG.getVectorShuffle(MVT::v8i32, DL, Res, Zero,

56358

{1, 8, 3, 8, 5, 8, 7, 8});

56359

}

56360

return DAG.getBitcast(VT, Res);

56361

}

56362

[[fallthrough]];

56363

case X86ISD::VSRAI:

56364

case X86ISD::VSHL:

56365

case X86ISD::VSRL:

56366

case X86ISD::VSRA:

56367

if (((VT.is256BitVector() && Subtarget.hasInt256()) ||

56368

(VT.is512BitVector() && Subtarget.useAVX512Regs() &&

56369

(EltSizeInBits >= 32 || Subtarget.useBWIRegs()))) &&

56370

llvm::all_of(Ops, [Op0](SDValue Op) {

56371

return Op0.getOperand(1) == Op.getOperand(1);

56372

})) {

56373

return DAG.getNode(Op0.getOpcode(), DL, VT,

56374

ConcatSubOperand(VT, Ops, 0), Op0.getOperand(1));

56375

}

56376

break;

56377

case X86ISD::VPERMI:

56378

case X86ISD::VROTLI:

56379

case X86ISD::VROTRI:

56380

if (VT.is512BitVector() && Subtarget.useAVX512Regs() &&

56381

llvm::all_of(Ops, [Op0](SDValue Op) {

56382

return Op0.getOperand(1) == Op.getOperand(1);

56383

})) {

56384

return DAG.getNode(Op0.getOpcode(), DL, VT,

56385

ConcatSubOperand(VT, Ops, 0), Op0.getOperand(1));

56386

}

56387

break;

56388

case ISD::AND:

56389

case ISD::OR:

56390

case ISD::XOR:

56391

case X86ISD::ANDNP:

56392

if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||

56393

(VT.is512BitVector() && Subtarget.useAVX512Regs()))) {

56394

MVT SrcVT = Op0.getOperand(0).getSimpleValueType();

56395

SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),

56396

NumOps * SrcVT.getVectorNumElements());

56397

return DAG.getNode(Op0.getOpcode(), DL, VT,

56398

ConcatSubOperand(SrcVT, Ops, 0),

56399

ConcatSubOperand(SrcVT, Ops, 1));

56400

}

56401

break;

56402

case X86ISD::GF2P8AFFINEQB:

56403

if (!IsSplat &&

56404

(VT.is256BitVector() ||

56405

(VT.is512BitVector() && Subtarget.useAVX512Regs())) &&

56406

llvm::all_of(Ops, [Op0](SDValue Op) {

56407

return Op0.getOperand(2) == Op.getOperand(2);

56408

})) {

56409

return DAG.getNode(Op0.getOpcode(), DL, VT,

56410

ConcatSubOperand(VT, Ops, 0),

56411

ConcatSubOperand(VT, Ops, 1), Op0.getOperand(2));

56412

}

56413

break;

56414

case ISD::ADD:

56415

case ISD::SUB:

56416

case ISD::MUL:

56417

if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||

56418

(VT.is512BitVector() && Subtarget.useAVX512Regs() &&

56419

(EltSizeInBits >= 32 || Subtarget.useBWIRegs())))) {

56420

MVT SrcVT = Op0.getOperand(0).getSimpleValueType();

56421

SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),

56422

NumOps * SrcVT.getVectorNumElements());

56423

return DAG.getNode(Op0.getOpcode(), DL, VT,

56424

ConcatSubOperand(SrcVT, Ops, 0),

56425

ConcatSubOperand(SrcVT, Ops, 1));

56426

}

56427

break;

56428

case ISD::FADD:

56429

case ISD::FSUB:

56430

case ISD::FMUL:

56431

case ISD::FDIV:

56432

if (!IsSplat && (VT.is256BitVector() ||

56433

(VT.is512BitVector() && Subtarget.useAVX512Regs()))) {

56434

MVT SrcVT = Op0.getOperand(0).getSimpleValueType();

56435

SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),

56436

NumOps * SrcVT.getVectorNumElements());

56437

return DAG.getNode(Op0.getOpcode(), DL, VT,

56438

ConcatSubOperand(SrcVT, Ops, 0),

56439

ConcatSubOperand(SrcVT, Ops, 1));

56440

}

56441

break;

56442

case X86ISD::HADD:

56443

case X86ISD::HSUB:

56444

case X86ISD::FHADD:

56445

case X86ISD::FHSUB:

56446

case X86ISD::PACKSS:

56447

case X86ISD::PACKUS:

56448

if (!IsSplat && VT.is256BitVector() &&

56449

(VT.isFloatingPoint() || Subtarget.hasInt256())) {

56450

MVT SrcVT = Op0.getOperand(0).getSimpleValueType();

56451

SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),

56452

NumOps * SrcVT.getVectorNumElements());

56453

return DAG.getNode(Op0.getOpcode(), DL, VT,

56454

ConcatSubOperand(SrcVT, Ops, 0),

56455

ConcatSubOperand(SrcVT, Ops, 1));

56456

}

56457

break;

56458

case X86ISD::PALIGNR:

56459

if (!IsSplat &&

56460

((VT.is256BitVector() && Subtarget.hasInt256()) ||

56461

(VT.is512BitVector() && Subtarget.useBWIRegs())) &&

56462

llvm::all_of(Ops, [Op0](SDValue Op) {

56463

return Op0.getOperand(2) == Op.getOperand(2);

56464

})) {

56465

return DAG.getNode(Op0.getOpcode(), DL, VT,

56466

ConcatSubOperand(VT, Ops, 0),

56467

ConcatSubOperand(VT, Ops, 1), Op0.getOperand(2));

56468

}

56469

break;

56470

case ISD::VSELECT:

56471

if (!IsSplat && Subtarget.hasAVX512() &&

56472

(VT.is256BitVector() ||

56473

(VT.is512BitVector() && Subtarget.useAVX512Regs())) &&

56474

(EltSizeInBits >= 32 || Subtarget.hasBWI())) {

56475

EVT SelVT = Ops[0].getOperand(0).getValueType();

56476

if (SelVT.getVectorElementType() == MVT::i1) {

56477

SelVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,

56478

Ops.size() * SelVT.getVectorNumElements());

56479

if (DAG.getTargetLoweringInfo().isTypeLegal(SelVT))

56480

return DAG.getNode(Op0.getOpcode(), DL, VT,

56481

ConcatSubOperand(SelVT.getSimpleVT(), Ops, 0),

56482

ConcatSubOperand(VT, Ops, 1),

56483

ConcatSubOperand(VT, Ops, 2));

56484

}

56485

}

56486

[[fallthrough]];

56487

case X86ISD::BLENDV:

56488

if (!IsSplat && VT.is256BitVector() && Ops.size() == 2 &&

56489

(EltSizeInBits >= 32 || Subtarget.hasInt256()) &&

56490

IsConcatFree(VT, Ops, 1) && IsConcatFree(VT, Ops, 2)) {

56491

EVT SelVT = Ops[0].getOperand(0).getValueType();

56492

SelVT = SelVT.getDoubleNumVectorElementsVT(*DAG.getContext());

56493

if (DAG.getTargetLoweringInfo().isTypeLegal(SelVT))

56494

return DAG.getNode(Op0.getOpcode(), DL, VT,

56495

ConcatSubOperand(SelVT.getSimpleVT(), Ops, 0),

56496

ConcatSubOperand(VT, Ops, 1),

56497

ConcatSubOperand(VT, Ops, 2));

56498

}

56499

break;

56500

}

56501

}

56502

56503

// Fold subvector loads into one.

56504

// If needed, look through bitcasts to get to the load.

56505

if (auto *FirstLd = dyn_cast<LoadSDNode>(peekThroughBitcasts(Op0))) {

56506

unsigned Fast;

56507

const X86TargetLowering *TLI = Subtarget.getTargetLowering();

56508

if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,

56509

*FirstLd->getMemOperand(), &Fast) &&

56510

Fast) {

56511

if (SDValue Ld =

56512

EltsFromConsecutiveLoads(VT, Ops, DL, DAG, Subtarget, false))

56513

return Ld;

56514

}

56515

}

56516

56517

// Attempt to fold target constant loads.

56518

if (all_of(Ops, [](SDValue Op) { return getTargetConstantFromNode(Op); })) {

56519

SmallVector<APInt> EltBits;

56520

APInt UndefElts = APInt::getZero(VT.getVectorNumElements());

56521

for (unsigned I = 0, E = Ops.size(); I != E; ++I) {

56522

APInt OpUndefElts;

56523

SmallVector<APInt> OpEltBits;

56524

if (!getTargetConstantBitsFromNode(Ops[I], EltSizeInBits, OpUndefElts,

56525

OpEltBits, true, false))

56526

break;

56527

EltBits.append(OpEltBits);

56528

UndefElts.insertBits(OpUndefElts, I * OpUndefElts.getBitWidth());

56529

}

56530

if (EltBits.size() == VT.getVectorNumElements())

56531

return getConstVector(EltBits, UndefElts, VT, DAG, DL);

56532

}

56533

56534

return SDValue();

56535

}

56536

56537

static SDValue combineCONCAT_VECTORS(SDNode *N, SelectionDAG &DAG,

56538

TargetLowering::DAGCombinerInfo &DCI,

56539

const X86Subtarget &Subtarget) {

56540

EVT VT = N->getValueType(0);

56541

EVT SrcVT = N->getOperand(0).getValueType();

56542

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

56543

SmallVector<SDValue, 4> Ops(N->op_begin(), N->op_end());

56544

56545

if (VT.getVectorElementType() == MVT::i1) {

56546

// Attempt to constant fold.

56547

unsigned SubSizeInBits = SrcVT.getSizeInBits();

56548

APInt Constant = APInt::getZero(VT.getSizeInBits());

56549

for (unsigned I = 0, E = Ops.size(); I != E; ++I) {

56550

auto *C = dyn_cast<ConstantSDNode>(peekThroughBitcasts(Ops[I]));

56551

if (!C) break;

56552

Constant.insertBits(C->getAPIntValue(), I * SubSizeInBits);

56553

if (I == (E - 1)) {

56554

EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());

56555

if (TLI.isTypeLegal(IntVT))

56556

return DAG.getBitcast(VT, DAG.getConstant(Constant, SDLoc(N), IntVT));

56557

}

56558

}

56559

56560

// Don't do anything else for i1 vectors.

56561

return SDValue();

56562

}

56563

56564

if (Subtarget.hasAVX() && TLI.isTypeLegal(VT) && TLI.isTypeLegal(SrcVT)) {

56565

if (SDValue R = combineConcatVectorOps(SDLoc(N), VT.getSimpleVT(), Ops, DAG,

56566

DCI, Subtarget))

56567

return R;

56568

}

56569

56570

return SDValue();

56571

}

56572

56573

static SDValue combineINSERT_SUBVECTOR(SDNode *N, SelectionDAG &DAG,

56574

TargetLowering::DAGCombinerInfo &DCI,

56575

const X86Subtarget &Subtarget) {

56576

if (DCI.isBeforeLegalizeOps())

56577

return SDValue();

56578

56579

MVT OpVT = N->getSimpleValueType(0);

56580

56581

bool IsI1Vector = OpVT.getVectorElementType() == MVT::i1;

56582

56583

SDLoc dl(N);

56584

SDValue Vec = N->getOperand(0);

56585

SDValue SubVec = N->getOperand(1);

56586

56587

uint64_t IdxVal = N->getConstantOperandVal(2);

56588

MVT SubVecVT = SubVec.getSimpleValueType();

56589

56590

if (Vec.isUndef() && SubVec.isUndef())

56591

return DAG.getUNDEF(OpVT);

56592

56593

// Inserting undefs/zeros into zeros/undefs is a zero vector.

56594

if ((Vec.isUndef() || ISD::isBuildVectorAllZeros(Vec.getNode())) &&

56595

(SubVec.isUndef() || ISD::isBuildVectorAllZeros(SubVec.getNode())))

56596

return getZeroVector(OpVT, Subtarget, DAG, dl);

56597

56598

if (ISD::isBuildVectorAllZeros(Vec.getNode())) {

56599

// If we're inserting into a zero vector and then into a larger zero vector,

56600

// just insert into the larger zero vector directly.

56601

if (SubVec.getOpcode() == ISD::INSERT_SUBVECTOR &&

56602

ISD::isBuildVectorAllZeros(SubVec.getOperand(0).getNode())) {

56603

uint64_t Idx2Val = SubVec.getConstantOperandVal(2);

56604

return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,

56605

getZeroVector(OpVT, Subtarget, DAG, dl),

56606

SubVec.getOperand(1),

56607

DAG.getIntPtrConstant(IdxVal + Idx2Val, dl));

56608

}

56609

56610

// If we're inserting into a zero vector and our input was extracted from an

56611

// insert into a zero vector of the same type and the extraction was at

56612

// least as large as the original insertion. Just insert the original

56613

// subvector into a zero vector.

56614

if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR && IdxVal == 0 &&

56615

isNullConstant(SubVec.getOperand(1)) &&

56616

SubVec.getOperand(0).getOpcode() == ISD::INSERT_SUBVECTOR) {

56617

SDValue Ins = SubVec.getOperand(0);

56618

if (isNullConstant(Ins.getOperand(2)) &&

56619

ISD::isBuildVectorAllZeros(Ins.getOperand(0).getNode()) &&

56620

Ins.getOperand(1).getValueSizeInBits().getFixedValue() <=

56621

SubVecVT.getFixedSizeInBits())

56622

return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,

56623

getZeroVector(OpVT, Subtarget, DAG, dl),

56624

Ins.getOperand(1), N->getOperand(2));

56625

}

56626

}

56627

56628

// Stop here if this is an i1 vector.

56629

if (IsI1Vector)

56630

return SDValue();

56631

56632

// Eliminate an intermediate vector widening:

56633

// insert_subvector X, (insert_subvector undef, Y, 0), Idx -->

56634

// insert_subvector X, Y, Idx

56635

// TODO: This is a more general version of a DAGCombiner fold, can we move it

56636

// there?

56637

if (SubVec.getOpcode() == ISD::INSERT_SUBVECTOR &&

56638

SubVec.getOperand(0).isUndef() && isNullConstant(SubVec.getOperand(2)))

56639

return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Vec,

56640

SubVec.getOperand(1), N->getOperand(2));

56641

56642

// If this is an insert of an extract, combine to a shuffle. Don't do this

56643

// if the insert or extract can be represented with a subregister operation.

56644

if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR &&

56645

SubVec.getOperand(0).getSimpleValueType() == OpVT &&

56646

(IdxVal != 0 ||

56647

!(Vec.isUndef() || ISD::isBuildVectorAllZeros(Vec.getNode())))) {

56648

int ExtIdxVal = SubVec.getConstantOperandVal(1);

56649

if (ExtIdxVal != 0) {

56650

int VecNumElts = OpVT.getVectorNumElements();

56651

int SubVecNumElts = SubVecVT.getVectorNumElements();

56652

SmallVector<int, 64> Mask(VecNumElts);

56653

// First create an identity shuffle mask.

56654

for (int i = 0; i != VecNumElts; ++i)

56655

Mask[i] = i;

56656

// Now insert the extracted portion.

56657

for (int i = 0; i != SubVecNumElts; ++i)

56658

Mask[i + IdxVal] = i + ExtIdxVal + VecNumElts;

56659

56660

return DAG.getVectorShuffle(OpVT, dl, Vec, SubVec.getOperand(0), Mask);

56661

}

56662

}

56663

56664

// Match concat_vector style patterns.

56665

SmallVector<SDValue, 2> SubVectorOps;

56666

if (collectConcatOps(N, SubVectorOps, DAG)) {

56667

if (SDValue Fold =

56668

combineConcatVectorOps(dl, OpVT, SubVectorOps, DAG, DCI, Subtarget))

56669

return Fold;

56670

56671

// If we're inserting all zeros into the upper half, change this to

56672

// a concat with zero. We will match this to a move

56673

// with implicit upper bit zeroing during isel.

56674

// We do this here because we don't want combineConcatVectorOps to

56675

// create INSERT_SUBVECTOR from CONCAT_VECTORS.

56676

if (SubVectorOps.size() == 2 &&

56677

ISD::isBuildVectorAllZeros(SubVectorOps[1].getNode()))

56678

return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,

56679

getZeroVector(OpVT, Subtarget, DAG, dl),

56680

SubVectorOps[0], DAG.getIntPtrConstant(0, dl));

56681

}

56682

56683

// If this is a broadcast insert into an upper undef, use a larger broadcast.

56684

if (Vec.isUndef() && IdxVal != 0 && SubVec.getOpcode() == X86ISD::VBROADCAST)

56685

return DAG.getNode(X86ISD::VBROADCAST, dl, OpVT, SubVec.getOperand(0));

56686

56687

// If this is a broadcast load inserted into an upper undef, use a larger

56688

// broadcast load.

56689

if (Vec.isUndef() && IdxVal != 0 && SubVec.hasOneUse() &&

56690

SubVec.getOpcode() == X86ISD::VBROADCAST_LOAD) {

56691

auto *MemIntr = cast<MemIntrinsicSDNode>(SubVec);

56692

SDVTList Tys = DAG.getVTList(OpVT, MVT::Other);

56693

SDValue Ops[] = { MemIntr->getChain(), MemIntr->getBasePtr() };

56694

SDValue BcastLd =

56695

DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops,

56696

MemIntr->getMemoryVT(),

56697

MemIntr->getMemOperand());

56698

DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), BcastLd.getValue(1));

56699

return BcastLd;

56700

}

56701

56702

// If we're splatting the lower half subvector of a full vector load into the

56703

// upper half, attempt to create a subvector broadcast.

56704

if (IdxVal == (OpVT.getVectorNumElements() / 2) && SubVec.hasOneUse() &&

56705

Vec.getValueSizeInBits() == (2 * SubVec.getValueSizeInBits())) {

56706

auto *VecLd = dyn_cast<LoadSDNode>(Vec);

56707

auto *SubLd = dyn_cast<LoadSDNode>(SubVec);

56708

if (VecLd && SubLd &&

56709

DAG.areNonVolatileConsecutiveLoads(SubLd, VecLd,

56710

SubVec.getValueSizeInBits() / 8, 0))

56711

return getBROADCAST_LOAD(X86ISD::SUBV_BROADCAST_LOAD, dl, OpVT, SubVecVT,

56712

SubLd, 0, DAG);

56713

}

56714

56715

return SDValue();

56716

}

56717

56718

/// If we are extracting a subvector of a vector select and the select condition

56719

/// is composed of concatenated vectors, try to narrow the select width. This

56720

/// is a common pattern for AVX1 integer code because 256-bit selects may be

56721

/// legal, but there is almost no integer math/logic available for 256-bit.

56722

/// This function should only be called with legal types (otherwise, the calls

56723

/// to get simple value types will assert).

56724

static SDValue narrowExtractedVectorSelect(SDNode *Ext, SelectionDAG &DAG) {

56725

SDValue Sel = Ext->getOperand(0);

56726

SmallVector<SDValue, 4> CatOps;

56727

if (Sel.getOpcode() != ISD::VSELECT ||

56728

!collectConcatOps(Sel.getOperand(0).getNode(), CatOps, DAG))

56729

return SDValue();

56730

56731

// Note: We assume simple value types because this should only be called with

56732

// legal operations/types.

56733

// TODO: This can be extended to handle extraction to 256-bits.

56734

MVT VT = Ext->getSimpleValueType(0);

56735

if (!VT.is128BitVector())

56736

return SDValue();

56737

56738

MVT SelCondVT = Sel.getOperand(0).getSimpleValueType();

56739

if (!SelCondVT.is256BitVector() && !SelCondVT.is512BitVector())

56740

return SDValue();

56741

56742

MVT WideVT = Ext->getOperand(0).getSimpleValueType();

56743

MVT SelVT = Sel.getSimpleValueType();

56744

assert((SelVT.is256BitVector() || SelVT.is512BitVector()) &&(static_cast <bool> ((SelVT.is256BitVector() || SelVT.is512BitVector
()) && "Unexpected vector type with legal operations"
) ? void (0) : __assert_fail ("(SelVT.is256BitVector() || SelVT.is512BitVector()) && \"Unexpected vector type with legal operations\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 56745, __extension__
__PRETTY_FUNCTION__))

56745

"Unexpected vector type with legal operations")(static_cast <bool> ((SelVT.is256BitVector() || SelVT.is512BitVector
()) && "Unexpected vector type with legal operations"
) ? void (0) : __assert_fail ("(SelVT.is256BitVector() || SelVT.is512BitVector()) && \"Unexpected vector type with legal operations\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 56745, __extension__
__PRETTY_FUNCTION__));

56746

56747

unsigned SelElts = SelVT.getVectorNumElements();

56748

unsigned CastedElts = WideVT.getVectorNumElements();

56749

unsigned ExtIdx = Ext->getConstantOperandVal(1);

56750

if (SelElts % CastedElts == 0) {

56751

// The select has the same or more (narrower) elements than the extract

56752

// operand. The extraction index gets scaled by that factor.

56753

ExtIdx *= (SelElts / CastedElts);

56754

} else if (CastedElts % SelElts == 0) {

56755

// The select has less (wider) elements than the extract operand. Make sure

56756

// that the extraction index can be divided evenly.

56757

unsigned IndexDivisor = CastedElts / SelElts;

56758

if (ExtIdx % IndexDivisor != 0)

56759

return SDValue();

56760

ExtIdx /= IndexDivisor;

56761

} else {

56762

llvm_unreachable("Element count of simple vector types are not divisible?")::llvm::llvm_unreachable_internal("Element count of simple vector types are not divisible?"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 56762);

56763

}

56764

56765

unsigned NarrowingFactor = WideVT.getSizeInBits() / VT.getSizeInBits();

56766

unsigned NarrowElts = SelElts / NarrowingFactor;

56767

MVT NarrowSelVT = MVT::getVectorVT(SelVT.getVectorElementType(), NarrowElts);

56768

SDLoc DL(Ext);

56769

SDValue ExtCond = extract128BitVector(Sel.getOperand(0), ExtIdx, DAG, DL);

56770

SDValue ExtT = extract128BitVector(Sel.getOperand(1), ExtIdx, DAG, DL);

56771

SDValue ExtF = extract128BitVector(Sel.getOperand(2), ExtIdx, DAG, DL);

56772

SDValue NarrowSel = DAG.getSelect(DL, NarrowSelVT, ExtCond, ExtT, ExtF);

56773

return DAG.getBitcast(VT, NarrowSel);

56774

}

56775

56776

static SDValue combineEXTRACT_SUBVECTOR(SDNode *N, SelectionDAG &DAG,

56777

TargetLowering::DAGCombinerInfo &DCI,

56778

const X86Subtarget &Subtarget) {

56779

// For AVX1 only, if we are extracting from a 256-bit and+not (which will

56780

// eventually get combined/lowered into ANDNP) with a concatenated operand,

56781

// split the 'and' into 128-bit ops to avoid the concatenate and extract.

56782

// We let generic combining take over from there to simplify the

56783

// insert/extract and 'not'.

56784

// This pattern emerges during AVX1 legalization. We handle it before lowering

56785

// to avoid complications like splitting constant vector loads.

56786

56787

// Capture the original wide type in the likely case that we need to bitcast

56788

// back to this type.

56789

if (!N->getValueType(0).isSimple())

56790

return SDValue();

56791

56792

MVT VT = N->getSimpleValueType(0);

56793

SDValue InVec = N->getOperand(0);

56794

unsigned IdxVal = N->getConstantOperandVal(1);

56795

SDValue InVecBC = peekThroughBitcasts(InVec);

56796

EVT InVecVT = InVec.getValueType();

56797

unsigned SizeInBits = VT.getSizeInBits();

56798

unsigned InSizeInBits = InVecVT.getSizeInBits();

56799

unsigned NumSubElts = VT.getVectorNumElements();

56800

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

56801

56802

if (Subtarget.hasAVX() && !Subtarget.hasAVX2() &&

56803

TLI.isTypeLegal(InVecVT) &&

56804

InSizeInBits == 256 && InVecBC.getOpcode() == ISD::AND) {

56805

auto isConcatenatedNot = [](SDValue V) {

56806

V = peekThroughBitcasts(V);

56807

if (!isBitwiseNot(V))

56808

return false;

56809

SDValue NotOp = V->getOperand(0);

56810

return peekThroughBitcasts(NotOp).getOpcode() == ISD::CONCAT_VECTORS;

56811

};

56812

if (isConcatenatedNot(InVecBC.getOperand(0)) ||

56813

isConcatenatedNot(InVecBC.getOperand(1))) {

56814

// extract (and v4i64 X, (not (concat Y1, Y2))), n -> andnp v2i64 X(n), Y1

56815

SDValue Concat = splitVectorIntBinary(InVecBC, DAG);

56816

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), VT,

56817

DAG.getBitcast(InVecVT, Concat), N->getOperand(1));

56818

}

56819

}

56820

56821

if (DCI.isBeforeLegalizeOps())

56822

return SDValue();

56823

56824

if (SDValue V = narrowExtractedVectorSelect(N, DAG))

56825

return V;

56826

56827

if (ISD::isBuildVectorAllZeros(InVec.getNode()))

56828

return getZeroVector(VT, Subtarget, DAG, SDLoc(N));

56829

56830

if (ISD::isBuildVectorAllOnes(InVec.getNode())) {

56831

if (VT.getScalarType() == MVT::i1)

56832

return DAG.getConstant(1, SDLoc(N), VT);

56833

return getOnesVector(VT, DAG, SDLoc(N));

56834

}

56835

56836

if (InVec.getOpcode() == ISD::BUILD_VECTOR)

56837

return DAG.getBuildVector(VT, SDLoc(N),

56838

InVec->ops().slice(IdxVal, NumSubElts));

56839

56840

// If we are extracting from an insert into a larger vector, replace with a

56841

// smaller insert if we don't access less than the original subvector. Don't

56842

// do this for i1 vectors.

56843

// TODO: Relax the matching indices requirement?

56844

if (VT.getVectorElementType() != MVT::i1 &&

56845

InVec.getOpcode() == ISD::INSERT_SUBVECTOR && InVec.hasOneUse() &&

56846

IdxVal == InVec.getConstantOperandVal(2) &&

56847

InVec.getOperand(1).getValueSizeInBits() <= SizeInBits) {

56848

SDLoc DL(N);

56849

SDValue NewExt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT,

56850

InVec.getOperand(0), N->getOperand(1));

56851

unsigned NewIdxVal = InVec.getConstantOperandVal(2) - IdxVal;

56852

return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, NewExt,

56853

InVec.getOperand(1),

56854

DAG.getVectorIdxConstant(NewIdxVal, DL));

56855

}

56856

56857

// If we're extracting an upper subvector from a broadcast we should just

56858

// extract the lowest subvector instead which should allow

56859

// SimplifyDemandedVectorElts do more simplifications.

56860

if (IdxVal != 0 && (InVec.getOpcode() == X86ISD::VBROADCAST ||

56861

InVec.getOpcode() == X86ISD::VBROADCAST_LOAD ||

56862

DAG.isSplatValue(InVec, /*AllowUndefs*/ false)))

56863

return extractSubVector(InVec, 0, DAG, SDLoc(N), SizeInBits);

56864

56865

// If we're extracting a broadcasted subvector, just use the lowest subvector.

56866

if (IdxVal != 0 && InVec.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD &&

56867

cast<MemIntrinsicSDNode>(InVec)->getMemoryVT() == VT)

56868

return extractSubVector(InVec, 0, DAG, SDLoc(N), SizeInBits);

56869

56870

// Attempt to extract from the source of a shuffle vector.

56871

if ((InSizeInBits % SizeInBits) == 0 && (IdxVal % NumSubElts) == 0) {

56872

SmallVector<int, 32> ShuffleMask;

56873

SmallVector<int, 32> ScaledMask;

56874

SmallVector<SDValue, 2> ShuffleInputs;

56875

unsigned NumSubVecs = InSizeInBits / SizeInBits;

56876

// Decode the shuffle mask and scale it so its shuffling subvectors.

56877

if (getTargetShuffleInputs(InVecBC, ShuffleInputs, ShuffleMask, DAG) &&

56878

scaleShuffleElements(ShuffleMask, NumSubVecs, ScaledMask)) {

56879

unsigned SubVecIdx = IdxVal / NumSubElts;

56880

if (ScaledMask[SubVecIdx] == SM_SentinelUndef)

56881

return DAG.getUNDEF(VT);

56882

if (ScaledMask[SubVecIdx] == SM_SentinelZero)

56883

return getZeroVector(VT, Subtarget, DAG, SDLoc(N));

56884

SDValue Src = ShuffleInputs[ScaledMask[SubVecIdx] / NumSubVecs];

56885

if (Src.getValueSizeInBits() == InSizeInBits) {

56886

unsigned SrcSubVecIdx = ScaledMask[SubVecIdx] % NumSubVecs;

56887

unsigned SrcEltIdx = SrcSubVecIdx * NumSubElts;

56888

return extractSubVector(DAG.getBitcast(InVecVT, Src), SrcEltIdx, DAG,

56889

SDLoc(N), SizeInBits);

56890

}

56891

}

56892

}

56893

56894

// If we're extracting the lowest subvector and we're the only user,

56895

// we may be able to perform this with a smaller vector width.

56896

unsigned InOpcode = InVec.getOpcode();

56897

if (InVec.hasOneUse()) {

56898

if (IdxVal == 0 && VT == MVT::v2f64 && InVecVT == MVT::v4f64) {

56899

// v2f64 CVTDQ2PD(v4i32).

56900

if (InOpcode == ISD::SINT_TO_FP &&

56901

InVec.getOperand(0).getValueType() == MVT::v4i32) {

56902

return DAG.getNode(X86ISD::CVTSI2P, SDLoc(N), VT, InVec.getOperand(0));

56903

}

56904

// v2f64 CVTUDQ2PD(v4i32).

56905

if (InOpcode == ISD::UINT_TO_FP && Subtarget.hasVLX() &&

56906

InVec.getOperand(0).getValueType() == MVT::v4i32) {

56907

return DAG.getNode(X86ISD::CVTUI2P, SDLoc(N), VT, InVec.getOperand(0));

56908

}

56909

// v2f64 CVTPS2PD(v4f32).

56910

if (InOpcode == ISD::FP_EXTEND &&

56911

InVec.getOperand(0).getValueType() == MVT::v4f32) {

56912

return DAG.getNode(X86ISD::VFPEXT, SDLoc(N), VT, InVec.getOperand(0));

56913

}

56914

}

56915

if (IdxVal == 0 &&

56916

(InOpcode == ISD::ANY_EXTEND ||

56917

InOpcode == ISD::ANY_EXTEND_VECTOR_INREG ||

56918

InOpcode == ISD::ZERO_EXTEND ||

56919

InOpcode == ISD::ZERO_EXTEND_VECTOR_INREG ||

56920

InOpcode == ISD::SIGN_EXTEND ||

56921

InOpcode == ISD::SIGN_EXTEND_VECTOR_INREG) &&

56922

(SizeInBits == 128 || SizeInBits == 256) &&

56923

InVec.getOperand(0).getValueSizeInBits() >= SizeInBits) {

56924

SDLoc DL(N);

56925

SDValue Ext = InVec.getOperand(0);

56926

if (Ext.getValueSizeInBits() > SizeInBits)

56927

Ext = extractSubVector(Ext, 0, DAG, DL, SizeInBits);

56928

unsigned ExtOp = DAG.getOpcode_EXTEND_VECTOR_INREG(InOpcode);

56929

return DAG.getNode(ExtOp, DL, VT, Ext);

56930

}

56931

if (IdxVal == 0 && InOpcode == ISD::VSELECT &&

56932

InVec.getOperand(0).getValueType().is256BitVector() &&

56933

InVec.getOperand(1).getValueType().is256BitVector() &&

56934

InVec.getOperand(2).getValueType().is256BitVector()) {

56935

SDLoc DL(N);

56936

SDValue Ext0 = extractSubVector(InVec.getOperand(0), 0, DAG, DL, 128);

56937

SDValue Ext1 = extractSubVector(InVec.getOperand(1), 0, DAG, DL, 128);

56938

SDValue Ext2 = extractSubVector(InVec.getOperand(2), 0, DAG, DL, 128);

56939

return DAG.getNode(InOpcode, DL, VT, Ext0, Ext1, Ext2);

56940

}

56941

if (IdxVal == 0 && InOpcode == ISD::TRUNCATE && Subtarget.hasVLX() &&

56942

(VT.is128BitVector() || VT.is256BitVector())) {

56943

SDLoc DL(N);

56944

SDValue InVecSrc = InVec.getOperand(0);

56945

unsigned Scale = InVecSrc.getValueSizeInBits() / InSizeInBits;

56946

SDValue Ext = extractSubVector(InVecSrc, 0, DAG, DL, Scale * SizeInBits);

56947

return DAG.getNode(InOpcode, DL, VT, Ext);

56948

}

56949

if (InOpcode == X86ISD::MOVDDUP &&

56950

(VT.is128BitVector() || VT.is256BitVector())) {

56951

SDLoc DL(N);

56952

SDValue Ext0 =

56953

extractSubVector(InVec.getOperand(0), IdxVal, DAG, DL, SizeInBits);

56954

return DAG.getNode(InOpcode, DL, VT, Ext0);

56955

}

56956

}

56957

56958

// Always split vXi64 logical shifts where we're extracting the upper 32-bits

56959

// as this is very likely to fold into a shuffle/truncation.

56960

if ((InOpcode == X86ISD::VSHLI || InOpcode == X86ISD::VSRLI) &&

56961

InVecVT.getScalarSizeInBits() == 64 &&

56962

InVec.getConstantOperandAPInt(1) == 32) {

56963

SDLoc DL(N);

56964

SDValue Ext =

56965

extractSubVector(InVec.getOperand(0), IdxVal, DAG, DL, SizeInBits);

56966

return DAG.getNode(InOpcode, DL, VT, Ext, InVec.getOperand(1));

56967

}

56968

56969

return SDValue();

56970

}

56971

56972

static SDValue combineScalarToVector(SDNode *N, SelectionDAG &DAG) {

56973

EVT VT = N->getValueType(0);

56974

SDValue Src = N->getOperand(0);

56975

SDLoc DL(N);

56976

56977

// If this is a scalar to vector to v1i1 from an AND with 1, bypass the and.

56978

// This occurs frequently in our masked scalar intrinsic code and our

56979

// floating point select lowering with AVX512.

56980

// TODO: SimplifyDemandedBits instead?

56981

if (VT == MVT::v1i1 && Src.getOpcode() == ISD::AND && Src.hasOneUse() &&

56982

isOneConstant(Src.getOperand(1)))

56983

return DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Src.getOperand(0));

56984

56985

// Combine scalar_to_vector of an extract_vector_elt into an extract_subvec.

56986

if (VT == MVT::v1i1 && Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&

56987

Src.hasOneUse() && Src.getOperand(0).getValueType().isVector() &&

56988

Src.getOperand(0).getValueType().getVectorElementType() == MVT::i1)

56989

if (auto *C = dyn_cast<ConstantSDNode>(Src.getOperand(1)))

56990

if (C->isZero())

56991

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Src.getOperand(0),

56992

Src.getOperand(1));

56993

56994

// Reduce v2i64 to v4i32 if we don't need the upper bits or are known zero.

56995

// TODO: Move to DAGCombine/SimplifyDemandedBits?

56996

if ((VT == MVT::v2i64 || VT == MVT::v2f64) && Src.hasOneUse()) {

56997

auto IsExt64 = [&DAG](SDValue Op, bool IsZeroExt) {

56998

if (Op.getValueType() != MVT::i64)

56999

return SDValue();

57000

unsigned Opc = IsZeroExt ? ISD::ZERO_EXTEND : ISD::ANY_EXTEND;

57001

if (Op.getOpcode() == Opc &&

57002

Op.getOperand(0).getScalarValueSizeInBits() <= 32)

57003

return Op.getOperand(0);

57004

unsigned Ext = IsZeroExt ? ISD::ZEXTLOAD : ISD::EXTLOAD;

57005

if (auto *Ld = dyn_cast<LoadSDNode>(Op))

57006

if (Ld->getExtensionType() == Ext &&

57007

Ld->getMemoryVT().getScalarSizeInBits() <= 32)

57008

return Op;

57009

if (IsZeroExt) {

57010

KnownBits Known = DAG.computeKnownBits(Op);

57011

if (!Known.isConstant() && Known.countMinLeadingZeros() >= 32)

57012

return Op;

57013

}

57014

return SDValue();

57015

};

57016

57017

if (SDValue AnyExt = IsExt64(peekThroughOneUseBitcasts(Src), false))

57018

return DAG.getBitcast(

57019

VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32,

57020

DAG.getAnyExtOrTrunc(AnyExt, DL, MVT::i32)));

57021

57022

if (SDValue ZeroExt = IsExt64(peekThroughOneUseBitcasts(Src), true))

57023

return DAG.getBitcast(

57024

VT,

57025

DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v4i32,

57026

DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32,

57027

DAG.getZExtOrTrunc(ZeroExt, DL, MVT::i32))));

57028

}

57029

57030

// Combine (v2i64 (scalar_to_vector (i64 (bitconvert (mmx))))) to MOVQ2DQ.

57031

if (VT == MVT::v2i64 && Src.getOpcode() == ISD::BITCAST &&

57032

Src.getOperand(0).getValueType() == MVT::x86mmx)

57033

return DAG.getNode(X86ISD::MOVQ2DQ, DL, VT, Src.getOperand(0));

57034

57035

// See if we're broadcasting the scalar value, in which case just reuse that.

57036

// Ensure the same SDValue from the SDNode use is being used.

57037

if (VT.getScalarType() == Src.getValueType())

57038

for (SDNode *User : Src->uses())

57039

if (User->getOpcode() == X86ISD::VBROADCAST &&

57040

Src == User->getOperand(0)) {

57041

unsigned SizeInBits = VT.getFixedSizeInBits();

57042

unsigned BroadcastSizeInBits =

57043

User->getValueSizeInBits(0).getFixedValue();

57044

if (BroadcastSizeInBits == SizeInBits)

57045

return SDValue(User, 0);

57046

if (BroadcastSizeInBits > SizeInBits)

57047

return extractSubVector(SDValue(User, 0), 0, DAG, DL, SizeInBits);

57048

// TODO: Handle BroadcastSizeInBits < SizeInBits when we have test

57049

// coverage.

57050

}

57051

57052

return SDValue();

57053

}

57054

57055

// Simplify PMULDQ and PMULUDQ operations.

57056

static SDValue combinePMULDQ(SDNode *N, SelectionDAG &DAG,

57057

TargetLowering::DAGCombinerInfo &DCI,

57058

const X86Subtarget &Subtarget) {

57059

SDValue LHS = N->getOperand(0);

57060

SDValue RHS = N->getOperand(1);

57061

57062

// Canonicalize constant to RHS.

57063

if (DAG.isConstantIntBuildVectorOrConstantInt(LHS) &&

57064

!DAG.isConstantIntBuildVectorOrConstantInt(RHS))

57065

return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), RHS, LHS);

57066

57067

// Multiply by zero.

57068

// Don't return RHS as it may contain UNDEFs.

57069

if (ISD::isBuildVectorAllZeros(RHS.getNode()))

57070

return DAG.getConstant(0, SDLoc(N), N->getValueType(0));

57071

57072

// PMULDQ/PMULUDQ only uses lower 32 bits from each vector element.

57073

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

57074

if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(64), DCI))

57075

return SDValue(N, 0);

57076

57077

// If the input is an extend_invec and the SimplifyDemandedBits call didn't

57078

// convert it to any_extend_invec, due to the LegalOperations check, do the

57079

// conversion directly to a vector shuffle manually. This exposes combine

57080

// opportunities missed by combineEXTEND_VECTOR_INREG not calling

57081

// combineX86ShufflesRecursively on SSE4.1 targets.

57082

// FIXME: This is basically a hack around several other issues related to

57083

// ANY_EXTEND_VECTOR_INREG.

57084

if (N->getValueType(0) == MVT::v2i64 && LHS.hasOneUse() &&

57085

(LHS.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG ||

57086

LHS.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG) &&

57087

LHS.getOperand(0).getValueType() == MVT::v4i32) {

57088

SDLoc dl(N);

57089

LHS = DAG.getVectorShuffle(MVT::v4i32, dl, LHS.getOperand(0),

57090

LHS.getOperand(0), { 0, -1, 1, -1 });

57091

LHS = DAG.getBitcast(MVT::v2i64, LHS);

57092

return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS);

57093

}

57094

if (N->getValueType(0) == MVT::v2i64 && RHS.hasOneUse() &&

57095

(RHS.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG ||

57096

RHS.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG) &&

57097

RHS.getOperand(0).getValueType() == MVT::v4i32) {

57098

SDLoc dl(N);

57099

RHS = DAG.getVectorShuffle(MVT::v4i32, dl, RHS.getOperand(0),

57100

RHS.getOperand(0), { 0, -1, 1, -1 });

57101

RHS = DAG.getBitcast(MVT::v2i64, RHS);

57102

return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS);

57103

}

57104

57105

return SDValue();

57106

}

57107

57108

// Simplify VPMADDUBSW/VPMADDWD operations.

57109

static SDValue combineVPMADD(SDNode *N, SelectionDAG &DAG,

57110

TargetLowering::DAGCombinerInfo &DCI) {

57111

EVT VT = N->getValueType(0);

57112

SDValue LHS = N->getOperand(0);

57113

SDValue RHS = N->getOperand(1);

57114

57115

// Multiply by zero.

57116

// Don't return LHS/RHS as it may contain UNDEFs.

57117

if (ISD::isBuildVectorAllZeros(LHS.getNode()) ||

57118

ISD::isBuildVectorAllZeros(RHS.getNode()))

57119

return DAG.getConstant(0, SDLoc(N), VT);

57120

57121

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

57122

APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());

57123

if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))

57124

return SDValue(N, 0);

57125

57126

return SDValue();

57127

}

57128

57129

static SDValue combineEXTEND_VECTOR_INREG(SDNode *N, SelectionDAG &DAG,

57130

TargetLowering::DAGCombinerInfo &DCI,

57131

const X86Subtarget &Subtarget) {

57132

EVT VT = N->getValueType(0);

57133

SDValue In = N->getOperand(0);

57134

unsigned Opcode = N->getOpcode();

57135

unsigned InOpcode = In.getOpcode();

57136

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

57137

SDLoc DL(N);

57138

57139

// Try to merge vector loads and extend_inreg to an extload.

57140

if (!DCI.isBeforeLegalizeOps() && ISD::isNormalLoad(In.getNode()) &&

57141

In.hasOneUse()) {

57142

auto *Ld = cast<LoadSDNode>(In);

57143

if (Ld->isSimple()) {

57144

MVT SVT = In.getSimpleValueType().getVectorElementType();

57145

ISD::LoadExtType Ext = Opcode == ISD::SIGN_EXTEND_VECTOR_INREG

57146

? ISD::SEXTLOAD

57147

: ISD::ZEXTLOAD;

57148

EVT MemVT = VT.changeVectorElementType(SVT);

57149

if (TLI.isLoadExtLegal(Ext, VT, MemVT)) {

57150

SDValue Load = DAG.getExtLoad(

57151

Ext, DL, VT, Ld->getChain(), Ld->getBasePtr(), Ld->getPointerInfo(),

57152

MemVT, Ld->getOriginalAlign(), Ld->getMemOperand()->getFlags());

57153

DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));

57154

return Load;

57155

}

57156

}

57157

}

57158

57159

// Fold EXTEND_VECTOR_INREG(EXTEND_VECTOR_INREG(X)) -> EXTEND_VECTOR_INREG(X).

57160

if (Opcode == InOpcode)

57161

return DAG.getNode(Opcode, DL, VT, In.getOperand(0));

57162

57163

// Fold EXTEND_VECTOR_INREG(EXTRACT_SUBVECTOR(EXTEND(X),0))

57164

// -> EXTEND_VECTOR_INREG(X).

57165

// TODO: Handle non-zero subvector indices.

57166

if (InOpcode == ISD::EXTRACT_SUBVECTOR && In.getConstantOperandVal(1) == 0 &&

57167

In.getOperand(0).getOpcode() == DAG.getOpcode_EXTEND(Opcode) &&

57168

In.getOperand(0).getOperand(0).getValueSizeInBits() ==

57169

In.getValueSizeInBits())

57170

return DAG.getNode(Opcode, DL, VT, In.getOperand(0).getOperand(0));

57171

57172

// Fold EXTEND_VECTOR_INREG(BUILD_VECTOR(X,Y,?,?)) -> BUILD_VECTOR(X,0,Y,0).

57173

// TODO: Move to DAGCombine?

57174

if (!DCI.isBeforeLegalizeOps() && Opcode == ISD::ZERO_EXTEND_VECTOR_INREG &&

57175

In.getOpcode() == ISD::BUILD_VECTOR && In.hasOneUse() &&

57176

In.getValueSizeInBits() == VT.getSizeInBits()) {

57177

unsigned NumElts = VT.getVectorNumElements();

57178

unsigned Scale = VT.getScalarSizeInBits() / In.getScalarValueSizeInBits();

57179

EVT EltVT = In.getOperand(0).getValueType();

57180

SmallVector<SDValue> Elts(Scale * NumElts, DAG.getConstant(0, DL, EltVT));

57181

for (unsigned I = 0; I != NumElts; ++I)

57182

Elts[I * Scale] = In.getOperand(I);

57183

return DAG.getBitcast(VT, DAG.getBuildVector(In.getValueType(), DL, Elts));

57184

}

57185

57186

// Attempt to combine as a shuffle on SSE41+ targets.

57187

if ((Opcode == ISD::ANY_EXTEND_VECTOR_INREG ||

57188

Opcode == ISD::ZERO_EXTEND_VECTOR_INREG) &&

57189

Subtarget.hasSSE41()) {

57190

SDValue Op(N, 0);

57191

if (TLI.isTypeLegal(VT) && TLI.isTypeLegal(In.getValueType()))

57192

if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))

57193

return Res;

57194

}

57195

57196

return SDValue();

57197

}

57198

57199

static SDValue combineKSHIFT(SDNode *N, SelectionDAG &DAG,

57200

TargetLowering::DAGCombinerInfo &DCI) {

57201

EVT VT = N->getValueType(0);

57202

57203

if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))

57204

return DAG.getConstant(0, SDLoc(N), VT);

57205

57206

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

57207

APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());

57208

if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))

57209

return SDValue(N, 0);

57210

57211

return SDValue();

57212

}

57213

57214

// Optimize (fp16_to_fp (fp_to_fp16 X)) to VCVTPS2PH followed by VCVTPH2PS.

57215

// Done as a combine because the lowering for fp16_to_fp and fp_to_fp16 produce

57216

// extra instructions between the conversion due to going to scalar and back.

57217

static SDValue combineFP16_TO_FP(SDNode *N, SelectionDAG &DAG,

57218

const X86Subtarget &Subtarget) {

57219

if (Subtarget.useSoftFloat() || !Subtarget.hasF16C())

57220

return SDValue();

57221

57222

if (N->getOperand(0).getOpcode() != ISD::FP_TO_FP16)

57223

return SDValue();

57224

57225

if (N->getValueType(0) != MVT::f32 ||

57226

N->getOperand(0).getOperand(0).getValueType() != MVT::f32)

57227

return SDValue();

57228

57229

SDLoc dl(N);

57230

SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32,

57231

N->getOperand(0).getOperand(0));

57232

Res = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Res,

57233

DAG.getTargetConstant(4, dl, MVT::i32));

57234

Res = DAG.getNode(X86ISD::CVTPH2PS, dl, MVT::v4f32, Res);

57235

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,

57236

DAG.getIntPtrConstant(0, dl));

57237

}

57238

57239

static SDValue combineFP_EXTEND(SDNode *N, SelectionDAG &DAG,

57240

const X86Subtarget &Subtarget) {

57241

if (!Subtarget.hasF16C() || Subtarget.useSoftFloat())

57242

return SDValue();

57243

57244

if (Subtarget.hasFP16())

57245

return SDValue();

57246

57247

bool IsStrict = N->isStrictFPOpcode();

57248

EVT VT = N->getValueType(0);

57249

SDValue Src = N->getOperand(IsStrict ? 1 : 0);

57250

EVT SrcVT = Src.getValueType();

57251

57252

if (!SrcVT.isVector() || SrcVT.getVectorElementType() != MVT::f16)

57253

return SDValue();

57254

57255

if (VT.getVectorElementType() != MVT::f32 &&

57256

VT.getVectorElementType() != MVT::f64)

57257

return SDValue();

57258

57259

unsigned NumElts = VT.getVectorNumElements();

57260

if (NumElts == 1 || !isPowerOf2_32(NumElts))

57261

return SDValue();

57262

57263

SDLoc dl(N);

57264

57265

// Convert the input to vXi16.

57266

EVT IntVT = SrcVT.changeVectorElementTypeToInteger();

57267

Src = DAG.getBitcast(IntVT, Src);

57268

57269

// Widen to at least 8 input elements.

57270

if (NumElts < 8) {

57271

unsigned NumConcats = 8 / NumElts;

57272

SDValue Fill = NumElts == 4 ? DAG.getUNDEF(IntVT)

57273

: DAG.getConstant(0, dl, IntVT);

57274

SmallVector<SDValue, 4> Ops(NumConcats, Fill);

57275

Ops[0] = Src;

57276

Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, Ops);

57277

}

57278

57279

// Destination is vXf32 with at least 4 elements.

57280

EVT CvtVT = EVT::getVectorVT(*DAG.getContext(), MVT::f32,

57281

std::max(4U, NumElts));

57282

SDValue Cvt, Chain;

57283

if (IsStrict) {

57284

Cvt = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {CvtVT, MVT::Other},

57285

{N->getOperand(0), Src});

57286

Chain = Cvt.getValue(1);

57287

} else {

57288

Cvt = DAG.getNode(X86ISD::CVTPH2PS, dl, CvtVT, Src);

57289

}

57290

57291

if (NumElts < 4) {

57292

assert(NumElts == 2 && "Unexpected size")(static_cast <bool> (NumElts == 2 && "Unexpected size"
) ? void (0) : __assert_fail ("NumElts == 2 && \"Unexpected size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 57292, __extension__
__PRETTY_FUNCTION__));

57293

Cvt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2f32, Cvt,

57294

DAG.getIntPtrConstant(0, dl));

57295

}

57296

57297

if (IsStrict) {

57298

// Extend to the original VT if necessary.

57299

if (Cvt.getValueType() != VT) {

57300

Cvt = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {VT, MVT::Other},

57301

{Chain, Cvt});

57302

Chain = Cvt.getValue(1);

57303

}

57304

return DAG.getMergeValues({Cvt, Chain}, dl);

57305

}

57306

57307

// Extend to the original VT if necessary.

57308

return DAG.getNode(ISD::FP_EXTEND, dl, VT, Cvt);

57309

}

57310

57311

// Try to find a larger VBROADCAST_LOAD/SUBV_BROADCAST_LOAD that we can extract

57312

// from. Limit this to cases where the loads have the same input chain and the

57313

// output chains are unused. This avoids any memory ordering issues.

57314

static SDValue combineBROADCAST_LOAD(SDNode *N, SelectionDAG &DAG,

57315

TargetLowering::DAGCombinerInfo &DCI) {

57316

assert((N->getOpcode() == X86ISD::VBROADCAST_LOAD ||(static_cast <bool> ((N->getOpcode() == X86ISD::VBROADCAST_LOAD
|| N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) &&
"Unknown broadcast load type") ? void (0) : __assert_fail ("(N->getOpcode() == X86ISD::VBROADCAST_LOAD || N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) && \"Unknown broadcast load type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 57318, __extension__
__PRETTY_FUNCTION__))

57317

N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) &&(static_cast <bool> ((N->getOpcode() == X86ISD::VBROADCAST_LOAD
|| N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) &&
"Unknown broadcast load type") ? void (0) : __assert_fail ("(N->getOpcode() == X86ISD::VBROADCAST_LOAD || N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) && \"Unknown broadcast load type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 57318, __extension__
__PRETTY_FUNCTION__))

57318

"Unknown broadcast load type")(static_cast <bool> ((N->getOpcode() == X86ISD::VBROADCAST_LOAD
|| N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) &&
"Unknown broadcast load type") ? void (0) : __assert_fail ("(N->getOpcode() == X86ISD::VBROADCAST_LOAD || N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) && \"Unknown broadcast load type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 57318, __extension__
__PRETTY_FUNCTION__));

57319

57320

// Only do this if the chain result is unused.

57321

if (N->hasAnyUseOfValue(1))

57322

return SDValue();

57323

57324

auto *MemIntrin = cast<MemIntrinsicSDNode>(N);

57325

57326

SDValue Ptr = MemIntrin->getBasePtr();

57327

SDValue Chain = MemIntrin->getChain();

57328

EVT VT = N->getSimpleValueType(0);

57329

EVT MemVT = MemIntrin->getMemoryVT();

57330

57331

// Look at other users of our base pointer and try to find a wider broadcast.

57332

// The input chain and the size of the memory VT must match.

57333

for (SDNode *User : Ptr->uses())

57334

if (User != N && User->getOpcode() == N->getOpcode() &&

57335

cast<MemIntrinsicSDNode>(User)->getBasePtr() == Ptr &&

57336

cast<MemIntrinsicSDNode>(User)->getChain() == Chain &&

57337

cast<MemIntrinsicSDNode>(User)->getMemoryVT().getSizeInBits() ==

57338

MemVT.getSizeInBits() &&

57339

!User->hasAnyUseOfValue(1) &&

57340

User->getValueSizeInBits(0).getFixedValue() > VT.getFixedSizeInBits()) {

57341

SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, SDLoc(N),

57342

VT.getSizeInBits());

57343

Extract = DAG.getBitcast(VT, Extract);

57344

return DCI.CombineTo(N, Extract, SDValue(User, 1));

57345

}

57346

57347

return SDValue();

57348

}

57349

57350

static SDValue combineFP_ROUND(SDNode *N, SelectionDAG &DAG,

57351

const X86Subtarget &Subtarget) {

57352

if (!Subtarget.hasF16C() || Subtarget.useSoftFloat())

57353

return SDValue();

57354

57355

bool IsStrict = N->isStrictFPOpcode();

57356

EVT VT = N->getValueType(0);

57357

SDValue Src = N->getOperand(IsStrict ? 1 : 0);

57358

EVT SrcVT = Src.getValueType();

57359

57360

if (!VT.isVector() || VT.getVectorElementType() != MVT::f16 ||

57361

SrcVT.getVectorElementType() != MVT::f32)

57362

return SDValue();

57363

57364

SDLoc dl(N);

57365

57366

SDValue Cvt, Chain;

57367

unsigned NumElts = VT.getVectorNumElements();

57368

if (Subtarget.hasFP16()) {

57369

// Combine (v8f16 fp_round(concat_vectors(v4f32 (xint_to_fp v4i64), ..)))

57370

// into (v8f16 vector_shuffle(v8f16 (CVTXI2P v4i64), ..))

57371

if (NumElts == 8 && Src.getOpcode() == ISD::CONCAT_VECTORS) {

57372

SDValue Cvt0, Cvt1;

57373

SDValue Op0 = Src.getOperand(0);

57374

SDValue Op1 = Src.getOperand(1);

57375

bool IsOp0Strict = Op0->isStrictFPOpcode();

57376

if (Op0.getOpcode() != Op1.getOpcode() ||

57377

Op0.getOperand(IsOp0Strict ? 1 : 0).getValueType() != MVT::v4i64 ||

57378

Op1.getOperand(IsOp0Strict ? 1 : 0).getValueType() != MVT::v4i64) {

57379

return SDValue();

57380

}

57381

int Mask[8] = {0, 1, 2, 3, 8, 9, 10, 11};

57382

if (IsStrict) {

57383

assert(IsOp0Strict && "Op0 must be strict node")(static_cast <bool> (IsOp0Strict && "Op0 must be strict node"
) ? void (0) : __assert_fail ("IsOp0Strict && \"Op0 must be strict node\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 57383, __extension__
__PRETTY_FUNCTION__));

57384

unsigned Opc = Op0.getOpcode() == ISD::STRICT_SINT_TO_FP

57385

? X86ISD::STRICT_CVTSI2P

57386

: X86ISD::STRICT_CVTUI2P;

57387

Cvt0 = DAG.getNode(Opc, dl, {MVT::v8f16, MVT::Other},

57388

{Op0.getOperand(0), Op0.getOperand(1)});

57389

Cvt1 = DAG.getNode(Opc, dl, {MVT::v8f16, MVT::Other},

57390

{Op1.getOperand(0), Op1.getOperand(1)});

57391

Cvt = DAG.getVectorShuffle(MVT::v8f16, dl, Cvt0, Cvt1, Mask);

57392

return DAG.getMergeValues({Cvt, Cvt0.getValue(1)}, dl);

57393

}

57394

unsigned Opc = Op0.getOpcode() == ISD::SINT_TO_FP ? X86ISD::CVTSI2P

57395

: X86ISD::CVTUI2P;

57396

Cvt0 = DAG.getNode(Opc, dl, MVT::v8f16, Op0.getOperand(0));

57397

Cvt1 = DAG.getNode(Opc, dl, MVT::v8f16, Op1.getOperand(0));

57398

return Cvt = DAG.getVectorShuffle(MVT::v8f16, dl, Cvt0, Cvt1, Mask);

57399

}

57400

return SDValue();

57401

}

57402

57403

if (NumElts == 1 || !isPowerOf2_32(NumElts))

57404

return SDValue();

57405

57406

// Widen to at least 4 input elements.

57407

if (NumElts < 4)

57408

Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,

57409

DAG.getConstantFP(0.0, dl, SrcVT));

57410

57411

// Destination is v8i16 with at least 8 elements.

57412

EVT CvtVT =

57413

EVT::getVectorVT(*DAG.getContext(), MVT::i16, std::max(8U, NumElts));

57414

SDValue Rnd = DAG.getTargetConstant(4, dl, MVT::i32);

57415

if (IsStrict) {

57416

Cvt = DAG.getNode(X86ISD::STRICT_CVTPS2PH, dl, {CvtVT, MVT::Other},

57417

{N->getOperand(0), Src, Rnd});

57418

Chain = Cvt.getValue(1);

57419

} else {

57420

Cvt = DAG.getNode(X86ISD::CVTPS2PH, dl, CvtVT, Src, Rnd);

57421

}

57422

57423

// Extract down to real number of elements.

57424

if (NumElts < 8) {

57425

EVT IntVT = VT.changeVectorElementTypeToInteger();

57426

Cvt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, IntVT, Cvt,

57427

DAG.getIntPtrConstant(0, dl));

57428

}

57429

57430

Cvt = DAG.getBitcast(VT, Cvt);

57431

57432

if (IsStrict)

57433

return DAG.getMergeValues({Cvt, Chain}, dl);

57434

57435

return Cvt;

57436

}

57437

57438

static SDValue combineMOVDQ2Q(SDNode *N, SelectionDAG &DAG) {

57439

SDValue Src = N->getOperand(0);

57440

57441

// Turn MOVDQ2Q+simple_load into an mmx load.

57442

if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) {

57443

LoadSDNode *LN = cast<LoadSDNode>(Src.getNode());

57444

57445

if (LN->isSimple()) {

57446

SDValue NewLd = DAG.getLoad(MVT::x86mmx, SDLoc(N), LN->getChain(),

57447

LN->getBasePtr(),

57448

LN->getPointerInfo(),

57449

LN->getOriginalAlign(),

57450

LN->getMemOperand()->getFlags());

57451

DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), NewLd.getValue(1));

57452

return NewLd;

57453

}

57454

}

57455

57456

return SDValue();

57457

}

57458

57459

static SDValue combinePDEP(SDNode *N, SelectionDAG &DAG,

57460

TargetLowering::DAGCombinerInfo &DCI) {

57461

unsigned NumBits = N->getSimpleValueType(0).getSizeInBits();

57462

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

57463

if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(NumBits), DCI))

57464

return SDValue(N, 0);

57465

57466

return SDValue();

57467

}

57468

57469

SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,

57470

DAGCombinerInfo &DCI) const {

57471

SelectionDAG &DAG = DCI.DAG;

57472

switch (N->getOpcode()) {

57473

default: break;

57474

case ISD::SCALAR_TO_VECTOR:

57475

return combineScalarToVector(N, DAG);

57476

case ISD::EXTRACT_VECTOR_ELT:

57477

case X86ISD::PEXTRW:

57478

case X86ISD::PEXTRB:

57479

return combineExtractVectorElt(N, DAG, DCI, Subtarget);

57480

case ISD::CONCAT_VECTORS:

57481

return combineCONCAT_VECTORS(N, DAG, DCI, Subtarget);

57482

case ISD::INSERT_SUBVECTOR:

57483

return combineINSERT_SUBVECTOR(N, DAG, DCI, Subtarget);

57484

case ISD::EXTRACT_SUBVECTOR:

57485

return combineEXTRACT_SUBVECTOR(N, DAG, DCI, Subtarget);

57486

case ISD::VSELECT:

57487

case ISD::SELECT:

57488

case X86ISD::BLENDV: return combineSelect(N, DAG, DCI, Subtarget);

57489

case ISD::BITCAST: return combineBitcast(N, DAG, DCI, Subtarget);

57490

case X86ISD::CMOV: return combineCMov(N, DAG, DCI, Subtarget);

57491

case X86ISD::CMP: return combineCMP(N, DAG);

57492

case ISD::ADD: return combineAdd(N, DAG, DCI, Subtarget);

57493

case ISD::SUB: return combineSub(N, DAG, DCI, Subtarget);

57494

case X86ISD::ADD:

57495

case X86ISD::SUB: return combineX86AddSub(N, DAG, DCI);

57496

case X86ISD::SBB: return combineSBB(N, DAG);

57497

case X86ISD::ADC: return combineADC(N, DAG, DCI);

57498

case ISD::MUL: return combineMul(N, DAG, DCI, Subtarget);

57499

case ISD::SHL: return combineShiftLeft(N, DAG);

57500

case ISD::SRA: return combineShiftRightArithmetic(N, DAG, Subtarget);

57501

case ISD::SRL: return combineShiftRightLogical(N, DAG, DCI, Subtarget);

57502

case ISD::AND: return combineAnd(N, DAG, DCI, Subtarget);

57503

case ISD::OR: return combineOr(N, DAG, DCI, Subtarget);

57504

case ISD::XOR: return combineXor(N, DAG, DCI, Subtarget);

57505

case X86ISD::BEXTR:

57506

case X86ISD::BEXTRI: return combineBEXTR(N, DAG, DCI, Subtarget);

57507

case ISD::LOAD: return combineLoad(N, DAG, DCI, Subtarget);

57508

case ISD::MLOAD: return combineMaskedLoad(N, DAG, DCI, Subtarget);

57509

case ISD::STORE: return combineStore(N, DAG, DCI, Subtarget);

57510

case ISD::MSTORE: return combineMaskedStore(N, DAG, DCI, Subtarget);

57511

case X86ISD::VEXTRACT_STORE:

57512

return combineVEXTRACT_STORE(N, DAG, DCI, Subtarget);

57513

case ISD::SINT_TO_FP:

57514

case ISD::STRICT_SINT_TO_FP:

57515

return combineSIntToFP(N, DAG, DCI, Subtarget);

57516

case ISD::UINT_TO_FP:

57517

case ISD::STRICT_UINT_TO_FP:

57518

return combineUIntToFP(N, DAG, Subtarget);

57519

case ISD::FADD:

57520

case ISD::FSUB: return combineFaddFsub(N, DAG, Subtarget);

57521

case X86ISD::VFCMULC:

57522

case X86ISD::VFMULC: return combineFMulcFCMulc(N, DAG, Subtarget);

57523

case ISD::FNEG: return combineFneg(N, DAG, DCI, Subtarget);

57524

case ISD::TRUNCATE: return combineTruncate(N, DAG, Subtarget);

57525

case X86ISD::VTRUNC: return combineVTRUNC(N, DAG, DCI);

57526

case X86ISD::ANDNP: return combineAndnp(N, DAG, DCI, Subtarget);

57527

case X86ISD::FAND: return combineFAnd(N, DAG, Subtarget);

57528

case X86ISD::FANDN: return combineFAndn(N, DAG, Subtarget);

57529

case X86ISD::FXOR:

57530

case X86ISD::FOR: return combineFOr(N, DAG, DCI, Subtarget);

57531

case X86ISD::FMIN:

57532

case X86ISD::FMAX: return combineFMinFMax(N, DAG);

57533

case ISD::FMINNUM:

57534

case ISD::FMAXNUM: return combineFMinNumFMaxNum(N, DAG, Subtarget);

57535

case X86ISD::CVTSI2P:

57536

case X86ISD::CVTUI2P: return combineX86INT_TO_FP(N, DAG, DCI);

57537

case X86ISD::CVTP2SI:

57538

case X86ISD::CVTP2UI:

57539

case X86ISD::STRICT_CVTTP2SI:

57540

case X86ISD::CVTTP2SI:

57541

case X86ISD::STRICT_CVTTP2UI:

57542

case X86ISD::CVTTP2UI:

57543

return combineCVTP2I_CVTTP2I(N, DAG, DCI);

57544

case X86ISD::STRICT_CVTPH2PS:

57545

case X86ISD::CVTPH2PS: return combineCVTPH2PS(N, DAG, DCI);

57546

case X86ISD::BT: return combineBT(N, DAG, DCI);

57547

case ISD::ANY_EXTEND:

57548

case ISD::ZERO_EXTEND: return combineZext(N, DAG, DCI, Subtarget);

57549

case ISD::SIGN_EXTEND: return combineSext(N, DAG, DCI, Subtarget);

57550

case ISD::SIGN_EXTEND_INREG: return combineSignExtendInReg(N, DAG, Subtarget);

57551

case ISD::ANY_EXTEND_VECTOR_INREG:

57552

case ISD::SIGN_EXTEND_VECTOR_INREG:

57553

case ISD::ZERO_EXTEND_VECTOR_INREG:

57554

return combineEXTEND_VECTOR_INREG(N, DAG, DCI, Subtarget);

57555

case ISD::SETCC: return combineSetCC(N, DAG, DCI, Subtarget);

57556

case X86ISD::SETCC: return combineX86SetCC(N, DAG, Subtarget);

57557

case X86ISD::BRCOND: return combineBrCond(N, DAG, Subtarget);

57558

case X86ISD::PACKSS:

57559

case X86ISD::PACKUS: return combineVectorPack(N, DAG, DCI, Subtarget);

57560

case X86ISD::HADD:

57561

case X86ISD::HSUB:

57562

case X86ISD::FHADD:

57563

case X86ISD::FHSUB: return combineVectorHADDSUB(N, DAG, DCI, Subtarget);

57564

case X86ISD::VSHL:

57565

case X86ISD::VSRA:

57566

case X86ISD::VSRL:

57567

return combineVectorShiftVar(N, DAG, DCI, Subtarget);

57568

case X86ISD::VSHLI:

57569

case X86ISD::VSRAI:

57570

case X86ISD::VSRLI:

57571

return combineVectorShiftImm(N, DAG, DCI, Subtarget);

57572

case ISD::INSERT_VECTOR_ELT:

57573

case X86ISD::PINSRB:

57574

case X86ISD::PINSRW: return combineVectorInsert(N, DAG, DCI, Subtarget);

57575

case X86ISD::SHUFP: // Handle all target specific shuffles

57576

case X86ISD::INSERTPS:

57577

case X86ISD::EXTRQI:

57578

case X86ISD::INSERTQI:

57579

case X86ISD::VALIGN:

57580

case X86ISD::PALIGNR:

57581

case X86ISD::VSHLDQ:

57582

case X86ISD::VSRLDQ:

57583

case X86ISD::BLENDI:

57584

case X86ISD::UNPCKH:

57585

case X86ISD::UNPCKL:

57586

case X86ISD::MOVHLPS:

57587

case X86ISD::MOVLHPS:

57588

case X86ISD::PSHUFB:

57589

case X86ISD::PSHUFD:

57590

case X86ISD::PSHUFHW:

57591

case X86ISD::PSHUFLW:

57592

case X86ISD::MOVSHDUP:

57593

case X86ISD::MOVSLDUP:

57594

case X86ISD::MOVDDUP:

57595

case X86ISD::MOVSS:

57596

case X86ISD::MOVSD:

57597

case X86ISD::MOVSH:

57598

case X86ISD::VBROADCAST:

57599

case X86ISD::VPPERM:

57600

case X86ISD::VPERMI:

57601

case X86ISD::VPERMV:

57602

case X86ISD::VPERMV3:

57603

case X86ISD::VPERMIL2:

57604

case X86ISD::VPERMILPI:

57605

case X86ISD::VPERMILPV:

57606

case X86ISD::VPERM2X128:

57607

case X86ISD::SHUF128:

57608

case X86ISD::VZEXT_MOVL:

57609

case ISD::VECTOR_SHUFFLE: return combineShuffle(N, DAG, DCI,Subtarget);

57610

case X86ISD::FMADD_RND:

57611

case X86ISD::FMSUB:

57612

case X86ISD::STRICT_FMSUB:

57613

case X86ISD::FMSUB_RND:

57614

case X86ISD::FNMADD:

57615

case X86ISD::STRICT_FNMADD:

57616

case X86ISD::FNMADD_RND:

57617

case X86ISD::FNMSUB:

57618

case X86ISD::STRICT_FNMSUB:

57619

case X86ISD::FNMSUB_RND:

57620

case ISD::FMA:

57621

case ISD::STRICT_FMA: return combineFMA(N, DAG, DCI, Subtarget);

57622

case X86ISD::FMADDSUB_RND:

57623

case X86ISD::FMSUBADD_RND:

57624

case X86ISD::FMADDSUB:

57625

case X86ISD::FMSUBADD: return combineFMADDSUB(N, DAG, DCI);

57626

case X86ISD::MOVMSK: return combineMOVMSK(N, DAG, DCI, Subtarget);

57627

case X86ISD::TESTP: return combineTESTP(N, DAG, DCI, Subtarget);

57628

case X86ISD::MGATHER:

57629

case X86ISD::MSCATTER:

57630

return combineX86GatherScatter(N, DAG, DCI, Subtarget);

57631

case ISD::MGATHER:

57632

case ISD::MSCATTER: return combineGatherScatter(N, DAG, DCI);

57633

case X86ISD::PCMPEQ:

57634

case X86ISD::PCMPGT: return combineVectorCompare(N, DAG, Subtarget);

57635

case X86ISD::PMULDQ:

57636

case X86ISD::PMULUDQ: return combinePMULDQ(N, DAG, DCI, Subtarget);

57637

case X86ISD::VPMADDUBSW:

57638

case X86ISD::VPMADDWD: return combineVPMADD(N, DAG, DCI);

57639

case X86ISD::KSHIFTL:

57640

case X86ISD::KSHIFTR: return combineKSHIFT(N, DAG, DCI);

57641

case ISD::FP16_TO_FP: return combineFP16_TO_FP(N, DAG, Subtarget);

57642

case ISD::STRICT_FP_EXTEND:

57643

case ISD::FP_EXTEND: return combineFP_EXTEND(N, DAG, Subtarget);

57644

case ISD::STRICT_FP_ROUND:

57645

case ISD::FP_ROUND: return combineFP_ROUND(N, DAG, Subtarget);

57646

case X86ISD::VBROADCAST_LOAD:

57647

case X86ISD::SUBV_BROADCAST_LOAD: return combineBROADCAST_LOAD(N, DAG, DCI);

57648

case X86ISD::MOVDQ2Q: return combineMOVDQ2Q(N, DAG);

57649

case X86ISD::PDEP: return combinePDEP(N, DAG, DCI);

57650

}

57651

57652

return SDValue();

57653

}

57654

57655

bool X86TargetLowering::preferABDSToABSWithNSW(EVT VT) const {

57656

return false;

57657

}

57658

57659

bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {

57660

if (!isTypeLegal(VT))

57661

return false;

57662

57663

// There are no vXi8 shifts.

57664

if (Opc == ISD::SHL && VT.isVector() && VT.getVectorElementType() == MVT::i8)

57665

return false;

57666

57667

// TODO: Almost no 8-bit ops are desirable because they have no actual

57668

// size/speed advantages vs. 32-bit ops, but they do have a major

57669

// potential disadvantage by causing partial register stalls.

57670

//

57671

// 8-bit multiply/shl is probably not cheaper than 32-bit multiply/shl, and

57672

// we have specializations to turn 32-bit multiply/shl into LEA or other ops.

57673

// Also, see the comment in "IsDesirableToPromoteOp" - where we additionally

57674

// check for a constant operand to the multiply.

57675

if ((Opc == ISD::MUL || Opc == ISD::SHL) && VT == MVT::i8)

57676

return false;

57677

57678

// i16 instruction encodings are longer and some i16 instructions are slow,

57679

// so those are not desirable.

57680

if (VT == MVT::i16) {

57681

switch (Opc) {

57682

default:

57683

break;

57684

case ISD::LOAD:

57685

case ISD::SIGN_EXTEND:

57686

case ISD::ZERO_EXTEND:

57687

case ISD::ANY_EXTEND:

57688

case ISD::SHL:

57689

case ISD::SRA:

57690

case ISD::SRL:

57691

case ISD::SUB:

57692

case ISD::ADD:

57693

case ISD::MUL:

57694

case ISD::AND:

57695

case ISD::OR:

57696

case ISD::XOR:

57697

return false;

57698

}

57699

}

57700

57701

// Any legal type not explicitly accounted for above here is desirable.

57702

return true;

57703

}

57704

57705

SDValue X86TargetLowering::expandIndirectJTBranch(const SDLoc& dl,

57706

SDValue Value, SDValue Addr,

57707

SelectionDAG &DAG) const {

57708

const Module *M = DAG.getMachineFunction().getMMI().getModule();

57709

Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch");

57710

if (IsCFProtectionSupported) {

57711

// In case control-flow branch protection is enabled, we need to add

57712

// notrack prefix to the indirect branch.

57713

// In order to do that we create NT_BRIND SDNode.

57714

// Upon ISEL, the pattern will convert it to jmp with NoTrack prefix.

57715

return DAG.getNode(X86ISD::NT_BRIND, dl, MVT::Other, Value, Addr);

57716

}

57717

57718

return TargetLowering::expandIndirectJTBranch(dl, Value, Addr, DAG);

57719

}

57720

57721

TargetLowering::AndOrSETCCFoldKind

57722

X86TargetLowering::isDesirableToCombineLogicOpOfSETCC(

57723

const SDNode *LogicOp, const SDNode *SETCC0, const SDNode *SETCC1) const {

57724

using AndOrSETCCFoldKind = TargetLowering::AndOrSETCCFoldKind;

57725

EVT VT = LogicOp->getValueType(0);

57726

EVT OpVT = SETCC0->getOperand(0).getValueType();

57727

if (!VT.isInteger())

57728

return AndOrSETCCFoldKind::None;

57729

57730

if (VT.isVector())

57731

return AndOrSETCCFoldKind(AndOrSETCCFoldKind::NotAnd |

57732

(isOperationLegal(ISD::ABS, OpVT)

57733

? AndOrSETCCFoldKind::ABS

57734

: AndOrSETCCFoldKind::None));

57735

57736

// Don't use `NotAnd` as even though `not` is generally shorter code size than

57737

// `add`, `add` can lower to LEA which can save moves / spills. Any case where

57738

// `NotAnd` applies, `AddAnd` does as well.

57739

// TODO: Currently we lower (icmp eq/ne (and ~X, Y), 0) -> `test (not X), Y`,

57740

// if we change that to `andn Y, X` it may be worth prefering `NotAnd` here.

57741

return AndOrSETCCFoldKind::AddAnd;

57742

}

57743

57744

bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {

57745

EVT VT = Op.getValueType();

57746

bool Is8BitMulByConstant = VT == MVT::i8 && Op.getOpcode() == ISD::MUL &&

57747

isa<ConstantSDNode>(Op.getOperand(1));

57748

57749

// i16 is legal, but undesirable since i16 instruction encodings are longer

57750

// and some i16 instructions are slow.

57751

// 8-bit multiply-by-constant can usually be expanded to something cheaper

57752

// using LEA and/or other ALU ops.

57753

if (VT != MVT::i16 && !Is8BitMulByConstant)

57754

return false;

57755

57756

auto IsFoldableRMW = [](SDValue Load, SDValue Op) {

57757

if (!Op.hasOneUse())

57758

return false;

57759

SDNode *User = *Op->use_begin();

57760

if (!ISD::isNormalStore(User))

57761

return false;

57762

auto *Ld = cast<LoadSDNode>(Load);

57763

auto *St = cast<StoreSDNode>(User);

57764

return Ld->getBasePtr() == St->getBasePtr();

57765

};

57766

57767

auto IsFoldableAtomicRMW = [](SDValue Load, SDValue Op) {

57768

if (!Load.hasOneUse() || Load.getOpcode() != ISD::ATOMIC_LOAD)

57769

return false;

57770

if (!Op.hasOneUse())

57771

return false;

57772

SDNode *User = *Op->use_begin();

57773

if (User->getOpcode() != ISD::ATOMIC_STORE)

57774

return false;

57775

auto *Ld = cast<AtomicSDNode>(Load);

57776

auto *St = cast<AtomicSDNode>(User);

57777

return Ld->getBasePtr() == St->getBasePtr();

57778

};

57779

57780

bool Commute = false;

57781

switch (Op.getOpcode()) {

57782

default: return false;

57783

case ISD::SIGN_EXTEND:

57784

case ISD::ZERO_EXTEND:

57785

case ISD::ANY_EXTEND:

57786

break;

57787

case ISD::SHL:

57788

case ISD::SRA:

57789

case ISD::SRL: {

57790

SDValue N0 = Op.getOperand(0);

57791

// Look out for (store (shl (load), x)).

57792

if (X86::mayFoldLoad(N0, Subtarget) && IsFoldableRMW(N0, Op))

57793

return false;

57794

break;

57795

}

57796

case ISD::ADD:

57797

case ISD::MUL:

57798

case ISD::AND:

57799

case ISD::OR:

57800

case ISD::XOR:

57801

Commute = true;

57802

[[fallthrough]];

57803

case ISD::SUB: {

57804

SDValue N0 = Op.getOperand(0);

57805

SDValue N1 = Op.getOperand(1);

57806

// Avoid disabling potential load folding opportunities.

57807

if (X86::mayFoldLoad(N1, Subtarget) &&

57808

(!Commute || !isa<ConstantSDNode>(N0) ||

57809

(Op.getOpcode() != ISD::MUL && IsFoldableRMW(N1, Op))))

57810

return false;

57811

if (X86::mayFoldLoad(N0, Subtarget) &&

57812

((Commute && !isa<ConstantSDNode>(N1)) ||

57813

(Op.getOpcode() != ISD::MUL && IsFoldableRMW(N0, Op))))

57814

return false;

57815

if (IsFoldableAtomicRMW(N0, Op) ||

57816

(Commute && IsFoldableAtomicRMW(N1, Op)))

57817

return false;

57818

}

57819

}

57820

57821

PVT = MVT::i32;

57822

return true;

57823

}

57824

57825

//===----------------------------------------------------------------------===//

57826

// X86 Inline Assembly Support

57827

//===----------------------------------------------------------------------===//

57828

57829

// Helper to match a string separated by whitespace.

57830

static bool matchAsm(StringRef S, ArrayRef<const char *> Pieces) {

57831

S = S.substr(S.find_first_not_of(" \t")); // Skip leading whitespace.

57832

57833

for (StringRef Piece : Pieces) {

57834

if (!S.startswith(Piece)) // Check if the piece matches.

57835

return false;

57836

57837

S = S.substr(Piece.size());

57838

StringRef::size_type Pos = S.find_first_not_of(" \t");

57839

if (Pos == 0) // We matched a prefix.

57840

return false;

57841

57842

S = S.substr(Pos);

57843

}

57844

57845

return S.empty();

57846

}

57847

57848

static bool clobbersFlagRegisters(const SmallVector<StringRef, 4> &AsmPieces) {

57849

57850

if (AsmPieces.size() == 3 || AsmPieces.size() == 4) {

57851

if (llvm::is_contained(AsmPieces, "~{cc}") &&

57852

llvm::is_contained(AsmPieces, "~{flags}") &&

57853

llvm::is_contained(AsmPieces, "~{fpsr}")) {

57854

57855

if (AsmPieces.size() == 3)

57856

return true;

57857

else if (llvm::is_contained(AsmPieces, "~{dirflag}"))

57858

return true;

57859

}

57860

}

57861

return false;

57862

}

57863

57864

bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {

57865

InlineAsm *IA = cast<InlineAsm>(CI->getCalledOperand());

57866

57867

const std::string &AsmStr = IA->getAsmString();

57868

57869

IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());

57870

if (!Ty || Ty->getBitWidth() % 16 != 0)

57871

return false;

57872

57873

// TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a"

57874

SmallVector<StringRef, 4> AsmPieces;

57875

SplitString(AsmStr, AsmPieces, ";\n");

57876

57877

switch (AsmPieces.size()) {

57878

default: return false;

57879

case 1:

57880

// FIXME: this should verify that we are targeting a 486 or better. If not,

57881

// we will turn this bswap into something that will be lowered to logical

57882

// ops instead of emitting the bswap asm. For now, we don't support 486 or

57883

// lower so don't worry about this.

57884

// bswap $0

57885

if (matchAsm(AsmPieces[0], {"bswap", "$0"}) ||

57886

matchAsm(AsmPieces[0], {"bswapl", "$0"}) ||

57887

matchAsm(AsmPieces[0], {"bswapq", "$0"}) ||

57888

matchAsm(AsmPieces[0], {"bswap", "${0:q}"}) ||

57889

matchAsm(AsmPieces[0], {"bswapl", "${0:q}"}) ||

57890

matchAsm(AsmPieces[0], {"bswapq", "${0:q}"})) {

57891

// No need to check constraints, nothing other than the equivalent of

57892

// "=r,0" would be valid here.

57893

return IntrinsicLowering::LowerToByteSwap(CI);

57894

}

57895

57896

// rorw $$8, ${0:w} --> llvm.bswap.i16

57897

if (CI->getType()->isIntegerTy(16) &&

57898

IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&

57899

(matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) ||

57900

matchAsm(AsmPieces[0], {"rolw", "$$8,", "${0:w}"}))) {

57901

AsmPieces.clear();

57902

StringRef ConstraintsStr = IA->getConstraintString();

57903

SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");

57904

array_pod_sort(AsmPieces.begin(), AsmPieces.end());

57905

if (clobbersFlagRegisters(AsmPieces))

57906

return IntrinsicLowering::LowerToByteSwap(CI);

57907

}

57908

break;

57909

case 3:

57910

if (CI->getType()->isIntegerTy(32) &&

57911

IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&

57912

matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) &&

57913

matchAsm(AsmPieces[1], {"rorl", "$$16,", "$0"}) &&

57914

matchAsm(AsmPieces[2], {"rorw", "$$8,", "${0:w}"})) {

57915

AsmPieces.clear();

57916

StringRef ConstraintsStr = IA->getConstraintString();

57917

SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");

57918

array_pod_sort(AsmPieces.begin(), AsmPieces.end());

57919

if (clobbersFlagRegisters(AsmPieces))

57920

return IntrinsicLowering::LowerToByteSwap(CI);

57921

}

57922

57923

if (CI->getType()->isIntegerTy(64)) {

57924

InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints();

57925

if (Constraints.size() >= 2 &&

57926

Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&

57927

Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {

57928

// bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64

57929

if (matchAsm(AsmPieces[0], {"bswap", "%eax"}) &&

57930

matchAsm(AsmPieces[1], {"bswap", "%edx"}) &&

57931

matchAsm(AsmPieces[2], {"xchgl", "%eax,", "%edx"}))

57932

return IntrinsicLowering::LowerToByteSwap(CI);

57933

}

57934

}

57935

break;

57936

}

57937

return false;

57938

}

57939

57940

static X86::CondCode parseConstraintCode(llvm::StringRef Constraint) {

57941

X86::CondCode Cond = StringSwitch<X86::CondCode>(Constraint)

57942

.Case("{@cca}", X86::COND_A)

57943

.Case("{@ccae}", X86::COND_AE)

57944

.Case("{@ccb}", X86::COND_B)

57945

.Case("{@ccbe}", X86::COND_BE)

57946

.Case("{@ccc}", X86::COND_B)

57947

.Case("{@cce}", X86::COND_E)

57948

.Case("{@ccz}", X86::COND_E)

57949

.Case("{@ccg}", X86::COND_G)

57950

.Case("{@ccge}", X86::COND_GE)

57951

.Case("{@ccl}", X86::COND_L)

57952

.Case("{@ccle}", X86::COND_LE)

57953

.Case("{@ccna}", X86::COND_BE)

57954

.Case("{@ccnae}", X86::COND_B)

57955

.Case("{@ccnb}", X86::COND_AE)

57956

.Case("{@ccnbe}", X86::COND_A)

57957

.Case("{@ccnc}", X86::COND_AE)

57958

.Case("{@ccne}", X86::COND_NE)

57959

.Case("{@ccnz}", X86::COND_NE)

57960

.Case("{@ccng}", X86::COND_LE)

57961

.Case("{@ccnge}", X86::COND_L)

57962

.Case("{@ccnl}", X86::COND_GE)

57963

.Case("{@ccnle}", X86::COND_G)

57964

.Case("{@ccno}", X86::COND_NO)

57965

.Case("{@ccnp}", X86::COND_NP)

57966

.Case("{@ccns}", X86::COND_NS)

57967

.Case("{@cco}", X86::COND_O)

57968

.Case("{@ccp}", X86::COND_P)

57969

.Case("{@ccs}", X86::COND_S)

57970

.Default(X86::COND_INVALID);

57971

return Cond;

57972

}

57973

57974

/// Given a constraint letter, return the type of constraint for this target.

57975

X86TargetLowering::ConstraintType

57976

X86TargetLowering::getConstraintType(StringRef Constraint) const {

57977

if (Constraint.size() == 1) {

57978

switch (Constraint[0]) {

57979

case 'R':

57980

case 'q':

57981

case 'Q':

57982

case 'f':

57983

case 't':

57984

case 'u':

57985

case 'y':

57986

case 'x':

57987

case 'v':

57988

case 'l':

57989

case 'k': // AVX512 masking registers.

57990

return C_RegisterClass;

57991

case 'a':

57992

case 'b':

57993

case 'c':

57994

case 'd':

57995

case 'S':

57996

case 'D':

57997

case 'A':

57998

return C_Register;

57999

case 'I':

58000

case 'J':

58001

case 'K':

58002

case 'N':

58003

case 'G':

58004

case 'L':

58005

case 'M':

58006

return C_Immediate;

58007

case 'C':

58008

case 'e':

58009

case 'Z':

58010

return C_Other;

58011

default:

58012

break;

58013

}

58014

}

58015

else if (Constraint.size() == 2) {

58016

switch (Constraint[0]) {

58017

default:

58018

break;

58019

case 'Y':

58020

switch (Constraint[1]) {

58021

default:

58022

break;

58023

case 'z':

58024

return C_Register;

58025

case 'i':

58026

case 'm':

58027

case 'k':

58028

case 't':

58029

case '2':

58030

return C_RegisterClass;

58031

}

58032

}

58033

} else if (parseConstraintCode(Constraint) != X86::COND_INVALID)

58034

return C_Other;

58035

return TargetLowering::getConstraintType(Constraint);

58036

}

58037

58038

/// Examine constraint type and operand type and determine a weight value.

58039

/// This object must already have been set up with the operand type

58040

/// and the current alternative constraint selected.

58041

TargetLowering::ConstraintWeight

58042

X86TargetLowering::getSingleConstraintMatchWeight(

58043

AsmOperandInfo &info, const char *constraint) const {

58044

ConstraintWeight weight = CW_Invalid;

58045

Value *CallOperandVal = info.CallOperandVal;

58046

// If we don't have a value, we can't do a match,

58047

// but allow it at the lowest weight.

58048

if (!CallOperandVal)

58049

return CW_Default;

58050

Type *type = CallOperandVal->getType();

58051

// Look at the constraint type.

58052

switch (*constraint) {

58053

default:

58054

weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);

58055

[[fallthrough]];

58056

case 'R':

58057

case 'q':

58058

case 'Q':

58059

case 'a':

58060

case 'b':

58061

case 'c':

58062

case 'd':

58063

case 'S':

58064

case 'D':

58065

case 'A':

58066

if (CallOperandVal->getType()->isIntegerTy())

58067

weight = CW_SpecificReg;

58068

break;

58069

case 'f':

58070

case 't':

58071

case 'u':

58072

if (type->isFloatingPointTy())

58073

weight = CW_SpecificReg;

58074

break;

58075

case 'y':

58076

if (type->isX86_MMXTy() && Subtarget.hasMMX())

58077

weight = CW_SpecificReg;

58078

break;

58079

case 'Y':

58080

if (StringRef(constraint).size() != 2)

58081

break;

58082

switch (constraint[1]) {

58083

default:

58084

return CW_Invalid;

58085

// XMM0

58086

case 'z':

58087

if (((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||

58088

((type->getPrimitiveSizeInBits() == 256) && Subtarget.hasAVX()) ||

58089

((type->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512()))

58090

return CW_SpecificReg;

58091

return CW_Invalid;

58092

// Conditional OpMask regs (AVX512)

58093

case 'k':

58094

if ((type->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())

58095

return CW_Register;

58096

return CW_Invalid;

58097

// Any MMX reg

58098

case 'm':

58099

if (type->isX86_MMXTy() && Subtarget.hasMMX())

58100

return weight;

58101

return CW_Invalid;

58102

// Any SSE reg when ISA >= SSE2, same as 'x'

58103

case 'i':

58104

case 't':

58105

case '2':

58106

if (!Subtarget.hasSSE2())

58107

return CW_Invalid;

58108

break;

58109

}

58110

break;

58111

case 'v':

58112

if ((type->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512())

58113

weight = CW_Register;

58114

[[fallthrough]];

58115

case 'x':

58116

if (((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||

58117

((type->getPrimitiveSizeInBits() == 256) && Subtarget.hasAVX()))

58118

weight = CW_Register;

58119

break;

58120

case 'k':

58121

// Enable conditional vector operations using %k<#> registers.

58122

if ((type->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())

58123

weight = CW_Register;

58124

break;

58125

case 'I':

58126

if (auto *C = dyn_cast<ConstantInt>(info.CallOperandVal)) {

58127

if (C->getZExtValue() <= 31)

58128

weight = CW_Constant;

58129

}

58130

break;

58131

case 'J':

58132

if (auto *C = dyn_cast<ConstantInt>(CallOperandVal)) {

58133

if (C->getZExtValue() <= 63)

58134

weight = CW_Constant;

58135

}

58136

break;

58137

case 'K':

58138

if (auto *C = dyn_cast<ConstantInt>(CallOperandVal)) {

58139

if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f))

58140

weight = CW_Constant;

58141

}

58142

break;

58143

case 'L':

58144

if (auto *C = dyn_cast<ConstantInt>(CallOperandVal)) {

58145

if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff))

58146

weight = CW_Constant;

58147

}

58148

break;

58149

case 'M':

58150

if (auto *C = dyn_cast<ConstantInt>(CallOperandVal)) {

58151

if (C->getZExtValue() <= 3)

58152

weight = CW_Constant;

58153

}

58154

break;

58155

case 'N':

58156

if (auto *C = dyn_cast<ConstantInt>(CallOperandVal)) {

58157

if (C->getZExtValue() <= 0xff)

58158

weight = CW_Constant;

58159

}

58160

break;

58161

case 'G':

58162

case 'C':

58163

if (isa<ConstantFP>(CallOperandVal)) {

58164

weight = CW_Constant;

58165

}

58166

break;

58167

case 'e':

58168

if (auto *C = dyn_cast<ConstantInt>(CallOperandVal)) {

58169

if ((C->getSExtValue() >= -0x80000000LL) &&

58170

(C->getSExtValue() <= 0x7fffffffLL))

58171

weight = CW_Constant;

58172

}

58173

break;

58174

case 'Z':

58175

if (auto *C = dyn_cast<ConstantInt>(CallOperandVal)) {

58176

if (C->getZExtValue() <= 0xffffffff)

58177

weight = CW_Constant;

58178

}

58179

break;

58180

}

58181

return weight;

58182

}

58183

58184

/// Try to replace an X constraint, which matches anything, with another that

58185

/// has more specific requirements based on the type of the corresponding

58186

/// operand.

58187

const char *X86TargetLowering::

58188

LowerXConstraint(EVT ConstraintVT) const {

58189

// FP X constraints get lowered to SSE1/2 registers if available, otherwise

58190

// 'f' like normal targets.

58191

if (ConstraintVT.isFloatingPoint()) {

58192

if (Subtarget.hasSSE1())

58193

return "x";

58194

}

58195

58196

return TargetLowering::LowerXConstraint(ConstraintVT);

58197

}

58198

58199

// Lower @cc targets via setcc.

58200

SDValue X86TargetLowering::LowerAsmOutputForConstraint(

58201

SDValue &Chain, SDValue &Glue, const SDLoc &DL,

58202

const AsmOperandInfo &OpInfo, SelectionDAG &DAG) const {

58203

X86::CondCode Cond = parseConstraintCode(OpInfo.ConstraintCode);

58204

if (Cond == X86::COND_INVALID)

58205

return SDValue();

58206

// Check that return type is valid.

58207

if (OpInfo.ConstraintVT.isVector() || !OpInfo.ConstraintVT.isInteger() ||

58208

OpInfo.ConstraintVT.getSizeInBits() < 8)

58209

report_fatal_error("Glue output operand is of invalid type");

58210

58211

// Get EFLAGS register. Only update chain when copyfrom is glued.

58212

if (Glue.getNode()) {

58213

Glue = DAG.getCopyFromReg(Chain, DL, X86::EFLAGS, MVT::i32, Glue);

58214

Chain = Glue.getValue(1);

58215

} else

58216

Glue = DAG.getCopyFromReg(Chain, DL, X86::EFLAGS, MVT::i32);

58217

// Extract CC code.

58218

SDValue CC = getSETCC(Cond, Glue, DL, DAG);

58219

// Extend to 32-bits

58220

SDValue Result = DAG.getNode(ISD::ZERO_EXTEND, DL, OpInfo.ConstraintVT, CC);

58221

58222

return Result;

58223

}

58224

58225

/// Lower the specified operand into the Ops vector.

58226

/// If it is invalid, don't add anything to Ops.

58227

void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,

58228

std::string &Constraint,

58229

std::vector<SDValue>&Ops,

58230

SelectionDAG &DAG) const {

58231

SDValue Result;

58232

58233

// Only support length 1 constraints for now.

58234

if (Constraint.length() > 1) return;

58235

58236

char ConstraintLetter = Constraint[0];

58237

switch (ConstraintLetter) {

58238

default: break;

58239

case 'I':

58240

if (auto *C = dyn_cast<ConstantSDNode>(Op)) {

58241

if (C->getZExtValue() <= 31) {

58242

Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),

58243

Op.getValueType());

58244

break;

58245

}

58246

}

58247

return;

58248

case 'J':

58249

if (auto *C = dyn_cast<ConstantSDNode>(Op)) {

58250

if (C->getZExtValue() <= 63) {

58251

Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),

58252

Op.getValueType());

58253

break;

58254

}

58255

}

58256

return;

58257

case 'K':

58258

if (auto *C = dyn_cast<ConstantSDNode>(Op)) {

58259

if (isInt<8>(C->getSExtValue())) {

58260

Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),

58261

Op.getValueType());

58262

break;

58263

}

58264

}

58265

return;

58266

case 'L':

58267

if (auto *C = dyn_cast<ConstantSDNode>(Op)) {

58268

if (C->getZExtValue() == 0xff || C->getZExtValue() == 0xffff ||

58269

(Subtarget.is64Bit() && C->getZExtValue() == 0xffffffff)) {

58270

Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op),

58271

Op.getValueType());

58272

break;

58273

}

58274

}

58275

return;

58276

case 'M':

58277

if (auto *C = dyn_cast<ConstantSDNode>(Op)) {

58278

if (C->getZExtValue() <= 3) {

58279

Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),

58280

Op.getValueType());

58281

break;

58282

}

58283

}

58284

return;

58285

case 'N':

58286

if (auto *C = dyn_cast<ConstantSDNode>(Op)) {

58287

if (C->getZExtValue() <= 255) {

58288

Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),

58289

Op.getValueType());

58290

break;

58291

}

58292

}

58293

return;

58294

case 'O':

58295

if (auto *C = dyn_cast<ConstantSDNode>(Op)) {

58296

if (C->getZExtValue() <= 127) {

58297

Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),

58298

Op.getValueType());

58299

break;

58300

}

58301

}

58302

return;

58303

case 'e': {

58304

// 32-bit signed value

58305

if (auto *C = dyn_cast<ConstantSDNode>(Op)) {

58306

if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),

58307

C->getSExtValue())) {

58308

// Widen to 64 bits here to get it sign extended.

58309

Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op), MVT::i64);

58310

break;

58311

}

58312

// FIXME gcc accepts some relocatable values here too, but only in certain

58313

// memory models; it's complicated.

58314

}

58315

return;

58316

}

58317

case 'Z': {

58318

// 32-bit unsigned value

58319

if (auto *C = dyn_cast<ConstantSDNode>(Op)) {

58320

if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),

58321

C->getZExtValue())) {

58322

Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),

58323

Op.getValueType());

58324

break;

58325

}

58326

}

58327

// FIXME gcc accepts some relocatable values here too, but only in certain

58328

// memory models; it's complicated.

58329

return;

58330

}

58331

case 'i': {

58332

// Literal immediates are always ok.

58333

if (auto *CST = dyn_cast<ConstantSDNode>(Op)) {

58334

bool IsBool = CST->getConstantIntValue()->getBitWidth() == 1;

58335

BooleanContent BCont = getBooleanContents(MVT::i64);

58336

ISD::NodeType ExtOpc = IsBool ? getExtendForContent(BCont)

58337

: ISD::SIGN_EXTEND;

58338

int64_t ExtVal = ExtOpc == ISD::ZERO_EXTEND ? CST->getZExtValue()

58339

: CST->getSExtValue();

58340

Result = DAG.getTargetConstant(ExtVal, SDLoc(Op), MVT::i64);

58341

break;

58342

}

58343

58344

// In any sort of PIC mode addresses need to be computed at runtime by

58345

// adding in a register or some sort of table lookup. These can't

58346

// be used as immediates. BlockAddresses and BasicBlocks are fine though.

58347

if ((Subtarget.isPICStyleGOT() || Subtarget.isPICStyleStubPIC()) &&

58348

!(isa<BlockAddressSDNode>(Op) || isa<BasicBlockSDNode>(Op)))

58349

return;

58350

58351

// If we are in non-pic codegen mode, we allow the address of a global (with

58352

// an optional displacement) to be used with 'i'.

58353

if (auto *GA = dyn_cast<GlobalAddressSDNode>(Op))

58354

// If we require an extra load to get this address, as in PIC mode, we

58355

// can't accept it.

58356

if (isGlobalStubReference(

58357

Subtarget.classifyGlobalReference(GA->getGlobal())))

58358

return;

58359

break;

58360

}

58361

}

58362

58363

if (Result.getNode()) {

58364

Ops.push_back(Result);

58365

return;

58366

}

58367

return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);

58368

}

58369

58370

/// Check if \p RC is a general purpose register class.

58371

/// I.e., GR* or one of their variant.

58372

static bool isGRClass(const TargetRegisterClass &RC) {

58373

return RC.hasSuperClassEq(&X86::GR8RegClass) ||

58374

RC.hasSuperClassEq(&X86::GR16RegClass) ||

58375

RC.hasSuperClassEq(&X86::GR32RegClass) ||

58376

RC.hasSuperClassEq(&X86::GR64RegClass) ||

58377

RC.hasSuperClassEq(&X86::LOW32_ADDR_ACCESS_RBPRegClass);

58378

}

58379

58380

/// Check if \p RC is a vector register class.

58381

/// I.e., FR* / VR* or one of their variant.

58382

static bool isFRClass(const TargetRegisterClass &RC) {

58383

return RC.hasSuperClassEq(&X86::FR16XRegClass) ||

58384

RC.hasSuperClassEq(&X86::FR32XRegClass) ||

58385

RC.hasSuperClassEq(&X86::FR64XRegClass) ||

58386

RC.hasSuperClassEq(&X86::VR128XRegClass) ||

58387

RC.hasSuperClassEq(&X86::VR256XRegClass) ||

58388

RC.hasSuperClassEq(&X86::VR512RegClass);

58389

}

58390

58391

/// Check if \p RC is a mask register class.

58392

/// I.e., VK* or one of their variant.

58393

static bool isVKClass(const TargetRegisterClass &RC) {

58394

return RC.hasSuperClassEq(&X86::VK1RegClass) ||

58395

RC.hasSuperClassEq(&X86::VK2RegClass) ||

58396

RC.hasSuperClassEq(&X86::VK4RegClass) ||

58397

RC.hasSuperClassEq(&X86::VK8RegClass) ||

58398

RC.hasSuperClassEq(&X86::VK16RegClass) ||

58399

RC.hasSuperClassEq(&X86::VK32RegClass) ||

58400

RC.hasSuperClassEq(&X86::VK64RegClass);

58401

}

58402

58403

std::pair<unsigned, const TargetRegisterClass *>

58404

X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,

58405

StringRef Constraint,

58406

MVT VT) const {

58407

// First, see if this is a constraint that directly corresponds to an LLVM

58408

// register class.

58409

if (Constraint.size() == 1) {

58410

// GCC Constraint Letters

58411

switch (Constraint[0]) {

58412

default: break;

58413

// 'A' means [ER]AX + [ER]DX.

58414

case 'A':

58415

if (Subtarget.is64Bit())

58416

return std::make_pair(X86::RAX, &X86::GR64_ADRegClass);

58417

assert((Subtarget.is32Bit() || Subtarget.is16Bit()) &&(static_cast <bool> ((Subtarget.is32Bit() || Subtarget.
is16Bit()) && "Expecting 64, 32 or 16 bit subtarget")
? void (0) : __assert_fail ("(Subtarget.is32Bit() || Subtarget.is16Bit()) && \"Expecting 64, 32 or 16 bit subtarget\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 58418, __extension__
__PRETTY_FUNCTION__))

58418

"Expecting 64, 32 or 16 bit subtarget")(static_cast <bool> ((Subtarget.is32Bit() || Subtarget.
is16Bit()) && "Expecting 64, 32 or 16 bit subtarget")
? void (0) : __assert_fail ("(Subtarget.is32Bit() || Subtarget.is16Bit()) && \"Expecting 64, 32 or 16 bit subtarget\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 58418, __extension__
__PRETTY_FUNCTION__));

58419

return std::make_pair(X86::EAX, &X86::GR32_ADRegClass);

58420

58421

// TODO: Slight differences here in allocation order and leaving

58422

// RIP in the class. Do they matter any more here than they do

58423

// in the normal allocation?

58424

case 'k':

58425

if (Subtarget.hasAVX512()) {

58426

if (VT == MVT::i1)

58427

return std::make_pair(0U, &X86::VK1RegClass);

58428

if (VT == MVT::i8)

58429

return std::make_pair(0U, &X86::VK8RegClass);

58430

if (VT == MVT::i16)

58431

return std::make_pair(0U, &X86::VK16RegClass);

58432

}

58433

if (Subtarget.hasBWI()) {

58434

if (VT == MVT::i32)

58435

return std::make_pair(0U, &X86::VK32RegClass);

58436

if (VT == MVT::i64)

58437

return std::make_pair(0U, &X86::VK64RegClass);

58438

}

58439

break;

58440

case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.

58441

if (Subtarget.is64Bit()) {

58442

if (VT == MVT::i8 || VT == MVT::i1)

58443

return std::make_pair(0U, &X86::GR8RegClass);

58444

if (VT == MVT::i16)

58445

return std::make_pair(0U, &X86::GR16RegClass);

58446

if (VT == MVT::i32 || VT == MVT::f32)

58447

return std::make_pair(0U, &X86::GR32RegClass);

58448

if (VT != MVT::f80 && !VT.isVector())

58449

return std::make_pair(0U, &X86::GR64RegClass);

58450

break;

58451

}

58452

[[fallthrough]];

58453

// 32-bit fallthrough

58454

case 'Q': // Q_REGS

58455

if (VT == MVT::i8 || VT == MVT::i1)

58456

return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);

58457

if (VT == MVT::i16)

58458

return std::make_pair(0U, &X86::GR16_ABCDRegClass);

58459

if (VT == MVT::i32 || VT == MVT::f32 ||

58460

(!VT.isVector() && !Subtarget.is64Bit()))

58461

return std::make_pair(0U, &X86::GR32_ABCDRegClass);

58462

if (VT != MVT::f80 && !VT.isVector())

58463

return std::make_pair(0U, &X86::GR64_ABCDRegClass);

58464

break;

58465

case 'r': // GENERAL_REGS

58466

case 'l': // INDEX_REGS

58467

if (VT == MVT::i8 || VT == MVT::i1)

58468

return std::make_pair(0U, &X86::GR8RegClass);

58469

if (VT == MVT::i16)

58470

return std::make_pair(0U, &X86::GR16RegClass);

58471

if (VT == MVT::i32 || VT == MVT::f32 ||

58472

(!VT.isVector() && !Subtarget.is64Bit()))

58473

return std::make_pair(0U, &X86::GR32RegClass);

58474

if (VT != MVT::f80 && !VT.isVector())

58475

return std::make_pair(0U, &X86::GR64RegClass);

58476

break;

58477

case 'R': // LEGACY_REGS

58478

if (VT == MVT::i8 || VT == MVT::i1)

58479

return std::make_pair(0U, &X86::GR8_NOREXRegClass);

58480

if (VT == MVT::i16)

58481

return std::make_pair(0U, &X86::GR16_NOREXRegClass);

58482

if (VT == MVT::i32 || VT == MVT::f32 ||

58483

(!VT.isVector() && !Subtarget.is64Bit()))

58484

return std::make_pair(0U, &X86::GR32_NOREXRegClass);

58485

if (VT != MVT::f80 && !VT.isVector())

58486

return std::make_pair(0U, &X86::GR64_NOREXRegClass);

58487

break;

58488

case 'f': // FP Stack registers.

58489

// If SSE is enabled for this VT, use f80 to ensure the isel moves the

58490

// value to the correct fpstack register class.

58491

if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))

58492

return std::make_pair(0U, &X86::RFP32RegClass);

58493

if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))

58494

return std::make_pair(0U, &X86::RFP64RegClass);

58495

if (VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f80)

58496

return std::make_pair(0U, &X86::RFP80RegClass);

58497

break;

58498

case 'y': // MMX_REGS if MMX allowed.

58499

if (!Subtarget.hasMMX()) break;

58500

return std::make_pair(0U, &X86::VR64RegClass);

58501

case 'v':

58502

case 'x': // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed

58503

if (!Subtarget.hasSSE1()) break;

58504

bool VConstraint = (Constraint[0] == 'v');

58505

58506

switch (VT.SimpleTy) {

58507

default: break;

58508

// Scalar SSE types.

58509

case MVT::f16:

58510

if (VConstraint && Subtarget.hasFP16())

58511

return std::make_pair(0U, &X86::FR16XRegClass);

58512

break;

58513

case MVT::f32:

58514

case MVT::i32:

58515

if (VConstraint && Subtarget.hasVLX())

58516

return std::make_pair(0U, &X86::FR32XRegClass);

58517

return std::make_pair(0U, &X86::FR32RegClass);

58518

case MVT::f64:

58519

case MVT::i64:

58520

if (VConstraint && Subtarget.hasVLX())

58521

return std::make_pair(0U, &X86::FR64XRegClass);

58522

return std::make_pair(0U, &X86::FR64RegClass);

58523

case MVT::i128:

58524

if (Subtarget.is64Bit()) {

58525

if (VConstraint && Subtarget.hasVLX())

58526

return std::make_pair(0U, &X86::VR128XRegClass);

58527

return std::make_pair(0U, &X86::VR128RegClass);

58528

}

58529

break;

58530

// Vector types and fp128.

58531

case MVT::v8f16:

58532

if (!Subtarget.hasFP16())

58533

break;

58534

[[fallthrough]];

58535

case MVT::f128:

58536

case MVT::v16i8:

58537

case MVT::v8i16:

58538

case MVT::v4i32:

58539

case MVT::v2i64:

58540

case MVT::v4f32:

58541

case MVT::v2f64:

58542

if (VConstraint && Subtarget.hasVLX())

58543

return std::make_pair(0U, &X86::VR128XRegClass);

58544

return std::make_pair(0U, &X86::VR128RegClass);

58545

// AVX types.

58546

case MVT::v16f16:

58547

if (!Subtarget.hasFP16())

58548

break;

58549

[[fallthrough]];

58550

case MVT::v32i8:

58551

case MVT::v16i16:

58552

case MVT::v8i32:

58553

case MVT::v4i64:

58554

case MVT::v8f32:

58555

case MVT::v4f64:

58556

if (VConstraint && Subtarget.hasVLX())

58557

return std::make_pair(0U, &X86::VR256XRegClass);

58558

if (Subtarget.hasAVX())

58559

return std::make_pair(0U, &X86::VR256RegClass);

58560

break;

58561

case MVT::v32f16:

58562

if (!Subtarget.hasFP16())

58563

break;

58564

[[fallthrough]];

58565

case MVT::v64i8:

58566

case MVT::v32i16:

58567

case MVT::v8f64:

58568

case MVT::v16f32:

58569

case MVT::v16i32:

58570

case MVT::v8i64:

58571

if (!Subtarget.hasAVX512()) break;

58572

if (VConstraint)

58573

return std::make_pair(0U, &X86::VR512RegClass);

58574

return std::make_pair(0U, &X86::VR512_0_15RegClass);

58575

}

58576

break;

58577

}

58578

} else if (Constraint.size() == 2 && Constraint[0] == 'Y') {

58579

switch (Constraint[1]) {

58580

default:

58581

break;

58582

case 'i':

58583

case 't':

58584

case '2':

58585

return getRegForInlineAsmConstraint(TRI, "x", VT);

58586

case 'm':

58587

if (!Subtarget.hasMMX()) break;

58588

return std::make_pair(0U, &X86::VR64RegClass);

58589

case 'z':

58590

if (!Subtarget.hasSSE1()) break;

58591

switch (VT.SimpleTy) {

58592

default: break;

58593

// Scalar SSE types.

58594

case MVT::f16:

58595

if (!Subtarget.hasFP16())

58596

break;

58597

return std::make_pair(X86::XMM0, &X86::FR16XRegClass);

58598

case MVT::f32:

58599

case MVT::i32:

58600

return std::make_pair(X86::XMM0, &X86::FR32RegClass);

58601

case MVT::f64:

58602

case MVT::i64:

58603

return std::make_pair(X86::XMM0, &X86::FR64RegClass);

58604

case MVT::v8f16:

58605

if (!Subtarget.hasFP16())

58606

break;

58607

[[fallthrough]];

58608

case MVT::f128:

58609

case MVT::v16i8:

58610

case MVT::v8i16:

58611

case MVT::v4i32:

58612

case MVT::v2i64:

58613

case MVT::v4f32:

58614

case MVT::v2f64:

58615

return std::make_pair(X86::XMM0, &X86::VR128RegClass);

58616

// AVX types.

58617

case MVT::v16f16:

58618

if (!Subtarget.hasFP16())

58619

break;

58620

[[fallthrough]];

58621

case MVT::v32i8:

58622

case MVT::v16i16:

58623

case MVT::v8i32:

58624

case MVT::v4i64:

58625

case MVT::v8f32:

58626

case MVT::v4f64:

58627

if (Subtarget.hasAVX())

58628

return std::make_pair(X86::YMM0, &X86::VR256RegClass);

58629

break;

58630

case MVT::v32f16:

58631

if (!Subtarget.hasFP16())

58632

break;

58633

[[fallthrough]];

58634

case MVT::v64i8:

58635

case MVT::v32i16:

58636

case MVT::v8f64:

58637

case MVT::v16f32:

58638

case MVT::v16i32:

58639

case MVT::v8i64:

58640

if (Subtarget.hasAVX512())

58641

return std::make_pair(X86::ZMM0, &X86::VR512_0_15RegClass);

58642

break;

58643

}

58644

break;

58645

case 'k':

58646

// This register class doesn't allocate k0 for masked vector operation.

58647

if (Subtarget.hasAVX512()) {

58648

if (VT == MVT::i1)

58649

return std::make_pair(0U, &X86::VK1WMRegClass);

58650

if (VT == MVT::i8)

58651

return std::make_pair(0U, &X86::VK8WMRegClass);

58652

if (VT == MVT::i16)

58653

return std::make_pair(0U, &X86::VK16WMRegClass);

58654

}

58655

if (Subtarget.hasBWI()) {

58656

if (VT == MVT::i32)

58657

return std::make_pair(0U, &X86::VK32WMRegClass);

58658

if (VT == MVT::i64)

58659

return std::make_pair(0U, &X86::VK64WMRegClass);

58660

}

58661

break;

58662

}

58663

}

58664

58665

if (parseConstraintCode(Constraint) != X86::COND_INVALID)

58666

return std::make_pair(0U, &X86::GR32RegClass);

58667

58668

// Use the default implementation in TargetLowering to convert the register

58669

// constraint into a member of a register class.

58670

std::pair<Register, const TargetRegisterClass*> Res;

58671

Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);

58672

58673

// Not found as a standard register?

58674

if (!Res.second) {

58675

// Only match x87 registers if the VT is one SelectionDAGBuilder can convert

58676

// to/from f80.

58677

if (VT == MVT::Other || VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f80) {

58678

// Map st(0) -> st(7) -> ST0

58679

if (Constraint.size() == 7 && Constraint[0] == '{' &&

58680

tolower(Constraint[1]) == 's' && tolower(Constraint[2]) == 't' &&

58681

Constraint[3] == '(' &&

58682

(Constraint[4] >= '0' && Constraint[4] <= '7') &&

58683

Constraint[5] == ')' && Constraint[6] == '}') {

58684

// st(7) is not allocatable and thus not a member of RFP80. Return

58685

// singleton class in cases where we have a reference to it.

58686

if (Constraint[4] == '7')

58687

return std::make_pair(X86::FP7, &X86::RFP80_7RegClass);

58688

return std::make_pair(X86::FP0 + Constraint[4] - '0',

58689

&X86::RFP80RegClass);

58690

}

58691

58692

// GCC allows "st(0)" to be called just plain "st".

58693

if (StringRef("{st}").equals_insensitive(Constraint))

58694

return std::make_pair(X86::FP0, &X86::RFP80RegClass);

58695

}

58696

58697

// flags -> EFLAGS

58698

if (StringRef("{flags}").equals_insensitive(Constraint))

58699

return std::make_pair(X86::EFLAGS, &X86::CCRRegClass);

58700

58701

// dirflag -> DF

58702

// Only allow for clobber.

58703

if (StringRef("{dirflag}").equals_insensitive(Constraint) &&

58704

VT == MVT::Other)

58705

return std::make_pair(X86::DF, &X86::DFCCRRegClass);

58706

58707

// fpsr -> FPSW

58708

if (StringRef("{fpsr}").equals_insensitive(Constraint))

58709

return std::make_pair(X86::FPSW, &X86::FPCCRRegClass);

58710

58711

return Res;

58712

}

58713

58714

// Make sure it isn't a register that requires 64-bit mode.

58715

if (!Subtarget.is64Bit() &&

58716

(isFRClass(*Res.second) || isGRClass(*Res.second)) &&

58717

TRI->getEncodingValue(Res.first) >= 8) {

58718

// Register requires REX prefix, but we're in 32-bit mode.

58719

return std::make_pair(0, nullptr);

58720

}

58721

58722

// Make sure it isn't a register that requires AVX512.

58723

if (!Subtarget.hasAVX512() && isFRClass(*Res.second) &&

58724

TRI->getEncodingValue(Res.first) & 0x10) {

58725

// Register requires EVEX prefix.

58726

return std::make_pair(0, nullptr);

58727

}

58728

58729

// Otherwise, check to see if this is a register class of the wrong value

58730

// type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to

58731

// turn into {ax},{dx}.

58732

// MVT::Other is used to specify clobber names.

58733

if (TRI->isTypeLegalForClass(*Res.second, VT) || VT == MVT::Other)

58734

return Res; // Correct type already, nothing to do.

58735

58736

// Get a matching integer of the correct size. i.e. "ax" with MVT::32 should

58737

// return "eax". This should even work for things like getting 64bit integer

58738

// registers when given an f64 type.

58739

const TargetRegisterClass *Class = Res.second;

58740

// The generic code will match the first register class that contains the

58741

// given register. Thus, based on the ordering of the tablegened file,

58742

// the "plain" GR classes might not come first.

58743

// Therefore, use a helper method.

58744

if (isGRClass(*Class)) {

58745

unsigned Size = VT.getSizeInBits();

58746

if (Size == 1) Size = 8;

58747

if (Size != 8 && Size != 16 && Size != 32 && Size != 64)

58748

return std::make_pair(0, nullptr);

58749

Register DestReg = getX86SubSuperRegister(Res.first, Size);

58750

if (DestReg.isValid()) {

58751

bool is64Bit = Subtarget.is64Bit();

58752

const TargetRegisterClass *RC =

58753

Size == 8 ? (is64Bit ? &X86::GR8RegClass : &X86::GR8_NOREXRegClass)

58754

: Size == 16 ? (is64Bit ? &X86::GR16RegClass : &X86::GR16_NOREXRegClass)

58755

: Size == 32 ? (is64Bit ? &X86::GR32RegClass : &X86::GR32_NOREXRegClass)

58756

: /*Size == 64*/ (is64Bit ? &X86::GR64RegClass : nullptr);

58757

if (Size == 64 && !is64Bit) {

58758

// Model GCC's behavior here and select a fixed pair of 32-bit

58759

// registers.

58760

switch (DestReg) {

58761

case X86::RAX:

58762

return std::make_pair(X86::EAX, &X86::GR32_ADRegClass);

58763

case X86::RDX:

58764

return std::make_pair(X86::EDX, &X86::GR32_DCRegClass);

58765

case X86::RCX:

58766

return std::make_pair(X86::ECX, &X86::GR32_CBRegClass);

58767

case X86::RBX:

58768

return std::make_pair(X86::EBX, &X86::GR32_BSIRegClass);

58769

case X86::RSI:

58770

return std::make_pair(X86::ESI, &X86::GR32_SIDIRegClass);

58771

case X86::RDI:

58772

return std::make_pair(X86::EDI, &X86::GR32_DIBPRegClass);

58773

case X86::RBP:

58774

return std::make_pair(X86::EBP, &X86::GR32_BPSPRegClass);

58775

default:

58776

return std::make_pair(0, nullptr);

58777

}

58778

}

58779

if (RC && RC->contains(DestReg))

58780

return std::make_pair(DestReg, RC);

58781

return Res;

58782

}

58783

// No register found/type mismatch.

58784

return std::make_pair(0, nullptr);

58785

} else if (isFRClass(*Class)) {

58786

// Handle references to XMM physical registers that got mapped into the

58787

// wrong class. This can happen with constraints like {xmm0} where the

58788

// target independent register mapper will just pick the first match it can

58789

// find, ignoring the required type.

58790

58791

// TODO: Handle f128 and i128 in FR128RegClass after it is tested well.

58792

if (VT == MVT::f16)

58793

Res.second = &X86::FR16XRegClass;

58794

else if (VT == MVT::f32 || VT == MVT::i32)

58795

Res.second = &X86::FR32XRegClass;

58796

else if (VT == MVT::f64 || VT == MVT::i64)

58797

Res.second = &X86::FR64XRegClass;

58798

else if (TRI->isTypeLegalForClass(X86::VR128XRegClass, VT))

58799

Res.second = &X86::VR128XRegClass;

58800

else if (TRI->isTypeLegalForClass(X86::VR256XRegClass, VT))

58801

Res.second = &X86::VR256XRegClass;

58802

else if (TRI->isTypeLegalForClass(X86::VR512RegClass, VT))

58803

Res.second = &X86::VR512RegClass;

58804

else {

58805

// Type mismatch and not a clobber: Return an error;

58806

Res.first = 0;

58807

Res.second = nullptr;

58808

}

58809

} else if (isVKClass(*Class)) {

58810

if (VT == MVT::i1)

58811

Res.second = &X86::VK1RegClass;

58812

else if (VT == MVT::i8)

58813

Res.second = &X86::VK8RegClass;

58814

else if (VT == MVT::i16)

58815

Res.second = &X86::VK16RegClass;

58816

else if (VT == MVT::i32)

58817

Res.second = &X86::VK32RegClass;

58818

else if (VT == MVT::i64)

58819

Res.second = &X86::VK64RegClass;

58820

else {

58821

// Type mismatch and not a clobber: Return an error;

58822

Res.first = 0;

58823

Res.second = nullptr;

58824

}

58825

}

58826

58827

return Res;

58828

}

58829

58830

bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {

58831

// Integer division on x86 is expensive. However, when aggressively optimizing

58832

// for code size, we prefer to use a div instruction, as it is usually smaller

58833

// than the alternative sequence.

58834

// The exception to this is vector division. Since x86 doesn't have vector

58835

// integer division, leaving the division as-is is a loss even in terms of

58836

// size, because it will have to be scalarized, while the alternative code

58837

// sequence can be performed in vector form.

58838

bool OptSize = Attr.hasFnAttr(Attribute::MinSize);

58839

return OptSize && !VT.isVector();

58840

}

58841

58842

void X86TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {

58843

if (!Subtarget.is64Bit())

58844

return;

58845

58846

// Update IsSplitCSR in X86MachineFunctionInfo.

58847

X86MachineFunctionInfo *AFI =

58848

Entry->getParent()->getInfo<X86MachineFunctionInfo>();

58849

AFI->setIsSplitCSR(true);

58850

}

58851

58852

void X86TargetLowering::insertCopiesSplitCSR(

58853

MachineBasicBlock *Entry,

58854

const SmallVectorImpl<MachineBasicBlock *> &Exits) const {

58855

const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();

58856

const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());

58857

if (!IStart)

58858

return;

58859

58860

const TargetInstrInfo *TII = Subtarget.getInstrInfo();

58861

MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();

58862

MachineBasicBlock::iterator MBBI = Entry->begin();

58863

for (const MCPhysReg *I = IStart; *I; ++I) {

58864

const TargetRegisterClass *RC = nullptr;

58865

if (X86::GR64RegClass.contains(*I))

58866

RC = &X86::GR64RegClass;

58867

else

58868

llvm_unreachable("Unexpected register class in CSRsViaCopy!")::llvm::llvm_unreachable_internal("Unexpected register class in CSRsViaCopy!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 58868);

58869

58870

Register NewVR = MRI->createVirtualRegister(RC);

58871

// Create copy from CSR to a virtual register.

58872

// FIXME: this currently does not emit CFI pseudo-instructions, it works

58873

// fine for CXX_FAST_TLS since the C++-style TLS access functions should be

58874

// nounwind. If we want to generalize this later, we may need to emit

58875

// CFI pseudo-instructions.

58876

assert((static_cast <bool> (Entry->getParent()->getFunction
().hasFnAttribute(Attribute::NoUnwind) && "Function should be nounwind in insertCopiesSplitCSR!"
) ? void (0) : __assert_fail ("Entry->getParent()->getFunction().hasFnAttribute(Attribute::NoUnwind) && \"Function should be nounwind in insertCopiesSplitCSR!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 58878, __extension__
__PRETTY_FUNCTION__))

58877

Entry->getParent()->getFunction().hasFnAttribute(Attribute::NoUnwind) &&(static_cast <bool> (Entry->getParent()->getFunction
().hasFnAttribute(Attribute::NoUnwind) && "Function should be nounwind in insertCopiesSplitCSR!"
) ? void (0) : __assert_fail ("Entry->getParent()->getFunction().hasFnAttribute(Attribute::NoUnwind) && \"Function should be nounwind in insertCopiesSplitCSR!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 58878, __extension__
__PRETTY_FUNCTION__))

58878

"Function should be nounwind in insertCopiesSplitCSR!")(static_cast <bool> (Entry->getParent()->getFunction
().hasFnAttribute(Attribute::NoUnwind) && "Function should be nounwind in insertCopiesSplitCSR!"
) ? void (0) : __assert_fail ("Entry->getParent()->getFunction().hasFnAttribute(Attribute::NoUnwind) && \"Function should be nounwind in insertCopiesSplitCSR!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 58878, __extension__
__PRETTY_FUNCTION__));

58879

Entry->addLiveIn(*I);

58880

BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)

58881

.addReg(*I);

58882

58883

// Insert the copy-back instructions right before the terminator.

58884

for (auto *Exit : Exits)

58885

BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),

58886

TII->get(TargetOpcode::COPY), *I)

58887

.addReg(NewVR);

58888

}

58889

}

58890

58891

bool X86TargetLowering::supportSwiftError() const {

58892

return Subtarget.is64Bit();

58893

}

58894

58895

/// Returns true if stack probing through a function call is requested.

58896

bool X86TargetLowering::hasStackProbeSymbol(const MachineFunction &MF) const {

58897

return !getStackProbeSymbolName(MF).empty();

58898

}

58899

58900

/// Returns true if stack probing through inline assembly is requested.

58901

bool X86TargetLowering::hasInlineStackProbe(const MachineFunction &MF) const {

58902

58903

// No inline stack probe for Windows, they have their own mechanism.

58904

if (Subtarget.isOSWindows() ||

58905

MF.getFunction().hasFnAttribute("no-stack-arg-probe"))

58906

return false;

58907

58908

// If the function specifically requests inline stack probes, emit them.

58909

if (MF.getFunction().hasFnAttribute("probe-stack"))

58910

return MF.getFunction().getFnAttribute("probe-stack").getValueAsString() ==

58911

"inline-asm";

58912

58913

return false;

58914

}

58915

58916

/// Returns the name of the symbol used to emit stack probes or the empty

58917

/// string if not applicable.

58918

StringRef

58919

X86TargetLowering::getStackProbeSymbolName(const MachineFunction &MF) const {

58920

// Inline Stack probes disable stack probe call

58921

if (hasInlineStackProbe(MF))

58922

return "";

58923

58924

// If the function specifically requests stack probes, emit them.

58925

if (MF.getFunction().hasFnAttribute("probe-stack"))

58926

return MF.getFunction().getFnAttribute("probe-stack").getValueAsString();

58927

58928

// Generally, if we aren't on Windows, the platform ABI does not include

58929

// support for stack probes, so don't emit them.

58930

if (!Subtarget.isOSWindows() || Subtarget.isTargetMachO() ||

58931

MF.getFunction().hasFnAttribute("no-stack-arg-probe"))

58932

return "";

58933

58934

// We need a stack probe to conform to the Windows ABI. Choose the right

58935

// symbol.

58936

if (Subtarget.is64Bit())

58937

return Subtarget.isTargetCygMing() ? "___chkstk_ms" : "__chkstk";

58938

return Subtarget.isTargetCygMing() ? "_alloca" : "_chkstk";

58939

}

58940

58941

unsigned

58942

X86TargetLowering::getStackProbeSize(const MachineFunction &MF) const {

58943

// The default stack probe size is 4096 if the function has no stackprobesize

58944

// attribute.

58945

return MF.getFunction().getFnAttributeAsParsedInteger("stack-probe-size",

58946

4096);

58947

}

58948

58949

Align X86TargetLowering::getPrefLoopAlignment(MachineLoop *ML) const {

58950

if (ML->isInnermost() &&

58951

ExperimentalPrefInnermostLoopAlignment.getNumOccurrences())

58952

return Align(1ULL << ExperimentalPrefInnermostLoopAlignment);

58953

return TargetLowering::getPrefLoopAlignment();

58954

}

File:	build/source/llvm/lib/Target/X86/X86ISelLowering.cpp
Warning:	line 17608, column 31 Division by zero

Bug Summary

Annotated Source Code