/build/source/llvm/lib/Target/X86/X86ISelLowering.cpp

1

//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//

2

//

3

// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

4

// See https://llvm.org/LICENSE.txt for license information.

5

// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

6

//

7

//===----------------------------------------------------------------------===//

8

//

9

// This file defines the interfaces that X86 uses to lower LLVM code into a

10

// selection DAG.

11

//

12

//===----------------------------------------------------------------------===//

13

14

#include "X86ISelLowering.h"

15

#include "MCTargetDesc/X86ShuffleDecode.h"

16

#include "X86.h"

17

#include "X86CallingConv.h"

18

#include "X86FrameLowering.h"

19

#include "X86InstrBuilder.h"

20

#include "X86IntrinsicsInfo.h"

21

#include "X86MachineFunctionInfo.h"

22

#include "X86TargetMachine.h"

23

#include "X86TargetObjectFile.h"

24

#include "llvm/ADT/SmallBitVector.h"

25

#include "llvm/ADT/SmallSet.h"

26

#include "llvm/ADT/Statistic.h"

27

#include "llvm/ADT/StringExtras.h"

28

#include "llvm/ADT/StringSwitch.h"

29

#include "llvm/Analysis/BlockFrequencyInfo.h"

30

#include "llvm/Analysis/EHPersonalities.h"

31

#include "llvm/Analysis/ObjCARCUtil.h"

32

#include "llvm/Analysis/ProfileSummaryInfo.h"

33

#include "llvm/Analysis/VectorUtils.h"

34

#include "llvm/CodeGen/IntrinsicLowering.h"

35

#include "llvm/CodeGen/MachineFrameInfo.h"

36

#include "llvm/CodeGen/MachineFunction.h"

37

#include "llvm/CodeGen/MachineInstrBuilder.h"

38

#include "llvm/CodeGen/MachineJumpTableInfo.h"

39

#include "llvm/CodeGen/MachineLoopInfo.h"

40

#include "llvm/CodeGen/MachineModuleInfo.h"

41

#include "llvm/CodeGen/MachineRegisterInfo.h"

42

#include "llvm/CodeGen/TargetLowering.h"

43

#include "llvm/CodeGen/WinEHFuncInfo.h"

44

#include "llvm/IR/CallingConv.h"

45

#include "llvm/IR/Constants.h"

46

#include "llvm/IR/DerivedTypes.h"

47

#include "llvm/IR/DiagnosticInfo.h"

48

#include "llvm/IR/Function.h"

49

#include "llvm/IR/GlobalAlias.h"

50

#include "llvm/IR/GlobalVariable.h"

51

#include "llvm/IR/IRBuilder.h"

52

#include "llvm/IR/Instructions.h"

53

#include "llvm/IR/Intrinsics.h"

54

#include "llvm/IR/PatternMatch.h"

55

#include "llvm/MC/MCAsmInfo.h"

56

#include "llvm/MC/MCContext.h"

57

#include "llvm/MC/MCExpr.h"

58

#include "llvm/MC/MCSymbol.h"

59

#include "llvm/Support/CommandLine.h"

60

#include "llvm/Support/Debug.h"

61

#include "llvm/Support/ErrorHandling.h"

62

#include "llvm/Support/KnownBits.h"

63

#include "llvm/Support/MathExtras.h"

64

#include "llvm/Target/TargetOptions.h"

65

#include <algorithm>

66

#include <bitset>

67

#include <cctype>

68

#include <numeric>

69

using namespace llvm;

70

71

#define DEBUG_TYPE"x86-isel" "x86-isel"

72

73

STATISTIC(NumTailCalls, "Number of tail calls")static llvm::Statistic NumTailCalls = {"x86-isel", "NumTailCalls"
, "Number of tail calls"};

74

75

static cl::opt<int> ExperimentalPrefInnermostLoopAlignment(

76

"x86-experimental-pref-innermost-loop-alignment", cl::init(4),

77

cl::desc(

78

"Sets the preferable loop alignment for experiments (as log2 bytes) "

79

"for innermost loops only. If specified, this option overrides "

80

"alignment set by x86-experimental-pref-loop-alignment."),

81

cl::Hidden);

82

83

static cl::opt<bool> MulConstantOptimization(

84

"mul-constant-optimization", cl::init(true),

85

cl::desc("Replace 'mul x, Const' with more effective instructions like "

86

"SHIFT, LEA, etc."),

87

cl::Hidden);

88

89

static cl::opt<bool> ExperimentalUnorderedISEL(

90

"x86-experimental-unordered-atomic-isel", cl::init(false),

91

cl::desc("Use LoadSDNode and StoreSDNode instead of "

92

"AtomicSDNode for unordered atomic loads and "

93

"stores respectively."),

94

cl::Hidden);

95

96

/// Call this when the user attempts to do something unsupported, like

97

/// returning a double without SSE2 enabled on x86_64. This is not fatal, unlike

98

/// report_fatal_error, so calling code should attempt to recover without

99

/// crashing.

100

static void errorUnsupported(SelectionDAG &DAG, const SDLoc &dl,

101

const char *Msg) {

102

MachineFunction &MF = DAG.getMachineFunction();

103

DAG.getContext()->diagnose(

104

DiagnosticInfoUnsupported(MF.getFunction(), Msg, dl.getDebugLoc()));

105

}

106

107

/// Returns true if a CC can dynamically exclude a register from the list of

108

/// callee-saved-registers (TargetRegistryInfo::getCalleeSavedRegs()) based on

109

/// params/returns.

110

static bool shouldDisableCalleeSavedRegisterCC(CallingConv::ID CC) {

111

switch (CC) {

112

default:

113

return false;

114

case CallingConv::X86_RegCall:

115

case CallingConv::PreserveMost:

116

case CallingConv::PreserveAll:

117

return true;

118

}

119

}

120

121

X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,

122

const X86Subtarget &STI)

123

: TargetLowering(TM), Subtarget(STI) {

124

bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87();

125

MVT PtrVT = MVT::getIntegerVT(TM.getPointerSizeInBits(0));

126

127

// Set up the TargetLowering object.

128

129

// X86 is weird. It always uses i8 for shift amounts and setcc results.

130

setBooleanContents(ZeroOrOneBooleanContent);

131

// X86-SSE is even stranger. It uses -1 or 0 for vector masks.

132

setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);

133

134

// For 64-bit, since we have so many registers, use the ILP scheduler.

135

// For 32-bit, use the register pressure specific scheduling.

136

// For Atom, always use ILP scheduling.

137

if (Subtarget.isAtom())

138

setSchedulingPreference(Sched::ILP);

139

else if (Subtarget.is64Bit())

140

setSchedulingPreference(Sched::ILP);

141

else

142

setSchedulingPreference(Sched::RegPressure);

143

const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();

144

setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());

145

146

// Bypass expensive divides and use cheaper ones.

147

if (TM.getOptLevel() >= CodeGenOpt::Default) {

148

if (Subtarget.hasSlowDivide32())

149

addBypassSlowDiv(32, 8);

150

if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit())

151

addBypassSlowDiv(64, 32);

152

}

153

154

// Setup Windows compiler runtime calls.

155

if (Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium()) {

156

static const struct {

157

const RTLIB::Libcall Op;

158

const char * const Name;

159

const CallingConv::ID CC;

160

} LibraryCalls[] = {

161

{ RTLIB::SDIV_I64, "_alldiv", CallingConv::X86_StdCall },

162

{ RTLIB::UDIV_I64, "_aulldiv", CallingConv::X86_StdCall },

163

{ RTLIB::SREM_I64, "_allrem", CallingConv::X86_StdCall },

164

{ RTLIB::UREM_I64, "_aullrem", CallingConv::X86_StdCall },

165

{ RTLIB::MUL_I64, "_allmul", CallingConv::X86_StdCall },

166

};

167

168

for (const auto &LC : LibraryCalls) {

169

setLibcallName(LC.Op, LC.Name);

170

setLibcallCallingConv(LC.Op, LC.CC);

171

}

172

}

173

174

if (Subtarget.getTargetTriple().isOSMSVCRT()) {

175

// MSVCRT doesn't have powi; fall back to pow

176

setLibcallName(RTLIB::POWI_F32, nullptr);

177

setLibcallName(RTLIB::POWI_F64, nullptr);

178

}

179

180

// If we don't have cmpxchg8b(meaing this is a 386/486), limit atomic size to

181

// 32 bits so the AtomicExpandPass will expand it so we don't need cmpxchg8b.

182

// FIXME: Should we be limiting the atomic size on other configs? Default is

183

// 1024.

184

if (!Subtarget.canUseCMPXCHG8B())

185

setMaxAtomicSizeInBitsSupported(32);

186

187

setMaxDivRemBitWidthSupported(Subtarget.is64Bit() ? 128 : 64);

188

189

setMaxLargeFPConvertBitWidthSupported(128);

190

191

// Set up the register classes.

192

addRegisterClass(MVT::i8, &X86::GR8RegClass);

193

addRegisterClass(MVT::i16, &X86::GR16RegClass);

194

addRegisterClass(MVT::i32, &X86::GR32RegClass);

195

if (Subtarget.is64Bit())

196

addRegisterClass(MVT::i64, &X86::GR64RegClass);

197

198

for (MVT VT : MVT::integer_valuetypes())

199

setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);

200

201

// We don't accept any truncstore of integer registers.

202

setTruncStoreAction(MVT::i64, MVT::i32, Expand);

203

setTruncStoreAction(MVT::i64, MVT::i16, Expand);

204

setTruncStoreAction(MVT::i64, MVT::i8 , Expand);

205

setTruncStoreAction(MVT::i32, MVT::i16, Expand);

206

setTruncStoreAction(MVT::i32, MVT::i8 , Expand);

207

setTruncStoreAction(MVT::i16, MVT::i8, Expand);

208

209

setTruncStoreAction(MVT::f64, MVT::f32, Expand);

210

211

// SETOEQ and SETUNE require checking two conditions.

212

for (auto VT : {MVT::f32, MVT::f64, MVT::f80}) {

213

setCondCodeAction(ISD::SETOEQ, VT, Expand);

214

setCondCodeAction(ISD::SETUNE, VT, Expand);

215

}

216

217

// Integer absolute.

218

if (Subtarget.canUseCMOV()) {

219

setOperationAction(ISD::ABS , MVT::i16 , Custom);

220

setOperationAction(ISD::ABS , MVT::i32 , Custom);

221

if (Subtarget.is64Bit())

222

setOperationAction(ISD::ABS , MVT::i64 , Custom);

223

}

224

225

// Signed saturation subtraction.

226

setOperationAction(ISD::SSUBSAT , MVT::i8 , Custom);

227

setOperationAction(ISD::SSUBSAT , MVT::i16 , Custom);

228

setOperationAction(ISD::SSUBSAT , MVT::i32 , Custom);

229

if (Subtarget.is64Bit())

230

setOperationAction(ISD::SSUBSAT , MVT::i64 , Custom);

231

232

// Funnel shifts.

233

for (auto ShiftOp : {ISD::FSHL, ISD::FSHR}) {

234

// For slow shld targets we only lower for code size.

235

LegalizeAction ShiftDoubleAction = Subtarget.isSHLDSlow() ? Custom : Legal;

236

237

setOperationAction(ShiftOp , MVT::i8 , Custom);

238

setOperationAction(ShiftOp , MVT::i16 , Custom);

239

setOperationAction(ShiftOp , MVT::i32 , ShiftDoubleAction);

240

if (Subtarget.is64Bit())

241

setOperationAction(ShiftOp , MVT::i64 , ShiftDoubleAction);

242

}

243

244

if (!Subtarget.useSoftFloat()) {

245

// Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this

246

// operation.

247

setOperationAction(ISD::UINT_TO_FP, MVT::i8, Promote);

248

setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i8, Promote);

249

setOperationAction(ISD::UINT_TO_FP, MVT::i16, Promote);

250

setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i16, Promote);

251

// We have an algorithm for SSE2, and we turn this into a 64-bit

252

// FILD or VCVTUSI2SS/SD for other targets.

253

setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);

254

setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i32, Custom);

255

// We have an algorithm for SSE2->double, and we turn this into a

256

// 64-bit FILD followed by conditional FADD for other targets.

257

setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);

258

setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i64, Custom);

259

260

// Promote i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have

261

// this operation.

262

setOperationAction(ISD::SINT_TO_FP, MVT::i8, Promote);

263

setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i8, Promote);

264

// SSE has no i16 to fp conversion, only i32. We promote in the handler

265

// to allow f80 to use i16 and f64 to use i16 with sse1 only

266

setOperationAction(ISD::SINT_TO_FP, MVT::i16, Custom);

267

setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i16, Custom);

268

// f32 and f64 cases are Legal with SSE1/SSE2, f80 case is not

269

setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);

270

setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i32, Custom);

271

// In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64

272

// are Legal, f80 is custom lowered.

273

setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);

274

setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i64, Custom);

275

276

// Promote i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have

277

// this operation.

278

setOperationAction(ISD::FP_TO_SINT, MVT::i8, Promote);

279

// FIXME: This doesn't generate invalid exception when it should. PR44019.

280

setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i8, Promote);

281

setOperationAction(ISD::FP_TO_SINT, MVT::i16, Custom);

282

setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i16, Custom);

283

setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);

284

setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i32, Custom);

285

// In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64

286

// are Legal, f80 is custom lowered.

287

setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);

288

setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i64, Custom);

289

290

// Handle FP_TO_UINT by promoting the destination to a larger signed

291

// conversion.

292

setOperationAction(ISD::FP_TO_UINT, MVT::i8, Promote);

293

// FIXME: This doesn't generate invalid exception when it should. PR44019.

294

setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i8, Promote);

295

setOperationAction(ISD::FP_TO_UINT, MVT::i16, Promote);

296

// FIXME: This doesn't generate invalid exception when it should. PR44019.

297

setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i16, Promote);

298

setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);

299

setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i32, Custom);

300

setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);

301

setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i64, Custom);

302

303

setOperationAction(ISD::LRINT, MVT::f32, Custom);

304

setOperationAction(ISD::LRINT, MVT::f64, Custom);

305

setOperationAction(ISD::LLRINT, MVT::f32, Custom);

306

setOperationAction(ISD::LLRINT, MVT::f64, Custom);

307

308

if (!Subtarget.is64Bit()) {

309

setOperationAction(ISD::LRINT, MVT::i64, Custom);

310

setOperationAction(ISD::LLRINT, MVT::i64, Custom);

311

}

312

}

313

314

if (Subtarget.hasSSE2()) {

315

// Custom lowering for saturating float to int conversions.

316

// We handle promotion to larger result types manually.

317

for (MVT VT : { MVT::i8, MVT::i16, MVT::i32 }) {

318

setOperationAction(ISD::FP_TO_UINT_SAT, VT, Custom);

319

setOperationAction(ISD::FP_TO_SINT_SAT, VT, Custom);

320

}

321

if (Subtarget.is64Bit()) {

322

setOperationAction(ISD::FP_TO_UINT_SAT, MVT::i64, Custom);

323

setOperationAction(ISD::FP_TO_SINT_SAT, MVT::i64, Custom);

324

}

325

}

326

327

// Handle address space casts between mixed sized pointers.

328

setOperationAction(ISD::ADDRSPACECAST, MVT::i32, Custom);

329

setOperationAction(ISD::ADDRSPACECAST, MVT::i64, Custom);

330

331

// TODO: when we have SSE, these could be more efficient, by using movd/movq.

332

if (!Subtarget.hasSSE2()) {

333

setOperationAction(ISD::BITCAST , MVT::f32 , Expand);

334

setOperationAction(ISD::BITCAST , MVT::i32 , Expand);

335

if (Subtarget.is64Bit()) {

336

setOperationAction(ISD::BITCAST , MVT::f64 , Expand);

337

// Without SSE, i64->f64 goes through memory.

338

setOperationAction(ISD::BITCAST , MVT::i64 , Expand);

339

}

340

} else if (!Subtarget.is64Bit())

341

setOperationAction(ISD::BITCAST , MVT::i64 , Custom);

342

343

// Scalar integer divide and remainder are lowered to use operations that

344

// produce two results, to match the available instructions. This exposes

345

// the two-result form to trivial CSE, which is able to combine x/y and x%y

346

// into a single instruction.

347

//

348

// Scalar integer multiply-high is also lowered to use two-result

349

// operations, to match the available instructions. However, plain multiply

350

// (low) operations are left as Legal, as there are single-result

351

// instructions for this in x86. Using the two-result multiply instructions

352

// when both high and low results are needed must be arranged by dagcombine.

353

for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {

354

setOperationAction(ISD::MULHS, VT, Expand);

355

setOperationAction(ISD::MULHU, VT, Expand);

356

setOperationAction(ISD::SDIV, VT, Expand);

357

setOperationAction(ISD::UDIV, VT, Expand);

358

setOperationAction(ISD::SREM, VT, Expand);

359

setOperationAction(ISD::UREM, VT, Expand);

360

}

361

362

setOperationAction(ISD::BR_JT , MVT::Other, Expand);

363

setOperationAction(ISD::BRCOND , MVT::Other, Custom);

364

for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128,

365

MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {

366

setOperationAction(ISD::BR_CC, VT, Expand);

367

setOperationAction(ISD::SELECT_CC, VT, Expand);

368

}

369

if (Subtarget.is64Bit())

370

setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);

371

setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal);

372

setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal);

373

setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand);

374

375

setOperationAction(ISD::FREM , MVT::f32 , Expand);

376

setOperationAction(ISD::FREM , MVT::f64 , Expand);

377

setOperationAction(ISD::FREM , MVT::f80 , Expand);

378

setOperationAction(ISD::FREM , MVT::f128 , Expand);

379

380

if (!Subtarget.useSoftFloat() && Subtarget.hasX87()) {

381

setOperationAction(ISD::GET_ROUNDING , MVT::i32 , Custom);

382

setOperationAction(ISD::SET_ROUNDING , MVT::Other, Custom);

383

}

384

385

// Promote the i8 variants and force them on up to i32 which has a shorter

386

// encoding.

387

setOperationPromotedToType(ISD::CTTZ , MVT::i8 , MVT::i32);

388

setOperationPromotedToType(ISD::CTTZ_ZERO_UNDEF, MVT::i8 , MVT::i32);

389

// Promoted i16. tzcntw has a false dependency on Intel CPUs. For BSF, we emit

390

// a REP prefix to encode it as TZCNT for modern CPUs so it makes sense to

391

// promote that too.

392

setOperationPromotedToType(ISD::CTTZ , MVT::i16 , MVT::i32);

393

setOperationPromotedToType(ISD::CTTZ_ZERO_UNDEF, MVT::i16 , MVT::i32);

394

395

if (!Subtarget.hasBMI()) {

396

setOperationAction(ISD::CTTZ , MVT::i32 , Custom);

397

setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32 , Legal);

398

if (Subtarget.is64Bit()) {

399

setOperationAction(ISD::CTTZ , MVT::i64 , Custom);

400

setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Legal);

401

}

402

}

403

404

if (Subtarget.hasLZCNT()) {

405

// When promoting the i8 variants, force them to i32 for a shorter

406

// encoding.

407

setOperationPromotedToType(ISD::CTLZ , MVT::i8 , MVT::i32);

408

setOperationPromotedToType(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , MVT::i32);

409

} else {

410

for (auto VT : {MVT::i8, MVT::i16, MVT::i32, MVT::i64}) {

411

if (VT == MVT::i64 && !Subtarget.is64Bit())

412

continue;

413

setOperationAction(ISD::CTLZ , VT, Custom);

414

setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Custom);

415

}

416

}

417

418

for (auto Op : {ISD::FP16_TO_FP, ISD::STRICT_FP16_TO_FP, ISD::FP_TO_FP16,

419

ISD::STRICT_FP_TO_FP16}) {

420

// Special handling for half-precision floating point conversions.

421

// If we don't have F16C support, then lower half float conversions

422

// into library calls.

423

setOperationAction(

424

Op, MVT::f32,

425

(!Subtarget.useSoftFloat() && Subtarget.hasF16C()) ? Custom : Expand);

426

// There's never any support for operations beyond MVT::f32.

427

setOperationAction(Op, MVT::f64, Expand);

428

setOperationAction(Op, MVT::f80, Expand);

429

setOperationAction(Op, MVT::f128, Expand);

430

}

431

432

for (MVT VT : {MVT::f32, MVT::f64, MVT::f80, MVT::f128}) {

433

setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);

434

setLoadExtAction(ISD::EXTLOAD, VT, MVT::bf16, Expand);

435

setTruncStoreAction(VT, MVT::f16, Expand);

436

setTruncStoreAction(VT, MVT::bf16, Expand);

437

438

setOperationAction(ISD::BF16_TO_FP, VT, Expand);

439

setOperationAction(ISD::FP_TO_BF16, VT, Custom);

440

}

441

442

setOperationAction(ISD::PARITY, MVT::i8, Custom);

443

setOperationAction(ISD::PARITY, MVT::i16, Custom);

444

setOperationAction(ISD::PARITY, MVT::i32, Custom);

445

if (Subtarget.is64Bit())

446

setOperationAction(ISD::PARITY, MVT::i64, Custom);

447

if (Subtarget.hasPOPCNT()) {

448

setOperationPromotedToType(ISD::CTPOP, MVT::i8, MVT::i32);

449

// popcntw is longer to encode than popcntl and also has a false dependency

450

// on the dest that popcntl hasn't had since Cannon Lake.

451

setOperationPromotedToType(ISD::CTPOP, MVT::i16, MVT::i32);

452

} else {

453

setOperationAction(ISD::CTPOP , MVT::i8 , Expand);

454

setOperationAction(ISD::CTPOP , MVT::i16 , Expand);

455

setOperationAction(ISD::CTPOP , MVT::i32 , Expand);

456

if (Subtarget.is64Bit())

457

setOperationAction(ISD::CTPOP , MVT::i64 , Expand);

458

else

459

setOperationAction(ISD::CTPOP , MVT::i64 , Custom);

460

}

461

462

setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom);

463

464

if (!Subtarget.hasMOVBE())

465

setOperationAction(ISD::BSWAP , MVT::i16 , Expand);

466

467

// X86 wants to expand cmov itself.

468

for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) {

469

setOperationAction(ISD::SELECT, VT, Custom);

470

setOperationAction(ISD::SETCC, VT, Custom);

471

setOperationAction(ISD::STRICT_FSETCC, VT, Custom);

472

setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);

473

}

474

for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {

475

if (VT == MVT::i64 && !Subtarget.is64Bit())

476

continue;

477

setOperationAction(ISD::SELECT, VT, Custom);

478

setOperationAction(ISD::SETCC, VT, Custom);

479

}

480

481

// Custom action for SELECT MMX and expand action for SELECT_CC MMX

482

setOperationAction(ISD::SELECT, MVT::x86mmx, Custom);

483

setOperationAction(ISD::SELECT_CC, MVT::x86mmx, Expand);

484

485

setOperationAction(ISD::EH_RETURN , MVT::Other, Custom);

486

// NOTE: EH_SJLJ_SETJMP/_LONGJMP are not recommended, since

487

// LLVM/Clang supports zero-cost DWARF and SEH exception handling.

488

setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);

489

setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);

490

setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom);

491

if (TM.Options.ExceptionModel == ExceptionHandling::SjLj)

492

setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");

493

494

// Darwin ABI issue.

495

for (auto VT : { MVT::i32, MVT::i64 }) {

496

if (VT == MVT::i64 && !Subtarget.is64Bit())

497

continue;

498

setOperationAction(ISD::ConstantPool , VT, Custom);

499

setOperationAction(ISD::JumpTable , VT, Custom);

500

setOperationAction(ISD::GlobalAddress , VT, Custom);

501

setOperationAction(ISD::GlobalTLSAddress, VT, Custom);

502

setOperationAction(ISD::ExternalSymbol , VT, Custom);

503

setOperationAction(ISD::BlockAddress , VT, Custom);

504

}

505

506

// 64-bit shl, sra, srl (iff 32-bit x86)

507

for (auto VT : { MVT::i32, MVT::i64 }) {

508

if (VT == MVT::i64 && !Subtarget.is64Bit())

509

continue;

510

setOperationAction(ISD::SHL_PARTS, VT, Custom);

511

setOperationAction(ISD::SRA_PARTS, VT, Custom);

512

setOperationAction(ISD::SRL_PARTS, VT, Custom);

513

}

514

515

if (Subtarget.hasSSEPrefetch() || Subtarget.hasThreeDNow())

516

setOperationAction(ISD::PREFETCH , MVT::Other, Legal);

517

518

setOperationAction(ISD::ATOMIC_FENCE , MVT::Other, Custom);

519

520

// Expand certain atomics

521

for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {

522

setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);

523

setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);

524

setOperationAction(ISD::ATOMIC_LOAD_ADD, VT, Custom);

525

setOperationAction(ISD::ATOMIC_LOAD_OR, VT, Custom);

526

setOperationAction(ISD::ATOMIC_LOAD_XOR, VT, Custom);

527

setOperationAction(ISD::ATOMIC_LOAD_AND, VT, Custom);

528

setOperationAction(ISD::ATOMIC_STORE, VT, Custom);

529

}

530

531

if (!Subtarget.is64Bit())

532

setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Custom);

533

534

if (Subtarget.canUseCMPXCHG16B())

535

setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);

536

537

// FIXME - use subtarget debug flags

538

if (!Subtarget.isTargetDarwin() && !Subtarget.isTargetELF() &&

539

!Subtarget.isTargetCygMing() && !Subtarget.isTargetWin64() &&

540

TM.Options.ExceptionModel != ExceptionHandling::SjLj) {

541

setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);

542

}

543

544

setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);

545

setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);

546

547

setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);

548

setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);

549

550

setOperationAction(ISD::TRAP, MVT::Other, Legal);

551

setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);

552

if (Subtarget.isTargetPS())

553

setOperationAction(ISD::UBSANTRAP, MVT::Other, Expand);

554

else

555

setOperationAction(ISD::UBSANTRAP, MVT::Other, Legal);

556

557

// VASTART needs to be custom lowered to use the VarArgsFrameIndex

558

setOperationAction(ISD::VASTART , MVT::Other, Custom);

559

setOperationAction(ISD::VAEND , MVT::Other, Expand);

560

bool Is64Bit = Subtarget.is64Bit();

561

setOperationAction(ISD::VAARG, MVT::Other, Is64Bit ? Custom : Expand);

562

setOperationAction(ISD::VACOPY, MVT::Other, Is64Bit ? Custom : Expand);

563

564

setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);

565

setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);

566

567

setOperationAction(ISD::DYNAMIC_STACKALLOC, PtrVT, Custom);

568

569

// GC_TRANSITION_START and GC_TRANSITION_END need custom lowering.

570

setOperationAction(ISD::GC_TRANSITION_START, MVT::Other, Custom);

571

setOperationAction(ISD::GC_TRANSITION_END, MVT::Other, Custom);

572

573

setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Legal);

574

575

auto setF16Action = [&] (MVT VT, LegalizeAction Action) {

576

setOperationAction(ISD::FABS, VT, Action);

577

setOperationAction(ISD::FNEG, VT, Action);

578

setOperationAction(ISD::FCOPYSIGN, VT, Expand);

579

setOperationAction(ISD::FREM, VT, Action);

580

setOperationAction(ISD::FMA, VT, Action);

581

setOperationAction(ISD::FMINNUM, VT, Action);

582

setOperationAction(ISD::FMAXNUM, VT, Action);

583

setOperationAction(ISD::FMINIMUM, VT, Action);

584

setOperationAction(ISD::FMAXIMUM, VT, Action);

585

setOperationAction(ISD::FSIN, VT, Action);

586

setOperationAction(ISD::FCOS, VT, Action);

587

setOperationAction(ISD::FSINCOS, VT, Action);

588

setOperationAction(ISD::FSQRT, VT, Action);

589

setOperationAction(ISD::FPOW, VT, Action);

590

setOperationAction(ISD::FLOG, VT, Action);

591

setOperationAction(ISD::FLOG2, VT, Action);

592

setOperationAction(ISD::FLOG10, VT, Action);

593

setOperationAction(ISD::FEXP, VT, Action);

594

setOperationAction(ISD::FEXP2, VT, Action);

595

setOperationAction(ISD::FCEIL, VT, Action);

596

setOperationAction(ISD::FFLOOR, VT, Action);

597

setOperationAction(ISD::FNEARBYINT, VT, Action);

598

setOperationAction(ISD::FRINT, VT, Action);

599

setOperationAction(ISD::BR_CC, VT, Action);

600

setOperationAction(ISD::SETCC, VT, Action);

601

setOperationAction(ISD::SELECT, VT, Custom);

602

setOperationAction(ISD::SELECT_CC, VT, Action);

603

setOperationAction(ISD::FROUND, VT, Action);

604

setOperationAction(ISD::FROUNDEVEN, VT, Action);

605

setOperationAction(ISD::FTRUNC, VT, Action);

606

};

607

608

if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {

609

// f16, f32 and f64 use SSE.

610

// Set up the FP register classes.

611

addRegisterClass(MVT::f16, Subtarget.hasAVX512() ? &X86::FR16XRegClass

612

: &X86::FR16RegClass);

613

addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass

614

: &X86::FR32RegClass);

615

addRegisterClass(MVT::f64, Subtarget.hasAVX512() ? &X86::FR64XRegClass

616

: &X86::FR64RegClass);

617

618

// Disable f32->f64 extload as we can only generate this in one instruction

619

// under optsize. So its easier to pattern match (fpext (load)) for that

620

// case instead of needing to emit 2 instructions for extload in the

621

// non-optsize case.

622

setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);

623

624

for (auto VT : { MVT::f32, MVT::f64 }) {

625

// Use ANDPD to simulate FABS.

626

setOperationAction(ISD::FABS, VT, Custom);

627

628

// Use XORP to simulate FNEG.

629

setOperationAction(ISD::FNEG, VT, Custom);

630

631

// Use ANDPD and ORPD to simulate FCOPYSIGN.

632

setOperationAction(ISD::FCOPYSIGN, VT, Custom);

633

634

// These might be better off as horizontal vector ops.

635

setOperationAction(ISD::FADD, VT, Custom);

636

setOperationAction(ISD::FSUB, VT, Custom);

637

638

// We don't support sin/cos/fmod

639

setOperationAction(ISD::FSIN , VT, Expand);

640

setOperationAction(ISD::FCOS , VT, Expand);

641

setOperationAction(ISD::FSINCOS, VT, Expand);

642

}

643

644

// Half type will be promoted by default.

645

setF16Action(MVT::f16, Promote);

646

setOperationAction(ISD::FADD, MVT::f16, Promote);

647

setOperationAction(ISD::FSUB, MVT::f16, Promote);

648

setOperationAction(ISD::FMUL, MVT::f16, Promote);

649

setOperationAction(ISD::FDIV, MVT::f16, Promote);

650

setOperationAction(ISD::FP_ROUND, MVT::f16, Custom);

651

setOperationAction(ISD::FP_EXTEND, MVT::f32, Custom);

652

setOperationAction(ISD::FP_EXTEND, MVT::f64, Custom);

653

654

setOperationAction(ISD::STRICT_FADD, MVT::f16, Promote);

655

setOperationAction(ISD::STRICT_FSUB, MVT::f16, Promote);

656

setOperationAction(ISD::STRICT_FMUL, MVT::f16, Promote);

657

setOperationAction(ISD::STRICT_FDIV, MVT::f16, Promote);

658

setOperationAction(ISD::STRICT_FMA, MVT::f16, Promote);

659

setOperationAction(ISD::STRICT_FMINNUM, MVT::f16, Promote);

660

setOperationAction(ISD::STRICT_FMAXNUM, MVT::f16, Promote);

661

setOperationAction(ISD::STRICT_FMINIMUM, MVT::f16, Promote);

662

setOperationAction(ISD::STRICT_FMAXIMUM, MVT::f16, Promote);

663

setOperationAction(ISD::STRICT_FSQRT, MVT::f16, Promote);

664

setOperationAction(ISD::STRICT_FPOW, MVT::f16, Promote);

665

setOperationAction(ISD::STRICT_FLOG, MVT::f16, Promote);

666

setOperationAction(ISD::STRICT_FLOG2, MVT::f16, Promote);

667

setOperationAction(ISD::STRICT_FLOG10, MVT::f16, Promote);

668

setOperationAction(ISD::STRICT_FEXP, MVT::f16, Promote);

669

setOperationAction(ISD::STRICT_FEXP2, MVT::f16, Promote);

670

setOperationAction(ISD::STRICT_FCEIL, MVT::f16, Promote);

671

setOperationAction(ISD::STRICT_FFLOOR, MVT::f16, Promote);

672

setOperationAction(ISD::STRICT_FNEARBYINT, MVT::f16, Promote);

673

setOperationAction(ISD::STRICT_FRINT, MVT::f16, Promote);

674

setOperationAction(ISD::STRICT_FSETCC, MVT::f16, Promote);

675

setOperationAction(ISD::STRICT_FSETCCS, MVT::f16, Promote);

676

setOperationAction(ISD::STRICT_FROUND, MVT::f16, Promote);

677

setOperationAction(ISD::STRICT_FROUNDEVEN, MVT::f16, Promote);

678

setOperationAction(ISD::STRICT_FTRUNC, MVT::f16, Promote);

679

setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, Custom);

680

setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f32, Custom);

681

setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Custom);

682

683

setLibcallName(RTLIB::FPROUND_F32_F16, "__truncsfhf2");

684

setLibcallName(RTLIB::FPEXT_F16_F32, "__extendhfsf2");

685

686

// Lower this to MOVMSK plus an AND.

687

setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);

688

setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);

689

690

} else if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1() &&

691

(UseX87 || Is64Bit)) {

692

// Use SSE for f32, x87 for f64.

693

// Set up the FP register classes.

694

addRegisterClass(MVT::f32, &X86::FR32RegClass);

695

if (UseX87)

696

addRegisterClass(MVT::f64, &X86::RFP64RegClass);

697

698

// Use ANDPS to simulate FABS.

699

setOperationAction(ISD::FABS , MVT::f32, Custom);

700

701

// Use XORP to simulate FNEG.

702

setOperationAction(ISD::FNEG , MVT::f32, Custom);

703

704

if (UseX87)

705

setOperationAction(ISD::UNDEF, MVT::f64, Expand);

706

707

// Use ANDPS and ORPS to simulate FCOPYSIGN.

708

if (UseX87)

709

setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);

710

setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);

711

712

// We don't support sin/cos/fmod

713

setOperationAction(ISD::FSIN , MVT::f32, Expand);

714

setOperationAction(ISD::FCOS , MVT::f32, Expand);

715

setOperationAction(ISD::FSINCOS, MVT::f32, Expand);

716

717

if (UseX87) {

718

// Always expand sin/cos functions even though x87 has an instruction.

719

setOperationAction(ISD::FSIN, MVT::f64, Expand);

720

setOperationAction(ISD::FCOS, MVT::f64, Expand);

721

setOperationAction(ISD::FSINCOS, MVT::f64, Expand);

722

}

723

} else if (UseX87) {

724

// f32 and f64 in x87.

725

// Set up the FP register classes.

726

addRegisterClass(MVT::f64, &X86::RFP64RegClass);

727

addRegisterClass(MVT::f32, &X86::RFP32RegClass);

728

729

for (auto VT : { MVT::f32, MVT::f64 }) {

730

setOperationAction(ISD::UNDEF, VT, Expand);

731

setOperationAction(ISD::FCOPYSIGN, VT, Expand);

732

733

// Always expand sin/cos functions even though x87 has an instruction.

734

setOperationAction(ISD::FSIN , VT, Expand);

735

setOperationAction(ISD::FCOS , VT, Expand);

736

setOperationAction(ISD::FSINCOS, VT, Expand);

737

}

738

}

739

740

// Expand FP32 immediates into loads from the stack, save special cases.

741

if (isTypeLegal(MVT::f32)) {

742

if (UseX87 && (getRegClassFor(MVT::f32) == &X86::RFP32RegClass)) {

743

addLegalFPImmediate(APFloat(+0.0f)); // FLD0

744

addLegalFPImmediate(APFloat(+1.0f)); // FLD1

745

addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS

746

addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS

747

} else // SSE immediates.

748

addLegalFPImmediate(APFloat(+0.0f)); // xorps

749

}

750

// Expand FP64 immediates into loads from the stack, save special cases.

751

if (isTypeLegal(MVT::f64)) {

752

if (UseX87 && getRegClassFor(MVT::f64) == &X86::RFP64RegClass) {

753

addLegalFPImmediate(APFloat(+0.0)); // FLD0

754

addLegalFPImmediate(APFloat(+1.0)); // FLD1

755

addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS

756

addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS

757

} else // SSE immediates.

758

addLegalFPImmediate(APFloat(+0.0)); // xorpd

759

}

760

// Support fp16 0 immediate.

761

if (isTypeLegal(MVT::f16))

762

addLegalFPImmediate(APFloat::getZero(APFloat::IEEEhalf()));

763

764

// Handle constrained floating-point operations of scalar.

765

setOperationAction(ISD::STRICT_FADD, MVT::f32, Legal);

766

setOperationAction(ISD::STRICT_FADD, MVT::f64, Legal);

767

setOperationAction(ISD::STRICT_FSUB, MVT::f32, Legal);

768

setOperationAction(ISD::STRICT_FSUB, MVT::f64, Legal);

769

setOperationAction(ISD::STRICT_FMUL, MVT::f32, Legal);

770

setOperationAction(ISD::STRICT_FMUL, MVT::f64, Legal);

771

setOperationAction(ISD::STRICT_FDIV, MVT::f32, Legal);

772

setOperationAction(ISD::STRICT_FDIV, MVT::f64, Legal);

773

setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Legal);

774

setOperationAction(ISD::STRICT_FP_ROUND, MVT::f64, Legal);

775

setOperationAction(ISD::STRICT_FSQRT, MVT::f32, Legal);

776

setOperationAction(ISD::STRICT_FSQRT, MVT::f64, Legal);

777

778

// We don't support FMA.

779

setOperationAction(ISD::FMA, MVT::f64, Expand);

780

setOperationAction(ISD::FMA, MVT::f32, Expand);

781

782

// f80 always uses X87.

783

if (UseX87) {

784

addRegisterClass(MVT::f80, &X86::RFP80RegClass);

785

setOperationAction(ISD::UNDEF, MVT::f80, Expand);

786

setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);

787

{

788

APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended());

789

addLegalFPImmediate(TmpFlt); // FLD0

790

TmpFlt.changeSign();

791

addLegalFPImmediate(TmpFlt); // FLD0/FCHS

792

793

bool ignored;

794

APFloat TmpFlt2(+1.0);

795

TmpFlt2.convert(APFloat::x87DoubleExtended(), APFloat::rmNearestTiesToEven,

796

&ignored);

797

addLegalFPImmediate(TmpFlt2); // FLD1

798

TmpFlt2.changeSign();

799

addLegalFPImmediate(TmpFlt2); // FLD1/FCHS

800

}

801

802

// Always expand sin/cos functions even though x87 has an instruction.

803

setOperationAction(ISD::FSIN , MVT::f80, Expand);

804

setOperationAction(ISD::FCOS , MVT::f80, Expand);

805

setOperationAction(ISD::FSINCOS, MVT::f80, Expand);

806

807

setOperationAction(ISD::FFLOOR, MVT::f80, Expand);

808

setOperationAction(ISD::FCEIL, MVT::f80, Expand);

809

setOperationAction(ISD::FTRUNC, MVT::f80, Expand);

810

setOperationAction(ISD::FRINT, MVT::f80, Expand);

811

setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);

812

setOperationAction(ISD::FMA, MVT::f80, Expand);

813

setOperationAction(ISD::LROUND, MVT::f80, Expand);

814

setOperationAction(ISD::LLROUND, MVT::f80, Expand);

815

setOperationAction(ISD::LRINT, MVT::f80, Custom);

816

setOperationAction(ISD::LLRINT, MVT::f80, Custom);

817

818

// Handle constrained floating-point operations of scalar.

819

setOperationAction(ISD::STRICT_FADD , MVT::f80, Legal);

820

setOperationAction(ISD::STRICT_FSUB , MVT::f80, Legal);

821

setOperationAction(ISD::STRICT_FMUL , MVT::f80, Legal);

822

setOperationAction(ISD::STRICT_FDIV , MVT::f80, Legal);

823

setOperationAction(ISD::STRICT_FSQRT , MVT::f80, Legal);

824

if (isTypeLegal(MVT::f16)) {

825

setOperationAction(ISD::FP_EXTEND, MVT::f80, Custom);

826

setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f80, Custom);

827

} else {

828

setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f80, Legal);

829

}

830

// FIXME: When the target is 64-bit, STRICT_FP_ROUND will be overwritten

831

// as Custom.

832

setOperationAction(ISD::STRICT_FP_ROUND, MVT::f80, Legal);

833

}

834

835

// f128 uses xmm registers, but most operations require libcalls.

836

if (!Subtarget.useSoftFloat() && Subtarget.is64Bit() && Subtarget.hasSSE1()) {

837

addRegisterClass(MVT::f128, Subtarget.hasVLX() ? &X86::VR128XRegClass

838

: &X86::VR128RegClass);

839

840

addLegalFPImmediate(APFloat::getZero(APFloat::IEEEquad())); // xorps

841

842

setOperationAction(ISD::FADD, MVT::f128, LibCall);

843

setOperationAction(ISD::STRICT_FADD, MVT::f128, LibCall);

844

setOperationAction(ISD::FSUB, MVT::f128, LibCall);

845

setOperationAction(ISD::STRICT_FSUB, MVT::f128, LibCall);

846

setOperationAction(ISD::FDIV, MVT::f128, LibCall);

847

setOperationAction(ISD::STRICT_FDIV, MVT::f128, LibCall);

848

setOperationAction(ISD::FMUL, MVT::f128, LibCall);

849

setOperationAction(ISD::STRICT_FMUL, MVT::f128, LibCall);

850

setOperationAction(ISD::FMA, MVT::f128, LibCall);

851

setOperationAction(ISD::STRICT_FMA, MVT::f128, LibCall);

852

853

setOperationAction(ISD::FABS, MVT::f128, Custom);

854

setOperationAction(ISD::FNEG, MVT::f128, Custom);

855

setOperationAction(ISD::FCOPYSIGN, MVT::f128, Custom);

856

857

setOperationAction(ISD::FSIN, MVT::f128, LibCall);

858

setOperationAction(ISD::STRICT_FSIN, MVT::f128, LibCall);

859

setOperationAction(ISD::FCOS, MVT::f128, LibCall);

860

setOperationAction(ISD::STRICT_FCOS, MVT::f128, LibCall);

861

setOperationAction(ISD::FSINCOS, MVT::f128, LibCall);

862

// No STRICT_FSINCOS

863

setOperationAction(ISD::FSQRT, MVT::f128, LibCall);

864

setOperationAction(ISD::STRICT_FSQRT, MVT::f128, LibCall);

865

866

setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom);

867

setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f128, Custom);

868

// We need to custom handle any FP_ROUND with an f128 input, but

869

// LegalizeDAG uses the result type to know when to run a custom handler.

870

// So we have to list all legal floating point result types here.

871

if (isTypeLegal(MVT::f32)) {

872

setOperationAction(ISD::FP_ROUND, MVT::f32, Custom);

873

setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Custom);

874

}

875

if (isTypeLegal(MVT::f64)) {

876

setOperationAction(ISD::FP_ROUND, MVT::f64, Custom);

877

setOperationAction(ISD::STRICT_FP_ROUND, MVT::f64, Custom);

878

}

879

if (isTypeLegal(MVT::f80)) {

880

setOperationAction(ISD::FP_ROUND, MVT::f80, Custom);

881

setOperationAction(ISD::STRICT_FP_ROUND, MVT::f80, Custom);

882

}

883

884

setOperationAction(ISD::SETCC, MVT::f128, Custom);

885

886

setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f32, Expand);

887

setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f64, Expand);

888

setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f80, Expand);

889

setTruncStoreAction(MVT::f128, MVT::f32, Expand);

890

setTruncStoreAction(MVT::f128, MVT::f64, Expand);

891

setTruncStoreAction(MVT::f128, MVT::f80, Expand);

892

}

893

894

// Always use a library call for pow.

895

setOperationAction(ISD::FPOW , MVT::f32 , Expand);

896

setOperationAction(ISD::FPOW , MVT::f64 , Expand);

897

setOperationAction(ISD::FPOW , MVT::f80 , Expand);

898

setOperationAction(ISD::FPOW , MVT::f128 , Expand);

899

900

setOperationAction(ISD::FLOG, MVT::f80, Expand);

901

setOperationAction(ISD::FLOG2, MVT::f80, Expand);

902

setOperationAction(ISD::FLOG10, MVT::f80, Expand);

903

setOperationAction(ISD::FEXP, MVT::f80, Expand);

904

setOperationAction(ISD::FEXP2, MVT::f80, Expand);

905

setOperationAction(ISD::FMINNUM, MVT::f80, Expand);

906

setOperationAction(ISD::FMAXNUM, MVT::f80, Expand);

907

908

// Some FP actions are always expanded for vector types.

909

for (auto VT : { MVT::v8f16, MVT::v16f16, MVT::v32f16,

910

MVT::v4f32, MVT::v8f32, MVT::v16f32,

911

MVT::v2f64, MVT::v4f64, MVT::v8f64 }) {

912

setOperationAction(ISD::FSIN, VT, Expand);

913

setOperationAction(ISD::FSINCOS, VT, Expand);

914

setOperationAction(ISD::FCOS, VT, Expand);

915

setOperationAction(ISD::FREM, VT, Expand);

916

setOperationAction(ISD::FCOPYSIGN, VT, Expand);

917

setOperationAction(ISD::FPOW, VT, Expand);

918

setOperationAction(ISD::FLOG, VT, Expand);

919

setOperationAction(ISD::FLOG2, VT, Expand);

920

setOperationAction(ISD::FLOG10, VT, Expand);

921

setOperationAction(ISD::FEXP, VT, Expand);

922

setOperationAction(ISD::FEXP2, VT, Expand);

923

}

924

925

// First set operation action for all vector types to either promote

926

// (for widening) or expand (for scalarization). Then we will selectively

927

// turn on ones that can be effectively codegen'd.

928

for (MVT VT : MVT::fixedlen_vector_valuetypes()) {

929

setOperationAction(ISD::SDIV, VT, Expand);

930

setOperationAction(ISD::UDIV, VT, Expand);

931

setOperationAction(ISD::SREM, VT, Expand);

932

setOperationAction(ISD::UREM, VT, Expand);

933

setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand);

934

setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);

935

setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand);

936

setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand);

937

setOperationAction(ISD::FMA, VT, Expand);

938

setOperationAction(ISD::FFLOOR, VT, Expand);

939

setOperationAction(ISD::FCEIL, VT, Expand);

940

setOperationAction(ISD::FTRUNC, VT, Expand);

941

setOperationAction(ISD::FRINT, VT, Expand);

942

setOperationAction(ISD::FNEARBYINT, VT, Expand);

943

setOperationAction(ISD::SMUL_LOHI, VT, Expand);

944

setOperationAction(ISD::MULHS, VT, Expand);

945

setOperationAction(ISD::UMUL_LOHI, VT, Expand);

946

setOperationAction(ISD::MULHU, VT, Expand);

947

setOperationAction(ISD::SDIVREM, VT, Expand);

948

setOperationAction(ISD::UDIVREM, VT, Expand);

949

setOperationAction(ISD::CTPOP, VT, Expand);

950

setOperationAction(ISD::CTTZ, VT, Expand);

951

setOperationAction(ISD::CTLZ, VT, Expand);

952

setOperationAction(ISD::ROTL, VT, Expand);

953

setOperationAction(ISD::ROTR, VT, Expand);

954

setOperationAction(ISD::BSWAP, VT, Expand);

955

setOperationAction(ISD::SETCC, VT, Expand);

956

setOperationAction(ISD::FP_TO_UINT, VT, Expand);

957

setOperationAction(ISD::FP_TO_SINT, VT, Expand);

958

setOperationAction(ISD::UINT_TO_FP, VT, Expand);

959

setOperationAction(ISD::SINT_TO_FP, VT, Expand);

960

setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand);

961

setOperationAction(ISD::TRUNCATE, VT, Expand);

962

setOperationAction(ISD::SIGN_EXTEND, VT, Expand);

963

setOperationAction(ISD::ZERO_EXTEND, VT, Expand);

964

setOperationAction(ISD::ANY_EXTEND, VT, Expand);

965

setOperationAction(ISD::SELECT_CC, VT, Expand);

966

for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {

967

setTruncStoreAction(InnerVT, VT, Expand);

968

969

setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);

970

setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand);

971

972

// N.b. ISD::EXTLOAD legality is basically ignored except for i1-like

973

// types, we have to deal with them whether we ask for Expansion or not.

974

// Setting Expand causes its own optimisation problems though, so leave

975

// them legal.

976

if (VT.getVectorElementType() == MVT::i1)

977

setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);

978

979

// EXTLOAD for MVT::f16 vectors is not legal because f16 vectors are

980

// split/scalarized right now.

981

if (VT.getVectorElementType() == MVT::f16 ||

982

VT.getVectorElementType() == MVT::bf16)

983

setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);

984

}

985

}

986

987

// FIXME: In order to prevent SSE instructions being expanded to MMX ones

988

// with -msoft-float, disable use of MMX as well.

989

if (!Subtarget.useSoftFloat() && Subtarget.hasMMX()) {

990

addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);

991

// No operations on x86mmx supported, everything uses intrinsics.

992

}

993

994

if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1()) {

995

addRegisterClass(MVT::v4f32, Subtarget.hasVLX() ? &X86::VR128XRegClass

996

: &X86::VR128RegClass);

997

998

setOperationAction(ISD::FNEG, MVT::v4f32, Custom);

999

setOperationAction(ISD::FABS, MVT::v4f32, Custom);

1000

setOperationAction(ISD::FCOPYSIGN, MVT::v4f32, Custom);

1001

setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom);

1002

setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom);

1003

setOperationAction(ISD::VSELECT, MVT::v4f32, Custom);

1004

setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);

1005

setOperationAction(ISD::SELECT, MVT::v4f32, Custom);

1006

1007

setOperationAction(ISD::LOAD, MVT::v2f32, Custom);

1008

setOperationAction(ISD::STORE, MVT::v2f32, Custom);

1009

1010

setOperationAction(ISD::STRICT_FADD, MVT::v4f32, Legal);

1011

setOperationAction(ISD::STRICT_FSUB, MVT::v4f32, Legal);

1012

setOperationAction(ISD::STRICT_FMUL, MVT::v4f32, Legal);

1013

setOperationAction(ISD::STRICT_FDIV, MVT::v4f32, Legal);

1014

setOperationAction(ISD::STRICT_FSQRT, MVT::v4f32, Legal);

1015

}

1016

1017

if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {

1018

addRegisterClass(MVT::v2f64, Subtarget.hasVLX() ? &X86::VR128XRegClass

1019

: &X86::VR128RegClass);

1020

1021

// FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM

1022

// registers cannot be used even for integer operations.

1023

addRegisterClass(MVT::v16i8, Subtarget.hasVLX() ? &X86::VR128XRegClass

1024

: &X86::VR128RegClass);

1025

addRegisterClass(MVT::v8i16, Subtarget.hasVLX() ? &X86::VR128XRegClass

1026

: &X86::VR128RegClass);

1027

addRegisterClass(MVT::v8f16, Subtarget.hasVLX() ? &X86::VR128XRegClass

1028

: &X86::VR128RegClass);

1029

addRegisterClass(MVT::v4i32, Subtarget.hasVLX() ? &X86::VR128XRegClass

1030

: &X86::VR128RegClass);

1031

addRegisterClass(MVT::v2i64, Subtarget.hasVLX() ? &X86::VR128XRegClass

1032

: &X86::VR128RegClass);

1033

1034

for (auto VT : { MVT::v2i8, MVT::v4i8, MVT::v8i8,

1035

MVT::v2i16, MVT::v4i16, MVT::v2i32 }) {

1036

setOperationAction(ISD::SDIV, VT, Custom);

1037

setOperationAction(ISD::SREM, VT, Custom);

1038

setOperationAction(ISD::UDIV, VT, Custom);

1039

setOperationAction(ISD::UREM, VT, Custom);

1040

}

1041

1042

setOperationAction(ISD::MUL, MVT::v2i8, Custom);

1043

setOperationAction(ISD::MUL, MVT::v4i8, Custom);

1044

setOperationAction(ISD::MUL, MVT::v8i8, Custom);

1045

1046

setOperationAction(ISD::MUL, MVT::v16i8, Custom);

1047

setOperationAction(ISD::MUL, MVT::v4i32, Custom);

1048

setOperationAction(ISD::MUL, MVT::v2i64, Custom);

1049

setOperationAction(ISD::MULHU, MVT::v4i32, Custom);

1050

setOperationAction(ISD::MULHS, MVT::v4i32, Custom);

1051

setOperationAction(ISD::MULHU, MVT::v16i8, Custom);

1052

setOperationAction(ISD::MULHS, MVT::v16i8, Custom);

1053

setOperationAction(ISD::MULHU, MVT::v8i16, Legal);

1054

setOperationAction(ISD::MULHS, MVT::v8i16, Legal);

1055

setOperationAction(ISD::MUL, MVT::v8i16, Legal);

1056

setOperationAction(ISD::AVGCEILU, MVT::v16i8, Legal);

1057

setOperationAction(ISD::AVGCEILU, MVT::v8i16, Legal);

1058

1059

setOperationAction(ISD::SMULO, MVT::v16i8, Custom);

1060

setOperationAction(ISD::UMULO, MVT::v16i8, Custom);

1061

setOperationAction(ISD::UMULO, MVT::v2i32, Custom);

1062

1063

setOperationAction(ISD::FNEG, MVT::v2f64, Custom);

1064

setOperationAction(ISD::FABS, MVT::v2f64, Custom);

1065

setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Custom);

1066

1067

for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {

1068

setOperationAction(ISD::SMAX, VT, VT == MVT::v8i16 ? Legal : Custom);

1069

setOperationAction(ISD::SMIN, VT, VT == MVT::v8i16 ? Legal : Custom);

1070

setOperationAction(ISD::UMAX, VT, VT == MVT::v16i8 ? Legal : Custom);

1071

setOperationAction(ISD::UMIN, VT, VT == MVT::v16i8 ? Legal : Custom);

1072

}

1073

1074

setOperationAction(ISD::UADDSAT, MVT::v16i8, Legal);

1075

setOperationAction(ISD::SADDSAT, MVT::v16i8, Legal);

1076

setOperationAction(ISD::USUBSAT, MVT::v16i8, Legal);

1077

setOperationAction(ISD::SSUBSAT, MVT::v16i8, Legal);

1078

setOperationAction(ISD::UADDSAT, MVT::v8i16, Legal);

1079

setOperationAction(ISD::SADDSAT, MVT::v8i16, Legal);

1080

setOperationAction(ISD::USUBSAT, MVT::v8i16, Legal);

1081

setOperationAction(ISD::SSUBSAT, MVT::v8i16, Legal);

1082

setOperationAction(ISD::USUBSAT, MVT::v4i32, Custom);

1083

setOperationAction(ISD::USUBSAT, MVT::v2i64, Custom);

1084

1085

setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom);

1086

setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom);

1087

setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);

1088

setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);

1089

1090

for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {

1091

setOperationAction(ISD::SETCC, VT, Custom);

1092

setOperationAction(ISD::STRICT_FSETCC, VT, Custom);

1093

setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);

1094

setOperationAction(ISD::CTPOP, VT, Custom);

1095

setOperationAction(ISD::ABS, VT, Custom);

1096

1097

// The condition codes aren't legal in SSE/AVX and under AVX512 we use

1098

// setcc all the way to isel and prefer SETGT in some isel patterns.

1099

setCondCodeAction(ISD::SETLT, VT, Custom);

1100

setCondCodeAction(ISD::SETLE, VT, Custom);

1101

}

1102

1103

for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {

1104

setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);

1105

setOperationAction(ISD::BUILD_VECTOR, VT, Custom);

1106

setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);

1107

setOperationAction(ISD::VSELECT, VT, Custom);

1108

setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);

1109

}

1110

1111

for (auto VT : { MVT::v8f16, MVT::v2f64, MVT::v2i64 }) {

1112

setOperationAction(ISD::BUILD_VECTOR, VT, Custom);

1113

setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);

1114

setOperationAction(ISD::VSELECT, VT, Custom);

1115

1116

if (VT == MVT::v2i64 && !Subtarget.is64Bit())

1117

continue;

1118

1119

setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);

1120

setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);

1121

}

1122

setF16Action(MVT::v8f16, Expand);

1123

setOperationAction(ISD::FADD, MVT::v8f16, Expand);

1124

setOperationAction(ISD::FSUB, MVT::v8f16, Expand);

1125

setOperationAction(ISD::FMUL, MVT::v8f16, Expand);

1126

setOperationAction(ISD::FDIV, MVT::v8f16, Expand);

1127

1128

// Custom lower v2i64 and v2f64 selects.

1129

setOperationAction(ISD::SELECT, MVT::v2f64, Custom);

1130

setOperationAction(ISD::SELECT, MVT::v2i64, Custom);

1131

setOperationAction(ISD::SELECT, MVT::v4i32, Custom);

1132

setOperationAction(ISD::SELECT, MVT::v8i16, Custom);

1133

setOperationAction(ISD::SELECT, MVT::v8f16, Custom);

1134

setOperationAction(ISD::SELECT, MVT::v16i8, Custom);

1135

1136

setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Custom);

1137

setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Custom);

1138

setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Custom);

1139

setOperationAction(ISD::FP_TO_UINT, MVT::v2i32, Custom);

1140

setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v4i32, Custom);

1141

setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2i32, Custom);

1142

1143

// Custom legalize these to avoid over promotion or custom promotion.

1144

for (auto VT : {MVT::v2i8, MVT::v4i8, MVT::v8i8, MVT::v2i16, MVT::v4i16}) {

1145

setOperationAction(ISD::FP_TO_SINT, VT, Custom);

1146

setOperationAction(ISD::FP_TO_UINT, VT, Custom);

1147

setOperationAction(ISD::STRICT_FP_TO_SINT, VT, Custom);

1148

setOperationAction(ISD::STRICT_FP_TO_UINT, VT, Custom);

1149

}

1150

1151

setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Custom);

1152

setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4i32, Custom);

1153

setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom);

1154

setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2i32, Custom);

1155

1156

setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom);

1157

setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2i32, Custom);

1158

1159

setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom);

1160

setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i32, Custom);

1161

1162

// Fast v2f32 UINT_TO_FP( v2i32 ) custom conversion.

1163

setOperationAction(ISD::SINT_TO_FP, MVT::v2f32, Custom);

1164

setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2f32, Custom);

1165

setOperationAction(ISD::UINT_TO_FP, MVT::v2f32, Custom);

1166

setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2f32, Custom);

1167

1168

setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom);

1169

setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v2f32, Custom);

1170

setOperationAction(ISD::FP_ROUND, MVT::v2f32, Custom);

1171

setOperationAction(ISD::STRICT_FP_ROUND, MVT::v2f32, Custom);

1172

1173

// We want to legalize this to an f64 load rather than an i64 load on

1174

// 64-bit targets and two 32-bit loads on a 32-bit target. Similar for

1175

// store.

1176

setOperationAction(ISD::LOAD, MVT::v2i32, Custom);

1177

setOperationAction(ISD::LOAD, MVT::v4i16, Custom);

1178

setOperationAction(ISD::LOAD, MVT::v8i8, Custom);

1179

setOperationAction(ISD::STORE, MVT::v2i32, Custom);

1180

setOperationAction(ISD::STORE, MVT::v4i16, Custom);

1181

setOperationAction(ISD::STORE, MVT::v8i8, Custom);

1182

1183

// Add 32-bit vector stores to help vectorization opportunities.

1184

setOperationAction(ISD::STORE, MVT::v2i16, Custom);

1185

setOperationAction(ISD::STORE, MVT::v4i8, Custom);

1186

1187

setOperationAction(ISD::BITCAST, MVT::v2i32, Custom);

1188

setOperationAction(ISD::BITCAST, MVT::v4i16, Custom);

1189

setOperationAction(ISD::BITCAST, MVT::v8i8, Custom);

1190

if (!Subtarget.hasAVX512())

1191

setOperationAction(ISD::BITCAST, MVT::v16i1, Custom);

1192

1193

setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v2i64, Custom);

1194

setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i32, Custom);

1195

setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i16, Custom);

1196

1197

setOperationAction(ISD::SIGN_EXTEND, MVT::v4i64, Custom);

1198

1199

setOperationAction(ISD::TRUNCATE, MVT::v2i8, Custom);

1200

setOperationAction(ISD::TRUNCATE, MVT::v2i16, Custom);

1201

setOperationAction(ISD::TRUNCATE, MVT::v2i32, Custom);

1202

setOperationAction(ISD::TRUNCATE, MVT::v4i8, Custom);

1203

setOperationAction(ISD::TRUNCATE, MVT::v4i16, Custom);

1204

setOperationAction(ISD::TRUNCATE, MVT::v8i8, Custom);

1205

1206

// In the customized shift lowering, the legal v4i32/v2i64 cases

1207

// in AVX2 will be recognized.

1208

for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {

1209

setOperationAction(ISD::SRL, VT, Custom);

1210

setOperationAction(ISD::SHL, VT, Custom);

1211

setOperationAction(ISD::SRA, VT, Custom);

1212

if (VT == MVT::v2i64) continue;

1213

setOperationAction(ISD::ROTL, VT, Custom);

1214

setOperationAction(ISD::ROTR, VT, Custom);

1215

setOperationAction(ISD::FSHL, VT, Custom);

1216

setOperationAction(ISD::FSHR, VT, Custom);

1217

}

1218

1219

setOperationAction(ISD::STRICT_FSQRT, MVT::v2f64, Legal);

1220

setOperationAction(ISD::STRICT_FADD, MVT::v2f64, Legal);

1221

setOperationAction(ISD::STRICT_FSUB, MVT::v2f64, Legal);

1222

setOperationAction(ISD::STRICT_FMUL, MVT::v2f64, Legal);

1223

setOperationAction(ISD::STRICT_FDIV, MVT::v2f64, Legal);

1224

}

1225

1226

if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {

1227

setOperationAction(ISD::ABS, MVT::v16i8, Legal);

1228

setOperationAction(ISD::ABS, MVT::v8i16, Legal);

1229

setOperationAction(ISD::ABS, MVT::v4i32, Legal);

1230

setOperationAction(ISD::BITREVERSE, MVT::v16i8, Custom);

1231

setOperationAction(ISD::CTLZ, MVT::v16i8, Custom);

1232

setOperationAction(ISD::CTLZ, MVT::v8i16, Custom);

1233

setOperationAction(ISD::CTLZ, MVT::v4i32, Custom);

1234

setOperationAction(ISD::CTLZ, MVT::v2i64, Custom);

1235

1236

// These might be better off as horizontal vector ops.

1237

setOperationAction(ISD::ADD, MVT::i16, Custom);

1238

setOperationAction(ISD::ADD, MVT::i32, Custom);

1239

setOperationAction(ISD::SUB, MVT::i16, Custom);

1240

setOperationAction(ISD::SUB, MVT::i32, Custom);

1241

}

1242

1243

if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) {

1244

for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {

1245

setOperationAction(ISD::FFLOOR, RoundedTy, Legal);

1246

setOperationAction(ISD::STRICT_FFLOOR, RoundedTy, Legal);

1247

setOperationAction(ISD::FCEIL, RoundedTy, Legal);

1248

setOperationAction(ISD::STRICT_FCEIL, RoundedTy, Legal);

1249

setOperationAction(ISD::FTRUNC, RoundedTy, Legal);

1250

setOperationAction(ISD::STRICT_FTRUNC, RoundedTy, Legal);

1251

setOperationAction(ISD::FRINT, RoundedTy, Legal);

1252

setOperationAction(ISD::STRICT_FRINT, RoundedTy, Legal);

1253

setOperationAction(ISD::FNEARBYINT, RoundedTy, Legal);

1254

setOperationAction(ISD::STRICT_FNEARBYINT, RoundedTy, Legal);

1255

setOperationAction(ISD::FROUNDEVEN, RoundedTy, Legal);

1256

setOperationAction(ISD::STRICT_FROUNDEVEN, RoundedTy, Legal);

1257

1258

setOperationAction(ISD::FROUND, RoundedTy, Custom);

1259

}

1260

1261

setOperationAction(ISD::SMAX, MVT::v16i8, Legal);

1262

setOperationAction(ISD::SMAX, MVT::v4i32, Legal);

1263

setOperationAction(ISD::UMAX, MVT::v8i16, Legal);

1264

setOperationAction(ISD::UMAX, MVT::v4i32, Legal);

1265

setOperationAction(ISD::SMIN, MVT::v16i8, Legal);

1266

setOperationAction(ISD::SMIN, MVT::v4i32, Legal);

1267

setOperationAction(ISD::UMIN, MVT::v8i16, Legal);

1268

setOperationAction(ISD::UMIN, MVT::v4i32, Legal);

1269

1270

setOperationAction(ISD::UADDSAT, MVT::v4i32, Custom);

1271

setOperationAction(ISD::SADDSAT, MVT::v2i64, Custom);

1272

setOperationAction(ISD::SSUBSAT, MVT::v2i64, Custom);

1273

1274

// FIXME: Do we need to handle scalar-to-vector here?

1275

setOperationAction(ISD::MUL, MVT::v4i32, Legal);

1276

setOperationAction(ISD::SMULO, MVT::v2i32, Custom);

1277

1278

// We directly match byte blends in the backend as they match the VSELECT

1279

// condition form.

1280

setOperationAction(ISD::VSELECT, MVT::v16i8, Legal);

1281

1282

// SSE41 brings specific instructions for doing vector sign extend even in

1283

// cases where we don't have SRA.

1284

for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {

1285

setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Legal);

1286

setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Legal);

1287

}

1288

1289

// SSE41 also has vector sign/zero extending loads, PMOV[SZ]X

1290

for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {

1291

setLoadExtAction(LoadExtOp, MVT::v8i16, MVT::v8i8, Legal);

1292

setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i8, Legal);

1293

setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i8, Legal);

1294

setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i16, Legal);

1295

setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i16, Legal);

1296

setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i32, Legal);

1297

}

1298

1299

if (Subtarget.is64Bit() && !Subtarget.hasAVX512()) {

1300

// We need to scalarize v4i64->v432 uint_to_fp using cvtsi2ss, but we can

1301

// do the pre and post work in the vector domain.

1302

setOperationAction(ISD::UINT_TO_FP, MVT::v4i64, Custom);

1303

setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i64, Custom);

1304

// We need to mark SINT_TO_FP as Custom even though we want to expand it

1305

// so that DAG combine doesn't try to turn it into uint_to_fp.

1306

setOperationAction(ISD::SINT_TO_FP, MVT::v4i64, Custom);

1307

setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4i64, Custom);

1308

}

1309

}

1310

1311

if (!Subtarget.useSoftFloat() && Subtarget.hasSSE42()) {

1312

setOperationAction(ISD::UADDSAT, MVT::v2i64, Custom);

1313

}

1314

1315

if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) {

1316

for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,

1317

MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {

1318

setOperationAction(ISD::ROTL, VT, Custom);

1319

setOperationAction(ISD::ROTR, VT, Custom);

1320

}

1321

1322

// XOP can efficiently perform BITREVERSE with VPPERM.

1323

for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 })

1324

setOperationAction(ISD::BITREVERSE, VT, Custom);

1325

1326

for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,

1327

MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })

1328

setOperationAction(ISD::BITREVERSE, VT, Custom);

1329

}

1330

1331

if (!Subtarget.useSoftFloat() && Subtarget.hasAVX()) {

1332

bool HasInt256 = Subtarget.hasInt256();

1333

1334

addRegisterClass(MVT::v32i8, Subtarget.hasVLX() ? &X86::VR256XRegClass

1335

: &X86::VR256RegClass);

1336

addRegisterClass(MVT::v16i16, Subtarget.hasVLX() ? &X86::VR256XRegClass

1337

: &X86::VR256RegClass);

1338

addRegisterClass(MVT::v16f16, Subtarget.hasVLX() ? &X86::VR256XRegClass

1339

: &X86::VR256RegClass);

1340

addRegisterClass(MVT::v8i32, Subtarget.hasVLX() ? &X86::VR256XRegClass

1341

: &X86::VR256RegClass);

1342

addRegisterClass(MVT::v8f32, Subtarget.hasVLX() ? &X86::VR256XRegClass

1343

: &X86::VR256RegClass);

1344

addRegisterClass(MVT::v4i64, Subtarget.hasVLX() ? &X86::VR256XRegClass

1345

: &X86::VR256RegClass);

1346

addRegisterClass(MVT::v4f64, Subtarget.hasVLX() ? &X86::VR256XRegClass

1347

: &X86::VR256RegClass);

1348

1349

for (auto VT : { MVT::v8f32, MVT::v4f64 }) {

1350

setOperationAction(ISD::FFLOOR, VT, Legal);

1351

setOperationAction(ISD::STRICT_FFLOOR, VT, Legal);

1352

setOperationAction(ISD::FCEIL, VT, Legal);

1353

setOperationAction(ISD::STRICT_FCEIL, VT, Legal);

1354

setOperationAction(ISD::FTRUNC, VT, Legal);

1355

setOperationAction(ISD::STRICT_FTRUNC, VT, Legal);

1356

setOperationAction(ISD::FRINT, VT, Legal);

1357

setOperationAction(ISD::STRICT_FRINT, VT, Legal);

1358

setOperationAction(ISD::FNEARBYINT, VT, Legal);

1359

setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal);

1360

setOperationAction(ISD::FROUNDEVEN, VT, Legal);

1361

setOperationAction(ISD::STRICT_FROUNDEVEN, VT, Legal);

1362

1363

setOperationAction(ISD::FROUND, VT, Custom);

1364

1365

setOperationAction(ISD::FNEG, VT, Custom);

1366

setOperationAction(ISD::FABS, VT, Custom);

1367

setOperationAction(ISD::FCOPYSIGN, VT, Custom);

1368

}

1369

1370

// (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted

1371

// even though v8i16 is a legal type.

1372

setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i16, MVT::v8i32);

1373

setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i16, MVT::v8i32);

1374

setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i16, MVT::v8i32);

1375

setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i16, MVT::v8i32);

1376

setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Custom);

1377

setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Custom);

1378

setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v8i32, Custom);

1379

1380

setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Custom);

1381

setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v8i32, Custom);

1382

setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Expand);

1383

setOperationAction(ISD::FP_ROUND, MVT::v8f16, Expand);

1384

setOperationAction(ISD::FP_EXTEND, MVT::v4f64, Custom);

1385

setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v4f64, Custom);

1386

1387

setOperationAction(ISD::STRICT_FP_ROUND, MVT::v4f32, Legal);

1388

setOperationAction(ISD::STRICT_FADD, MVT::v8f32, Legal);

1389

setOperationAction(ISD::STRICT_FADD, MVT::v4f64, Legal);

1390

setOperationAction(ISD::STRICT_FSUB, MVT::v8f32, Legal);

1391

setOperationAction(ISD::STRICT_FSUB, MVT::v4f64, Legal);

1392

setOperationAction(ISD::STRICT_FMUL, MVT::v8f32, Legal);

1393

setOperationAction(ISD::STRICT_FMUL, MVT::v4f64, Legal);

1394

setOperationAction(ISD::STRICT_FDIV, MVT::v8f32, Legal);

1395

setOperationAction(ISD::STRICT_FDIV, MVT::v4f64, Legal);

1396

setOperationAction(ISD::STRICT_FSQRT, MVT::v8f32, Legal);

1397

setOperationAction(ISD::STRICT_FSQRT, MVT::v4f64, Legal);

1398

1399

if (!Subtarget.hasAVX512())

1400

setOperationAction(ISD::BITCAST, MVT::v32i1, Custom);

1401

1402

// In the customized shift lowering, the legal v8i32/v4i64 cases

1403

// in AVX2 will be recognized.

1404

for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {

1405

setOperationAction(ISD::SRL, VT, Custom);

1406

setOperationAction(ISD::SHL, VT, Custom);

1407

setOperationAction(ISD::SRA, VT, Custom);

1408

if (VT == MVT::v4i64) continue;

1409

setOperationAction(ISD::ROTL, VT, Custom);

1410

setOperationAction(ISD::ROTR, VT, Custom);

1411

setOperationAction(ISD::FSHL, VT, Custom);

1412

setOperationAction(ISD::FSHR, VT, Custom);

1413

}

1414

1415

// These types need custom splitting if their input is a 128-bit vector.

1416

setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom);

1417

setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);

1418

setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom);

1419

setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);

1420

1421

setOperationAction(ISD::SELECT, MVT::v4f64, Custom);

1422

setOperationAction(ISD::SELECT, MVT::v4i64, Custom);

1423

setOperationAction(ISD::SELECT, MVT::v8i32, Custom);

1424

setOperationAction(ISD::SELECT, MVT::v16i16, Custom);

1425

setOperationAction(ISD::SELECT, MVT::v16f16, Custom);

1426

setOperationAction(ISD::SELECT, MVT::v32i8, Custom);

1427

setOperationAction(ISD::SELECT, MVT::v8f32, Custom);

1428

1429

for (auto VT : { MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {

1430

setOperationAction(ISD::SIGN_EXTEND, VT, Custom);

1431

setOperationAction(ISD::ZERO_EXTEND, VT, Custom);

1432

setOperationAction(ISD::ANY_EXTEND, VT, Custom);

1433

}

1434

1435

setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom);

1436

setOperationAction(ISD::TRUNCATE, MVT::v8i16, Custom);

1437

setOperationAction(ISD::TRUNCATE, MVT::v4i32, Custom);

1438

setOperationAction(ISD::BITREVERSE, MVT::v32i8, Custom);

1439

1440

for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {

1441

setOperationAction(ISD::SETCC, VT, Custom);

1442

setOperationAction(ISD::STRICT_FSETCC, VT, Custom);

1443

setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);

1444

setOperationAction(ISD::CTPOP, VT, Custom);

1445

setOperationAction(ISD::CTLZ, VT, Custom);

1446

1447

// The condition codes aren't legal in SSE/AVX and under AVX512 we use

1448

// setcc all the way to isel and prefer SETGT in some isel patterns.

1449

setCondCodeAction(ISD::SETLT, VT, Custom);

1450

setCondCodeAction(ISD::SETLE, VT, Custom);

1451

}

1452

1453

if (Subtarget.hasAnyFMA()) {

1454

for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32,

1455

MVT::v2f64, MVT::v4f64 }) {

1456

setOperationAction(ISD::FMA, VT, Legal);

1457

setOperationAction(ISD::STRICT_FMA, VT, Legal);

1458

}

1459

}

1460

1461

for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {

1462

setOperationAction(ISD::ADD, VT, HasInt256 ? Legal : Custom);

1463

setOperationAction(ISD::SUB, VT, HasInt256 ? Legal : Custom);

1464

}

1465

1466

setOperationAction(ISD::MUL, MVT::v4i64, Custom);

1467

setOperationAction(ISD::MUL, MVT::v8i32, HasInt256 ? Legal : Custom);

1468

setOperationAction(ISD::MUL, MVT::v16i16, HasInt256 ? Legal : Custom);

1469

setOperationAction(ISD::MUL, MVT::v32i8, Custom);

1470

1471

setOperationAction(ISD::MULHU, MVT::v8i32, Custom);

1472

setOperationAction(ISD::MULHS, MVT::v8i32, Custom);

1473

setOperationAction(ISD::MULHU, MVT::v16i16, HasInt256 ? Legal : Custom);

1474

setOperationAction(ISD::MULHS, MVT::v16i16, HasInt256 ? Legal : Custom);

1475

setOperationAction(ISD::MULHU, MVT::v32i8, Custom);

1476

setOperationAction(ISD::MULHS, MVT::v32i8, Custom);

1477

setOperationAction(ISD::AVGCEILU, MVT::v16i16, HasInt256 ? Legal : Custom);

1478

setOperationAction(ISD::AVGCEILU, MVT::v32i8, HasInt256 ? Legal : Custom);

1479

1480

setOperationAction(ISD::SMULO, MVT::v32i8, Custom);

1481

setOperationAction(ISD::UMULO, MVT::v32i8, Custom);

1482

1483

setOperationAction(ISD::ABS, MVT::v4i64, Custom);

1484

setOperationAction(ISD::SMAX, MVT::v4i64, Custom);

1485

setOperationAction(ISD::UMAX, MVT::v4i64, Custom);

1486

setOperationAction(ISD::SMIN, MVT::v4i64, Custom);

1487

setOperationAction(ISD::UMIN, MVT::v4i64, Custom);

1488

1489

setOperationAction(ISD::UADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom);

1490

setOperationAction(ISD::SADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom);

1491

setOperationAction(ISD::USUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom);

1492

setOperationAction(ISD::SSUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom);

1493

setOperationAction(ISD::UADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom);

1494

setOperationAction(ISD::SADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom);

1495

setOperationAction(ISD::USUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom);

1496

setOperationAction(ISD::SSUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom);

1497

setOperationAction(ISD::UADDSAT, MVT::v8i32, Custom);

1498

setOperationAction(ISD::USUBSAT, MVT::v8i32, Custom);

1499

setOperationAction(ISD::UADDSAT, MVT::v4i64, Custom);

1500

setOperationAction(ISD::USUBSAT, MVT::v4i64, Custom);

1501

1502

for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {

1503

setOperationAction(ISD::ABS, VT, HasInt256 ? Legal : Custom);

1504

setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom);

1505

setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom);

1506

setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom);

1507

setOperationAction(ISD::UMIN, VT, HasInt256 ? Legal : Custom);

1508

}

1509

1510

for (auto VT : {MVT::v16i16, MVT::v8i32, MVT::v4i64}) {

1511

setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Custom);

1512

setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Custom);

1513

}

1514

1515

if (HasInt256) {

1516

// The custom lowering for UINT_TO_FP for v8i32 becomes interesting

1517

// when we have a 256bit-wide blend with immediate.

1518

setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom);

1519

setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v8i32, Custom);

1520

1521

// AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X

1522

for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {

1523

setLoadExtAction(LoadExtOp, MVT::v16i16, MVT::v16i8, Legal);

1524

setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i8, Legal);

1525

setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i8, Legal);

1526

setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i16, Legal);

1527

setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i16, Legal);

1528

setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i32, Legal);

1529

}

1530

}

1531

1532

for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,

1533

MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {

1534

setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);

1535

setOperationAction(ISD::MSTORE, VT, Legal);

1536

}

1537

1538

// Extract subvector is special because the value type

1539

// (result) is 128-bit but the source is 256-bit wide.

1540

for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,

1541

MVT::v8f16, MVT::v4f32, MVT::v2f64 }) {

1542

setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);

1543

}

1544

1545

// Custom lower several nodes for 256-bit types.

1546

for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,

1547

MVT::v16f16, MVT::v8f32, MVT::v4f64 }) {

1548

setOperationAction(ISD::BUILD_VECTOR, VT, Custom);

1549

setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);

1550

setOperationAction(ISD::VSELECT, VT, Custom);

1551

setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);

1552

setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);

1553

setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);

1554

setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);

1555

setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);

1556

setOperationAction(ISD::STORE, VT, Custom);

1557

}

1558

setF16Action(MVT::v16f16, Expand);

1559

setOperationAction(ISD::FADD, MVT::v16f16, Expand);

1560

setOperationAction(ISD::FSUB, MVT::v16f16, Expand);

1561

setOperationAction(ISD::FMUL, MVT::v16f16, Expand);

1562

setOperationAction(ISD::FDIV, MVT::v16f16, Expand);

1563

1564

if (HasInt256) {

1565

setOperationAction(ISD::VSELECT, MVT::v32i8, Legal);

1566

1567

// Custom legalize 2x32 to get a little better code.

1568

setOperationAction(ISD::MGATHER, MVT::v2f32, Custom);

1569

setOperationAction(ISD::MGATHER, MVT::v2i32, Custom);

1570

1571

for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,

1572

MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })

1573

setOperationAction(ISD::MGATHER, VT, Custom);

1574

}

1575

}

1576

1577

if (!Subtarget.useSoftFloat() && !Subtarget.hasFP16() &&

1578

Subtarget.hasF16C()) {

1579

for (MVT VT : { MVT::f16, MVT::v2f16, MVT::v4f16, MVT::v8f16 }) {

1580

setOperationAction(ISD::FP_ROUND, VT, Custom);

1581

setOperationAction(ISD::STRICT_FP_ROUND, VT, Custom);

1582

}

1583

for (MVT VT : { MVT::f32, MVT::v2f32, MVT::v4f32 }) {

1584

setOperationAction(ISD::FP_EXTEND, VT, Custom);

1585

setOperationAction(ISD::STRICT_FP_EXTEND, VT, Custom);

1586

}

1587

for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV}) {

1588

setOperationPromotedToType(Opc, MVT::v8f16, MVT::v8f32);

1589

setOperationPromotedToType(Opc, MVT::v16f16, MVT::v16f32);

1590

}

1591

1592

setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Legal);

1593

setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v8f32, Legal);

1594

}

1595

1596

// This block controls legalization of the mask vector sizes that are

1597

// available with AVX512. 512-bit vectors are in a separate block controlled

1598

// by useAVX512Regs.

1599

if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {

1600

addRegisterClass(MVT::v1i1, &X86::VK1RegClass);

1601

addRegisterClass(MVT::v2i1, &X86::VK2RegClass);

1602

addRegisterClass(MVT::v4i1, &X86::VK4RegClass);

1603

addRegisterClass(MVT::v8i1, &X86::VK8RegClass);

1604

addRegisterClass(MVT::v16i1, &X86::VK16RegClass);

1605

1606

setOperationAction(ISD::SELECT, MVT::v1i1, Custom);

1607

setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v1i1, Custom);

1608

setOperationAction(ISD::BUILD_VECTOR, MVT::v1i1, Custom);

1609

1610

setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i1, MVT::v8i32);

1611

setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i1, MVT::v8i32);

1612

setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v4i1, MVT::v4i32);

1613

setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v4i1, MVT::v4i32);

1614

setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i1, MVT::v8i32);

1615

setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i1, MVT::v8i32);

1616

setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v4i1, MVT::v4i32);

1617

setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v4i1, MVT::v4i32);

1618

setOperationAction(ISD::FP_TO_SINT, MVT::v2i1, Custom);

1619

setOperationAction(ISD::FP_TO_UINT, MVT::v2i1, Custom);

1620

setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2i1, Custom);

1621

setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2i1, Custom);

1622

1623

// There is no byte sized k-register load or store without AVX512DQ.

1624

if (!Subtarget.hasDQI()) {

1625

setOperationAction(ISD::LOAD, MVT::v1i1, Custom);

1626

setOperationAction(ISD::LOAD, MVT::v2i1, Custom);

1627

setOperationAction(ISD::LOAD, MVT::v4i1, Custom);

1628

setOperationAction(ISD::LOAD, MVT::v8i1, Custom);

1629

1630

setOperationAction(ISD::STORE, MVT::v1i1, Custom);

1631

setOperationAction(ISD::STORE, MVT::v2i1, Custom);

1632

setOperationAction(ISD::STORE, MVT::v4i1, Custom);

1633

setOperationAction(ISD::STORE, MVT::v8i1, Custom);

1634

}

1635

1636

// Extends of v16i1/v8i1/v4i1/v2i1 to 128-bit vectors.

1637

for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {

1638

setOperationAction(ISD::SIGN_EXTEND, VT, Custom);

1639

setOperationAction(ISD::ZERO_EXTEND, VT, Custom);

1640

setOperationAction(ISD::ANY_EXTEND, VT, Custom);

1641

}

1642

1643

for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 })

1644

setOperationAction(ISD::VSELECT, VT, Expand);

1645

1646

for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 }) {

1647

setOperationAction(ISD::SETCC, VT, Custom);

1648

setOperationAction(ISD::STRICT_FSETCC, VT, Custom);

1649

setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);

1650

setOperationAction(ISD::SELECT, VT, Custom);

1651

setOperationAction(ISD::TRUNCATE, VT, Custom);

1652

1653

setOperationAction(ISD::BUILD_VECTOR, VT, Custom);

1654

setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);

1655

setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);

1656

setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);

1657

setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);

1658

setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);

1659

}

1660

1661

for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1 })

1662

setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);

1663

}

1664

1665

// This block controls legalization for 512-bit operations with 32/64 bit

1666

// elements. 512-bits can be disabled based on prefer-vector-width and

1667

// required-vector-width function attributes.

1668

if (!Subtarget.useSoftFloat() && Subtarget.useAVX512Regs()) {

1669

bool HasBWI = Subtarget.hasBWI();

1670

1671

addRegisterClass(MVT::v16i32, &X86::VR512RegClass);

1672

addRegisterClass(MVT::v16f32, &X86::VR512RegClass);

1673

addRegisterClass(MVT::v8i64, &X86::VR512RegClass);

1674

addRegisterClass(MVT::v8f64, &X86::VR512RegClass);

1675

addRegisterClass(MVT::v32i16, &X86::VR512RegClass);

1676

addRegisterClass(MVT::v32f16, &X86::VR512RegClass);

1677

addRegisterClass(MVT::v64i8, &X86::VR512RegClass);

1678

1679

for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) {

1680

setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8, Legal);

1681

setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i16, Legal);

1682

setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i8, Legal);

1683

setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i16, Legal);

1684

setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i32, Legal);

1685

if (HasBWI)

1686

setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);

1687

}

1688

1689

for (MVT VT : { MVT::v16f32, MVT::v8f64 }) {

1690

setOperationAction(ISD::FNEG, VT, Custom);

1691

setOperationAction(ISD::FABS, VT, Custom);

1692

setOperationAction(ISD::FMA, VT, Legal);

1693

setOperationAction(ISD::STRICT_FMA, VT, Legal);

1694

setOperationAction(ISD::FCOPYSIGN, VT, Custom);

1695

}

1696

1697

for (MVT VT : { MVT::v16i1, MVT::v16i8, MVT::v16i16 }) {

1698

setOperationPromotedToType(ISD::FP_TO_SINT , VT, MVT::v16i32);

1699

setOperationPromotedToType(ISD::FP_TO_UINT , VT, MVT::v16i32);

1700

setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, VT, MVT::v16i32);

1701

setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, VT, MVT::v16i32);

1702

}

1703

setOperationAction(ISD::FP_TO_SINT, MVT::v16i32, Custom);

1704

setOperationAction(ISD::FP_TO_UINT, MVT::v16i32, Custom);

1705

setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v16i32, Custom);

1706

setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v16i32, Custom);

1707

setOperationAction(ISD::SINT_TO_FP, MVT::v16i32, Custom);

1708

setOperationAction(ISD::UINT_TO_FP, MVT::v16i32, Custom);

1709

setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v16i32, Custom);

1710

setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v16i32, Custom);

1711

setOperationAction(ISD::FP_EXTEND, MVT::v8f64, Custom);

1712

setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v8f64, Custom);

1713

1714

setOperationAction(ISD::STRICT_FADD, MVT::v16f32, Legal);

1715

setOperationAction(ISD::STRICT_FADD, MVT::v8f64, Legal);

1716

setOperationAction(ISD::STRICT_FSUB, MVT::v16f32, Legal);

1717

setOperationAction(ISD::STRICT_FSUB, MVT::v8f64, Legal);

1718

setOperationAction(ISD::STRICT_FMUL, MVT::v16f32, Legal);

1719

setOperationAction(ISD::STRICT_FMUL, MVT::v8f64, Legal);

1720

setOperationAction(ISD::STRICT_FDIV, MVT::v16f32, Legal);

1721

setOperationAction(ISD::STRICT_FDIV, MVT::v8f64, Legal);

1722

setOperationAction(ISD::STRICT_FSQRT, MVT::v16f32, Legal);

1723

setOperationAction(ISD::STRICT_FSQRT, MVT::v8f64, Legal);

1724

setOperationAction(ISD::STRICT_FP_ROUND, MVT::v8f32, Legal);

1725

1726

setTruncStoreAction(MVT::v8i64, MVT::v8i8, Legal);

1727

setTruncStoreAction(MVT::v8i64, MVT::v8i16, Legal);

1728

setTruncStoreAction(MVT::v8i64, MVT::v8i32, Legal);

1729

setTruncStoreAction(MVT::v16i32, MVT::v16i8, Legal);

1730

setTruncStoreAction(MVT::v16i32, MVT::v16i16, Legal);

1731

if (HasBWI)

1732

setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal);

1733

1734

// With 512-bit vectors and no VLX, we prefer to widen MLOAD/MSTORE

1735

// to 512-bit rather than use the AVX2 instructions so that we can use

1736

// k-masks.

1737

if (!Subtarget.hasVLX()) {

1738

for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,

1739

MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {

1740

setOperationAction(ISD::MLOAD, VT, Custom);

1741

setOperationAction(ISD::MSTORE, VT, Custom);

1742

}

1743

}

1744

1745

setOperationAction(ISD::TRUNCATE, MVT::v8i32, Legal);

1746

setOperationAction(ISD::TRUNCATE, MVT::v16i16, Legal);

1747

setOperationAction(ISD::TRUNCATE, MVT::v32i8, HasBWI ? Legal : Custom);

1748

setOperationAction(ISD::TRUNCATE, MVT::v16i64, Custom);

1749

setOperationAction(ISD::ZERO_EXTEND, MVT::v32i16, Custom);

1750

setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);

1751

setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom);

1752

setOperationAction(ISD::ANY_EXTEND, MVT::v32i16, Custom);

1753

setOperationAction(ISD::ANY_EXTEND, MVT::v16i32, Custom);

1754

setOperationAction(ISD::ANY_EXTEND, MVT::v8i64, Custom);

1755

setOperationAction(ISD::SIGN_EXTEND, MVT::v32i16, Custom);

1756

setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);

1757

setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom);

1758

1759

if (HasBWI) {

1760

// Extends from v64i1 masks to 512-bit vectors.

1761

setOperationAction(ISD::SIGN_EXTEND, MVT::v64i8, Custom);

1762

setOperationAction(ISD::ZERO_EXTEND, MVT::v64i8, Custom);

1763

setOperationAction(ISD::ANY_EXTEND, MVT::v64i8, Custom);

1764

}

1765

1766

for (auto VT : { MVT::v16f32, MVT::v8f64 }) {

1767

setOperationAction(ISD::FFLOOR, VT, Legal);

1768

setOperationAction(ISD::STRICT_FFLOOR, VT, Legal);

1769

setOperationAction(ISD::FCEIL, VT, Legal);

1770

setOperationAction(ISD::STRICT_FCEIL, VT, Legal);

1771

setOperationAction(ISD::FTRUNC, VT, Legal);

1772

setOperationAction(ISD::STRICT_FTRUNC, VT, Legal);

1773

setOperationAction(ISD::FRINT, VT, Legal);

1774

setOperationAction(ISD::STRICT_FRINT, VT, Legal);

1775

setOperationAction(ISD::FNEARBYINT, VT, Legal);

1776

setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal);

1777

setOperationAction(ISD::FROUNDEVEN, VT, Legal);

1778

setOperationAction(ISD::STRICT_FROUNDEVEN, VT, Legal);

1779

1780

setOperationAction(ISD::FROUND, VT, Custom);

1781

}

1782

1783

for (auto VT : {MVT::v32i16, MVT::v16i32, MVT::v8i64}) {

1784

setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Custom);

1785

setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Custom);

1786

}

1787

1788

setOperationAction(ISD::ADD, MVT::v32i16, HasBWI ? Legal : Custom);

1789

setOperationAction(ISD::SUB, MVT::v32i16, HasBWI ? Legal : Custom);

1790

setOperationAction(ISD::ADD, MVT::v64i8, HasBWI ? Legal : Custom);

1791

setOperationAction(ISD::SUB, MVT::v64i8, HasBWI ? Legal : Custom);

1792

1793

setOperationAction(ISD::MUL, MVT::v8i64, Custom);

1794

setOperationAction(ISD::MUL, MVT::v16i32, Legal);

1795

setOperationAction(ISD::MUL, MVT::v32i16, HasBWI ? Legal : Custom);

1796

setOperationAction(ISD::MUL, MVT::v64i8, Custom);

1797

1798

setOperationAction(ISD::MULHU, MVT::v16i32, Custom);

1799

setOperationAction(ISD::MULHS, MVT::v16i32, Custom);

1800

setOperationAction(ISD::MULHS, MVT::v32i16, HasBWI ? Legal : Custom);

1801

setOperationAction(ISD::MULHU, MVT::v32i16, HasBWI ? Legal : Custom);

1802

setOperationAction(ISD::MULHS, MVT::v64i8, Custom);

1803

setOperationAction(ISD::MULHU, MVT::v64i8, Custom);

1804

setOperationAction(ISD::AVGCEILU, MVT::v32i16, HasBWI ? Legal : Custom);

1805

setOperationAction(ISD::AVGCEILU, MVT::v64i8, HasBWI ? Legal : Custom);

1806

1807

setOperationAction(ISD::SMULO, MVT::v64i8, Custom);

1808

setOperationAction(ISD::UMULO, MVT::v64i8, Custom);

1809

1810

setOperationAction(ISD::BITREVERSE, MVT::v64i8, Custom);

1811

1812

for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64 }) {

1813

setOperationAction(ISD::SRL, VT, Custom);

1814

setOperationAction(ISD::SHL, VT, Custom);

1815

setOperationAction(ISD::SRA, VT, Custom);

1816

setOperationAction(ISD::ROTL, VT, Custom);

1817

setOperationAction(ISD::ROTR, VT, Custom);

1818

setOperationAction(ISD::SETCC, VT, Custom);

1819

1820

// The condition codes aren't legal in SSE/AVX and under AVX512 we use

1821

// setcc all the way to isel and prefer SETGT in some isel patterns.

1822

setCondCodeAction(ISD::SETLT, VT, Custom);

1823

setCondCodeAction(ISD::SETLE, VT, Custom);

1824

}

1825

for (auto VT : { MVT::v16i32, MVT::v8i64 }) {

1826

setOperationAction(ISD::SMAX, VT, Legal);

1827

setOperationAction(ISD::UMAX, VT, Legal);

1828

setOperationAction(ISD::SMIN, VT, Legal);

1829

setOperationAction(ISD::UMIN, VT, Legal);

1830

setOperationAction(ISD::ABS, VT, Legal);

1831

setOperationAction(ISD::CTPOP, VT, Custom);

1832

setOperationAction(ISD::STRICT_FSETCC, VT, Custom);

1833

setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);

1834

}

1835

1836

for (auto VT : { MVT::v64i8, MVT::v32i16 }) {

1837

setOperationAction(ISD::ABS, VT, HasBWI ? Legal : Custom);

1838

setOperationAction(ISD::CTPOP, VT, Subtarget.hasBITALG() ? Legal : Custom);

1839

setOperationAction(ISD::CTLZ, VT, Custom);

1840

setOperationAction(ISD::SMAX, VT, HasBWI ? Legal : Custom);

1841

setOperationAction(ISD::UMAX, VT, HasBWI ? Legal : Custom);

1842

setOperationAction(ISD::SMIN, VT, HasBWI ? Legal : Custom);

1843

setOperationAction(ISD::UMIN, VT, HasBWI ? Legal : Custom);

1844

setOperationAction(ISD::UADDSAT, VT, HasBWI ? Legal : Custom);

1845

setOperationAction(ISD::SADDSAT, VT, HasBWI ? Legal : Custom);

1846

setOperationAction(ISD::USUBSAT, VT, HasBWI ? Legal : Custom);

1847

setOperationAction(ISD::SSUBSAT, VT, HasBWI ? Legal : Custom);

1848

}

1849

1850

setOperationAction(ISD::FSHL, MVT::v64i8, Custom);

1851

setOperationAction(ISD::FSHR, MVT::v64i8, Custom);

1852

setOperationAction(ISD::FSHL, MVT::v32i16, Custom);

1853

setOperationAction(ISD::FSHR, MVT::v32i16, Custom);

1854

setOperationAction(ISD::FSHL, MVT::v16i32, Custom);

1855

setOperationAction(ISD::FSHR, MVT::v16i32, Custom);

1856

1857

if (Subtarget.hasDQI()) {

1858

for (auto Opc : {ISD::SINT_TO_FP, ISD::UINT_TO_FP, ISD::STRICT_SINT_TO_FP,

1859

ISD::STRICT_UINT_TO_FP, ISD::FP_TO_SINT, ISD::FP_TO_UINT,

1860

ISD::STRICT_FP_TO_SINT, ISD::STRICT_FP_TO_UINT})

1861

setOperationAction(Opc, MVT::v8i64, Custom);

1862

setOperationAction(ISD::MUL, MVT::v8i64, Legal);

1863

}

1864

1865

if (Subtarget.hasCDI()) {

1866

// NonVLX sub-targets extend 128/256 vectors to use the 512 version.

1867

for (auto VT : { MVT::v16i32, MVT::v8i64} ) {

1868

setOperationAction(ISD::CTLZ, VT, Legal);

1869

}

1870

} // Subtarget.hasCDI()

1871

1872

if (Subtarget.hasVPOPCNTDQ()) {

1873

for (auto VT : { MVT::v16i32, MVT::v8i64 })

1874

setOperationAction(ISD::CTPOP, VT, Legal);

1875

}

1876

1877

// Extract subvector is special because the value type

1878

// (result) is 256-bit but the source is 512-bit wide.

1879

// 128-bit was made Legal under AVX1.

1880

for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,

1881

MVT::v16f16, MVT::v8f32, MVT::v4f64 })

1882

setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);

1883

1884

for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64,

1885

MVT::v32f16, MVT::v16f32, MVT::v8f64 }) {

1886

setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);

1887

setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);

1888

setOperationAction(ISD::SELECT, VT, Custom);

1889

setOperationAction(ISD::VSELECT, VT, Custom);

1890

setOperationAction(ISD::BUILD_VECTOR, VT, Custom);

1891

setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);

1892

setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);

1893

setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);

1894

setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);

1895

}

1896

setF16Action(MVT::v32f16, Expand);

1897

setOperationAction(ISD::FP_ROUND, MVT::v16f16, Custom);

1898

setOperationAction(ISD::STRICT_FP_ROUND, MVT::v16f16, Custom);

1899

setOperationAction(ISD::FP_EXTEND, MVT::v16f32, Legal);

1900

setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v16f32, Legal);

1901

for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV}) {

1902

setOperationPromotedToType(Opc, MVT::v16f16, MVT::v16f32);

1903

setOperationPromotedToType(Opc, MVT::v32f16, MVT::v32f32);

1904

}

1905

1906

for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {

1907

setOperationAction(ISD::MLOAD, VT, Legal);

1908

setOperationAction(ISD::MSTORE, VT, Legal);

1909

setOperationAction(ISD::MGATHER, VT, Custom);

1910

setOperationAction(ISD::MSCATTER, VT, Custom);

1911

}

1912

if (HasBWI) {

1913

for (auto VT : { MVT::v64i8, MVT::v32i16 }) {

1914

setOperationAction(ISD::MLOAD, VT, Legal);

1915

setOperationAction(ISD::MSTORE, VT, Legal);

1916

}

1917

} else {

1918

setOperationAction(ISD::STORE, MVT::v32i16, Custom);

1919

setOperationAction(ISD::STORE, MVT::v64i8, Custom);

1920

}

1921

1922

if (Subtarget.hasVBMI2()) {

1923

for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64,

1924

MVT::v16i16, MVT::v8i32, MVT::v4i64,

1925

MVT::v32i16, MVT::v16i32, MVT::v8i64 }) {

1926

setOperationAction(ISD::FSHL, VT, Custom);

1927

setOperationAction(ISD::FSHR, VT, Custom);

1928

}

1929

1930

setOperationAction(ISD::ROTL, MVT::v32i16, Custom);

1931

setOperationAction(ISD::ROTR, MVT::v8i16, Custom);

1932

setOperationAction(ISD::ROTR, MVT::v16i16, Custom);

1933

setOperationAction(ISD::ROTR, MVT::v32i16, Custom);

1934

}

1935

}// useAVX512Regs

1936

1937

// This block controls legalization for operations that don't have

1938

// pre-AVX512 equivalents. Without VLX we use 512-bit operations for

1939

// narrower widths.

1940

if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {

1941

// These operations are handled on non-VLX by artificially widening in

1942

// isel patterns.

1943

1944

setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v8i32, Custom);

1945

setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v4i32, Custom);

1946

setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2i32, Custom);

1947

1948

if (Subtarget.hasDQI()) {

1949

// Fast v2f32 SINT_TO_FP( v2i64 ) custom conversion.

1950

// v2f32 UINT_TO_FP is already custom under SSE2.

1951

assert(isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) &&(static_cast <bool> (isOperationCustom(ISD::UINT_TO_FP,
MVT::v2f32) && isOperationCustom(ISD::STRICT_UINT_TO_FP
, MVT::v2f32) && "Unexpected operation action!") ? void
(0) : __assert_fail ("isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) && isOperationCustom(ISD::STRICT_UINT_TO_FP, MVT::v2f32) && \"Unexpected operation action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 1953, __extension__
__PRETTY_FUNCTION__))

1952

isOperationCustom(ISD::STRICT_UINT_TO_FP, MVT::v2f32) &&(static_cast <bool> (isOperationCustom(ISD::UINT_TO_FP,
MVT::v2f32) && isOperationCustom(ISD::STRICT_UINT_TO_FP
, MVT::v2f32) && "Unexpected operation action!") ? void
(0) : __assert_fail ("isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) && isOperationCustom(ISD::STRICT_UINT_TO_FP, MVT::v2f32) && \"Unexpected operation action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 1953, __extension__
__PRETTY_FUNCTION__))

1953

"Unexpected operation action!")(static_cast <bool> (isOperationCustom(ISD::UINT_TO_FP,
MVT::v2f32) && isOperationCustom(ISD::STRICT_UINT_TO_FP
, MVT::v2f32) && "Unexpected operation action!") ? void
(0) : __assert_fail ("isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) && isOperationCustom(ISD::STRICT_UINT_TO_FP, MVT::v2f32) && \"Unexpected operation action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 1953, __extension__
__PRETTY_FUNCTION__));

1954

// v2i64 FP_TO_S/UINT(v2f32) custom conversion.

1955

setOperationAction(ISD::FP_TO_SINT, MVT::v2f32, Custom);

1956

setOperationAction(ISD::FP_TO_UINT, MVT::v2f32, Custom);

1957

setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2f32, Custom);

1958

setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2f32, Custom);

1959

}

1960

1961

for (auto VT : { MVT::v2i64, MVT::v4i64 }) {

1962

setOperationAction(ISD::SMAX, VT, Legal);

1963

setOperationAction(ISD::UMAX, VT, Legal);

1964

setOperationAction(ISD::SMIN, VT, Legal);

1965

setOperationAction(ISD::UMIN, VT, Legal);

1966

setOperationAction(ISD::ABS, VT, Legal);

1967

}

1968

1969

for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {

1970

setOperationAction(ISD::ROTL, VT, Custom);

1971

setOperationAction(ISD::ROTR, VT, Custom);

1972

}

1973

1974

// Custom legalize 2x32 to get a little better code.

1975

setOperationAction(ISD::MSCATTER, MVT::v2f32, Custom);

1976

setOperationAction(ISD::MSCATTER, MVT::v2i32, Custom);

1977

1978

for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,

1979

MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })

1980

setOperationAction(ISD::MSCATTER, VT, Custom);

1981

1982

if (Subtarget.hasDQI()) {

1983

for (auto Opc : {ISD::SINT_TO_FP, ISD::UINT_TO_FP, ISD::STRICT_SINT_TO_FP,

1984

ISD::STRICT_UINT_TO_FP, ISD::FP_TO_SINT, ISD::FP_TO_UINT,

1985

ISD::STRICT_FP_TO_SINT, ISD::STRICT_FP_TO_UINT}) {

1986

setOperationAction(Opc, MVT::v2i64, Custom);

1987

setOperationAction(Opc, MVT::v4i64, Custom);

1988

}

1989

setOperationAction(ISD::MUL, MVT::v2i64, Legal);

1990

setOperationAction(ISD::MUL, MVT::v4i64, Legal);

1991

}

1992

1993

if (Subtarget.hasCDI()) {

1994

for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {

1995

setOperationAction(ISD::CTLZ, VT, Legal);

1996

}

1997

} // Subtarget.hasCDI()

1998

1999

if (Subtarget.hasVPOPCNTDQ()) {

2000

for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 })

2001

setOperationAction(ISD::CTPOP, VT, Legal);

2002

}

2003

}

2004

2005

// This block control legalization of v32i1/v64i1 which are available with

2006

// AVX512BW. 512-bit v32i16 and v64i8 vector legalization is controlled with

2007

// useBWIRegs.

2008

if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {

2009

addRegisterClass(MVT::v32i1, &X86::VK32RegClass);

2010

addRegisterClass(MVT::v64i1, &X86::VK64RegClass);

2011

2012

for (auto VT : { MVT::v32i1, MVT::v64i1 }) {

2013

setOperationAction(ISD::VSELECT, VT, Expand);

2014

setOperationAction(ISD::TRUNCATE, VT, Custom);

2015

setOperationAction(ISD::SETCC, VT, Custom);

2016

setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);

2017

setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);

2018

setOperationAction(ISD::SELECT, VT, Custom);

2019

setOperationAction(ISD::BUILD_VECTOR, VT, Custom);

2020

setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);

2021

setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);

2022

setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);

2023

}

2024

2025

for (auto VT : { MVT::v16i1, MVT::v32i1 })

2026

setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);

2027

2028

// Extends from v32i1 masks to 256-bit vectors.

2029

setOperationAction(ISD::SIGN_EXTEND, MVT::v32i8, Custom);

2030

setOperationAction(ISD::ZERO_EXTEND, MVT::v32i8, Custom);

2031

setOperationAction(ISD::ANY_EXTEND, MVT::v32i8, Custom);

2032

2033

for (auto VT : { MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16 }) {

2034

setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);

2035

setOperationAction(ISD::MSTORE, VT, Subtarget.hasVLX() ? Legal : Custom);

2036

}

2037

2038

// These operations are handled on non-VLX by artificially widening in

2039

// isel patterns.

2040

// TODO: Custom widen in lowering on non-VLX and drop the isel patterns?

2041

2042

if (Subtarget.hasBITALG()) {

2043

for (auto VT : { MVT::v16i8, MVT::v32i8, MVT::v8i16, MVT::v16i16 })

2044

setOperationAction(ISD::CTPOP, VT, Legal);

2045

}

2046

}

2047

2048

if (!Subtarget.useSoftFloat() && Subtarget.hasFP16()) {

2049

auto setGroup = [&] (MVT VT) {

2050

setOperationAction(ISD::FADD, VT, Legal);

2051

setOperationAction(ISD::STRICT_FADD, VT, Legal);

2052

setOperationAction(ISD::FSUB, VT, Legal);

2053

setOperationAction(ISD::STRICT_FSUB, VT, Legal);

2054

setOperationAction(ISD::FMUL, VT, Legal);

2055

setOperationAction(ISD::STRICT_FMUL, VT, Legal);

2056

setOperationAction(ISD::FDIV, VT, Legal);

2057

setOperationAction(ISD::STRICT_FDIV, VT, Legal);

2058

setOperationAction(ISD::FSQRT, VT, Legal);

2059

setOperationAction(ISD::STRICT_FSQRT, VT, Legal);

2060

2061

setOperationAction(ISD::FFLOOR, VT, Legal);

2062

setOperationAction(ISD::STRICT_FFLOOR, VT, Legal);

2063

setOperationAction(ISD::FCEIL, VT, Legal);

2064

setOperationAction(ISD::STRICT_FCEIL, VT, Legal);

2065

setOperationAction(ISD::FTRUNC, VT, Legal);

2066

setOperationAction(ISD::STRICT_FTRUNC, VT, Legal);

2067

setOperationAction(ISD::FRINT, VT, Legal);

2068

setOperationAction(ISD::STRICT_FRINT, VT, Legal);

2069

setOperationAction(ISD::FNEARBYINT, VT, Legal);

2070

setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal);

2071

2072

setOperationAction(ISD::LOAD, VT, Legal);

2073

setOperationAction(ISD::STORE, VT, Legal);

2074

2075

setOperationAction(ISD::FMA, VT, Legal);

2076

setOperationAction(ISD::STRICT_FMA, VT, Legal);

2077

setOperationAction(ISD::VSELECT, VT, Legal);

2078

setOperationAction(ISD::BUILD_VECTOR, VT, Custom);

2079

setOperationAction(ISD::SELECT, VT, Custom);

2080

2081

setOperationAction(ISD::FNEG, VT, Custom);

2082

setOperationAction(ISD::FABS, VT, Custom);

2083

setOperationAction(ISD::FCOPYSIGN, VT, Custom);

2084

setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);

2085

setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);

2086

};

2087

2088

// AVX512_FP16 scalar operations

2089

setGroup(MVT::f16);

2090

setOperationAction(ISD::FREM, MVT::f16, Promote);

2091

setOperationAction(ISD::STRICT_FREM, MVT::f16, Promote);

2092

setOperationAction(ISD::SELECT_CC, MVT::f16, Expand);

2093

setOperationAction(ISD::BR_CC, MVT::f16, Expand);

2094

setOperationAction(ISD::SETCC, MVT::f16, Custom);

2095

setOperationAction(ISD::STRICT_FSETCC, MVT::f16, Custom);

2096

setOperationAction(ISD::STRICT_FSETCCS, MVT::f16, Custom);

2097

setOperationAction(ISD::FROUND, MVT::f16, Custom);

2098

setOperationAction(ISD::STRICT_FROUND, MVT::f16, Promote);

2099

setOperationAction(ISD::FROUNDEVEN, MVT::f16, Legal);

2100

setOperationAction(ISD::STRICT_FROUNDEVEN, MVT::f16, Legal);

2101

setOperationAction(ISD::FP_ROUND, MVT::f16, Custom);

2102

setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, Custom);

2103

setOperationAction(ISD::FP_EXTEND, MVT::f32, Legal);

2104

setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f32, Legal);

2105

2106

setCondCodeAction(ISD::SETOEQ, MVT::f16, Expand);

2107

setCondCodeAction(ISD::SETUNE, MVT::f16, Expand);

2108

2109

if (Subtarget.useAVX512Regs()) {

2110

setGroup(MVT::v32f16);

2111

setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v32f16, Custom);

2112

setOperationAction(ISD::SINT_TO_FP, MVT::v32i16, Legal);

2113

setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v32i16, Legal);

2114

setOperationAction(ISD::UINT_TO_FP, MVT::v32i16, Legal);

2115

setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v32i16, Legal);

2116

setOperationAction(ISD::FP_ROUND, MVT::v16f16, Legal);

2117

setOperationAction(ISD::STRICT_FP_ROUND, MVT::v16f16, Legal);

2118

setOperationAction(ISD::FP_EXTEND, MVT::v16f32, Legal);

2119

setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v16f32, Legal);

2120

setOperationAction(ISD::FP_EXTEND, MVT::v8f64, Legal);

2121

setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v8f64, Legal);

2122

setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32f16, Custom);

2123

2124

setOperationAction(ISD::FP_TO_SINT, MVT::v32i16, Custom);

2125

setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v32i16, Custom);

2126

setOperationAction(ISD::FP_TO_UINT, MVT::v32i16, Custom);

2127

setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v32i16, Custom);

2128

setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v32i8, MVT::v32i16);

2129

setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v32i8,

2130

MVT::v32i16);

2131

setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v32i8, MVT::v32i16);

2132

setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v32i8,

2133

MVT::v32i16);

2134

setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v32i1, MVT::v32i16);

2135

setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v32i1,

2136

MVT::v32i16);

2137

setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v32i1, MVT::v32i16);

2138

setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v32i1,

2139

MVT::v32i16);

2140

2141

setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v16f16, Legal);

2142

setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32f16, Legal);

2143

setOperationAction(ISD::CONCAT_VECTORS, MVT::v32f16, Custom);

2144

2145

setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Legal);

2146

setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16f16, Legal);

2147

2148

setOperationAction(ISD::STRICT_FSETCC, MVT::v32i1, Custom);

2149

setOperationAction(ISD::STRICT_FSETCCS, MVT::v32i1, Custom);

2150

}

2151

2152

if (Subtarget.hasVLX()) {

2153

setGroup(MVT::v8f16);

2154

setGroup(MVT::v16f16);

2155

2156

setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8f16, Legal);

2157

setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16f16, Custom);

2158

setOperationAction(ISD::SINT_TO_FP, MVT::v16i16, Legal);

2159

setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v16i16, Legal);

2160

setOperationAction(ISD::SINT_TO_FP, MVT::v8i16, Legal);

2161

setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v8i16, Legal);

2162

setOperationAction(ISD::UINT_TO_FP, MVT::v16i16, Legal);

2163

setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v16i16, Legal);

2164

setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Legal);

2165

setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v8i16, Legal);

2166

2167

setOperationAction(ISD::FP_TO_SINT, MVT::v8i16, Custom);

2168

setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v8i16, Custom);

2169

setOperationAction(ISD::FP_TO_UINT, MVT::v8i16, Custom);

2170

setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v8i16, Custom);

2171

setOperationAction(ISD::FP_ROUND, MVT::v8f16, Legal);

2172

setOperationAction(ISD::STRICT_FP_ROUND, MVT::v8f16, Legal);

2173

setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Legal);

2174

setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v8f32, Legal);

2175

setOperationAction(ISD::FP_EXTEND, MVT::v4f64, Legal);

2176

setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v4f64, Legal);

2177

2178

// INSERT_VECTOR_ELT v8f16 extended to VECTOR_SHUFFLE

2179

setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8f16, Custom);

2180

setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16f16, Custom);

2181

2182

setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8f16, Legal);

2183

setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v16f16, Legal);

2184

setOperationAction(ISD::CONCAT_VECTORS, MVT::v16f16, Custom);

2185

2186

setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Legal);

2187

setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Legal);

2188

setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Legal);

2189

setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Legal);

2190

2191

// Need to custom widen these to prevent scalarization.

2192

setOperationAction(ISD::LOAD, MVT::v4f16, Custom);

2193

setOperationAction(ISD::STORE, MVT::v4f16, Custom);

2194

}

2195

}

2196

2197

if (!Subtarget.useSoftFloat() &&

2198

(Subtarget.hasAVXNECONVERT() || Subtarget.hasBF16())) {

2199

addRegisterClass(MVT::v8bf16, &X86::VR128XRegClass);

2200

addRegisterClass(MVT::v16bf16, &X86::VR256XRegClass);

2201

// We set the type action of bf16 to TypeSoftPromoteHalf, but we don't

2202

// provide the method to promote BUILD_VECTOR. Set the operation action

2203

// Custom to do the customization later.

2204

setOperationAction(ISD::BUILD_VECTOR, MVT::bf16, Custom);

2205

for (auto VT : {MVT::v8bf16, MVT::v16bf16}) {

2206

setF16Action(VT, Expand);

2207

setOperationAction(ISD::FADD, VT, Expand);

2208

setOperationAction(ISD::FSUB, VT, Expand);

2209

setOperationAction(ISD::FMUL, VT, Expand);

2210

setOperationAction(ISD::FDIV, VT, Expand);

2211

setOperationAction(ISD::BUILD_VECTOR, VT, Custom);

2212

}

2213

addLegalFPImmediate(APFloat::getZero(APFloat::BFloat()));

2214

}

2215

2216

if (!Subtarget.useSoftFloat() && Subtarget.hasBF16()) {

2217

addRegisterClass(MVT::v32bf16, &X86::VR512RegClass);

2218

setF16Action(MVT::v32bf16, Expand);

2219

setOperationAction(ISD::FADD, MVT::v32bf16, Expand);

2220

setOperationAction(ISD::FSUB, MVT::v32bf16, Expand);

2221

setOperationAction(ISD::FMUL, MVT::v32bf16, Expand);

2222

setOperationAction(ISD::FDIV, MVT::v32bf16, Expand);

2223

setOperationAction(ISD::BUILD_VECTOR, MVT::v32bf16, Custom);

2224

}

2225

2226

if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {

2227

setTruncStoreAction(MVT::v4i64, MVT::v4i8, Legal);

2228

setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal);

2229

setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal);

2230

setTruncStoreAction(MVT::v8i32, MVT::v8i8, Legal);

2231

setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal);

2232

2233

setTruncStoreAction(MVT::v2i64, MVT::v2i8, Legal);

2234

setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal);

2235

setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal);

2236

setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal);

2237

setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);

2238

2239

if (Subtarget.hasBWI()) {

2240

setTruncStoreAction(MVT::v16i16, MVT::v16i8, Legal);

2241

setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);

2242

}

2243

2244

if (Subtarget.hasFP16()) {

2245

// vcvttph2[u]dq v4f16 -> v4i32/64, v2f16 -> v2i32/64

2246

setOperationAction(ISD::FP_TO_SINT, MVT::v2f16, Custom);

2247

setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2f16, Custom);

2248

setOperationAction(ISD::FP_TO_UINT, MVT::v2f16, Custom);

2249

setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2f16, Custom);

2250

setOperationAction(ISD::FP_TO_SINT, MVT::v4f16, Custom);

2251

setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v4f16, Custom);

2252

setOperationAction(ISD::FP_TO_UINT, MVT::v4f16, Custom);

2253

setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v4f16, Custom);

2254

// vcvt[u]dq2ph v4i32/64 -> v4f16, v2i32/64 -> v2f16

2255

setOperationAction(ISD::SINT_TO_FP, MVT::v2f16, Custom);

2256

setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2f16, Custom);

2257

setOperationAction(ISD::UINT_TO_FP, MVT::v2f16, Custom);

2258

setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2f16, Custom);

2259

setOperationAction(ISD::SINT_TO_FP, MVT::v4f16, Custom);

2260

setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4f16, Custom);

2261

setOperationAction(ISD::UINT_TO_FP, MVT::v4f16, Custom);

2262

setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4f16, Custom);

2263

// vcvtps2phx v4f32 -> v4f16, v2f32 -> v2f16

2264

setOperationAction(ISD::FP_ROUND, MVT::v2f16, Custom);

2265

setOperationAction(ISD::STRICT_FP_ROUND, MVT::v2f16, Custom);

2266

setOperationAction(ISD::FP_ROUND, MVT::v4f16, Custom);

2267

setOperationAction(ISD::STRICT_FP_ROUND, MVT::v4f16, Custom);

2268

// vcvtph2psx v4f16 -> v4f32, v2f16 -> v2f32

2269

setOperationAction(ISD::FP_EXTEND, MVT::v2f16, Custom);

2270

setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v2f16, Custom);

2271

setOperationAction(ISD::FP_EXTEND, MVT::v4f16, Custom);

2272

setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v4f16, Custom);

2273

}

2274

2275

setOperationAction(ISD::TRUNCATE, MVT::v16i32, Custom);

2276

setOperationAction(ISD::TRUNCATE, MVT::v8i64, Custom);

2277

setOperationAction(ISD::TRUNCATE, MVT::v16i64, Custom);

2278

}

2279

2280

if (Subtarget.hasAMXTILE()) {

2281

addRegisterClass(MVT::x86amx, &X86::TILERegClass);

2282

}

2283

2284

// We want to custom lower some of our intrinsics.

2285

setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);

2286

setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);

2287

setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);

2288

if (!Subtarget.is64Bit()) {

2289

setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);

2290

}

2291

2292

// Only custom-lower 64-bit SADDO and friends on 64-bit because we don't

2293

// handle type legalization for these operations here.

2294

//

2295

// FIXME: We really should do custom legalization for addition and

2296

// subtraction on x86-32 once PR3203 is fixed. We really can't do much better

2297

// than generic legalization for 64-bit multiplication-with-overflow, though.

2298

for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {

2299

if (VT == MVT::i64 && !Subtarget.is64Bit())

2300

continue;

2301

// Add/Sub/Mul with overflow operations are custom lowered.

2302

setOperationAction(ISD::SADDO, VT, Custom);

2303

setOperationAction(ISD::UADDO, VT, Custom);

2304

setOperationAction(ISD::SSUBO, VT, Custom);

2305

setOperationAction(ISD::USUBO, VT, Custom);

2306

setOperationAction(ISD::SMULO, VT, Custom);

2307

setOperationAction(ISD::UMULO, VT, Custom);

2308

2309

// Support carry in as value rather than glue.

2310

setOperationAction(ISD::ADDCARRY, VT, Custom);

2311

setOperationAction(ISD::SUBCARRY, VT, Custom);

2312

setOperationAction(ISD::SETCCCARRY, VT, Custom);

2313

setOperationAction(ISD::SADDO_CARRY, VT, Custom);

2314

setOperationAction(ISD::SSUBO_CARRY, VT, Custom);

2315

}

2316

2317

if (!Subtarget.is64Bit()) {

2318

// These libcalls are not available in 32-bit.

2319

setLibcallName(RTLIB::SHL_I128, nullptr);

2320

setLibcallName(RTLIB::SRL_I128, nullptr);

2321

setLibcallName(RTLIB::SRA_I128, nullptr);

2322

setLibcallName(RTLIB::MUL_I128, nullptr);

2323

// The MULO libcall is not part of libgcc, only compiler-rt.

2324

setLibcallName(RTLIB::MULO_I64, nullptr);

2325

}

2326

// The MULO libcall is not part of libgcc, only compiler-rt.

2327

setLibcallName(RTLIB::MULO_I128, nullptr);

2328

2329

// Combine sin / cos into _sincos_stret if it is available.

2330

if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&

2331

getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {

2332

setOperationAction(ISD::FSINCOS, MVT::f64, Custom);

2333

setOperationAction(ISD::FSINCOS, MVT::f32, Custom);

2334

}

2335

2336

if (Subtarget.isTargetWin64()) {

2337

setOperationAction(ISD::SDIV, MVT::i128, Custom);

2338

setOperationAction(ISD::UDIV, MVT::i128, Custom);

2339

setOperationAction(ISD::SREM, MVT::i128, Custom);

2340

setOperationAction(ISD::UREM, MVT::i128, Custom);

2341

setOperationAction(ISD::FP_TO_SINT, MVT::i128, Custom);

2342

setOperationAction(ISD::FP_TO_UINT, MVT::i128, Custom);

2343

setOperationAction(ISD::SINT_TO_FP, MVT::i128, Custom);

2344

setOperationAction(ISD::UINT_TO_FP, MVT::i128, Custom);

2345

setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i128, Custom);

2346

setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i128, Custom);

2347

setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i128, Custom);

2348

setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i128, Custom);

2349

}

2350

2351

// On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)`

2352

// is. We should promote the value to 64-bits to solve this.

2353

// This is what the CRT headers do - `fmodf` is an inline header

2354

// function casting to f64 and calling `fmod`.

2355

if (Subtarget.is32Bit() &&

2356

(Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium()))

2357

for (ISD::NodeType Op :

2358

{ISD::FCEIL, ISD::STRICT_FCEIL,

2359

ISD::FCOS, ISD::STRICT_FCOS,

2360

ISD::FEXP, ISD::STRICT_FEXP,

2361

ISD::FFLOOR, ISD::STRICT_FFLOOR,

2362

ISD::FREM, ISD::STRICT_FREM,

2363

ISD::FLOG, ISD::STRICT_FLOG,

2364

ISD::FLOG10, ISD::STRICT_FLOG10,

2365

ISD::FPOW, ISD::STRICT_FPOW,

2366

ISD::FSIN, ISD::STRICT_FSIN})

2367

if (isOperationExpand(Op, MVT::f32))

2368

setOperationAction(Op, MVT::f32, Promote);

2369

2370

// We have target-specific dag combine patterns for the following nodes:

2371

setTargetDAGCombine({ISD::VECTOR_SHUFFLE,

2372

ISD::SCALAR_TO_VECTOR,

2373

ISD::INSERT_VECTOR_ELT,

2374

ISD::EXTRACT_VECTOR_ELT,

2375

ISD::CONCAT_VECTORS,

2376

ISD::INSERT_SUBVECTOR,

2377

ISD::EXTRACT_SUBVECTOR,

2378

ISD::BITCAST,

2379

ISD::VSELECT,

2380

ISD::SELECT,

2381

ISD::SHL,

2382

ISD::SRA,

2383

ISD::SRL,

2384

ISD::OR,

2385

ISD::AND,

2386

ISD::ADD,

2387

ISD::FADD,

2388

ISD::FSUB,

2389

ISD::FNEG,

2390

ISD::FMA,

2391

ISD::STRICT_FMA,

2392

ISD::FMINNUM,

2393

ISD::FMAXNUM,

2394

ISD::SUB,

2395

ISD::LOAD,

2396

ISD::MLOAD,

2397

ISD::STORE,

2398

ISD::MSTORE,

2399

ISD::TRUNCATE,

2400

ISD::ZERO_EXTEND,

2401

ISD::ANY_EXTEND,

2402

ISD::SIGN_EXTEND,

2403

ISD::SIGN_EXTEND_INREG,

2404

ISD::ANY_EXTEND_VECTOR_INREG,

2405

ISD::SIGN_EXTEND_VECTOR_INREG,

2406

ISD::ZERO_EXTEND_VECTOR_INREG,

2407

ISD::SINT_TO_FP,

2408

ISD::UINT_TO_FP,

2409

ISD::STRICT_SINT_TO_FP,

2410

ISD::STRICT_UINT_TO_FP,

2411

ISD::SETCC,

2412

ISD::MUL,

2413

ISD::XOR,

2414

ISD::MSCATTER,

2415

ISD::MGATHER,

2416

ISD::FP16_TO_FP,

2417

ISD::FP_EXTEND,

2418

ISD::STRICT_FP_EXTEND,

2419

ISD::FP_ROUND,

2420

ISD::STRICT_FP_ROUND});

2421

2422

computeRegisterProperties(Subtarget.getRegisterInfo());

2423

2424

MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores

2425

MaxStoresPerMemsetOptSize = 8;

2426

MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores

2427

MaxStoresPerMemcpyOptSize = 4;

2428

MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores

2429

MaxStoresPerMemmoveOptSize = 4;

2430

2431

// TODO: These control memcmp expansion in CGP and could be raised higher, but

2432

// that needs to benchmarked and balanced with the potential use of vector

2433

// load/store types (PR33329, PR33914).

2434

MaxLoadsPerMemcmp = 2;

2435

MaxLoadsPerMemcmpOptSize = 2;

2436

2437

// Default loop alignment, which can be overridden by -align-loops.

2438

setPrefLoopAlignment(Align(16));

2439

2440

// An out-of-order CPU can speculatively execute past a predictable branch,

2441

// but a conditional move could be stalled by an expensive earlier operation.

2442

PredictableSelectIsExpensive = Subtarget.getSchedModel().isOutOfOrder();

2443

EnableExtLdPromotion = true;

2444

setPrefFunctionAlignment(Align(16));

2445

2446

verifyIntrinsicTables();

2447

2448

// Default to having -disable-strictnode-mutation on

2449

IsStrictFPEnabled = true;

2450

}

2451

2452

// This has so far only been implemented for 64-bit MachO.

2453

bool X86TargetLowering::useLoadStackGuardNode() const {

2454

return Subtarget.isTargetMachO() && Subtarget.is64Bit();

2455

}

2456

2457

bool X86TargetLowering::useStackGuardXorFP() const {

2458

// Currently only MSVC CRTs XOR the frame pointer into the stack guard value.

2459

return Subtarget.getTargetTriple().isOSMSVCRT() && !Subtarget.isTargetMachO();

2460

}

2461

2462

SDValue X86TargetLowering::emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val,

2463

const SDLoc &DL) const {

2464

EVT PtrTy = getPointerTy(DAG.getDataLayout());

2465

unsigned XorOp = Subtarget.is64Bit() ? X86::XOR64_FP : X86::XOR32_FP;

2466

MachineSDNode *Node = DAG.getMachineNode(XorOp, DL, PtrTy, Val);

2467

return SDValue(Node, 0);

2468

}

2469

2470

TargetLoweringBase::LegalizeTypeAction

2471

X86TargetLowering::getPreferredVectorAction(MVT VT) const {

2472

if ((VT == MVT::v32i1 || VT == MVT::v64i1) && Subtarget.hasAVX512() &&

2473

!Subtarget.hasBWI())

2474

return TypeSplitVector;

2475

2476

if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&

2477

!Subtarget.hasF16C() && VT.getVectorElementType() == MVT::f16)

2478

return TypeSplitVector;

2479

2480

if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&

2481

VT.getVectorElementType() != MVT::i1)

2482

return TypeWidenVector;

2483

2484

return TargetLoweringBase::getPreferredVectorAction(VT);

2485

}

2486

2487

static std::pair<MVT, unsigned>

2488

handleMaskRegisterForCallingConv(unsigned NumElts, CallingConv::ID CC,

2489

const X86Subtarget &Subtarget) {

2490

// v2i1/v4i1/v8i1/v16i1 all pass in xmm registers unless the calling

2491

// convention is one that uses k registers.

2492

if (NumElts == 2)

2493

return {MVT::v2i64, 1};

2494

if (NumElts == 4)

2495

return {MVT::v4i32, 1};

2496

if (NumElts == 8 && CC != CallingConv::X86_RegCall &&

2497

CC != CallingConv::Intel_OCL_BI)

2498

return {MVT::v8i16, 1};

2499

if (NumElts == 16 && CC != CallingConv::X86_RegCall &&

2500

CC != CallingConv::Intel_OCL_BI)

2501

return {MVT::v16i8, 1};

2502

// v32i1 passes in ymm unless we have BWI and the calling convention is

2503

// regcall.

2504

if (NumElts == 32 && (!Subtarget.hasBWI() || CC != CallingConv::X86_RegCall))

2505

return {MVT::v32i8, 1};

2506

// Split v64i1 vectors if we don't have v64i8 available.

2507

if (NumElts == 64 && Subtarget.hasBWI() && CC != CallingConv::X86_RegCall) {

2508

if (Subtarget.useAVX512Regs())

2509

return {MVT::v64i8, 1};

2510

return {MVT::v32i8, 2};

2511

}

2512

2513

// Break wide or odd vXi1 vectors into scalars to match avx2 behavior.

2514

if (!isPowerOf2_32(NumElts) || (NumElts == 64 && !Subtarget.hasBWI()) ||

2515

NumElts > 64)

2516

return {MVT::i8, NumElts};

2517

2518

return {MVT::INVALID_SIMPLE_VALUE_TYPE, 0};

2519

}

2520

2521

MVT X86TargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,

2522

CallingConv::ID CC,

2523

EVT VT) const {

2524

if (VT.isVector()) {

2525

if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512()) {

2526

unsigned NumElts = VT.getVectorNumElements();

2527

2528

MVT RegisterVT;

2529

unsigned NumRegisters;

2530

std::tie(RegisterVT, NumRegisters) =

2531

handleMaskRegisterForCallingConv(NumElts, CC, Subtarget);

2532

if (RegisterVT != MVT::INVALID_SIMPLE_VALUE_TYPE)

2533

return RegisterVT;

2534

}

2535

2536

if (VT.getVectorElementType() == MVT::f16 && VT.getVectorNumElements() < 8)

2537

return MVT::v8f16;

2538

}

2539

2540

// We will use more GPRs for f64 and f80 on 32 bits when x87 is disabled.

2541

if ((VT == MVT::f64 || VT == MVT::f80) && !Subtarget.is64Bit() &&

2542

!Subtarget.hasX87())

2543

return MVT::i32;

2544

2545

if (VT.isVector() && VT.getVectorElementType() == MVT::bf16)

2546

return getRegisterTypeForCallingConv(Context, CC,

2547

VT.changeVectorElementTypeToInteger());

2548

2549

return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);

2550

}

2551

2552

unsigned X86TargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,

2553

CallingConv::ID CC,

2554

EVT VT) const {

2555

if (VT.isVector()) {

2556

if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512()) {

2557

unsigned NumElts = VT.getVectorNumElements();

2558

2559

MVT RegisterVT;

2560

unsigned NumRegisters;

2561

std::tie(RegisterVT, NumRegisters) =

2562

handleMaskRegisterForCallingConv(NumElts, CC, Subtarget);

2563

if (RegisterVT != MVT::INVALID_SIMPLE_VALUE_TYPE)

2564

return NumRegisters;

2565

}

2566

2567

if (VT.getVectorElementType() == MVT::f16 && VT.getVectorNumElements() < 8)

2568

return 1;

2569

}

2570

2571

// We have to split f64 to 2 registers and f80 to 3 registers on 32 bits if

2572

// x87 is disabled.

2573

if (!Subtarget.is64Bit() && !Subtarget.hasX87()) {

2574

if (VT == MVT::f64)

2575

return 2;

2576

if (VT == MVT::f80)

2577

return 3;

2578

}

2579

2580

if (VT.isVector() && VT.getVectorElementType() == MVT::bf16)

2581

return getNumRegistersForCallingConv(Context, CC,

2582

VT.changeVectorElementTypeToInteger());

2583

2584

return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);

2585

}

2586

2587

unsigned X86TargetLowering::getVectorTypeBreakdownForCallingConv(

2588

LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,

2589

unsigned &NumIntermediates, MVT &RegisterVT) const {

2590

// Break wide or odd vXi1 vectors into scalars to match avx2 behavior.

2591

if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&

2592

Subtarget.hasAVX512() &&

2593

(!isPowerOf2_32(VT.getVectorNumElements()) ||

2594

(VT.getVectorNumElements() == 64 && !Subtarget.hasBWI()) ||

2595

VT.getVectorNumElements() > 64)) {

2596

RegisterVT = MVT::i8;

2597

IntermediateVT = MVT::i1;

2598

NumIntermediates = VT.getVectorNumElements();

2599

return NumIntermediates;

2600

}

2601

2602

// Split v64i1 vectors if we don't have v64i8 available.

2603

if (VT == MVT::v64i1 && Subtarget.hasBWI() && !Subtarget.useAVX512Regs() &&

2604

CC != CallingConv::X86_RegCall) {

2605

RegisterVT = MVT::v32i8;

2606

IntermediateVT = MVT::v32i1;

2607

NumIntermediates = 2;

2608

return 2;

2609

}

2610

2611

return TargetLowering::getVectorTypeBreakdownForCallingConv(Context, CC, VT, IntermediateVT,

2612

NumIntermediates, RegisterVT);

2613

}

2614

2615

EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL,

2616

LLVMContext& Context,

2617

EVT VT) const {

2618

if (!VT.isVector())

2619

return MVT::i8;

2620

2621

if (Subtarget.hasAVX512()) {

2622

// Figure out what this type will be legalized to.

2623

EVT LegalVT = VT;

2624

while (getTypeAction(Context, LegalVT) != TypeLegal)

2625

LegalVT = getTypeToTransformTo(Context, LegalVT);

2626

2627

// If we got a 512-bit vector then we'll definitely have a vXi1 compare.

2628

if (LegalVT.getSimpleVT().is512BitVector())

2629

return EVT::getVectorVT(Context, MVT::i1, VT.getVectorElementCount());

2630

2631

if (LegalVT.getSimpleVT().isVector() && Subtarget.hasVLX()) {

2632

// If we legalized to less than a 512-bit vector, then we will use a vXi1

2633

// compare for vXi32/vXi64 for sure. If we have BWI we will also support

2634

// vXi16/vXi8.

2635

MVT EltVT = LegalVT.getSimpleVT().getVectorElementType();

2636

if (Subtarget.hasBWI() || EltVT.getSizeInBits() >= 32)

2637

return EVT::getVectorVT(Context, MVT::i1, VT.getVectorElementCount());

2638

}

2639

}

2640

2641

return VT.changeVectorElementTypeToInteger();

2642

}

2643

2644

/// Helper for getByValTypeAlignment to determine

2645

/// the desired ByVal argument alignment.

2646

static void getMaxByValAlign(Type *Ty, Align &MaxAlign) {

2647

if (MaxAlign == 16)

2648

return;

2649

if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {

2650

if (VTy->getPrimitiveSizeInBits().getFixedValue() == 128)

2651

MaxAlign = Align(16);

2652

} else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {

2653

Align EltAlign;

2654

getMaxByValAlign(ATy->getElementType(), EltAlign);

2655

if (EltAlign > MaxAlign)

2656

MaxAlign = EltAlign;

2657

} else if (StructType *STy = dyn_cast<StructType>(Ty)) {

2658

for (auto *EltTy : STy->elements()) {

2659

Align EltAlign;

2660

getMaxByValAlign(EltTy, EltAlign);

2661

if (EltAlign > MaxAlign)

2662

MaxAlign = EltAlign;

2663

if (MaxAlign == 16)

2664

break;

2665

}

2666

}

2667

}

2668

2669

/// Return the desired alignment for ByVal aggregate

2670

/// function arguments in the caller parameter area. For X86, aggregates

2671

/// that contain SSE vectors are placed at 16-byte boundaries while the rest

2672

/// are at 4-byte boundaries.

2673

uint64_t X86TargetLowering::getByValTypeAlignment(Type *Ty,

2674

const DataLayout &DL) const {

2675

if (Subtarget.is64Bit()) {

2676

// Max of 8 and alignment of type.

2677

Align TyAlign = DL.getABITypeAlign(Ty);

2678

if (TyAlign > 8)

2679

return TyAlign.value();

2680

return 8;

2681

}

2682

2683

Align Alignment(4);

2684

if (Subtarget.hasSSE1())

2685

getMaxByValAlign(Ty, Alignment);

2686

return Alignment.value();

2687

}

2688

2689

/// It returns EVT::Other if the type should be determined using generic

2690

/// target-independent logic.

2691

/// For vector ops we check that the overall size isn't larger than our

2692

/// preferred vector width.

2693

EVT X86TargetLowering::getOptimalMemOpType(

2694

const MemOp &Op, const AttributeList &FuncAttributes) const {

2695

if (!FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat)) {

2696

if (Op.size() >= 16 &&

2697

(!Subtarget.isUnalignedMem16Slow() || Op.isAligned(Align(16)))) {

2698

// FIXME: Check if unaligned 64-byte accesses are slow.

2699

if (Op.size() >= 64 && Subtarget.hasAVX512() &&

2700

(Subtarget.getPreferVectorWidth() >= 512)) {

2701

return Subtarget.hasBWI() ? MVT::v64i8 : MVT::v16i32;

2702

}

2703

// FIXME: Check if unaligned 32-byte accesses are slow.

2704

if (Op.size() >= 32 && Subtarget.hasAVX() &&

2705

Subtarget.useLight256BitInstructions()) {

2706

// Although this isn't a well-supported type for AVX1, we'll let

2707

// legalization and shuffle lowering produce the optimal codegen. If we

2708

// choose an optimal type with a vector element larger than a byte,

2709

// getMemsetStores() may create an intermediate splat (using an integer

2710

// multiply) before we splat as a vector.

2711

return MVT::v32i8;

2712

}

2713

if (Subtarget.hasSSE2() && (Subtarget.getPreferVectorWidth() >= 128))

2714

return MVT::v16i8;

2715

// TODO: Can SSE1 handle a byte vector?

2716

// If we have SSE1 registers we should be able to use them.

2717

if (Subtarget.hasSSE1() && (Subtarget.is64Bit() || Subtarget.hasX87()) &&

2718

(Subtarget.getPreferVectorWidth() >= 128))

2719

return MVT::v4f32;

2720

} else if (((Op.isMemcpy() && !Op.isMemcpyStrSrc()) || Op.isZeroMemset()) &&

2721

Op.size() >= 8 && !Subtarget.is64Bit() && Subtarget.hasSSE2()) {

2722

// Do not use f64 to lower memcpy if source is string constant. It's

2723

// better to use i32 to avoid the loads.

2724

// Also, do not use f64 to lower memset unless this is a memset of zeros.

2725

// The gymnastics of splatting a byte value into an XMM register and then

2726

// only using 8-byte stores (because this is a CPU with slow unaligned

2727

// 16-byte accesses) makes that a loser.

2728

return MVT::f64;

2729

}

2730

}

2731

// This is a compromise. If we reach here, unaligned accesses may be slow on

2732

// this target. However, creating smaller, aligned accesses could be even

2733

// slower and would certainly be a lot more code.

2734

if (Subtarget.is64Bit() && Op.size() >= 8)

2735

return MVT::i64;

2736

return MVT::i32;

2737

}

2738

2739

bool X86TargetLowering::isSafeMemOpType(MVT VT) const {

2740

if (VT == MVT::f32)

2741

return Subtarget.hasSSE1();

2742

if (VT == MVT::f64)

2743

return Subtarget.hasSSE2();

2744

return true;

2745

}

2746

2747

static bool isBitAligned(Align Alignment, uint64_t SizeInBits) {

2748

return (8 * Alignment.value()) % SizeInBits == 0;

2749

}

2750

2751

bool X86TargetLowering::isMemoryAccessFast(EVT VT, Align Alignment) const {

2752

if (isBitAligned(Alignment, VT.getSizeInBits()))

2753

return true;

2754

switch (VT.getSizeInBits()) {

2755

default:

2756

// 8-byte and under are always assumed to be fast.

2757

return true;

2758

case 128:

2759

return !Subtarget.isUnalignedMem16Slow();

2760

case 256:

2761

return !Subtarget.isUnalignedMem32Slow();

2762

// TODO: What about AVX-512 (512-bit) accesses?

2763

}

2764

}

2765

2766

bool X86TargetLowering::allowsMisalignedMemoryAccesses(

2767

EVT VT, unsigned, Align Alignment, MachineMemOperand::Flags Flags,

2768

unsigned *Fast) const {

2769

if (Fast)

2770

*Fast = isMemoryAccessFast(VT, Alignment);

2771

// NonTemporal vector memory ops must be aligned.

2772

if (!!(Flags & MachineMemOperand::MONonTemporal) && VT.isVector()) {

2773

// NT loads can only be vector aligned, so if its less aligned than the

2774

// minimum vector size (which we can split the vector down to), we might as

2775

// well use a regular unaligned vector load.

2776

// We don't have any NT loads pre-SSE41.

2777

if (!!(Flags & MachineMemOperand::MOLoad))

2778

return (Alignment < 16 || !Subtarget.hasSSE41());

2779

return false;

2780

}

2781

// Misaligned accesses of any size are always allowed.

2782

return true;

2783

}

2784

2785

bool X86TargetLowering::allowsMemoryAccess(LLVMContext &Context,

2786

const DataLayout &DL, EVT VT,

2787

unsigned AddrSpace, Align Alignment,

2788

MachineMemOperand::Flags Flags,

2789

unsigned *Fast) const {

2790

if (Fast)

2791

*Fast = isMemoryAccessFast(VT, Alignment);

2792

if (!!(Flags & MachineMemOperand::MONonTemporal) && VT.isVector()) {

2793

if (allowsMisalignedMemoryAccesses(VT, AddrSpace, Alignment, Flags,

2794

/*Fast=*/nullptr))

2795

return true;

2796

// NonTemporal vector memory ops are special, and must be aligned.

2797

if (!isBitAligned(Alignment, VT.getSizeInBits()))

2798

return false;

2799

switch (VT.getSizeInBits()) {

2800

case 128:

2801

if (!!(Flags & MachineMemOperand::MOLoad) && Subtarget.hasSSE41())

2802

return true;

2803

if (!!(Flags & MachineMemOperand::MOStore) && Subtarget.hasSSE2())

2804

return true;

2805

return false;

2806

case 256:

2807

if (!!(Flags & MachineMemOperand::MOLoad) && Subtarget.hasAVX2())

2808

return true;

2809

if (!!(Flags & MachineMemOperand::MOStore) && Subtarget.hasAVX())

2810

return true;

2811

return false;

2812

case 512:

2813

if (Subtarget.hasAVX512())

2814

return true;

2815

return false;

2816

default:

2817

return false; // Don't have NonTemporal vector memory ops of this size.

2818

}

2819

}

2820

return true;

2821

}

2822

2823

/// Return the entry encoding for a jump table in the

2824

/// current function. The returned value is a member of the

2825

/// MachineJumpTableInfo::JTEntryKind enum.

2826

unsigned X86TargetLowering::getJumpTableEncoding() const {

2827

// In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF

2828

// symbol.

2829

if (isPositionIndependent() && Subtarget.isPICStyleGOT())

2830

return MachineJumpTableInfo::EK_Custom32;

2831

2832

// Otherwise, use the normal jump table encoding heuristics.

2833

return TargetLowering::getJumpTableEncoding();

2834

}

2835

2836

bool X86TargetLowering::splitValueIntoRegisterParts(

2837

SelectionDAG &DAG, const SDLoc &DL, SDValue Val, SDValue *Parts,

2838

unsigned NumParts, MVT PartVT, std::optional<CallingConv::ID> CC) const {

2839

bool IsABIRegCopy = CC.has_value();

2840

EVT ValueVT = Val.getValueType();

2841

if (IsABIRegCopy && ValueVT == MVT::bf16 && PartVT == MVT::f32) {

2842

unsigned ValueBits = ValueVT.getSizeInBits();

2843

unsigned PartBits = PartVT.getSizeInBits();

2844

Val = DAG.getNode(ISD::BITCAST, DL, MVT::getIntegerVT(ValueBits), Val);

2845

Val = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::getIntegerVT(PartBits), Val);

2846

Val = DAG.getNode(ISD::BITCAST, DL, PartVT, Val);

2847

Parts[0] = Val;

2848

return true;

2849

}

2850

return false;

2851

}

2852

2853

SDValue X86TargetLowering::joinRegisterPartsIntoValue(

2854

SelectionDAG &DAG, const SDLoc &DL, const SDValue *Parts, unsigned NumParts,

2855

MVT PartVT, EVT ValueVT, std::optional<CallingConv::ID> CC) const {

2856

bool IsABIRegCopy = CC.has_value();

2857

if (IsABIRegCopy && ValueVT == MVT::bf16 && PartVT == MVT::f32) {

2858

unsigned ValueBits = ValueVT.getSizeInBits();

2859

unsigned PartBits = PartVT.getSizeInBits();

2860

SDValue Val = Parts[0];

2861

2862

Val = DAG.getNode(ISD::BITCAST, DL, MVT::getIntegerVT(PartBits), Val);

2863

Val = DAG.getNode(ISD::TRUNCATE, DL, MVT::getIntegerVT(ValueBits), Val);

2864

Val = DAG.getNode(ISD::BITCAST, DL, ValueVT, Val);

2865

return Val;

2866

}

2867

return SDValue();

2868

}

2869

2870

bool X86TargetLowering::useSoftFloat() const {

2871

return Subtarget.useSoftFloat();

2872

}

2873

2874

void X86TargetLowering::markLibCallAttributes(MachineFunction *MF, unsigned CC,

2875

ArgListTy &Args) const {

2876

2877

// Only relabel X86-32 for C / Stdcall CCs.

2878

if (Subtarget.is64Bit())

2879

return;

2880

if (CC != CallingConv::C && CC != CallingConv::X86_StdCall)

2881

return;

2882

unsigned ParamRegs = 0;

2883

if (auto *M = MF->getFunction().getParent())

2884

ParamRegs = M->getNumberRegisterParameters();

2885

2886

// Mark the first N int arguments as having reg

2887

for (auto &Arg : Args) {

2888

Type *T = Arg.Ty;

2889

if (T->isIntOrPtrTy())

2890

if (MF->getDataLayout().getTypeAllocSize(T) <= 8) {

2891

unsigned numRegs = 1;

2892

if (MF->getDataLayout().getTypeAllocSize(T) > 4)

2893

numRegs = 2;

2894

if (ParamRegs < numRegs)

2895

return;

2896

ParamRegs -= numRegs;

2897

Arg.IsInReg = true;

2898

}

2899

}

2900

}

2901

2902

const MCExpr *

2903

X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,

2904

const MachineBasicBlock *MBB,

2905

unsigned uid,MCContext &Ctx) const{

2906

assert(isPositionIndependent() && Subtarget.isPICStyleGOT())(static_cast <bool> (isPositionIndependent() &&
Subtarget.isPICStyleGOT()) ? void (0) : __assert_fail ("isPositionIndependent() && Subtarget.isPICStyleGOT()"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 2906, __extension__
__PRETTY_FUNCTION__));

2907

// In 32-bit ELF systems, our jump table entries are formed with @GOTOFF

2908

// entries.

2909

return MCSymbolRefExpr::create(MBB->getSymbol(),

2910

MCSymbolRefExpr::VK_GOTOFF, Ctx);

2911

}

2912

2913

/// Returns relocation base for the given PIC jumptable.

2914

SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,

2915

SelectionDAG &DAG) const {

2916

if (!Subtarget.is64Bit())

2917

// This doesn't have SDLoc associated with it, but is not really the

2918

// same as a Register.

2919

return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),

2920

getPointerTy(DAG.getDataLayout()));

2921

return Table;

2922

}

2923

2924

/// This returns the relocation base for the given PIC jumptable,

2925

/// the same as getPICJumpTableRelocBase, but as an MCExpr.

2926

const MCExpr *X86TargetLowering::

2927

getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,

2928

MCContext &Ctx) const {

2929

// X86-64 uses RIP relative addressing based on the jump table label.

2930

if (Subtarget.isPICStyleRIPRel())

2931

return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);

2932

2933

// Otherwise, the reference is relative to the PIC base.

2934

return MCSymbolRefExpr::create(MF->getPICBaseSymbol(), Ctx);

2935

}

2936

2937

std::pair<const TargetRegisterClass *, uint8_t>

2938

X86TargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,

2939

MVT VT) const {

2940

const TargetRegisterClass *RRC = nullptr;

2941

uint8_t Cost = 1;

2942

switch (VT.SimpleTy) {

2943

default:

2944

return TargetLowering::findRepresentativeClass(TRI, VT);

2945

case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:

2946

RRC = Subtarget.is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass;

2947

break;

2948

case MVT::x86mmx:

2949

RRC = &X86::VR64RegClass;

2950

break;

2951

case MVT::f32: case MVT::f64:

2952

case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:

2953

case MVT::v4f32: case MVT::v2f64:

2954

case MVT::v32i8: case MVT::v16i16: case MVT::v8i32: case MVT::v4i64:

2955

case MVT::v8f32: case MVT::v4f64:

2956

case MVT::v64i8: case MVT::v32i16: case MVT::v16i32: case MVT::v8i64:

2957

case MVT::v16f32: case MVT::v8f64:

2958

RRC = &X86::VR128XRegClass;

2959

break;

2960

}

2961

return std::make_pair(RRC, Cost);

2962

}

2963

2964

unsigned X86TargetLowering::getAddressSpace() const {

2965

if (Subtarget.is64Bit())

2966

return (getTargetMachine().getCodeModel() == CodeModel::Kernel) ? 256 : 257;

2967

return 256;

2968

}

2969

2970

static bool hasStackGuardSlotTLS(const Triple &TargetTriple) {

2971

return TargetTriple.isOSGlibc() || TargetTriple.isOSFuchsia() ||

2972

(TargetTriple.isAndroid() && !TargetTriple.isAndroidVersionLT(17));

2973

}

2974

2975

static Constant* SegmentOffset(IRBuilderBase &IRB,

2976

int Offset, unsigned AddressSpace) {

2977

return ConstantExpr::getIntToPtr(

2978

ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset),

2979

Type::getInt8PtrTy(IRB.getContext())->getPointerTo(AddressSpace));

2980

}

2981

2982

Value *X86TargetLowering::getIRStackGuard(IRBuilderBase &IRB) const {

2983

// glibc, bionic, and Fuchsia have a special slot for the stack guard in

2984

// tcbhead_t; use it instead of the usual global variable (see

2985

// sysdeps/{i386,x86_64}/nptl/tls.h)

2986

if (hasStackGuardSlotTLS(Subtarget.getTargetTriple())) {

2987

if (Subtarget.isTargetFuchsia()) {

2988

// <zircon/tls.h> defines ZX_TLS_STACK_GUARD_OFFSET with this value.

2989

return SegmentOffset(IRB, 0x10, getAddressSpace());

2990

} else {

2991

unsigned AddressSpace = getAddressSpace();

2992

Module *M = IRB.GetInsertBlock()->getParent()->getParent();

2993

// Specially, some users may customize the base reg and offset.

2994

int Offset = M->getStackProtectorGuardOffset();

2995

// If we don't set -stack-protector-guard-offset value:

2996

// %fs:0x28, unless we're using a Kernel code model, in which case

2997

// it's %gs:0x28. gs:0x14 on i386.

2998

if (Offset == INT_MAX2147483647)

2999

Offset = (Subtarget.is64Bit()) ? 0x28 : 0x14;

3000

3001

StringRef GuardReg = M->getStackProtectorGuardReg();

3002

if (GuardReg == "fs")

3003

AddressSpace = X86AS::FS;

3004

else if (GuardReg == "gs")

3005

AddressSpace = X86AS::GS;

3006

3007

// Use symbol guard if user specify.

3008

StringRef GuardSymb = M->getStackProtectorGuardSymbol();

3009

if (!GuardSymb.empty()) {

3010

GlobalVariable *GV = M->getGlobalVariable(GuardSymb);

3011

if (!GV) {

3012

Type *Ty = Subtarget.is64Bit() ? Type::getInt64Ty(M->getContext())

3013

: Type::getInt32Ty(M->getContext());

3014

GV = new GlobalVariable(*M, Ty, false, GlobalValue::ExternalLinkage,

3015

nullptr, GuardSymb, nullptr,

3016

GlobalValue::NotThreadLocal, AddressSpace);

3017

}

3018

return GV;

3019

}

3020

3021

return SegmentOffset(IRB, Offset, AddressSpace);

3022

}

3023

}

3024

return TargetLowering::getIRStackGuard(IRB);

3025

}

3026

3027

void X86TargetLowering::insertSSPDeclarations(Module &M) const {

3028

// MSVC CRT provides functionalities for stack protection.

3029

if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||

3030

Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {

3031

// MSVC CRT has a global variable holding security cookie.

3032

M.getOrInsertGlobal("__security_cookie",

3033

Type::getInt8PtrTy(M.getContext()));

3034

3035

// MSVC CRT has a function to validate security cookie.

3036

FunctionCallee SecurityCheckCookie = M.getOrInsertFunction(

3037

"__security_check_cookie", Type::getVoidTy(M.getContext()),

3038

Type::getInt8PtrTy(M.getContext()));

3039

if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee())) {

3040

F->setCallingConv(CallingConv::X86_FastCall);

3041

F->addParamAttr(0, Attribute::AttrKind::InReg);

3042

}

3043

return;

3044

}

3045

3046

StringRef GuardMode = M.getStackProtectorGuard();

3047

3048

// glibc, bionic, and Fuchsia have a special slot for the stack guard.

3049

if ((GuardMode == "tls" || GuardMode.empty()) &&

3050

hasStackGuardSlotTLS(Subtarget.getTargetTriple()))

3051

return;

3052

TargetLowering::insertSSPDeclarations(M);

3053

}

3054

3055

Value *X86TargetLowering::getSDagStackGuard(const Module &M) const {

3056

// MSVC CRT has a global variable holding security cookie.

3057

if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||

3058

Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {

3059

return M.getGlobalVariable("__security_cookie");

3060

}

3061

return TargetLowering::getSDagStackGuard(M);

3062

}

3063

3064

Function *X86TargetLowering::getSSPStackGuardCheck(const Module &M) const {

3065

// MSVC CRT has a function to validate security cookie.

3066

if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||

3067

Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {

3068

return M.getFunction("__security_check_cookie");

3069

}

3070

return TargetLowering::getSSPStackGuardCheck(M);

3071

}

3072

3073

Value *

3074

X86TargetLowering::getSafeStackPointerLocation(IRBuilderBase &IRB) const {

3075

if (Subtarget.getTargetTriple().isOSContiki())

3076

return getDefaultSafeStackPointerLocation(IRB, false);

3077

3078

// Android provides a fixed TLS slot for the SafeStack pointer. See the

3079

// definition of TLS_SLOT_SAFESTACK in

3080

// https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h

3081

if (Subtarget.isTargetAndroid()) {

3082

// %fs:0x48, unless we're using a Kernel code model, in which case it's %gs:

3083

// %gs:0x24 on i386

3084

int Offset = (Subtarget.is64Bit()) ? 0x48 : 0x24;

3085

return SegmentOffset(IRB, Offset, getAddressSpace());

3086

}

3087

3088

// Fuchsia is similar.

3089

if (Subtarget.isTargetFuchsia()) {

3090

// <zircon/tls.h> defines ZX_TLS_UNSAFE_SP_OFFSET with this value.

3091

return SegmentOffset(IRB, 0x18, getAddressSpace());

3092

}

3093

3094

return TargetLowering::getSafeStackPointerLocation(IRB);

3095

}

3096

3097

//===----------------------------------------------------------------------===//

3098

// Return Value Calling Convention Implementation

3099

//===----------------------------------------------------------------------===//

3100

3101

bool X86TargetLowering::CanLowerReturn(

3102

CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,

3103

const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {

3104

SmallVector<CCValAssign, 16> RVLocs;

3105

CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);

3106

return CCInfo.CheckReturn(Outs, RetCC_X86);

3107

}

3108

3109

const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {

3110

static const MCPhysReg ScratchRegs[] = { X86::R11, 0 };

3111

return ScratchRegs;

3112

}

3113

3114

/// Lowers masks values (v*i1) to the local register values

3115

/// \returns DAG node after lowering to register type

3116

static SDValue lowerMasksToReg(const SDValue &ValArg, const EVT &ValLoc,

3117

const SDLoc &Dl, SelectionDAG &DAG) {

3118

EVT ValVT = ValArg.getValueType();

3119

3120

if (ValVT == MVT::v1i1)

3121

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, Dl, ValLoc, ValArg,

3122

DAG.getIntPtrConstant(0, Dl));

3123

3124

if ((ValVT == MVT::v8i1 && (ValLoc == MVT::i8 || ValLoc == MVT::i32)) ||

3125

(ValVT == MVT::v16i1 && (ValLoc == MVT::i16 || ValLoc == MVT::i32))) {

3126

// Two stage lowering might be required

3127

// bitcast: v8i1 -> i8 / v16i1 -> i16

3128

// anyextend: i8 -> i32 / i16 -> i32

3129

EVT TempValLoc = ValVT == MVT::v8i1 ? MVT::i8 : MVT::i16;

3130

SDValue ValToCopy = DAG.getBitcast(TempValLoc, ValArg);

3131

if (ValLoc == MVT::i32)

3132

ValToCopy = DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValToCopy);

3133

return ValToCopy;

3134

}

3135

3136

if ((ValVT == MVT::v32i1 && ValLoc == MVT::i32) ||

3137

(ValVT == MVT::v64i1 && ValLoc == MVT::i64)) {

3138

// One stage lowering is required

3139

// bitcast: v32i1 -> i32 / v64i1 -> i64

3140

return DAG.getBitcast(ValLoc, ValArg);

3141

}

3142

3143

return DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValArg);

3144

}

3145

3146

/// Breaks v64i1 value into two registers and adds the new node to the DAG

3147

static void Passv64i1ArgInRegs(

3148

const SDLoc &Dl, SelectionDAG &DAG, SDValue &Arg,

3149

SmallVectorImpl<std::pair<Register, SDValue>> &RegsToPass, CCValAssign &VA,

3150

CCValAssign &NextVA, const X86Subtarget &Subtarget) {

3151

assert(Subtarget.hasBWI() && "Expected AVX512BW target!")(static_cast <bool> (Subtarget.hasBWI() && "Expected AVX512BW target!"
) ? void (0) : __assert_fail ("Subtarget.hasBWI() && \"Expected AVX512BW target!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3151, __extension__
__PRETTY_FUNCTION__));

3152

assert(Subtarget.is32Bit() && "Expecting 32 bit target")(static_cast <bool> (Subtarget.is32Bit() && "Expecting 32 bit target"
) ? void (0) : __assert_fail ("Subtarget.is32Bit() && \"Expecting 32 bit target\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3152, __extension__
__PRETTY_FUNCTION__));

3153

assert(Arg.getValueType() == MVT::i64 && "Expecting 64 bit value")(static_cast <bool> (Arg.getValueType() == MVT::i64 &&
"Expecting 64 bit value") ? void (0) : __assert_fail ("Arg.getValueType() == MVT::i64 && \"Expecting 64 bit value\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3153, __extension__
__PRETTY_FUNCTION__));

3154

assert(VA.isRegLoc() && NextVA.isRegLoc() &&(static_cast <bool> (VA.isRegLoc() && NextVA.isRegLoc
() && "The value should reside in two registers") ? void
(0) : __assert_fail ("VA.isRegLoc() && NextVA.isRegLoc() && \"The value should reside in two registers\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3155, __extension__
__PRETTY_FUNCTION__))

3155

"The value should reside in two registers")(static_cast <bool> (VA.isRegLoc() && NextVA.isRegLoc
() && "The value should reside in two registers") ? void
(0) : __assert_fail ("VA.isRegLoc() && NextVA.isRegLoc() && \"The value should reside in two registers\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3155, __extension__
__PRETTY_FUNCTION__));

3156

3157

// Before splitting the value we cast it to i64

3158

Arg = DAG.getBitcast(MVT::i64, Arg);

3159

3160

// Splitting the value into two i32 types

3161

SDValue Lo, Hi;

3162

Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,

3163

DAG.getConstant(0, Dl, MVT::i32));

3164

Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,

3165

DAG.getConstant(1, Dl, MVT::i32));

3166

3167

// Attach the two i32 types into corresponding registers

3168

RegsToPass.push_back(std::make_pair(VA.getLocReg(), Lo));

3169

RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), Hi));

3170

}

3171

3172

SDValue

3173

X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,

3174

bool isVarArg,

3175

const SmallVectorImpl<ISD::OutputArg> &Outs,

3176

const SmallVectorImpl<SDValue> &OutVals,

3177

const SDLoc &dl, SelectionDAG &DAG) const {

3178

MachineFunction &MF = DAG.getMachineFunction();

3179

X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();

3180

3181

// In some cases we need to disable registers from the default CSR list.

3182

// For example, when they are used for argument passing.

3183

bool ShouldDisableCalleeSavedRegister =

3184

shouldDisableCalleeSavedRegisterCC(CallConv) ||

3185

MF.getFunction().hasFnAttribute("no_caller_saved_registers");

3186

3187

if (CallConv == CallingConv::X86_INTR && !Outs.empty())

3188

report_fatal_error("X86 interrupts may not return any value");

3189

3190

SmallVector<CCValAssign, 16> RVLocs;

3191

CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());

3192

CCInfo.AnalyzeReturn(Outs, RetCC_X86);

3193

3194

SmallVector<std::pair<Register, SDValue>, 4> RetVals;

3195

for (unsigned I = 0, OutsIndex = 0, E = RVLocs.size(); I != E;

3196

++I, ++OutsIndex) {

3197

CCValAssign &VA = RVLocs[I];

3198

assert(VA.isRegLoc() && "Can only return in registers!")(static_cast <bool> (VA.isRegLoc() && "Can only return in registers!"
) ? void (0) : __assert_fail ("VA.isRegLoc() && \"Can only return in registers!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3198, __extension__
__PRETTY_FUNCTION__));

3199

3200

// Add the register to the CalleeSaveDisableRegs list.

3201

if (ShouldDisableCalleeSavedRegister)

3202

MF.getRegInfo().disableCalleeSavedRegister(VA.getLocReg());

3203

3204

SDValue ValToCopy = OutVals[OutsIndex];

3205

EVT ValVT = ValToCopy.getValueType();

3206

3207

// Promote values to the appropriate types.

3208

if (VA.getLocInfo() == CCValAssign::SExt)

3209

ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);

3210

else if (VA.getLocInfo() == CCValAssign::ZExt)

3211

ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);

3212

else if (VA.getLocInfo() == CCValAssign::AExt) {

3213

if (ValVT.isVector() && ValVT.getVectorElementType() == MVT::i1)

3214

ValToCopy = lowerMasksToReg(ValToCopy, VA.getLocVT(), dl, DAG);

3215

else

3216

ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);

3217

}

3218

else if (VA.getLocInfo() == CCValAssign::BCvt)

3219

ValToCopy = DAG.getBitcast(VA.getLocVT(), ValToCopy);

3220

3221

assert(VA.getLocInfo() != CCValAssign::FPExt &&(static_cast <bool> (VA.getLocInfo() != CCValAssign::FPExt
&& "Unexpected FP-extend for return value.") ? void (
0) : __assert_fail ("VA.getLocInfo() != CCValAssign::FPExt && \"Unexpected FP-extend for return value.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3222, __extension__
__PRETTY_FUNCTION__))

3222

"Unexpected FP-extend for return value.")(static_cast <bool> (VA.getLocInfo() != CCValAssign::FPExt
&& "Unexpected FP-extend for return value.") ? void (
0) : __assert_fail ("VA.getLocInfo() != CCValAssign::FPExt && \"Unexpected FP-extend for return value.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3222, __extension__
__PRETTY_FUNCTION__));

3223

3224

// Report an error if we have attempted to return a value via an XMM

3225

// register and SSE was disabled.

3226

if (!Subtarget.hasSSE1() && X86::FR32XRegClass.contains(VA.getLocReg())) {

3227

errorUnsupported(DAG, dl, "SSE register return with SSE disabled");

3228

VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.

3229

} else if (!Subtarget.hasSSE2() &&

3230

X86::FR64XRegClass.contains(VA.getLocReg()) &&

3231

ValVT == MVT::f64) {

3232

// When returning a double via an XMM register, report an error if SSE2 is

3233

// not enabled.

3234

errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled");

3235

VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.

3236

}

3237

3238

// Returns in ST0/ST1 are handled specially: these are pushed as operands to

3239

// the RET instruction and handled by the FP Stackifier.

3240

if (VA.getLocReg() == X86::FP0 ||

3241

VA.getLocReg() == X86::FP1) {

3242

// If this is a copy from an xmm register to ST(0), use an FPExtend to

3243

// change the value to the FP stack register class.

3244

if (isScalarFPTypeInSSEReg(VA.getValVT()))

3245

ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);

3246

RetVals.push_back(std::make_pair(VA.getLocReg(), ValToCopy));

3247

// Don't emit a copytoreg.

3248

continue;

3249

}

3250

3251

// 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64

3252

// which is returned in RAX / RDX.

3253

if (Subtarget.is64Bit()) {

3254

if (ValVT == MVT::x86mmx) {

3255

if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {

3256

ValToCopy = DAG.getBitcast(MVT::i64, ValToCopy);

3257

ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,

3258

ValToCopy);

3259

// If we don't have SSE2 available, convert to v4f32 so the generated

3260

// register is legal.

3261

if (!Subtarget.hasSSE2())

3262

ValToCopy = DAG.getBitcast(MVT::v4f32, ValToCopy);

3263

}

3264

}

3265

}

3266

3267

if (VA.needsCustom()) {

3268

assert(VA.getValVT() == MVT::v64i1 &&(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs"
) ? void (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3269, __extension__
__PRETTY_FUNCTION__))

3269

"Currently the only custom case is when we split v64i1 to 2 regs")(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs"
) ? void (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3269, __extension__
__PRETTY_FUNCTION__));

3270

3271

Passv64i1ArgInRegs(dl, DAG, ValToCopy, RetVals, VA, RVLocs[++I],

3272

Subtarget);

3273

3274

// Add the second register to the CalleeSaveDisableRegs list.

3275

if (ShouldDisableCalleeSavedRegister)

3276

MF.getRegInfo().disableCalleeSavedRegister(RVLocs[I].getLocReg());

3277

} else {

3278

RetVals.push_back(std::make_pair(VA.getLocReg(), ValToCopy));

3279

}

3280

}

3281

3282

SDValue Flag;

3283

SmallVector<SDValue, 6> RetOps;

3284

RetOps.push_back(Chain); // Operand #0 = Chain (updated below)

3285

// Operand #1 = Bytes To Pop

3286

RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), dl,

3287

MVT::i32));

3288

3289

// Copy the result values into the output registers.

3290

for (auto &RetVal : RetVals) {

3291

if (RetVal.first == X86::FP0 || RetVal.first == X86::FP1) {

3292

RetOps.push_back(RetVal.second);

3293

continue; // Don't emit a copytoreg.

3294

}

3295

3296

Chain = DAG.getCopyToReg(Chain, dl, RetVal.first, RetVal.second, Flag);

3297

Flag = Chain.getValue(1);

3298

RetOps.push_back(

3299

DAG.getRegister(RetVal.first, RetVal.second.getValueType()));

3300

}

3301

3302

// Swift calling convention does not require we copy the sret argument

3303

// into %rax/%eax for the return, and SRetReturnReg is not set for Swift.

3304

3305

// All x86 ABIs require that for returning structs by value we copy

3306

// the sret argument into %rax/%eax (depending on ABI) for the return.

3307

// We saved the argument into a virtual register in the entry block,

3308

// so now we copy the value out and into %rax/%eax.

3309

//

3310

// Checking Function.hasStructRetAttr() here is insufficient because the IR

3311

// may not have an explicit sret argument. If FuncInfo.CanLowerReturn is

3312

// false, then an sret argument may be implicitly inserted in the SelDAG. In

3313

// either case FuncInfo->setSRetReturnReg() will have been called.

3314

if (Register SRetReg = FuncInfo->getSRetReturnReg()) {

3315

// When we have both sret and another return value, we should use the

3316

// original Chain stored in RetOps[0], instead of the current Chain updated

3317

// in the above loop. If we only have sret, RetOps[0] equals to Chain.

3318

3319

// For the case of sret and another return value, we have

3320

// Chain_0 at the function entry

3321

// Chain_1 = getCopyToReg(Chain_0) in the above loop

3322

// If we use Chain_1 in getCopyFromReg, we will have

3323

// Val = getCopyFromReg(Chain_1)

3324

// Chain_2 = getCopyToReg(Chain_1, Val) from below

3325

3326

// getCopyToReg(Chain_0) will be glued together with

3327

// getCopyToReg(Chain_1, Val) into Unit A, getCopyFromReg(Chain_1) will be

3328

// in Unit B, and we will have cyclic dependency between Unit A and Unit B:

3329

// Data dependency from Unit B to Unit A due to usage of Val in

3330

// getCopyToReg(Chain_1, Val)

3331

// Chain dependency from Unit A to Unit B

3332

3333

// So here, we use RetOps[0] (i.e Chain_0) for getCopyFromReg.

3334

SDValue Val = DAG.getCopyFromReg(RetOps[0], dl, SRetReg,

3335

getPointerTy(MF.getDataLayout()));

3336

3337

Register RetValReg

3338

= (Subtarget.is64Bit() && !Subtarget.isTarget64BitILP32()) ?

3339

X86::RAX : X86::EAX;

3340

Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag);

3341

Flag = Chain.getValue(1);

3342

3343

// RAX/EAX now acts like a return value.

3344

RetOps.push_back(

3345

DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));

3346

3347

// Add the returned register to the CalleeSaveDisableRegs list.

3348

if (ShouldDisableCalleeSavedRegister)

3349

MF.getRegInfo().disableCalleeSavedRegister(RetValReg);

3350

}

3351

3352

const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();

3353

const MCPhysReg *I =

3354

TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());

3355

if (I) {

3356

for (; *I; ++I) {

3357

if (X86::GR64RegClass.contains(*I))

3358

RetOps.push_back(DAG.getRegister(*I, MVT::i64));

3359

else

3360

llvm_unreachable("Unexpected register class in CSRsViaCopy!")::llvm::llvm_unreachable_internal("Unexpected register class in CSRsViaCopy!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3360);

3361

}

3362

}

3363

3364

RetOps[0] = Chain; // Update chain.

3365

3366

// Add the flag if we have it.

3367

if (Flag.getNode())

3368

RetOps.push_back(Flag);

3369

3370

X86ISD::NodeType opcode = X86ISD::RET_FLAG;

3371

if (CallConv == CallingConv::X86_INTR)

3372

opcode = X86ISD::IRET;

3373

return DAG.getNode(opcode, dl, MVT::Other, RetOps);

3374

}

3375

3376

bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {

3377

if (N->getNumValues() != 1 || !N->hasNUsesOfValue(1, 0))

3378

return false;

3379

3380

SDValue TCChain = Chain;

3381

SDNode *Copy = *N->use_begin();

3382

if (Copy->getOpcode() == ISD::CopyToReg) {

3383

// If the copy has a glue operand, we conservatively assume it isn't safe to

3384

// perform a tail call.

3385

if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)

3386

return false;

3387

TCChain = Copy->getOperand(0);

3388

} else if (Copy->getOpcode() != ISD::FP_EXTEND)

3389

return false;

3390

3391

bool HasRet = false;

3392

for (const SDNode *U : Copy->uses()) {

3393

if (U->getOpcode() != X86ISD::RET_FLAG)

3394

return false;

3395

// If we are returning more than one value, we can definitely

3396

// not make a tail call see PR19530

3397

if (U->getNumOperands() > 4)

3398

return false;

3399

if (U->getNumOperands() == 4 &&

3400

U->getOperand(U->getNumOperands() - 1).getValueType() != MVT::Glue)

3401

return false;

3402

HasRet = true;

3403

}

3404

3405

if (!HasRet)

3406

return false;

3407

3408

Chain = TCChain;

3409

return true;

3410

}

3411

3412

EVT X86TargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT,

3413

ISD::NodeType ExtendKind) const {

3414

MVT ReturnMVT = MVT::i32;

3415

3416

bool Darwin = Subtarget.getTargetTriple().isOSDarwin();

3417

if (VT == MVT::i1 || (!Darwin && (VT == MVT::i8 || VT == MVT::i16))) {

3418

// The ABI does not require i1, i8 or i16 to be extended.

3419

//

3420

// On Darwin, there is code in the wild relying on Clang's old behaviour of

3421

// always extending i8/i16 return values, so keep doing that for now.

3422

// (PR26665).

3423

ReturnMVT = MVT::i8;

3424

}

3425

3426

EVT MinVT = getRegisterType(Context, ReturnMVT);

3427

return VT.bitsLT(MinVT) ? MinVT : VT;

3428

}

3429

3430

/// Reads two 32 bit registers and creates a 64 bit mask value.

3431

/// \param VA The current 32 bit value that need to be assigned.

3432

/// \param NextVA The next 32 bit value that need to be assigned.

3433

/// \param Root The parent DAG node.

3434

/// \param [in,out] InFlag Represents SDvalue in the parent DAG node for

3435

/// glue purposes. In the case the DAG is already using

3436

/// physical register instead of virtual, we should glue

3437

/// our new SDValue to InFlag SDvalue.

3438

/// \return a new SDvalue of size 64bit.

3439

static SDValue getv64i1Argument(CCValAssign &VA, CCValAssign &NextVA,

3440

SDValue &Root, SelectionDAG &DAG,

3441

const SDLoc &Dl, const X86Subtarget &Subtarget,

3442

SDValue *InFlag = nullptr) {

3443

assert((Subtarget.hasBWI()) && "Expected AVX512BW target!")(static_cast <bool> ((Subtarget.hasBWI()) && "Expected AVX512BW target!"
) ? void (0) : __assert_fail ("(Subtarget.hasBWI()) && \"Expected AVX512BW target!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3443, __extension__
__PRETTY_FUNCTION__));

3444

assert(Subtarget.is32Bit() && "Expecting 32 bit target")(static_cast <bool> (Subtarget.is32Bit() && "Expecting 32 bit target"
) ? void (0) : __assert_fail ("Subtarget.is32Bit() && \"Expecting 32 bit target\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3444, __extension__
__PRETTY_FUNCTION__));

3445

assert(VA.getValVT() == MVT::v64i1 &&(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Expecting first location of 64 bit width type") ? void (0) :
__assert_fail ("VA.getValVT() == MVT::v64i1 && \"Expecting first location of 64 bit width type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3446, __extension__
__PRETTY_FUNCTION__))

3446

"Expecting first location of 64 bit width type")(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Expecting first location of 64 bit width type") ? void (0) :
__assert_fail ("VA.getValVT() == MVT::v64i1 && \"Expecting first location of 64 bit width type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3446, __extension__
__PRETTY_FUNCTION__));

3447

assert(NextVA.getValVT() == VA.getValVT() &&(static_cast <bool> (NextVA.getValVT() == VA.getValVT()
&& "The locations should have the same type") ? void
(0) : __assert_fail ("NextVA.getValVT() == VA.getValVT() && \"The locations should have the same type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3448, __extension__
__PRETTY_FUNCTION__))

3448

"The locations should have the same type")(static_cast <bool> (NextVA.getValVT() == VA.getValVT()
&& "The locations should have the same type") ? void
(0) : __assert_fail ("NextVA.getValVT() == VA.getValVT() && \"The locations should have the same type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3448, __extension__
__PRETTY_FUNCTION__));

3449

assert(VA.isRegLoc() && NextVA.isRegLoc() &&(static_cast <bool> (VA.isRegLoc() && NextVA.isRegLoc
() && "The values should reside in two registers") ? void
(0) : __assert_fail ("VA.isRegLoc() && NextVA.isRegLoc() && \"The values should reside in two registers\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3450, __extension__
__PRETTY_FUNCTION__))

3450

"The values should reside in two registers")(static_cast <bool> (VA.isRegLoc() && NextVA.isRegLoc
() && "The values should reside in two registers") ? void
(0) : __assert_fail ("VA.isRegLoc() && NextVA.isRegLoc() && \"The values should reside in two registers\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3450, __extension__
__PRETTY_FUNCTION__));

3451

3452

SDValue Lo, Hi;

3453

SDValue ArgValueLo, ArgValueHi;

3454

3455

MachineFunction &MF = DAG.getMachineFunction();

3456

const TargetRegisterClass *RC = &X86::GR32RegClass;

3457

3458

// Read a 32 bit value from the registers.

3459

if (nullptr == InFlag) {

3460

// When no physical register is present,

3461

// create an intermediate virtual register.

3462

Register Reg = MF.addLiveIn(VA.getLocReg(), RC);

3463

ArgValueLo = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);

3464

Reg = MF.addLiveIn(NextVA.getLocReg(), RC);

3465

ArgValueHi = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);

3466

} else {

3467

// When a physical register is available read the value from it and glue

3468

// the reads together.

3469

ArgValueLo =

3470

DAG.getCopyFromReg(Root, Dl, VA.getLocReg(), MVT::i32, *InFlag);

3471

*InFlag = ArgValueLo.getValue(2);

3472

ArgValueHi =

3473

DAG.getCopyFromReg(Root, Dl, NextVA.getLocReg(), MVT::i32, *InFlag);

3474

*InFlag = ArgValueHi.getValue(2);

3475

}

3476

3477

// Convert the i32 type into v32i1 type.

3478

Lo = DAG.getBitcast(MVT::v32i1, ArgValueLo);

3479

3480

// Convert the i32 type into v32i1 type.

3481

Hi = DAG.getBitcast(MVT::v32i1, ArgValueHi);

3482

3483

// Concatenate the two values together.

3484

return DAG.getNode(ISD::CONCAT_VECTORS, Dl, MVT::v64i1, Lo, Hi);

3485

}

3486

3487

/// The function will lower a register of various sizes (8/16/32/64)

3488

/// to a mask value of the expected size (v8i1/v16i1/v32i1/v64i1)

3489

/// \returns a DAG node contains the operand after lowering to mask type.

3490

static SDValue lowerRegToMasks(const SDValue &ValArg, const EVT &ValVT,

3491

const EVT &ValLoc, const SDLoc &Dl,

3492

SelectionDAG &DAG) {

3493

SDValue ValReturned = ValArg;

3494

3495

if (ValVT == MVT::v1i1)

3496

return DAG.getNode(ISD::SCALAR_TO_VECTOR, Dl, MVT::v1i1, ValReturned);

3497

3498

if (ValVT == MVT::v64i1) {

3499

// In 32 bit machine, this case is handled by getv64i1Argument

3500

assert(ValLoc == MVT::i64 && "Expecting only i64 locations")(static_cast <bool> (ValLoc == MVT::i64 && "Expecting only i64 locations"
) ? void (0) : __assert_fail ("ValLoc == MVT::i64 && \"Expecting only i64 locations\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3500, __extension__
__PRETTY_FUNCTION__));

3501

// In 64 bit machine, There is no need to truncate the value only bitcast

3502

} else {

3503

MVT maskLen;

3504

switch (ValVT.getSimpleVT().SimpleTy) {

3505

case MVT::v8i1:

3506

maskLen = MVT::i8;

3507

break;

3508

case MVT::v16i1:

3509

maskLen = MVT::i16;

3510

break;

3511

case MVT::v32i1:

3512

maskLen = MVT::i32;

3513

break;

3514

default:

3515

llvm_unreachable("Expecting a vector of i1 types")::llvm::llvm_unreachable_internal("Expecting a vector of i1 types"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3515);

3516

}

3517

3518

ValReturned = DAG.getNode(ISD::TRUNCATE, Dl, maskLen, ValReturned);

3519

}

3520

return DAG.getBitcast(ValVT, ValReturned);

3521

}

3522

3523

/// Lower the result values of a call into the

3524

/// appropriate copies out of appropriate physical registers.

3525

///

3526

SDValue X86TargetLowering::LowerCallResult(

3527

SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,

3528

const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,

3529

SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,

3530

uint32_t *RegMask) const {

3531

3532

const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();

3533

// Assign locations to each value returned by this call.

3534

SmallVector<CCValAssign, 16> RVLocs;

3535

CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,

3536

*DAG.getContext());

3537

CCInfo.AnalyzeCallResult(Ins, RetCC_X86);

3538

3539

// Copy all of the result registers out of their specified physreg.

3540

for (unsigned I = 0, InsIndex = 0, E = RVLocs.size(); I != E;

3541

++I, ++InsIndex) {

3542

CCValAssign &VA = RVLocs[I];

3543

EVT CopyVT = VA.getLocVT();

3544

3545

// In some calling conventions we need to remove the used registers

3546

// from the register mask.

3547

if (RegMask) {

3548

for (MCSubRegIterator SubRegs(VA.getLocReg(), TRI, /*IncludeSelf=*/true);

3549

SubRegs.isValid(); ++SubRegs)

3550

RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32));

3551

}

3552

3553

// Report an error if there was an attempt to return FP values via XMM

3554

// registers.

3555

if (!Subtarget.hasSSE1() && X86::FR32XRegClass.contains(VA.getLocReg())) {

3556

errorUnsupported(DAG, dl, "SSE register return with SSE disabled");

3557

if (VA.getLocReg() == X86::XMM1)

3558

VA.convertToReg(X86::FP1); // Set reg to FP1, avoid hitting asserts.

3559

else

3560

VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.

3561

} else if (!Subtarget.hasSSE2() &&

3562

X86::FR64XRegClass.contains(VA.getLocReg()) &&

3563

CopyVT == MVT::f64) {

3564

errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled");

3565

if (VA.getLocReg() == X86::XMM1)

3566

VA.convertToReg(X86::FP1); // Set reg to FP1, avoid hitting asserts.

3567

else

3568

VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.

3569

}

3570

3571

// If we prefer to use the value in xmm registers, copy it out as f80 and

3572

// use a truncate to move it from fp stack reg to xmm reg.

3573

bool RoundAfterCopy = false;

3574

if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) &&

3575

isScalarFPTypeInSSEReg(VA.getValVT())) {

3576

if (!Subtarget.hasX87())

3577

report_fatal_error("X87 register return with X87 disabled");

3578

CopyVT = MVT::f80;

3579

RoundAfterCopy = (CopyVT != VA.getLocVT());

3580

}

3581

3582

SDValue Val;

3583

if (VA.needsCustom()) {

3584

assert(VA.getValVT() == MVT::v64i1 &&(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs"
) ? void (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3585, __extension__
__PRETTY_FUNCTION__))

3585

"Currently the only custom case is when we split v64i1 to 2 regs")(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs"
) ? void (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3585, __extension__
__PRETTY_FUNCTION__));

3586

Val =

3587

getv64i1Argument(VA, RVLocs[++I], Chain, DAG, dl, Subtarget, &InFlag);

3588

} else {

3589

Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), CopyVT, InFlag)

3590

.getValue(1);

3591

Val = Chain.getValue(0);

3592

InFlag = Chain.getValue(2);

3593

}

3594

3595

if (RoundAfterCopy)

3596

Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,

3597

// This truncation won't change the value.

3598

DAG.getIntPtrConstant(1, dl, /*isTarget=*/true));

3599

3600

if (VA.isExtInLoc()) {

3601

if (VA.getValVT().isVector() &&

3602

VA.getValVT().getScalarType() == MVT::i1 &&

3603

((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||

3604

(VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {

3605

// promoting a mask type (v*i1) into a register of type i64/i32/i16/i8

3606

Val = lowerRegToMasks(Val, VA.getValVT(), VA.getLocVT(), dl, DAG);

3607

} else

3608

Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);

3609

}

3610

3611

if (VA.getLocInfo() == CCValAssign::BCvt)

3612

Val = DAG.getBitcast(VA.getValVT(), Val);

3613

3614

InVals.push_back(Val);

3615

}

3616

3617

return Chain;

3618

}

3619

3620

//===----------------------------------------------------------------------===//

3621

// C & StdCall & Fast Calling Convention implementation

3622

//===----------------------------------------------------------------------===//

3623

// StdCall calling convention seems to be standard for many Windows' API

3624

// routines and around. It differs from C calling convention just a little:

3625

// callee should clean up the stack, not caller. Symbols should be also

3626

// decorated in some fancy way :) It doesn't support any vector arguments.

3627

// For info on fast calling convention see Fast Calling Convention (tail call)

3628

// implementation LowerX86_32FastCCCallTo.

3629

3630

/// Determines whether Args, either a set of outgoing arguments to a call, or a

3631

/// set of incoming args of a call, contains an sret pointer that the callee

3632

/// pops

3633

template <typename T>

3634

static bool hasCalleePopSRet(const SmallVectorImpl<T> &Args,

3635

const X86Subtarget &Subtarget) {

3636

// Not C++20 (yet), so no concepts available.

3637

static_assert(std::is_same_v<T, ISD::OutputArg> ||

3638

std::is_same_v<T, ISD::InputArg>,

3639

"requires ISD::OutputArg or ISD::InputArg");

3640

3641

// Only 32-bit pops the sret. It's a 64-bit world these days, so early-out

3642

// for most compilations.

3643

if (!Subtarget.is32Bit())

3644

return false;

3645

3646

if (Args.empty())

3647

return false;

3648

3649

// Most calls do not have an sret argument, check the arg next.

3650

const ISD::ArgFlagsTy &Flags = Args[0].Flags;

3651

if (!Flags.isSRet() || Flags.isInReg())

3652

return false;

3653

3654

// The MSVCabi does not pop the sret.

3655

if (Subtarget.getTargetTriple().isOSMSVCRT())

3656

return false;

3657

3658

// MCUs don't pop the sret

3659

if (Subtarget.isTargetMCU())

3660

return false;

3661

3662

// Callee pops argument

3663

return true;

3664

}

3665

3666

/// Make a copy of an aggregate at address specified by "Src" to address

3667

/// "Dst" with size and alignment information specified by the specific

3668

/// parameter attribute. The copy will be passed as a byval function parameter.

3669

static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst,

3670

SDValue Chain, ISD::ArgFlagsTy Flags,

3671

SelectionDAG &DAG, const SDLoc &dl) {

3672

SDValue SizeNode = DAG.getIntPtrConstant(Flags.getByValSize(), dl);

3673

3674

return DAG.getMemcpy(

3675

Chain, dl, Dst, Src, SizeNode, Flags.getNonZeroByValAlign(),

3676

/*isVolatile*/ false, /*AlwaysInline=*/true,

3677

/*isTailCall*/ false, MachinePointerInfo(), MachinePointerInfo());

3678

}

3679

3680

/// Return true if the calling convention is one that we can guarantee TCO for.

3681

static bool canGuaranteeTCO(CallingConv::ID CC) {

3682

return (CC == CallingConv::Fast || CC == CallingConv::GHC ||

3683

CC == CallingConv::X86_RegCall || CC == CallingConv::HiPE ||

3684

CC == CallingConv::HHVM || CC == CallingConv::Tail ||

3685

CC == CallingConv::SwiftTail);

3686

}

3687

3688

/// Return true if we might ever do TCO for calls with this calling convention.

3689

static bool mayTailCallThisCC(CallingConv::ID CC) {

3690

switch (CC) {

3691

// C calling conventions:

3692

case CallingConv::C:

3693

case CallingConv::Win64:

3694

case CallingConv::X86_64_SysV:

3695

// Callee pop conventions:

3696

case CallingConv::X86_ThisCall:

3697

case CallingConv::X86_StdCall:

3698

case CallingConv::X86_VectorCall:

3699

case CallingConv::X86_FastCall:

3700

// Swift:

3701

case CallingConv::Swift:

3702

return true;

3703

default:

3704

return canGuaranteeTCO(CC);

3705

}

3706

}

3707

3708

/// Return true if the function is being made into a tailcall target by

3709

/// changing its ABI.

3710

static bool shouldGuaranteeTCO(CallingConv::ID CC, bool GuaranteedTailCallOpt) {

3711

return (GuaranteedTailCallOpt && canGuaranteeTCO(CC)) ||

3712

CC == CallingConv::Tail || CC == CallingConv::SwiftTail;

3713

}

3714

3715

bool X86TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {

3716

if (!CI->isTailCall())

3717

return false;

3718

3719

CallingConv::ID CalleeCC = CI->getCallingConv();

3720

if (!mayTailCallThisCC(CalleeCC))

3721

return false;

3722

3723

return true;

3724

}

3725

3726

SDValue

3727

X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,

3728

const SmallVectorImpl<ISD::InputArg> &Ins,

3729

const SDLoc &dl, SelectionDAG &DAG,

3730

const CCValAssign &VA,

3731

MachineFrameInfo &MFI, unsigned i) const {

3732

// Create the nodes corresponding to a load from this parameter slot.

3733

ISD::ArgFlagsTy Flags = Ins[i].Flags;

3734

bool AlwaysUseMutable = shouldGuaranteeTCO(

3735

CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt);

3736

bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();

3737

EVT ValVT;

3738

MVT PtrVT = getPointerTy(DAG.getDataLayout());

3739

3740

// If value is passed by pointer we have address passed instead of the value

3741

// itself. No need to extend if the mask value and location share the same

3742

// absolute size.

3743

bool ExtendedInMem =

3744

VA.isExtInLoc() && VA.getValVT().getScalarType() == MVT::i1 &&

3745

VA.getValVT().getSizeInBits() != VA.getLocVT().getSizeInBits();

3746

3747

if (VA.getLocInfo() == CCValAssign::Indirect || ExtendedInMem)

3748

ValVT = VA.getLocVT();

3749

else

3750

ValVT = VA.getValVT();

3751

3752

// FIXME: For now, all byval parameter objects are marked mutable. This can be

3753

// changed with more analysis.

3754

// In case of tail call optimization mark all arguments mutable. Since they

3755

// could be overwritten by lowering of arguments in case of a tail call.

3756

if (Flags.isByVal()) {

3757

unsigned Bytes = Flags.getByValSize();

3758

if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.

3759

3760

// FIXME: For now, all byval parameter objects are marked as aliasing. This

3761

// can be improved with deeper analysis.

3762

int FI = MFI.CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable,

3763

/*isAliased=*/true);

3764

return DAG.getFrameIndex(FI, PtrVT);

3765

}

3766

3767

EVT ArgVT = Ins[i].ArgVT;

3768

3769

// If this is a vector that has been split into multiple parts, and the

3770

// scalar size of the parts don't match the vector element size, then we can't

3771

// elide the copy. The parts will have padding between them instead of being

3772

// packed like a vector.

3773

bool ScalarizedAndExtendedVector =

3774

ArgVT.isVector() && !VA.getLocVT().isVector() &&

3775

VA.getLocVT().getSizeInBits() != ArgVT.getScalarSizeInBits();

3776

3777

// This is an argument in memory. We might be able to perform copy elision.

3778

// If the argument is passed directly in memory without any extension, then we

3779

// can perform copy elision. Large vector types, for example, may be passed

3780

// indirectly by pointer.

3781

if (Flags.isCopyElisionCandidate() &&

3782

VA.getLocInfo() != CCValAssign::Indirect && !ExtendedInMem &&

3783

!ScalarizedAndExtendedVector) {

3784

SDValue PartAddr;

3785

if (Ins[i].PartOffset == 0) {

3786

// If this is a one-part value or the first part of a multi-part value,

3787

// create a stack object for the entire argument value type and return a

3788

// load from our portion of it. This assumes that if the first part of an

3789

// argument is in memory, the rest will also be in memory.

3790

int FI = MFI.CreateFixedObject(ArgVT.getStoreSize(), VA.getLocMemOffset(),

3791

/*IsImmutable=*/false);

3792

PartAddr = DAG.getFrameIndex(FI, PtrVT);

3793

return DAG.getLoad(

3794

ValVT, dl, Chain, PartAddr,

3795

MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));

3796

} else {

3797

// This is not the first piece of an argument in memory. See if there is

3798

// already a fixed stack object including this offset. If so, assume it

3799

// was created by the PartOffset == 0 branch above and create a load from

3800

// the appropriate offset into it.

3801

int64_t PartBegin = VA.getLocMemOffset();

3802

int64_t PartEnd = PartBegin + ValVT.getSizeInBits() / 8;

3803

int FI = MFI.getObjectIndexBegin();

3804

for (; MFI.isFixedObjectIndex(FI); ++FI) {

3805

int64_t ObjBegin = MFI.getObjectOffset(FI);

3806

int64_t ObjEnd = ObjBegin + MFI.getObjectSize(FI);

3807

if (ObjBegin <= PartBegin && PartEnd <= ObjEnd)

3808

break;

3809

}

3810

if (MFI.isFixedObjectIndex(FI)) {

3811

SDValue Addr =

3812

DAG.getNode(ISD::ADD, dl, PtrVT, DAG.getFrameIndex(FI, PtrVT),

3813

DAG.getIntPtrConstant(Ins[i].PartOffset, dl));

3814

return DAG.getLoad(

3815

ValVT, dl, Chain, Addr,

3816

MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI,

3817

Ins[i].PartOffset));

3818

}

3819

}

3820

}

3821

3822

int FI = MFI.CreateFixedObject(ValVT.getSizeInBits() / 8,

3823

VA.getLocMemOffset(), isImmutable);

3824

3825

// Set SExt or ZExt flag.

3826

if (VA.getLocInfo() == CCValAssign::ZExt) {

3827

MFI.setObjectZExt(FI, true);

3828

} else if (VA.getLocInfo() == CCValAssign::SExt) {

3829

MFI.setObjectSExt(FI, true);

3830

}

3831

3832

MaybeAlign Alignment;

3833

if (Subtarget.isTargetWindowsMSVC() && !Subtarget.is64Bit() &&

3834

ValVT != MVT::f80)

3835

Alignment = MaybeAlign(4);

3836

SDValue FIN = DAG.getFrameIndex(FI, PtrVT);

3837

SDValue Val = DAG.getLoad(

3838

ValVT, dl, Chain, FIN,

3839

MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI),

3840

Alignment);

3841

return ExtendedInMem

3842

? (VA.getValVT().isVector()

3843

? DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VA.getValVT(), Val)

3844

: DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val))

3845

: Val;

3846

}

3847

3848

// FIXME: Get this from tablegen.

3849

static ArrayRef<MCPhysReg> get64BitArgumentGPRs(CallingConv::ID CallConv,

3850

const X86Subtarget &Subtarget) {

3851

assert(Subtarget.is64Bit())(static_cast <bool> (Subtarget.is64Bit()) ? void (0) : __assert_fail
("Subtarget.is64Bit()", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3851, __extension__ __PRETTY_FUNCTION__));

3852

3853

if (Subtarget.isCallingConvWin64(CallConv)) {

3854

static const MCPhysReg GPR64ArgRegsWin64[] = {

3855

X86::RCX, X86::RDX, X86::R8, X86::R9

3856

};

3857

return ArrayRef(std::begin(GPR64ArgRegsWin64), std::end(GPR64ArgRegsWin64));

3858

}

3859

3860

static const MCPhysReg GPR64ArgRegs64Bit[] = {

3861

X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9

3862

};

3863

return ArrayRef(std::begin(GPR64ArgRegs64Bit), std::end(GPR64ArgRegs64Bit));

3864

}

3865

3866

// FIXME: Get this from tablegen.

3867

static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,

3868

CallingConv::ID CallConv,

3869

const X86Subtarget &Subtarget) {

3870

assert(Subtarget.is64Bit())(static_cast <bool> (Subtarget.is64Bit()) ? void (0) : __assert_fail
("Subtarget.is64Bit()", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 3870, __extension__ __PRETTY_FUNCTION__));

3871

if (Subtarget.isCallingConvWin64(CallConv)) {

3872

// The XMM registers which might contain var arg parameters are shadowed

3873

// in their paired GPR. So we only need to save the GPR to their home

3874

// slots.

3875

// TODO: __vectorcall will change this.

3876

return std::nullopt;

3877

}

3878

3879

bool isSoftFloat = Subtarget.useSoftFloat();

3880

if (isSoftFloat || !Subtarget.hasSSE1())

3881

// Kernel mode asks for SSE to be disabled, so there are no XMM argument

3882

// registers.

3883

return std::nullopt;

3884

3885

static const MCPhysReg XMMArgRegs64Bit[] = {

3886

X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,

3887

X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7

3888

};

3889

return ArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit));

3890

}

3891

3892

#ifndef NDEBUG

3893

static bool isSortedByValueNo(ArrayRef<CCValAssign> ArgLocs) {

3894

return llvm::is_sorted(

3895

ArgLocs, [](const CCValAssign &A, const CCValAssign &B) -> bool {

3896

return A.getValNo() < B.getValNo();

3897

});

3898

}

3899

#endif

3900

3901

namespace {

3902

/// This is a helper class for lowering variable arguments parameters.

3903

class VarArgsLoweringHelper {

3904

public:

3905

VarArgsLoweringHelper(X86MachineFunctionInfo *FuncInfo, const SDLoc &Loc,

3906

SelectionDAG &DAG, const X86Subtarget &Subtarget,

3907

CallingConv::ID CallConv, CCState &CCInfo)

3908

: FuncInfo(FuncInfo), DL(Loc), DAG(DAG), Subtarget(Subtarget),

3909

TheMachineFunction(DAG.getMachineFunction()),

3910

TheFunction(TheMachineFunction.getFunction()),

3911

FrameInfo(TheMachineFunction.getFrameInfo()),

3912

FrameLowering(*Subtarget.getFrameLowering()),

3913

TargLowering(DAG.getTargetLoweringInfo()), CallConv(CallConv),

3914

CCInfo(CCInfo) {}

3915

3916

// Lower variable arguments parameters.

3917

void lowerVarArgsParameters(SDValue &Chain, unsigned StackSize);

3918

3919

private:

3920

void createVarArgAreaAndStoreRegisters(SDValue &Chain, unsigned StackSize);

3921

3922

void forwardMustTailParameters(SDValue &Chain);

3923

3924

bool is64Bit() const { return Subtarget.is64Bit(); }

3925

bool isWin64() const { return Subtarget.isCallingConvWin64(CallConv); }

3926

3927

X86MachineFunctionInfo *FuncInfo;

3928

const SDLoc &DL;

3929

SelectionDAG &DAG;

3930

const X86Subtarget &Subtarget;

3931

MachineFunction &TheMachineFunction;

3932

const Function &TheFunction;

3933

MachineFrameInfo &FrameInfo;

3934

const TargetFrameLowering &FrameLowering;

3935

const TargetLowering &TargLowering;

3936

CallingConv::ID CallConv;

3937

CCState &CCInfo;

3938

};

3939

} // namespace

3940

3941

void VarArgsLoweringHelper::createVarArgAreaAndStoreRegisters(

3942

SDValue &Chain, unsigned StackSize) {

3943

// If the function takes variable number of arguments, make a frame index for

3944

// the start of the first vararg value... for expansion of llvm.va_start. We

3945

// can skip this if there are no va_start calls.

3946

if (is64Bit() || (CallConv != CallingConv::X86_FastCall &&

3947

CallConv != CallingConv::X86_ThisCall)) {

3948

FuncInfo->setVarArgsFrameIndex(

3949

FrameInfo.CreateFixedObject(1, StackSize, true));

3950

}

3951

3952

// 64-bit calling conventions support varargs and register parameters, so we

3953

// have to do extra work to spill them in the prologue.

3954

if (is64Bit()) {

3955

// Find the first unallocated argument registers.

3956

ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget);

3957

ArrayRef<MCPhysReg> ArgXMMs =

3958

get64BitArgumentXMMs(TheMachineFunction, CallConv, Subtarget);

3959

unsigned NumIntRegs = CCInfo.getFirstUnallocated(ArgGPRs);

3960

unsigned NumXMMRegs = CCInfo.getFirstUnallocated(ArgXMMs);

3961

3962

assert(!(NumXMMRegs && !Subtarget.hasSSE1()) &&(static_cast <bool> (!(NumXMMRegs && !Subtarget
.hasSSE1()) && "SSE register cannot be used when SSE is disabled!"
) ? void (0) : __assert_fail ("!(NumXMMRegs && !Subtarget.hasSSE1()) && \"SSE register cannot be used when SSE is disabled!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3963, __extension__
__PRETTY_FUNCTION__))

3963

"SSE register cannot be used when SSE is disabled!")(static_cast <bool> (!(NumXMMRegs && !Subtarget
.hasSSE1()) && "SSE register cannot be used when SSE is disabled!"
) ? void (0) : __assert_fail ("!(NumXMMRegs && !Subtarget.hasSSE1()) && \"SSE register cannot be used when SSE is disabled!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 3963, __extension__
__PRETTY_FUNCTION__));

3964

3965

if (isWin64()) {

3966

// Get to the caller-allocated home save location. Add 8 to account

3967

// for the return address.

3968

int HomeOffset = FrameLowering.getOffsetOfLocalArea() + 8;

3969

FuncInfo->setRegSaveFrameIndex(

3970

FrameInfo.CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));

3971

// Fixup to set vararg frame on shadow area (4 x i64).

3972

if (NumIntRegs < 4)

3973

FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());

3974

} else {

3975

// For X86-64, if there are vararg parameters that are passed via

3976

// registers, then we must store them to their spots on the stack so

3977

// they may be loaded by dereferencing the result of va_next.

3978

FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);

3979

FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16);

3980

FuncInfo->setRegSaveFrameIndex(FrameInfo.CreateStackObject(

3981

ArgGPRs.size() * 8 + ArgXMMs.size() * 16, Align(16), false));

3982

}

3983

3984

SmallVector<SDValue, 6>

3985

LiveGPRs; // list of SDValue for GPR registers keeping live input value

3986

SmallVector<SDValue, 8> LiveXMMRegs; // list of SDValue for XMM registers

3987

// keeping live input value

3988

SDValue ALVal; // if applicable keeps SDValue for %al register

3989

3990

// Gather all the live in physical registers.

3991

for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) {

3992

Register GPR = TheMachineFunction.addLiveIn(Reg, &X86::GR64RegClass);

3993

LiveGPRs.push_back(DAG.getCopyFromReg(Chain, DL, GPR, MVT::i64));

3994

}

3995

const auto &AvailableXmms = ArgXMMs.slice(NumXMMRegs);

3996

if (!AvailableXmms.empty()) {

3997

Register AL = TheMachineFunction.addLiveIn(X86::AL, &X86::GR8RegClass);

3998

ALVal = DAG.getCopyFromReg(Chain, DL, AL, MVT::i8);

3999

for (MCPhysReg Reg : AvailableXmms) {

4000

// FastRegisterAllocator spills virtual registers at basic

4001

// block boundary. That leads to usages of xmm registers

4002

// outside of check for %al. Pass physical registers to

4003

// VASTART_SAVE_XMM_REGS to avoid unneccessary spilling.

4004

TheMachineFunction.getRegInfo().addLiveIn(Reg);

4005

LiveXMMRegs.push_back(DAG.getRegister(Reg, MVT::v4f32));

4006

}

4007

}

4008

4009

// Store the integer parameter registers.

4010

SmallVector<SDValue, 8> MemOps;

4011

SDValue RSFIN =

4012

DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),

4013

TargLowering.getPointerTy(DAG.getDataLayout()));

4014

unsigned Offset = FuncInfo->getVarArgsGPOffset();

4015

for (SDValue Val : LiveGPRs) {

4016

SDValue FIN = DAG.getNode(ISD::ADD, DL,

4017

TargLowering.getPointerTy(DAG.getDataLayout()),

4018

RSFIN, DAG.getIntPtrConstant(Offset, DL));

4019

SDValue Store =

4020

DAG.getStore(Val.getValue(1), DL, Val, FIN,

4021

MachinePointerInfo::getFixedStack(

4022

DAG.getMachineFunction(),

4023

FuncInfo->getRegSaveFrameIndex(), Offset));

4024

MemOps.push_back(Store);

4025

Offset += 8;

4026

}

4027

4028

// Now store the XMM (fp + vector) parameter registers.

4029

if (!LiveXMMRegs.empty()) {

4030

SmallVector<SDValue, 12> SaveXMMOps;

4031

SaveXMMOps.push_back(Chain);

4032

SaveXMMOps.push_back(ALVal);

4033

SaveXMMOps.push_back(RSFIN);

4034

SaveXMMOps.push_back(

4035

DAG.getTargetConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32));

4036

llvm::append_range(SaveXMMOps, LiveXMMRegs);

4037

MachineMemOperand *StoreMMO =

4038

DAG.getMachineFunction().getMachineMemOperand(

4039

MachinePointerInfo::getFixedStack(

4040

DAG.getMachineFunction(), FuncInfo->getRegSaveFrameIndex(),

4041

Offset),

4042

MachineMemOperand::MOStore, 128, Align(16));

4043

MemOps.push_back(DAG.getMemIntrinsicNode(X86ISD::VASTART_SAVE_XMM_REGS,

4044

DL, DAG.getVTList(MVT::Other),

4045

SaveXMMOps, MVT::i8, StoreMMO));

4046

}

4047

4048

if (!MemOps.empty())

4049

Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);

4050

}

4051

}

4052

4053

void VarArgsLoweringHelper::forwardMustTailParameters(SDValue &Chain) {

4054

// Find the largest legal vector type.

4055

MVT VecVT = MVT::Other;

4056

// FIXME: Only some x86_32 calling conventions support AVX512.

4057

if (Subtarget.useAVX512Regs() &&

4058

(is64Bit() || (CallConv == CallingConv::X86_VectorCall ||

4059

CallConv == CallingConv::Intel_OCL_BI)))

4060

VecVT = MVT::v16f32;

4061

else if (Subtarget.hasAVX())

4062

VecVT = MVT::v8f32;

4063

else if (Subtarget.hasSSE2())

4064

VecVT = MVT::v4f32;

4065

4066

// We forward some GPRs and some vector types.

4067

SmallVector<MVT, 2> RegParmTypes;

4068

MVT IntVT = is64Bit() ? MVT::i64 : MVT::i32;

4069

RegParmTypes.push_back(IntVT);

4070

if (VecVT != MVT::Other)

4071

RegParmTypes.push_back(VecVT);

4072

4073

// Compute the set of forwarded registers. The rest are scratch.

4074

SmallVectorImpl<ForwardedRegister> &Forwards =

4075

FuncInfo->getForwardedMustTailRegParms();

4076

CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86);

4077

4078

// Forward AL for SysV x86_64 targets, since it is used for varargs.

4079

if (is64Bit() && !isWin64() && !CCInfo.isAllocated(X86::AL)) {

4080

Register ALVReg = TheMachineFunction.addLiveIn(X86::AL, &X86::GR8RegClass);

4081

Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8));

4082

}

4083

4084

// Copy all forwards from physical to virtual registers.

4085

for (ForwardedRegister &FR : Forwards) {

4086

// FIXME: Can we use a less constrained schedule?

4087

SDValue RegVal = DAG.getCopyFromReg(Chain, DL, FR.VReg, FR.VT);

4088

FR.VReg = TheMachineFunction.getRegInfo().createVirtualRegister(

4089

TargLowering.getRegClassFor(FR.VT));

4090

Chain = DAG.getCopyToReg(Chain, DL, FR.VReg, RegVal);

4091

}

4092

}

4093

4094

void VarArgsLoweringHelper::lowerVarArgsParameters(SDValue &Chain,

4095

unsigned StackSize) {

4096

// Set FrameIndex to the 0xAAAAAAA value to mark unset state.

4097

// If necessary, it would be set into the correct value later.

4098

FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);

4099

FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);

4100

4101

if (FrameInfo.hasVAStart())

4102

createVarArgAreaAndStoreRegisters(Chain, StackSize);

4103

4104

if (FrameInfo.hasMustTailInVarArgFunc())

4105

forwardMustTailParameters(Chain);

4106

}

4107

4108

SDValue X86TargetLowering::LowerFormalArguments(

4109

SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,

4110

const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,

4111

SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {

4112

MachineFunction &MF = DAG.getMachineFunction();

4113

X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();

4114

4115

const Function &F = MF.getFunction();

4116

if (F.hasExternalLinkage() && Subtarget.isTargetCygMing() &&

4117

F.getName() == "main")

4118

FuncInfo->setForceFramePointer(true);

4119

4120

MachineFrameInfo &MFI = MF.getFrameInfo();

4121

bool Is64Bit = Subtarget.is64Bit();

4122

bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);

4123

4124

assert((static_cast <bool> (!(IsVarArg && canGuaranteeTCO
(CallConv)) && "Var args not supported with calling conv' regcall, fastcc, ghc or hipe"
) ? void (0) : __assert_fail ("!(IsVarArg && canGuaranteeTCO(CallConv)) && \"Var args not supported with calling conv' regcall, fastcc, ghc or hipe\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4126, __extension__
__PRETTY_FUNCTION__))

4125

!(IsVarArg && canGuaranteeTCO(CallConv)) &&(static_cast <bool> (!(IsVarArg && canGuaranteeTCO
(CallConv)) && "Var args not supported with calling conv' regcall, fastcc, ghc or hipe"
) ? void (0) : __assert_fail ("!(IsVarArg && canGuaranteeTCO(CallConv)) && \"Var args not supported with calling conv' regcall, fastcc, ghc or hipe\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4126, __extension__
__PRETTY_FUNCTION__))

4126

"Var args not supported with calling conv' regcall, fastcc, ghc or hipe")(static_cast <bool> (!(IsVarArg && canGuaranteeTCO
(CallConv)) && "Var args not supported with calling conv' regcall, fastcc, ghc or hipe"
) ? void (0) : __assert_fail ("!(IsVarArg && canGuaranteeTCO(CallConv)) && \"Var args not supported with calling conv' regcall, fastcc, ghc or hipe\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4126, __extension__
__PRETTY_FUNCTION__));

4127

4128

// Assign locations to all of the incoming arguments.

4129

SmallVector<CCValAssign, 16> ArgLocs;

4130

CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());

4131

4132

// Allocate shadow area for Win64.

4133

if (IsWin64)

4134

CCInfo.AllocateStack(32, Align(8));

4135

4136

CCInfo.AnalyzeArguments(Ins, CC_X86);

4137

4138

// In vectorcall calling convention a second pass is required for the HVA

4139

// types.

4140

if (CallingConv::X86_VectorCall == CallConv) {

4141

CCInfo.AnalyzeArgumentsSecondPass(Ins, CC_X86);

4142

}

4143

4144

// The next loop assumes that the locations are in the same order of the

4145

// input arguments.

4146

assert(isSortedByValueNo(ArgLocs) &&(static_cast <bool> (isSortedByValueNo(ArgLocs) &&
"Argument Location list must be sorted before lowering") ? void
(0) : __assert_fail ("isSortedByValueNo(ArgLocs) && \"Argument Location list must be sorted before lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4147, __extension__
__PRETTY_FUNCTION__))

4147

"Argument Location list must be sorted before lowering")(static_cast <bool> (isSortedByValueNo(ArgLocs) &&
"Argument Location list must be sorted before lowering") ? void
(0) : __assert_fail ("isSortedByValueNo(ArgLocs) && \"Argument Location list must be sorted before lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4147, __extension__
__PRETTY_FUNCTION__));

4148

4149

SDValue ArgValue;

4150

for (unsigned I = 0, InsIndex = 0, E = ArgLocs.size(); I != E;

4151

++I, ++InsIndex) {

4152

assert(InsIndex < Ins.size() && "Invalid Ins index")(static_cast <bool> (InsIndex < Ins.size() &&
"Invalid Ins index") ? void (0) : __assert_fail ("InsIndex < Ins.size() && \"Invalid Ins index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4152, __extension__
__PRETTY_FUNCTION__));

4153

CCValAssign &VA = ArgLocs[I];

4154

4155

if (VA.isRegLoc()) {

4156

EVT RegVT = VA.getLocVT();

4157

if (VA.needsCustom()) {

4158

assert((static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs"
) ? void (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4160, __extension__
__PRETTY_FUNCTION__))

4159

VA.getValVT() == MVT::v64i1 &&(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs"
) ? void (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4160, __extension__
__PRETTY_FUNCTION__))

4160

"Currently the only custom case is when we split v64i1 to 2 regs")(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs"
) ? void (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4160, __extension__
__PRETTY_FUNCTION__));

4161

4162

// v64i1 values, in regcall calling convention, that are

4163

// compiled to 32 bit arch, are split up into two registers.

4164

ArgValue =

4165

getv64i1Argument(VA, ArgLocs[++I], Chain, DAG, dl, Subtarget);

4166

} else {

4167

const TargetRegisterClass *RC;

4168

if (RegVT == MVT::i8)

4169

RC = &X86::GR8RegClass;

4170

else if (RegVT == MVT::i16)

4171

RC = &X86::GR16RegClass;

4172

else if (RegVT == MVT::i32)

4173

RC = &X86::GR32RegClass;

4174

else if (Is64Bit && RegVT == MVT::i64)

4175

RC = &X86::GR64RegClass;

4176

else if (RegVT == MVT::f16)

4177

RC = Subtarget.hasAVX512() ? &X86::FR16XRegClass : &X86::FR16RegClass;

4178

else if (RegVT == MVT::f32)

4179

RC = Subtarget.hasAVX512() ? &X86::FR32XRegClass : &X86::FR32RegClass;

4180

else if (RegVT == MVT::f64)

4181

RC = Subtarget.hasAVX512() ? &X86::FR64XRegClass : &X86::FR64RegClass;

4182

else if (RegVT == MVT::f80)

4183

RC = &X86::RFP80RegClass;

4184

else if (RegVT == MVT::f128)

4185

RC = &X86::VR128RegClass;

4186

else if (RegVT.is512BitVector())

4187

RC = &X86::VR512RegClass;

4188

else if (RegVT.is256BitVector())

4189

RC = Subtarget.hasVLX() ? &X86::VR256XRegClass : &X86::VR256RegClass;

4190

else if (RegVT.is128BitVector())

4191

RC = Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass;

4192

else if (RegVT == MVT::x86mmx)

4193

RC = &X86::VR64RegClass;

4194

else if (RegVT == MVT::v1i1)

4195

RC = &X86::VK1RegClass;

4196

else if (RegVT == MVT::v8i1)

4197

RC = &X86::VK8RegClass;

4198

else if (RegVT == MVT::v16i1)

4199

RC = &X86::VK16RegClass;

4200

else if (RegVT == MVT::v32i1)

4201

RC = &X86::VK32RegClass;

4202

else if (RegVT == MVT::v64i1)

4203

RC = &X86::VK64RegClass;

4204

else

4205

llvm_unreachable("Unknown argument type!")::llvm::llvm_unreachable_internal("Unknown argument type!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 4205);

4206

4207

Register Reg = MF.addLiveIn(VA.getLocReg(), RC);

4208

ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);

4209

}

4210

4211

// If this is an 8 or 16-bit value, it is really passed promoted to 32

4212

// bits. Insert an assert[sz]ext to capture this, then truncate to the

4213

// right size.

4214

if (VA.getLocInfo() == CCValAssign::SExt)

4215

ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,

4216

DAG.getValueType(VA.getValVT()));

4217

else if (VA.getLocInfo() == CCValAssign::ZExt)

4218

ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,

4219

DAG.getValueType(VA.getValVT()));

4220

else if (VA.getLocInfo() == CCValAssign::BCvt)

4221

ArgValue = DAG.getBitcast(VA.getValVT(), ArgValue);

4222

4223

if (VA.isExtInLoc()) {

4224

// Handle MMX values passed in XMM regs.

4225

if (RegVT.isVector() && VA.getValVT().getScalarType() != MVT::i1)

4226

ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue);

4227

else if (VA.getValVT().isVector() &&

4228

VA.getValVT().getScalarType() == MVT::i1 &&

4229

((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||

4230

(VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {

4231

// Promoting a mask type (v*i1) into a register of type i64/i32/i16/i8

4232

ArgValue = lowerRegToMasks(ArgValue, VA.getValVT(), RegVT, dl, DAG);

4233

} else

4234

ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);

4235

}

4236

} else {

4237

assert(VA.isMemLoc())(static_cast <bool> (VA.isMemLoc()) ? void (0) : __assert_fail
("VA.isMemLoc()", "llvm/lib/Target/X86/X86ISelLowering.cpp",
4237, __extension__ __PRETTY_FUNCTION__));

4238

ArgValue =

4239

LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, InsIndex);

4240

}

4241

4242

// If value is passed via pointer - do a load.

4243

if (VA.getLocInfo() == CCValAssign::Indirect && !Ins[I].Flags.isByVal())

4244

ArgValue =

4245

DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, MachinePointerInfo());

4246

4247

InVals.push_back(ArgValue);

4248

}

4249

4250

for (unsigned I = 0, E = Ins.size(); I != E; ++I) {

4251

if (Ins[I].Flags.isSwiftAsync()) {

4252

auto X86FI = MF.getInfo<X86MachineFunctionInfo>();

4253

if (Subtarget.is64Bit())

4254

X86FI->setHasSwiftAsyncContext(true);

4255

else {

4256

int FI = MF.getFrameInfo().CreateStackObject(4, Align(4), false);

4257

X86FI->setSwiftAsyncContextFrameIdx(FI);

4258

SDValue St = DAG.getStore(DAG.getEntryNode(), dl, InVals[I],

4259

DAG.getFrameIndex(FI, MVT::i32),

4260

MachinePointerInfo::getFixedStack(MF, FI));

4261

Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, St, Chain);

4262

}

4263

}

4264

4265

// Swift calling convention does not require we copy the sret argument

4266

// into %rax/%eax for the return. We don't set SRetReturnReg for Swift.

4267

if (CallConv == CallingConv::Swift || CallConv == CallingConv::SwiftTail)

4268

continue;

4269

4270

// All x86 ABIs require that for returning structs by value we copy the

4271

// sret argument into %rax/%eax (depending on ABI) for the return. Save

4272

// the argument into a virtual register so that we can access it from the

4273

// return points.

4274

if (Ins[I].Flags.isSRet()) {

4275

assert(!FuncInfo->getSRetReturnReg() &&(static_cast <bool> (!FuncInfo->getSRetReturnReg() &&
"SRet return has already been set") ? void (0) : __assert_fail
("!FuncInfo->getSRetReturnReg() && \"SRet return has already been set\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4276, __extension__
__PRETTY_FUNCTION__))

4276

"SRet return has already been set")(static_cast <bool> (!FuncInfo->getSRetReturnReg() &&
"SRet return has already been set") ? void (0) : __assert_fail
("!FuncInfo->getSRetReturnReg() && \"SRet return has already been set\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4276, __extension__
__PRETTY_FUNCTION__));

4277

MVT PtrTy = getPointerTy(DAG.getDataLayout());

4278

Register Reg =

4279

MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));

4280

FuncInfo->setSRetReturnReg(Reg);

4281

SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[I]);

4282

Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);

4283

break;

4284

}

4285

}

4286

4287

unsigned StackSize = CCInfo.getNextStackOffset();

4288

// Align stack specially for tail calls.

4289

if (shouldGuaranteeTCO(CallConv,

4290

MF.getTarget().Options.GuaranteedTailCallOpt))

4291

StackSize = GetAlignedArgumentStackSize(StackSize, DAG);

4292

4293

if (IsVarArg)

4294

VarArgsLoweringHelper(FuncInfo, dl, DAG, Subtarget, CallConv, CCInfo)

4295

.lowerVarArgsParameters(Chain, StackSize);

4296

4297

// Some CCs need callee pop.

4298

if (X86::isCalleePop(CallConv, Is64Bit, IsVarArg,

4299

MF.getTarget().Options.GuaranteedTailCallOpt)) {

4300

FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.

4301

} else if (CallConv == CallingConv::X86_INTR && Ins.size() == 2) {

4302

// X86 interrupts must pop the error code (and the alignment padding) if

4303

// present.

4304

FuncInfo->setBytesToPopOnReturn(Is64Bit ? 16 : 4);

4305

} else {

4306

FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.

4307

// If this is an sret function, the return should pop the hidden pointer.

4308

if (!canGuaranteeTCO(CallConv) && hasCalleePopSRet(Ins, Subtarget))

4309

FuncInfo->setBytesToPopOnReturn(4);

4310

}

4311

4312

if (!Is64Bit) {

4313

// RegSaveFrameIndex is X86-64 only.

4314

FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);

4315

}

4316

4317

FuncInfo->setArgumentStackSize(StackSize);

4318

4319

if (WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo()) {

4320

EHPersonality Personality = classifyEHPersonality(F.getPersonalityFn());

4321

if (Personality == EHPersonality::CoreCLR) {

4322

assert(Is64Bit)(static_cast <bool> (Is64Bit) ? void (0) : __assert_fail
("Is64Bit", "llvm/lib/Target/X86/X86ISelLowering.cpp", 4322,
__extension__ __PRETTY_FUNCTION__));

4323

// TODO: Add a mechanism to frame lowering that will allow us to indicate

4324

// that we'd prefer this slot be allocated towards the bottom of the frame

4325

// (i.e. near the stack pointer after allocating the frame). Every

4326

// funclet needs a copy of this slot in its (mostly empty) frame, and the

4327

// offset from the bottom of this and each funclet's frame must be the

4328

// same, so the size of funclets' (mostly empty) frames is dictated by

4329

// how far this slot is from the bottom (since they allocate just enough

4330

// space to accommodate holding this slot at the correct offset).

4331

int PSPSymFI = MFI.CreateStackObject(8, Align(8), /*isSpillSlot=*/false);

4332

EHInfo->PSPSymFrameIdx = PSPSymFI;

4333

}

4334

}

4335

4336

if (shouldDisableCalleeSavedRegisterCC(CallConv) ||

4337

F.hasFnAttribute("no_caller_saved_registers")) {

4338

MachineRegisterInfo &MRI = MF.getRegInfo();

4339

for (std::pair<Register, Register> Pair : MRI.liveins())

4340

MRI.disableCalleeSavedRegister(Pair.first);

4341

}

4342

4343

return Chain;

4344

}

4345

4346

SDValue X86TargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr,

4347

SDValue Arg, const SDLoc &dl,

4348

SelectionDAG &DAG,

4349

const CCValAssign &VA,

4350

ISD::ArgFlagsTy Flags,

4351

bool isByVal) const {

4352

unsigned LocMemOffset = VA.getLocMemOffset();

4353

SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);

4354

PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),

4355

StackPtr, PtrOff);

4356

if (isByVal)

4357

return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);

4358

4359

MaybeAlign Alignment;

4360

if (Subtarget.isTargetWindowsMSVC() && !Subtarget.is64Bit() &&

4361

Arg.getSimpleValueType() != MVT::f80)

4362

Alignment = MaybeAlign(4);

4363

return DAG.getStore(

4364

Chain, dl, Arg, PtrOff,

4365

MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset),

4366

Alignment);

4367

}

4368

4369

/// Emit a load of return address if tail call

4370

/// optimization is performed and it is required.

4371

SDValue X86TargetLowering::EmitTailCallLoadRetAddr(

4372

SelectionDAG &DAG, SDValue &OutRetAddr, SDValue Chain, bool IsTailCall,

4373

bool Is64Bit, int FPDiff, const SDLoc &dl) const {

4374

// Adjust the Return address stack slot.

4375

EVT VT = getPointerTy(DAG.getDataLayout());

4376

OutRetAddr = getReturnAddressFrameIndex(DAG);

4377

4378

// Load the "old" Return address.

4379

OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo());

4380

return SDValue(OutRetAddr.getNode(), 1);

4381

}

4382

4383

/// Emit a store of the return address if tail call

4384

/// optimization is performed and it is required (FPDiff!=0).

4385

static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,

4386

SDValue Chain, SDValue RetAddrFrIdx,

4387

EVT PtrVT, unsigned SlotSize,

4388

int FPDiff, const SDLoc &dl) {

4389

// Store the return address to the appropriate stack slot.

4390

if (!FPDiff) return Chain;

4391

// Calculate the new stack slot for the return address.

4392

int NewReturnAddrFI =

4393

MF.getFrameInfo().CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize,

4394

false);

4395

SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);

4396

Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,

4397

MachinePointerInfo::getFixedStack(

4398

DAG.getMachineFunction(), NewReturnAddrFI));

4399

return Chain;

4400

}

4401

4402

/// Returns a vector_shuffle mask for an movs{s|d}, movd

4403

/// operation of specified width.

4404

static SDValue getMOVL(SelectionDAG &DAG, const SDLoc &dl, MVT VT, SDValue V1,

4405

SDValue V2) {

4406

unsigned NumElems = VT.getVectorNumElements();

4407

SmallVector<int, 8> Mask;

4408

Mask.push_back(NumElems);

4409

for (unsigned i = 1; i != NumElems; ++i)

4410

Mask.push_back(i);

4411

return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);

4412

}

4413

4414

SDValue

4415

X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,

4416

SmallVectorImpl<SDValue> &InVals) const {

4417

SelectionDAG &DAG = CLI.DAG;

4418

SDLoc &dl = CLI.DL;

4419

SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;

4420

SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;

4421

SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;

4422

SDValue Chain = CLI.Chain;

4423

SDValue Callee = CLI.Callee;

4424

CallingConv::ID CallConv = CLI.CallConv;

4425

bool &isTailCall = CLI.IsTailCall;

4426

bool isVarArg = CLI.IsVarArg;

4427

const auto *CB = CLI.CB;

4428

4429

MachineFunction &MF = DAG.getMachineFunction();

4430

bool Is64Bit = Subtarget.is64Bit();

4431

bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);

4432

bool IsSibcall = false;

4433

bool IsGuaranteeTCO = MF.getTarget().Options.GuaranteedTailCallOpt ||

4434

CallConv == CallingConv::Tail || CallConv == CallingConv::SwiftTail;

4435

bool IsCalleePopSRet = !IsGuaranteeTCO && hasCalleePopSRet(Outs, Subtarget);

4436

X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();

4437

bool HasNCSR = (CB && isa<CallInst>(CB) &&

4438

CB->hasFnAttr("no_caller_saved_registers"));

4439

bool HasNoCfCheck = (CB && CB->doesNoCfCheck());

4440

bool IsIndirectCall = (CB && isa<CallInst>(CB) && CB->isIndirectCall());

4441

bool IsCFICall = IsIndirectCall && CLI.CFIType;

4442

const Module *M = MF.getMMI().getModule();

4443

Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch");

4444

4445

MachineFunction::CallSiteInfo CSInfo;

4446

if (CallConv == CallingConv::X86_INTR)

4447

report_fatal_error("X86 interrupts may not be called directly");

4448

4449

bool IsMustTail = CLI.CB && CLI.CB->isMustTailCall();

4450

if (Subtarget.isPICStyleGOT() && !IsGuaranteeTCO && !IsMustTail) {

4451

// If we are using a GOT, disable tail calls to external symbols with

4452

// default visibility. Tail calling such a symbol requires using a GOT

4453

// relocation, which forces early binding of the symbol. This breaks code

4454

// that require lazy function symbol resolution. Using musttail or

4455

// GuaranteedTailCallOpt will override this.

4456

GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);

4457

if (!G || (!G->getGlobal()->hasLocalLinkage() &&

4458

G->getGlobal()->hasDefaultVisibility()))

4459

isTailCall = false;

4460

}

4461

4462

if (isTailCall && !IsMustTail) {

4463

// Check if it's really possible to do a tail call.

4464

isTailCall = IsEligibleForTailCallOptimization(

4465

Callee, CallConv, IsCalleePopSRet, isVarArg, CLI.RetTy, Outs, OutVals,

4466

Ins, DAG);

4467

4468

// Sibcalls are automatically detected tailcalls which do not require

4469

// ABI changes.

4470

if (!IsGuaranteeTCO && isTailCall)

4471

IsSibcall = true;

4472

4473

if (isTailCall)

4474

++NumTailCalls;

4475

}

4476

4477

if (IsMustTail && !isTailCall)

4478

report_fatal_error("failed to perform tail call elimination on a call "

4479

"site marked musttail");

4480

4481

assert(!(isVarArg && canGuaranteeTCO(CallConv)) &&(static_cast <bool> (!(isVarArg && canGuaranteeTCO
(CallConv)) && "Var args not supported with calling convention fastcc, ghc or hipe"
) ? void (0) : __assert_fail ("!(isVarArg && canGuaranteeTCO(CallConv)) && \"Var args not supported with calling convention fastcc, ghc or hipe\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4482, __extension__
__PRETTY_FUNCTION__))

4482

"Var args not supported with calling convention fastcc, ghc or hipe")(static_cast <bool> (!(isVarArg && canGuaranteeTCO
(CallConv)) && "Var args not supported with calling convention fastcc, ghc or hipe"
) ? void (0) : __assert_fail ("!(isVarArg && canGuaranteeTCO(CallConv)) && \"Var args not supported with calling convention fastcc, ghc or hipe\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4482, __extension__
__PRETTY_FUNCTION__));

4483

4484

// Analyze operands of the call, assigning locations to each operand.

4485

SmallVector<CCValAssign, 16> ArgLocs;

4486

CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());

4487

4488

// Allocate shadow area for Win64.

4489

if (IsWin64)

4490

CCInfo.AllocateStack(32, Align(8));

4491

4492

CCInfo.AnalyzeArguments(Outs, CC_X86);

4493

4494

// In vectorcall calling convention a second pass is required for the HVA

4495

// types.

4496

if (CallingConv::X86_VectorCall == CallConv) {

4497

CCInfo.AnalyzeArgumentsSecondPass(Outs, CC_X86);

4498

}

4499

4500

// Get a count of how many bytes are to be pushed on the stack.

4501

unsigned NumBytes = CCInfo.getAlignedCallFrameSize();

4502

if (IsSibcall)

4503

// This is a sibcall. The memory operands are available in caller's

4504

// own caller's stack.

4505

NumBytes = 0;

4506

else if (IsGuaranteeTCO && canGuaranteeTCO(CallConv))

4507

NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);

4508

4509

int FPDiff = 0;

4510

if (isTailCall &&

4511

shouldGuaranteeTCO(CallConv,

4512

MF.getTarget().Options.GuaranteedTailCallOpt)) {

4513

// Lower arguments at fp - stackoffset + fpdiff.

4514

unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();

4515

4516

FPDiff = NumBytesCallerPushed - NumBytes;

4517

4518

// Set the delta of movement of the returnaddr stackslot.

4519

// But only set if delta is greater than previous delta.

4520

if (FPDiff < X86Info->getTCReturnAddrDelta())

4521

X86Info->setTCReturnAddrDelta(FPDiff);

4522

}

4523

4524

unsigned NumBytesToPush = NumBytes;

4525

unsigned NumBytesToPop = NumBytes;

4526

4527

// If we have an inalloca argument, all stack space has already been allocated

4528

// for us and be right at the top of the stack. We don't support multiple

4529

// arguments passed in memory when using inalloca.

4530

if (!Outs.empty() && Outs.back().Flags.isInAlloca()) {

4531

NumBytesToPush = 0;

4532

if (!ArgLocs.back().isMemLoc())

4533

report_fatal_error("cannot use inalloca attribute on a register "

4534

"parameter");

4535

if (ArgLocs.back().getLocMemOffset() != 0)

4536

report_fatal_error("any parameter with the inalloca attribute must be "

4537

"the only memory argument");

4538

} else if (CLI.IsPreallocated) {

4539

assert(ArgLocs.back().isMemLoc() &&(static_cast <bool> (ArgLocs.back().isMemLoc() &&
"cannot use preallocated attribute on a register " "parameter"
) ? void (0) : __assert_fail ("ArgLocs.back().isMemLoc() && \"cannot use preallocated attribute on a register \" \"parameter\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4541, __extension__
__PRETTY_FUNCTION__))

4540

"cannot use preallocated attribute on a register "(static_cast <bool> (ArgLocs.back().isMemLoc() &&
"cannot use preallocated attribute on a register " "parameter"
) ? void (0) : __assert_fail ("ArgLocs.back().isMemLoc() && \"cannot use preallocated attribute on a register \" \"parameter\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4541, __extension__
__PRETTY_FUNCTION__))

4541

"parameter")(static_cast <bool> (ArgLocs.back().isMemLoc() &&
"cannot use preallocated attribute on a register " "parameter"
) ? void (0) : __assert_fail ("ArgLocs.back().isMemLoc() && \"cannot use preallocated attribute on a register \" \"parameter\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4541, __extension__
__PRETTY_FUNCTION__));

4542

SmallVector<size_t, 4> PreallocatedOffsets;

4543

for (size_t i = 0; i < CLI.OutVals.size(); ++i) {

4544

if (CLI.CB->paramHasAttr(i, Attribute::Preallocated)) {

4545

PreallocatedOffsets.push_back(ArgLocs[i].getLocMemOffset());

4546

}

4547

}

4548

auto *MFI = DAG.getMachineFunction().getInfo<X86MachineFunctionInfo>();

4549

size_t PreallocatedId = MFI->getPreallocatedIdForCallSite(CLI.CB);

4550

MFI->setPreallocatedStackSize(PreallocatedId, NumBytes);

4551

MFI->setPreallocatedArgOffsets(PreallocatedId, PreallocatedOffsets);

4552

NumBytesToPush = 0;

4553

}

4554

4555

if (!IsSibcall && !IsMustTail)

4556

Chain = DAG.getCALLSEQ_START(Chain, NumBytesToPush,

4557

NumBytes - NumBytesToPush, dl);

4558

4559

SDValue RetAddrFrIdx;

4560

// Load return address for tail calls.

4561

if (isTailCall && FPDiff)

4562

Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,

4563

Is64Bit, FPDiff, dl);

4564

4565

SmallVector<std::pair<Register, SDValue>, 8> RegsToPass;

4566

SmallVector<SDValue, 8> MemOpChains;

4567

SDValue StackPtr;

4568

4569

// The next loop assumes that the locations are in the same order of the

4570

// input arguments.

4571

assert(isSortedByValueNo(ArgLocs) &&(static_cast <bool> (isSortedByValueNo(ArgLocs) &&
"Argument Location list must be sorted before lowering") ? void
(0) : __assert_fail ("isSortedByValueNo(ArgLocs) && \"Argument Location list must be sorted before lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4572, __extension__
__PRETTY_FUNCTION__))

4572

"Argument Location list must be sorted before lowering")(static_cast <bool> (isSortedByValueNo(ArgLocs) &&
"Argument Location list must be sorted before lowering") ? void
(0) : __assert_fail ("isSortedByValueNo(ArgLocs) && \"Argument Location list must be sorted before lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4572, __extension__
__PRETTY_FUNCTION__));

4573

4574

// Walk the register/memloc assignments, inserting copies/loads. In the case

4575

// of tail call optimization arguments are handle later.

4576

const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();

4577

for (unsigned I = 0, OutIndex = 0, E = ArgLocs.size(); I != E;

4578

++I, ++OutIndex) {

4579

assert(OutIndex < Outs.size() && "Invalid Out index")(static_cast <bool> (OutIndex < Outs.size() &&
"Invalid Out index") ? void (0) : __assert_fail ("OutIndex < Outs.size() && \"Invalid Out index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4579, __extension__
__PRETTY_FUNCTION__));

4580

// Skip inalloca/preallocated arguments, they have already been written.

4581

ISD::ArgFlagsTy Flags = Outs[OutIndex].Flags;

4582

if (Flags.isInAlloca() || Flags.isPreallocated())

4583

continue;

4584

4585

CCValAssign &VA = ArgLocs[I];

4586

EVT RegVT = VA.getLocVT();

4587

SDValue Arg = OutVals[OutIndex];

4588

bool isByVal = Flags.isByVal();

4589

4590

// Promote the value if needed.

4591

switch (VA.getLocInfo()) {

4592

default: llvm_unreachable("Unknown loc info!")::llvm::llvm_unreachable_internal("Unknown loc info!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 4592);

4593

case CCValAssign::Full: break;

4594

case CCValAssign::SExt:

4595

Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);

4596

break;

4597

case CCValAssign::ZExt:

4598

Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);

4599

break;

4600

case CCValAssign::AExt:

4601

if (Arg.getValueType().isVector() &&

4602

Arg.getValueType().getVectorElementType() == MVT::i1)

4603

Arg = lowerMasksToReg(Arg, RegVT, dl, DAG);

4604

else if (RegVT.is128BitVector()) {

4605

// Special case: passing MMX values in XMM registers.

4606

Arg = DAG.getBitcast(MVT::i64, Arg);

4607

Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);

4608

Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);

4609

} else

4610

Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);

4611

break;

4612

case CCValAssign::BCvt:

4613

Arg = DAG.getBitcast(RegVT, Arg);

4614

break;

4615

case CCValAssign::Indirect: {

4616

if (isByVal) {

4617

// Memcpy the argument to a temporary stack slot to prevent

4618

// the caller from seeing any modifications the callee may make

4619

// as guaranteed by the `byval` attribute.

4620

int FrameIdx = MF.getFrameInfo().CreateStackObject(

4621

Flags.getByValSize(),

4622

std::max(Align(16), Flags.getNonZeroByValAlign()), false);

4623

SDValue StackSlot =

4624

DAG.getFrameIndex(FrameIdx, getPointerTy(DAG.getDataLayout()));

4625

Chain =

4626

CreateCopyOfByValArgument(Arg, StackSlot, Chain, Flags, DAG, dl);

4627

// From now on treat this as a regular pointer

4628

Arg = StackSlot;

4629

isByVal = false;

4630

} else {

4631

// Store the argument.

4632

SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());

4633

int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();

4634

Chain = DAG.getStore(

4635

Chain, dl, Arg, SpillSlot,

4636

MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));

4637

Arg = SpillSlot;

4638

}

4639

break;

4640

}

4641

}

4642

4643

if (VA.needsCustom()) {

4644

assert(VA.getValVT() == MVT::v64i1 &&(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs"
) ? void (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4645, __extension__
__PRETTY_FUNCTION__))

4645

"Currently the only custom case is when we split v64i1 to 2 regs")(static_cast <bool> (VA.getValVT() == MVT::v64i1 &&
"Currently the only custom case is when we split v64i1 to 2 regs"
) ? void (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4645, __extension__
__PRETTY_FUNCTION__));

4646

// Split v64i1 value into two registers

4647

Passv64i1ArgInRegs(dl, DAG, Arg, RegsToPass, VA, ArgLocs[++I], Subtarget);

4648

} else if (VA.isRegLoc()) {

4649

RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));

4650

const TargetOptions &Options = DAG.getTarget().Options;

4651

if (Options.EmitCallSiteInfo)

4652

CSInfo.emplace_back(VA.getLocReg(), I);

4653

if (isVarArg && IsWin64) {

4654

// Win64 ABI requires argument XMM reg to be copied to the corresponding

4655

// shadow reg if callee is a varargs function.

4656

Register ShadowReg;

4657

switch (VA.getLocReg()) {

4658

case X86::XMM0: ShadowReg = X86::RCX; break;

4659

case X86::XMM1: ShadowReg = X86::RDX; break;

4660

case X86::XMM2: ShadowReg = X86::R8; break;

4661

case X86::XMM3: ShadowReg = X86::R9; break;

4662

}

4663

if (ShadowReg)

4664

RegsToPass.push_back(std::make_pair(ShadowReg, Arg));

4665

}

4666

} else if (!IsSibcall && (!isTailCall || isByVal)) {

4667

assert(VA.isMemLoc())(static_cast <bool> (VA.isMemLoc()) ? void (0) : __assert_fail
("VA.isMemLoc()", "llvm/lib/Target/X86/X86ISelLowering.cpp",
4667, __extension__ __PRETTY_FUNCTION__));

4668

if (!StackPtr.getNode())

4669

StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),

4670

getPointerTy(DAG.getDataLayout()));

4671

MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,

4672

dl, DAG, VA, Flags, isByVal));

4673

}

4674

}

4675

4676

if (!MemOpChains.empty())

4677

Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);

4678

4679

if (Subtarget.isPICStyleGOT()) {

4680

// ELF / PIC requires GOT in the EBX register before function calls via PLT

4681

// GOT pointer (except regcall).

4682

if (!isTailCall) {

4683

// Indirect call with RegCall calling convertion may use up all the

4684

// general registers, so it is not suitable to bind EBX reister for

4685

// GOT address, just let register allocator handle it.

4686

if (CallConv != CallingConv::X86_RegCall)

4687

RegsToPass.push_back(std::make_pair(

4688

Register(X86::EBX), DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),

4689

getPointerTy(DAG.getDataLayout()))));

4690

} else {

4691

// If we are tail calling and generating PIC/GOT style code load the

4692

// address of the callee into ECX. The value in ecx is used as target of

4693

// the tail jump. This is done to circumvent the ebx/callee-saved problem

4694

// for tail calls on PIC/GOT architectures. Normally we would just put the

4695

// address of GOT into ebx and then call target@PLT. But for tail calls

4696

// ebx would be restored (since ebx is callee saved) before jumping to the

4697

// target@PLT.

4698

4699

// Note: The actual moving to ECX is done further down.

4700

GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);

4701

if (G && !G->getGlobal()->hasLocalLinkage() &&

4702

G->getGlobal()->hasDefaultVisibility())

4703

Callee = LowerGlobalAddress(Callee, DAG);

4704

else if (isa<ExternalSymbolSDNode>(Callee))

4705

Callee = LowerExternalSymbol(Callee, DAG);

4706

}

4707

}

4708

4709

if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail &&

4710

(Subtarget.hasSSE1() || !M->getModuleFlag("SkipRaxSetup"))) {

4711

// From AMD64 ABI document:

4712

// For calls that may call functions that use varargs or stdargs

4713

// (prototype-less calls or calls to functions containing ellipsis (...) in

4714

// the declaration) %al is used as hidden argument to specify the number

4715

// of SSE registers used. The contents of %al do not need to match exactly

4716

// the number of registers, but must be an ubound on the number of SSE

4717

// registers used and is in the range 0 - 8 inclusive.

4718

4719

// Count the number of XMM registers allocated.

4720

static const MCPhysReg XMMArgRegs[] = {

4721

X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,

4722

X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7

4723

};

4724

unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs);

4725

assert((Subtarget.hasSSE1() || !NumXMMRegs)(static_cast <bool> ((Subtarget.hasSSE1() || !NumXMMRegs
) && "SSE registers cannot be used when SSE is disabled"
) ? void (0) : __assert_fail ("(Subtarget.hasSSE1() || !NumXMMRegs) && \"SSE registers cannot be used when SSE is disabled\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4726, __extension__
__PRETTY_FUNCTION__))

4726

&& "SSE registers cannot be used when SSE is disabled")(static_cast <bool> ((Subtarget.hasSSE1() || !NumXMMRegs
) && "SSE registers cannot be used when SSE is disabled"
) ? void (0) : __assert_fail ("(Subtarget.hasSSE1() || !NumXMMRegs) && \"SSE registers cannot be used when SSE is disabled\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4726, __extension__
__PRETTY_FUNCTION__));

4727

RegsToPass.push_back(std::make_pair(Register(X86::AL),

4728

DAG.getConstant(NumXMMRegs, dl,

4729

MVT::i8)));

4730

}

4731

4732

if (isVarArg && IsMustTail) {

4733

const auto &Forwards = X86Info->getForwardedMustTailRegParms();

4734

for (const auto &F : Forwards) {

4735

SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);

4736

RegsToPass.push_back(std::make_pair(F.PReg, Val));

4737

}

4738

}

4739

4740

// For tail calls lower the arguments to the 'real' stack slots. Sibcalls

4741

// don't need this because the eligibility check rejects calls that require

4742

// shuffling arguments passed in memory.

4743

if (!IsSibcall && isTailCall) {

4744

// Force all the incoming stack arguments to be loaded from the stack

4745

// before any new outgoing arguments are stored to the stack, because the

4746

// outgoing stack slots may alias the incoming argument stack slots, and

4747

// the alias isn't otherwise explicit. This is slightly more conservative

4748

// than necessary, because it means that each store effectively depends

4749

// on every argument instead of just those arguments it would clobber.

4750

SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);

4751

4752

SmallVector<SDValue, 8> MemOpChains2;

4753

SDValue FIN;

4754

int FI = 0;

4755

for (unsigned I = 0, OutsIndex = 0, E = ArgLocs.size(); I != E;

4756

++I, ++OutsIndex) {

4757

CCValAssign &VA = ArgLocs[I];

4758

4759

if (VA.isRegLoc()) {

4760

if (VA.needsCustom()) {

4761

assert((CallConv == CallingConv::X86_RegCall) &&(static_cast <bool> ((CallConv == CallingConv::X86_RegCall
) && "Expecting custom case only in regcall calling convention"
) ? void (0) : __assert_fail ("(CallConv == CallingConv::X86_RegCall) && \"Expecting custom case only in regcall calling convention\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4762, __extension__
__PRETTY_FUNCTION__))

4762

"Expecting custom case only in regcall calling convention")(static_cast <bool> ((CallConv == CallingConv::X86_RegCall
) && "Expecting custom case only in regcall calling convention"
) ? void (0) : __assert_fail ("(CallConv == CallingConv::X86_RegCall) && \"Expecting custom case only in regcall calling convention\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4762, __extension__
__PRETTY_FUNCTION__));

4763

// This means that we are in special case where one argument was

4764

// passed through two register locations - Skip the next location

4765

++I;

4766

}

4767

4768

continue;

4769

}

4770

4771

assert(VA.isMemLoc())(static_cast <bool> (VA.isMemLoc()) ? void (0) : __assert_fail
("VA.isMemLoc()", "llvm/lib/Target/X86/X86ISelLowering.cpp",
4771, __extension__ __PRETTY_FUNCTION__));

4772

SDValue Arg = OutVals[OutsIndex];

4773

ISD::ArgFlagsTy Flags = Outs[OutsIndex].Flags;

4774

// Skip inalloca/preallocated arguments. They don't require any work.

4775

if (Flags.isInAlloca() || Flags.isPreallocated())

4776

continue;

4777

// Create frame index.

4778

int32_t Offset = VA.getLocMemOffset()+FPDiff;

4779

uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;

4780

FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);

4781

FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));

4782

4783

if (Flags.isByVal()) {

4784

// Copy relative to framepointer.

4785

SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset(), dl);

4786

if (!StackPtr.getNode())

4787

StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),

4788

getPointerTy(DAG.getDataLayout()));

4789

Source = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),

4790

StackPtr, Source);

4791

4792

MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,

4793

ArgChain,

4794

Flags, DAG, dl));

4795

} else {

4796

// Store relative to framepointer.

4797

MemOpChains2.push_back(DAG.getStore(

4798

ArgChain, dl, Arg, FIN,

4799

MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)));

4800

}

4801

}

4802

4803

if (!MemOpChains2.empty())

4804

Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);

4805

4806

// Store the return address to the appropriate stack slot.

4807

Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,

4808

getPointerTy(DAG.getDataLayout()),

4809

RegInfo->getSlotSize(), FPDiff, dl);

4810

}

4811

4812

// Build a sequence of copy-to-reg nodes chained together with token chain

4813

// and flag operands which copy the outgoing args into registers.

4814

SDValue InFlag;

4815

for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {

4816

Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,

4817

RegsToPass[i].second, InFlag);

4818

InFlag = Chain.getValue(1);

4819

}

4820

4821

if (DAG.getTarget().getCodeModel() == CodeModel::Large) {

4822

assert(Is64Bit && "Large code model is only legal in 64-bit mode.")(static_cast <bool> (Is64Bit && "Large code model is only legal in 64-bit mode."
) ? void (0) : __assert_fail ("Is64Bit && \"Large code model is only legal in 64-bit mode.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4822, __extension__
__PRETTY_FUNCTION__));

4823

// In the 64-bit large code model, we have to make all calls

4824

// through a register, since the call instruction's 32-bit

4825

// pc-relative offset may not be large enough to hold the whole

4826

// address.

4827

} else if (Callee->getOpcode() == ISD::GlobalAddress ||

4828

Callee->getOpcode() == ISD::ExternalSymbol) {

4829

// Lower direct calls to global addresses and external symbols. Setting

4830

// ForCall to true here has the effect of removing WrapperRIP when possible

4831

// to allow direct calls to be selected without first materializing the

4832

// address into a register.

4833

Callee = LowerGlobalOrExternal(Callee, DAG, /*ForCall=*/true);

4834

} else if (Subtarget.isTarget64BitILP32() &&

4835

Callee.getValueType() == MVT::i32) {

4836

// Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI

4837

Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee);

4838

}

4839

4840

// Returns a chain & a flag for retval copy to use.

4841

SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);

4842

SmallVector<SDValue, 8> Ops;

4843

4844

if (!IsSibcall && isTailCall && !IsMustTail) {

4845

Chain = DAG.getCALLSEQ_END(Chain, NumBytesToPop, 0, InFlag, dl);

4846

InFlag = Chain.getValue(1);

4847

}

4848

4849

Ops.push_back(Chain);

4850

Ops.push_back(Callee);

4851

4852

if (isTailCall)

4853

Ops.push_back(DAG.getTargetConstant(FPDiff, dl, MVT::i32));

4854

4855

// Add argument registers to the end of the list so that they are known live

4856

// into the call.

4857

for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)

4858

Ops.push_back(DAG.getRegister(RegsToPass[i].first,

4859

RegsToPass[i].second.getValueType()));

4860

4861

// Add a register mask operand representing the call-preserved registers.

4862

const uint32_t *Mask = [&]() {

4863

auto AdaptedCC = CallConv;

4864

// If HasNCSR is asserted (attribute NoCallerSavedRegisters exists),

4865

// use X86_INTR calling convention because it has the same CSR mask

4866

// (same preserved registers).

4867

if (HasNCSR)

4868

AdaptedCC = (CallingConv::ID)CallingConv::X86_INTR;

4869

// If NoCalleeSavedRegisters is requested, than use GHC since it happens

4870

// to use the CSR_NoRegs_RegMask.

4871

if (CB && CB->hasFnAttr("no_callee_saved_registers"))

4872

AdaptedCC = (CallingConv::ID)CallingConv::GHC;

4873

return RegInfo->getCallPreservedMask(MF, AdaptedCC);

4874

}();

4875

assert(Mask && "Missing call preserved mask for calling convention")(static_cast <bool> (Mask && "Missing call preserved mask for calling convention"
) ? void (0) : __assert_fail ("Mask && \"Missing call preserved mask for calling convention\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4875, __extension__
__PRETTY_FUNCTION__));

4876

4877

// If this is an invoke in a 32-bit function using a funclet-based

4878

// personality, assume the function clobbers all registers. If an exception

4879

// is thrown, the runtime will not restore CSRs.

4880

// FIXME: Model this more precisely so that we can register allocate across

4881

// the normal edge and spill and fill across the exceptional edge.

4882

if (!Is64Bit && CLI.CB && isa<InvokeInst>(CLI.CB)) {

4883

const Function &CallerFn = MF.getFunction();

4884

EHPersonality Pers =

4885

CallerFn.hasPersonalityFn()

4886

? classifyEHPersonality(CallerFn.getPersonalityFn())

4887

: EHPersonality::Unknown;

4888

if (isFuncletEHPersonality(Pers))

4889

Mask = RegInfo->getNoPreservedMask();

4890

}

4891

4892

// Define a new register mask from the existing mask.

4893

uint32_t *RegMask = nullptr;

4894

4895

// In some calling conventions we need to remove the used physical registers

4896

// from the reg mask.

4897

if (shouldDisableCalleeSavedRegisterCC(CallConv) || HasNCSR) {

4898

const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();

4899

4900

// Allocate a new Reg Mask and copy Mask.

4901

RegMask = MF.allocateRegMask();

4902

unsigned RegMaskSize = MachineOperand::getRegMaskSize(TRI->getNumRegs());

4903

memcpy(RegMask, Mask, sizeof(RegMask[0]) * RegMaskSize);

4904

4905

// Make sure all sub registers of the argument registers are reset

4906

// in the RegMask.

4907

for (auto const &RegPair : RegsToPass)

4908

for (MCSubRegIterator SubRegs(RegPair.first, TRI, /*IncludeSelf=*/true);

4909

SubRegs.isValid(); ++SubRegs)

4910

RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32));

4911

4912

// Create the RegMask Operand according to our updated mask.

4913

Ops.push_back(DAG.getRegisterMask(RegMask));

4914

} else {

4915

// Create the RegMask Operand according to the static mask.

4916

Ops.push_back(DAG.getRegisterMask(Mask));

4917

}

4918

4919

if (InFlag.getNode())

4920

Ops.push_back(InFlag);

4921

4922

if (isTailCall) {

4923

// We used to do:

4924

//// If this is the first return lowered for this function, add the regs

4925

//// to the liveout set for the function.

4926

// This isn't right, although it's probably harmless on x86; liveouts

4927

// should be computed from returns not tail calls. Consider a void

4928

// function making a tail call to a function returning int.

4929

MF.getFrameInfo().setHasTailCall();

4930

SDValue Ret = DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops);

4931

4932

if (IsCFICall)

4933

Ret.getNode()->setCFIType(CLI.CFIType->getZExtValue());

4934

4935

DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo));

4936

return Ret;

4937

}

4938

4939

if (HasNoCfCheck && IsCFProtectionSupported && IsIndirectCall) {

4940

Chain = DAG.getNode(X86ISD::NT_CALL, dl, NodeTys, Ops);

4941

} else if (CLI.CB && objcarc::hasAttachedCallOpBundle(CLI.CB)) {

4942

// Calls with a "clang.arc.attachedcall" bundle are special. They should be

4943

// expanded to the call, directly followed by a special marker sequence and

4944

// a call to a ObjC library function. Use the CALL_RVMARKER to do that.

4945

assert(!isTailCall &&(static_cast <bool> (!isTailCall && "tail calls cannot be marked with clang.arc.attachedcall"
) ? void (0) : __assert_fail ("!isTailCall && \"tail calls cannot be marked with clang.arc.attachedcall\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4946, __extension__
__PRETTY_FUNCTION__))

4946

"tail calls cannot be marked with clang.arc.attachedcall")(static_cast <bool> (!isTailCall && "tail calls cannot be marked with clang.arc.attachedcall"
) ? void (0) : __assert_fail ("!isTailCall && \"tail calls cannot be marked with clang.arc.attachedcall\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4946, __extension__
__PRETTY_FUNCTION__));

4947

assert(Is64Bit && "clang.arc.attachedcall is only supported in 64bit mode")(static_cast <bool> (Is64Bit && "clang.arc.attachedcall is only supported in 64bit mode"
) ? void (0) : __assert_fail ("Is64Bit && \"clang.arc.attachedcall is only supported in 64bit mode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 4947, __extension__
__PRETTY_FUNCTION__));

4948

4949

// Add a target global address for the retainRV/claimRV runtime function

4950

// just before the call target.

4951

Function *ARCFn = *objcarc::getAttachedARCFunction(CLI.CB);

4952

auto PtrVT = getPointerTy(DAG.getDataLayout());

4953

auto GA = DAG.getTargetGlobalAddress(ARCFn, dl, PtrVT);

4954

Ops.insert(Ops.begin() + 1, GA);

4955

Chain = DAG.getNode(X86ISD::CALL_RVMARKER, dl, NodeTys, Ops);

4956

} else {

4957

Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);

4958

}

4959

4960

if (IsCFICall)

4961

Chain.getNode()->setCFIType(CLI.CFIType->getZExtValue());

4962

4963

InFlag = Chain.getValue(1);

4964

DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge);

4965

DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));

4966

4967

// Save heapallocsite metadata.

4968

if (CLI.CB)

4969

if (MDNode *HeapAlloc = CLI.CB->getMetadata("heapallocsite"))

4970

DAG.addHeapAllocSite(Chain.getNode(), HeapAlloc);

4971

4972

// Create the CALLSEQ_END node.

4973

unsigned NumBytesForCalleeToPop = 0; // Callee pops nothing.

4974

if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,

4975

DAG.getTarget().Options.GuaranteedTailCallOpt))

4976

NumBytesForCalleeToPop = NumBytes; // Callee pops everything

4977

else if (!canGuaranteeTCO(CallConv) && IsCalleePopSRet)

4978

// If this call passes a struct-return pointer, the callee

4979

// pops that struct pointer.

4980

NumBytesForCalleeToPop = 4;

4981

4982

// Returns a flag for retval copy to use.

4983

if (!IsSibcall) {

4984

Chain = DAG.getCALLSEQ_END(Chain, NumBytesToPop, NumBytesForCalleeToPop,

4985

InFlag, dl);

4986

InFlag = Chain.getValue(1);

4987

}

4988

4989

// Handle result values, copying them out of physregs into vregs that we

4990

// return.

4991

return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, dl, DAG,

4992

InVals, RegMask);

4993

}

4994

4995

//===----------------------------------------------------------------------===//

4996

// Fast Calling Convention (tail call) implementation

4997

//===----------------------------------------------------------------------===//

4998

4999

// Like std call, callee cleans arguments, convention except that ECX is

5000

// reserved for storing the tail called function address. Only 2 registers are

5001

// free for argument passing (inreg). Tail call optimization is performed

5002

// provided:

5003

// * tailcallopt is enabled

5004

// * caller/callee are fastcc

5005

// On X86_64 architecture with GOT-style position independent code only local

5006

// (within module) calls are supported at the moment.

5007

// To keep the stack aligned according to platform abi the function

5008

// GetAlignedArgumentStackSize ensures that argument delta is always multiples

5009

// of stack alignment. (Dynamic linkers need this - Darwin's dyld for example)

5010

// If a tail called function callee has more arguments than the caller the

5011

// caller needs to make sure that there is room to move the RETADDR to. This is

5012

// achieved by reserving an area the size of the argument delta right after the

5013

// original RETADDR, but before the saved framepointer or the spilled registers

5014

// e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)

5015

// stack layout:

5016

// arg1

5017

// arg2

5018

// RETADDR

5019

// [ new RETADDR

5020

// move area ]

5021

// (possible EBP)

5022

// ESI

5023

// EDI

5024

// local1 ..

5025

5026

/// Make the stack size align e.g 16n + 12 aligned for a 16-byte align

5027

/// requirement.

5028

unsigned

5029

X86TargetLowering::GetAlignedArgumentStackSize(const unsigned StackSize,

5030

SelectionDAG &DAG) const {

5031

const Align StackAlignment = Subtarget.getFrameLowering()->getStackAlign();

5032

const uint64_t SlotSize = Subtarget.getRegisterInfo()->getSlotSize();

5033

assert(StackSize % SlotSize == 0 &&(static_cast <bool> (StackSize % SlotSize == 0 &&
"StackSize must be a multiple of SlotSize") ? void (0) : __assert_fail
("StackSize % SlotSize == 0 && \"StackSize must be a multiple of SlotSize\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 5034, __extension__
__PRETTY_FUNCTION__))

5034

"StackSize must be a multiple of SlotSize")(static_cast <bool> (StackSize % SlotSize == 0 &&
"StackSize must be a multiple of SlotSize") ? void (0) : __assert_fail
("StackSize % SlotSize == 0 && \"StackSize must be a multiple of SlotSize\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 5034, __extension__
__PRETTY_FUNCTION__));

5035

return alignTo(StackSize + SlotSize, StackAlignment) - SlotSize;

5036

}

5037

5038

/// Return true if the given stack call argument is already available in the

5039

/// same position (relatively) of the caller's incoming argument stack.

5040

static

5041

bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,

5042

MachineFrameInfo &MFI, const MachineRegisterInfo *MRI,

5043

const X86InstrInfo *TII, const CCValAssign &VA) {

5044

unsigned Bytes = Arg.getValueSizeInBits() / 8;

5045

5046

for (;;) {

5047

// Look through nodes that don't alter the bits of the incoming value.

5048

unsigned Op = Arg.getOpcode();

5049

if (Op == ISD::ZERO_EXTEND || Op == ISD::ANY_EXTEND || Op == ISD::BITCAST) {

5050

Arg = Arg.getOperand(0);

5051

continue;

5052

}

5053

if (Op == ISD::TRUNCATE) {

5054

const SDValue &TruncInput = Arg.getOperand(0);

5055

if (TruncInput.getOpcode() == ISD::AssertZext &&

5056

cast<VTSDNode>(TruncInput.getOperand(1))->getVT() ==

5057

Arg.getValueType()) {

5058

Arg = TruncInput.getOperand(0);

5059

continue;

5060

}

5061

}

5062

break;

5063

}

5064

5065

int FI = INT_MAX2147483647;

5066

if (Arg.getOpcode() == ISD::CopyFromReg) {

5067

Register VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();

5068

if (!VR.isVirtual())

5069

return false;

5070

MachineInstr *Def = MRI->getVRegDef(VR);

5071

if (!Def)

5072

return false;

5073

if (!Flags.isByVal()) {

5074

if (!TII->isLoadFromStackSlot(*Def, FI))

5075

return false;

5076

} else {

5077

unsigned Opcode = Def->getOpcode();

5078

if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r ||

5079

Opcode == X86::LEA64_32r) &&

5080

Def->getOperand(1).isFI()) {

5081

FI = Def->getOperand(1).getIndex();

5082

Bytes = Flags.getByValSize();

5083

} else

5084

return false;

5085

}

5086

} else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {

5087

if (Flags.isByVal())

5088

// ByVal argument is passed in as a pointer but it's now being

5089

// dereferenced. e.g.

5090

// define @foo(%struct.X* %A) {

5091

// tail call @bar(%struct.X* byval %A)

5092

// }

5093

return false;

5094

SDValue Ptr = Ld->getBasePtr();

5095

FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);

5096

if (!FINode)

5097

return false;

5098

FI = FINode->getIndex();

5099

} else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) {

5100

FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg);

5101

FI = FINode->getIndex();

5102

Bytes = Flags.getByValSize();

5103

} else

5104

return false;

5105

5106

assert(FI != INT_MAX)(static_cast <bool> (FI != 2147483647) ? void (0) : __assert_fail
("FI != INT_MAX", "llvm/lib/Target/X86/X86ISelLowering.cpp",
5106, __extension__ __PRETTY_FUNCTION__));

5107

if (!MFI.isFixedObjectIndex(FI))

5108

return false;

5109

5110

if (Offset != MFI.getObjectOffset(FI))

5111

return false;

5112

5113

// If this is not byval, check that the argument stack object is immutable.

5114

// inalloca and argument copy elision can create mutable argument stack

5115

// objects. Byval objects can be mutated, but a byval call intends to pass the

5116

// mutated memory.

5117

if (!Flags.isByVal() && !MFI.isImmutableObjectIndex(FI))

5118

return false;

5119

5120

if (VA.getLocVT().getFixedSizeInBits() >

5121

Arg.getValueSizeInBits().getFixedValue()) {

5122

// If the argument location is wider than the argument type, check that any

5123

// extension flags match.

5124

if (Flags.isZExt() != MFI.isObjectZExt(FI) ||

5125

Flags.isSExt() != MFI.isObjectSExt(FI)) {

5126

return false;

5127

}

5128

}

5129

5130

return Bytes == MFI.getObjectSize(FI);

5131

}

5132

5133

/// Check whether the call is eligible for tail call optimization. Targets

5134

/// that want to do tail call optimization should implement this function.

5135

bool X86TargetLowering::IsEligibleForTailCallOptimization(

5136

SDValue Callee, CallingConv::ID CalleeCC, bool IsCalleePopSRet,

5137

bool isVarArg, Type *RetTy, const SmallVectorImpl<ISD::OutputArg> &Outs,

5138

const SmallVectorImpl<SDValue> &OutVals,

5139

const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {

5140

if (!mayTailCallThisCC(CalleeCC))

5141

return false;

5142

5143

// If -tailcallopt is specified, make fastcc functions tail-callable.

5144

MachineFunction &MF = DAG.getMachineFunction();

5145

const Function &CallerF = MF.getFunction();

5146

5147

// If the function return type is x86_fp80 and the callee return type is not,

5148

// then the FP_EXTEND of the call result is not a nop. It's not safe to

5149

// perform a tailcall optimization here.

5150

if (CallerF.getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty())

5151

return false;

5152

5153

CallingConv::ID CallerCC = CallerF.getCallingConv();

5154

bool CCMatch = CallerCC == CalleeCC;

5155

bool IsCalleeWin64 = Subtarget.isCallingConvWin64(CalleeCC);

5156

bool IsCallerWin64 = Subtarget.isCallingConvWin64(CallerCC);

5157

bool IsGuaranteeTCO = DAG.getTarget().Options.GuaranteedTailCallOpt ||

5158

CalleeCC == CallingConv::Tail || CalleeCC == CallingConv::SwiftTail;

5159

5160

// Win64 functions have extra shadow space for argument homing. Don't do the

5161

// sibcall if the caller and callee have mismatched expectations for this

5162

// space.

5163

if (IsCalleeWin64 != IsCallerWin64)

5164

return false;

5165

5166

if (IsGuaranteeTCO) {

5167

if (canGuaranteeTCO(CalleeCC) && CCMatch)

5168

return true;

5169

return false;

5170

}

5171

5172

// Look for obvious safe cases to perform tail call optimization that do not

5173

// require ABI changes. This is what gcc calls sibcall.

5174

5175

// Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to

5176

// emit a special epilogue.

5177

const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();

5178

if (RegInfo->hasStackRealignment(MF))

5179

return false;

5180

5181

// Also avoid sibcall optimization if we're an sret return fn and the callee

5182

// is incompatible. See comment in LowerReturn about why hasStructRetAttr is

5183

// insufficient.

5184

if (MF.getInfo<X86MachineFunctionInfo>()->getSRetReturnReg()) {

5185

// For a compatible tail call the callee must return our sret pointer. So it

5186

// needs to be (a) an sret function itself and (b) we pass our sret as its

5187

// sret. Condition #b is harder to determine.

5188

return false;

5189

} else if (IsCalleePopSRet)

5190

// The callee pops an sret, so we cannot tail-call, as our caller doesn't

5191

// expect that.

5192

return false;

5193

5194

// Do not sibcall optimize vararg calls unless all arguments are passed via

5195

// registers.

5196

LLVMContext &C = *DAG.getContext();

5197

if (isVarArg && !Outs.empty()) {

5198

// Optimizing for varargs on Win64 is unlikely to be safe without

5199

// additional testing.

5200

if (IsCalleeWin64 || IsCallerWin64)

5201

return false;

5202

5203

SmallVector<CCValAssign, 16> ArgLocs;

5204

CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);

5205

5206

CCInfo.AnalyzeCallOperands(Outs, CC_X86);

5207

for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)

5208

if (!ArgLocs[i].isRegLoc())

5209

return false;

5210

}

5211

5212

// If the call result is in ST0 / ST1, it needs to be popped off the x87

5213

// stack. Therefore, if it's not used by the call it is not safe to optimize

5214

// this into a sibcall.

5215

bool Unused = false;

5216

for (unsigned i = 0, e = Ins.size(); i != e; ++i) {

5217

if (!Ins[i].Used) {

5218

Unused = true;

5219

break;

5220

}

5221

}

5222

if (Unused) {

5223

SmallVector<CCValAssign, 16> RVLocs;

5224

CCState CCInfo(CalleeCC, false, MF, RVLocs, C);

5225

CCInfo.AnalyzeCallResult(Ins, RetCC_X86);

5226

for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {

5227

CCValAssign &VA = RVLocs[i];

5228

if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1)

5229

return false;

5230

}

5231

}

5232

5233

// Check that the call results are passed in the same way.

5234

if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,

5235

RetCC_X86, RetCC_X86))

5236

return false;

5237

// The callee has to preserve all registers the caller needs to preserve.

5238

const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();

5239

const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);

5240

if (!CCMatch) {

5241

const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);

5242

if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))

5243

return false;

5244

}

5245

5246

unsigned StackArgsSize = 0;

5247

5248

// If the callee takes no arguments then go on to check the results of the

5249

// call.

5250

if (!Outs.empty()) {

5251

// Check if stack adjustment is needed. For now, do not do this if any

5252

// argument is passed on the stack.

5253

SmallVector<CCValAssign, 16> ArgLocs;

5254

CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);

5255

5256

// Allocate shadow area for Win64

5257

if (IsCalleeWin64)

5258

CCInfo.AllocateStack(32, Align(8));

5259

5260

CCInfo.AnalyzeCallOperands(Outs, CC_X86);

5261

StackArgsSize = CCInfo.getNextStackOffset();

5262

5263

if (CCInfo.getNextStackOffset()) {

5264

// Check if the arguments are already laid out in the right way as

5265

// the caller's fixed stack objects.

5266

MachineFrameInfo &MFI = MF.getFrameInfo();

5267

const MachineRegisterInfo *MRI = &MF.getRegInfo();

5268

const X86InstrInfo *TII = Subtarget.getInstrInfo();

5269

for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {

5270

CCValAssign &VA = ArgLocs[i];

5271

SDValue Arg = OutVals[i];

5272

ISD::ArgFlagsTy Flags = Outs[i].Flags;

5273

if (VA.getLocInfo() == CCValAssign::Indirect)

5274

return false;

5275

if (!VA.isRegLoc()) {

5276

if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,

5277

MFI, MRI, TII, VA))

5278

return false;

5279

}

5280

}

5281

}

5282

5283

bool PositionIndependent = isPositionIndependent();

5284

// If the tailcall address may be in a register, then make sure it's

5285

// possible to register allocate for it. In 32-bit, the call address can

5286

// only target EAX, EDX, or ECX since the tail call must be scheduled after

5287

// callee-saved registers are restored. These happen to be the same

5288

// registers used to pass 'inreg' arguments so watch out for those.

5289

if (!Subtarget.is64Bit() && ((!isa<GlobalAddressSDNode>(Callee) &&

5290

!isa<ExternalSymbolSDNode>(Callee)) ||

5291

PositionIndependent)) {

5292

unsigned NumInRegs = 0;

5293

// In PIC we need an extra register to formulate the address computation

5294

// for the callee.

5295

unsigned MaxInRegs = PositionIndependent ? 2 : 3;

5296

5297

for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {

5298

CCValAssign &VA = ArgLocs[i];

5299

if (!VA.isRegLoc())

5300

continue;

5301

Register Reg = VA.getLocReg();

5302

switch (Reg) {

5303

default: break;

5304

case X86::EAX: case X86::EDX: case X86::ECX:

5305

if (++NumInRegs == MaxInRegs)

5306

return false;

5307

break;

5308

}

5309

}

5310

}

5311

5312

const MachineRegisterInfo &MRI = MF.getRegInfo();

5313

if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))

5314

return false;

5315

}

5316

5317

bool CalleeWillPop =

5318

X86::isCalleePop(CalleeCC, Subtarget.is64Bit(), isVarArg,

5319

MF.getTarget().Options.GuaranteedTailCallOpt);

5320

5321

if (unsigned BytesToPop =

5322

MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) {

5323

// If we have bytes to pop, the callee must pop them.

5324

bool CalleePopMatches = CalleeWillPop && BytesToPop == StackArgsSize;

5325

if (!CalleePopMatches)

5326

return false;

5327

} else if (CalleeWillPop && StackArgsSize > 0) {

5328

// If we don't have bytes to pop, make sure the callee doesn't pop any.

5329

return false;

5330

}

5331

5332

return true;

5333

}

5334

5335

FastISel *

5336

X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,

5337

const TargetLibraryInfo *libInfo) const {

5338

return X86::createFastISel(funcInfo, libInfo);

5339

}

5340

5341

//===----------------------------------------------------------------------===//

5342

// Other Lowering Hooks

5343

//===----------------------------------------------------------------------===//

5344

5345

bool X86::mayFoldLoad(SDValue Op, const X86Subtarget &Subtarget,

5346

bool AssumeSingleUse) {

5347

if (!AssumeSingleUse && !Op.hasOneUse())

5348

return false;

5349

if (!ISD::isNormalLoad(Op.getNode()))

5350

return false;

5351

5352

// If this is an unaligned vector, make sure the target supports folding it.

5353

auto *Ld = cast<LoadSDNode>(Op.getNode());

5354

if (!Subtarget.hasAVX() && !Subtarget.hasSSEUnalignedMem() &&

5355

Ld->getValueSizeInBits(0) == 128 && Ld->getAlign() < Align(16))

5356

return false;

5357

5358

// TODO: If this is a non-temporal load and the target has an instruction

5359

// for it, it should not be folded. See "useNonTemporalLoad()".

5360

5361

return true;

5362

}

5363

5364

bool X86::mayFoldLoadIntoBroadcastFromMem(SDValue Op, MVT EltVT,

5365

const X86Subtarget &Subtarget,

5366

bool AssumeSingleUse) {

5367

assert(Subtarget.hasAVX() && "Expected AVX for broadcast from memory")(static_cast <bool> (Subtarget.hasAVX() && "Expected AVX for broadcast from memory"
) ? void (0) : __assert_fail ("Subtarget.hasAVX() && \"Expected AVX for broadcast from memory\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 5367, __extension__
__PRETTY_FUNCTION__));

5368

if (!X86::mayFoldLoad(Op, Subtarget, AssumeSingleUse))

5369

return false;

5370

5371

// We can not replace a wide volatile load with a broadcast-from-memory,

5372

// because that would narrow the load, which isn't legal for volatiles.

5373

auto *Ld = cast<LoadSDNode>(Op.getNode());

5374

return !Ld->isVolatile() ||

5375

Ld->getValueSizeInBits(0) == EltVT.getScalarSizeInBits();

5376

}

5377

5378

bool X86::mayFoldIntoStore(SDValue Op) {

5379

return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());

5380

}

5381

5382

bool X86::mayFoldIntoZeroExtend(SDValue Op) {

5383

if (Op.hasOneUse()) {

5384

unsigned Opcode = Op.getNode()->use_begin()->getOpcode();

5385

return (ISD::ZERO_EXTEND == Opcode);

5386

}

5387

return false;

5388

}

5389

5390

static bool isTargetShuffle(unsigned Opcode) {

5391

switch(Opcode) {

5392

default: return false;

5393

case X86ISD::BLENDI:

5394

case X86ISD::PSHUFB:

5395

case X86ISD::PSHUFD:

5396

case X86ISD::PSHUFHW:

5397

case X86ISD::PSHUFLW:

5398

case X86ISD::SHUFP:

5399

case X86ISD::INSERTPS:

5400

case X86ISD::EXTRQI:

5401

case X86ISD::INSERTQI:

5402

case X86ISD::VALIGN:

5403

case X86ISD::PALIGNR:

5404

case X86ISD::VSHLDQ:

5405

case X86ISD::VSRLDQ:

5406

case X86ISD::MOVLHPS:

5407

case X86ISD::MOVHLPS:

5408

case X86ISD::MOVSHDUP:

5409

case X86ISD::MOVSLDUP:

5410

case X86ISD::MOVDDUP:

5411

case X86ISD::MOVSS:

5412

case X86ISD::MOVSD:

5413

case X86ISD::MOVSH:

5414

case X86ISD::UNPCKL:

5415

case X86ISD::UNPCKH:

5416

case X86ISD::VBROADCAST:

5417

case X86ISD::VPERMILPI:

5418

case X86ISD::VPERMILPV:

5419

case X86ISD::VPERM2X128:

5420

case X86ISD::SHUF128:

5421

case X86ISD::VPERMIL2:

5422

case X86ISD::VPERMI:

5423

case X86ISD::VPPERM:

5424

case X86ISD::VPERMV:

5425

case X86ISD::VPERMV3:

5426

case X86ISD::VZEXT_MOVL:

5427

return true;

5428

}

5429

}

5430

5431

static bool isTargetShuffleVariableMask(unsigned Opcode) {

5432

switch (Opcode) {

5433

default: return false;

5434

// Target Shuffles.

5435

case X86ISD::PSHUFB:

5436

case X86ISD::VPERMILPV:

5437

case X86ISD::VPERMIL2:

5438

case X86ISD::VPPERM:

5439

case X86ISD::VPERMV:

5440

case X86ISD::VPERMV3:

5441

return true;

5442

// 'Faux' Target Shuffles.

5443

case ISD::OR:

5444

case ISD::AND:

5445

case X86ISD::ANDNP:

5446

return true;

5447

}

5448

}

5449

5450

SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {

5451

MachineFunction &MF = DAG.getMachineFunction();

5452

const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();

5453

X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();

5454

int ReturnAddrIndex = FuncInfo->getRAIndex();

5455

5456

if (ReturnAddrIndex == 0) {

5457

// Set up a frame object for the return address.

5458

unsigned SlotSize = RegInfo->getSlotSize();

5459

ReturnAddrIndex = MF.getFrameInfo().CreateFixedObject(SlotSize,

5460

-(int64_t)SlotSize,

5461

false);

5462

FuncInfo->setRAIndex(ReturnAddrIndex);

5463

}

5464

5465

return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy(DAG.getDataLayout()));

5466

}

5467

5468

bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,

5469

bool hasSymbolicDisplacement) {

5470

// Offset should fit into 32 bit immediate field.

5471

if (!isInt<32>(Offset))

5472

return false;

5473

5474

// If we don't have a symbolic displacement - we don't have any extra

5475

// restrictions.

5476

if (!hasSymbolicDisplacement)

5477

return true;

5478

5479

// FIXME: Some tweaks might be needed for medium code model.

5480

if (M != CodeModel::Small && M != CodeModel::Kernel)

5481

return false;

5482

5483

// For small code model we assume that latest object is 16MB before end of 31

5484

// bits boundary. We may also accept pretty large negative constants knowing

5485

// that all objects are in the positive half of address space.

5486

if (M == CodeModel::Small && Offset < 16*1024*1024)

5487

return true;

5488

5489

// For kernel code model we know that all object resist in the negative half

5490

// of 32bits address space. We may not accept negative offsets, since they may

5491

// be just off and we may accept pretty large positive ones.

5492

if (M == CodeModel::Kernel && Offset >= 0)

5493

return true;

5494

5495

return false;

5496

}

5497

5498

/// Determines whether the callee is required to pop its own arguments.

5499

/// Callee pop is necessary to support tail calls.

5500

bool X86::isCalleePop(CallingConv::ID CallingConv,

5501

bool is64Bit, bool IsVarArg, bool GuaranteeTCO) {

5502

// If GuaranteeTCO is true, we force some calls to be callee pop so that we

5503

// can guarantee TCO.

5504

if (!IsVarArg && shouldGuaranteeTCO(CallingConv, GuaranteeTCO))

5505

return true;

5506

5507

switch (CallingConv) {

5508

default:

5509

return false;

5510

case CallingConv::X86_StdCall:

5511

case CallingConv::X86_FastCall:

5512

case CallingConv::X86_ThisCall:

5513

case CallingConv::X86_VectorCall:

5514

return !is64Bit;

5515

}

5516

}

5517

5518

/// Return true if the condition is an signed comparison operation.

5519

static bool isX86CCSigned(unsigned X86CC) {

5520

switch (X86CC) {

5521

default:

5522

llvm_unreachable("Invalid integer condition!")::llvm::llvm_unreachable_internal("Invalid integer condition!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 5522);

5523

case X86::COND_E:

5524

case X86::COND_NE:

5525

case X86::COND_B:

5526

case X86::COND_A:

5527

case X86::COND_BE:

5528

case X86::COND_AE:

5529

return false;

5530

case X86::COND_G:

5531

case X86::COND_GE:

5532

case X86::COND_L:

5533

case X86::COND_LE:

5534

return true;

5535

}

5536

}

5537

5538

static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode) {

5539

switch (SetCCOpcode) {

5540

default: llvm_unreachable("Invalid integer condition!")::llvm::llvm_unreachable_internal("Invalid integer condition!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 5540);

5541

case ISD::SETEQ: return X86::COND_E;

5542

case ISD::SETGT: return X86::COND_G;

5543

case ISD::SETGE: return X86::COND_GE;

5544

case ISD::SETLT: return X86::COND_L;

5545

case ISD::SETLE: return X86::COND_LE;

5546

case ISD::SETNE: return X86::COND_NE;

5547

case ISD::SETULT: return X86::COND_B;

5548

case ISD::SETUGT: return X86::COND_A;

5549

case ISD::SETULE: return X86::COND_BE;

5550

case ISD::SETUGE: return X86::COND_AE;

5551

}

5552

}

5553

5554

/// Do a one-to-one translation of a ISD::CondCode to the X86-specific

5555

/// condition code, returning the condition code and the LHS/RHS of the

5556

/// comparison to make.

5557

static X86::CondCode TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL,

5558

bool isFP, SDValue &LHS, SDValue &RHS,

5559

SelectionDAG &DAG) {

5560

if (!isFP) {

5561

if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {

5562

if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnes()) {

5563

// X > -1 -> X == 0, jump !sign.

5564

RHS = DAG.getConstant(0, DL, RHS.getValueType());

5565

return X86::COND_NS;

5566

}

5567

if (SetCCOpcode == ISD::SETLT && RHSC->isZero()) {

5568

// X < 0 -> X == 0, jump on sign.

5569

return X86::COND_S;

5570

}

5571

if (SetCCOpcode == ISD::SETGE && RHSC->isZero()) {

5572

// X >= 0 -> X == 0, jump on !sign.

5573

return X86::COND_NS;

5574

}

5575

if (SetCCOpcode == ISD::SETLT && RHSC->isOne()) {

5576

// X < 1 -> X <= 0

5577

RHS = DAG.getConstant(0, DL, RHS.getValueType());

5578

return X86::COND_LE;

5579

}

5580

}

5581

5582

return TranslateIntegerX86CC(SetCCOpcode);

5583

}

5584

5585

// First determine if it is required or is profitable to flip the operands.

5586

5587

// If LHS is a foldable load, but RHS is not, flip the condition.

5588

if (ISD::isNON_EXTLoad(LHS.getNode()) &&

5589

!ISD::isNON_EXTLoad(RHS.getNode())) {

5590

SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);

5591

std::swap(LHS, RHS);

5592

}

5593

5594

switch (SetCCOpcode) {

5595

default: break;

5596

case ISD::SETOLT:

5597

case ISD::SETOLE:

5598

case ISD::SETUGT:

5599

case ISD::SETUGE:

5600

std::swap(LHS, RHS);

5601

break;

5602

}

5603

5604

// On a floating point condition, the flags are set as follows:

5605

// ZF PF CF op

5606

// 0 | 0 | 0 | X > Y

5607

// 0 | 0 | 1 | X < Y

5608

// 1 | 0 | 0 | X == Y

5609

// 1 | 1 | 1 | unordered

5610

switch (SetCCOpcode) {

5611

default: llvm_unreachable("Condcode should be pre-legalized away")::llvm::llvm_unreachable_internal("Condcode should be pre-legalized away"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 5611);

5612

case ISD::SETUEQ:

5613

case ISD::SETEQ: return X86::COND_E;

5614

case ISD::SETOLT: // flipped

5615

case ISD::SETOGT:

5616

case ISD::SETGT: return X86::COND_A;

5617

case ISD::SETOLE: // flipped

5618

case ISD::SETOGE:

5619

case ISD::SETGE: return X86::COND_AE;

5620

case ISD::SETUGT: // flipped

5621

case ISD::SETULT:

5622

case ISD::SETLT: return X86::COND_B;

5623

case ISD::SETUGE: // flipped

5624

case ISD::SETULE:

5625

case ISD::SETLE: return X86::COND_BE;

5626

case ISD::SETONE:

5627

case ISD::SETNE: return X86::COND_NE;

5628

case ISD::SETUO: return X86::COND_P;

5629

case ISD::SETO: return X86::COND_NP;

5630

case ISD::SETOEQ:

5631

case ISD::SETUNE: return X86::COND_INVALID;

5632

}

5633

}

5634

5635

/// Is there a floating point cmov for the specific X86 condition code?

5636

/// Current x86 isa includes the following FP cmov instructions:

5637

/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.

5638

static bool hasFPCMov(unsigned X86CC) {

5639

switch (X86CC) {

5640

default:

5641

return false;

5642

case X86::COND_B:

5643

case X86::COND_BE:

5644

case X86::COND_E:

5645

case X86::COND_P:

5646

case X86::COND_A:

5647

case X86::COND_AE:

5648

case X86::COND_NE:

5649

case X86::COND_NP:

5650

return true;

5651

}

5652

}

5653

5654

static bool useVPTERNLOG(const X86Subtarget &Subtarget, MVT VT) {

5655

return Subtarget.hasVLX() || Subtarget.canExtendTo512DQ() ||

5656

VT.is512BitVector();

5657

}

5658

5659

bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,

5660

const CallInst &I,

5661

MachineFunction &MF,

5662

unsigned Intrinsic) const {

5663

Info.flags = MachineMemOperand::MONone;

5664

Info.offset = 0;

5665

5666

const IntrinsicData* IntrData = getIntrinsicWithChain(Intrinsic);

5667

if (!IntrData) {

5668

switch (Intrinsic) {

5669

case Intrinsic::x86_aesenc128kl:

5670

case Intrinsic::x86_aesdec128kl:

5671

Info.opc = ISD::INTRINSIC_W_CHAIN;

5672

Info.ptrVal = I.getArgOperand(1);

5673

Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 48);

5674

Info.align = Align(1);

5675

Info.flags |= MachineMemOperand::MOLoad;

5676

return true;

5677

case Intrinsic::x86_aesenc256kl:

5678

case Intrinsic::x86_aesdec256kl:

5679

Info.opc = ISD::INTRINSIC_W_CHAIN;

5680

Info.ptrVal = I.getArgOperand(1);

5681

Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 64);

5682

Info.align = Align(1);

5683

Info.flags |= MachineMemOperand::MOLoad;

5684

return true;

5685

case Intrinsic::x86_aesencwide128kl:

5686

case Intrinsic::x86_aesdecwide128kl:

5687

Info.opc = ISD::INTRINSIC_W_CHAIN;

5688

Info.ptrVal = I.getArgOperand(0);

5689

Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 48);

5690

Info.align = Align(1);

5691

Info.flags |= MachineMemOperand::MOLoad;

5692

return true;

5693

case Intrinsic::x86_aesencwide256kl:

5694

case Intrinsic::x86_aesdecwide256kl:

5695

Info.opc = ISD::INTRINSIC_W_CHAIN;

5696

Info.ptrVal = I.getArgOperand(0);

5697

Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 64);

5698

Info.align = Align(1);

5699

Info.flags |= MachineMemOperand::MOLoad;

5700

return true;

5701

case Intrinsic::x86_cmpccxadd32:

5702

case Intrinsic::x86_cmpccxadd64:

5703

case Intrinsic::x86_atomic_bts:

5704

case Intrinsic::x86_atomic_btc:

5705

case Intrinsic::x86_atomic_btr: {

5706

Info.opc = ISD::INTRINSIC_W_CHAIN;

5707

Info.ptrVal = I.getArgOperand(0);

5708

unsigned Size = I.getType()->getScalarSizeInBits();

5709

Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);

5710

Info.align = Align(Size);

5711

Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore |

5712

MachineMemOperand::MOVolatile;

5713

return true;

5714

}

5715

case Intrinsic::x86_atomic_bts_rm:

5716

case Intrinsic::x86_atomic_btc_rm:

5717

case Intrinsic::x86_atomic_btr_rm: {

5718

Info.opc = ISD::INTRINSIC_W_CHAIN;

5719

Info.ptrVal = I.getArgOperand(0);

5720

unsigned Size = I.getArgOperand(1)->getType()->getScalarSizeInBits();

5721

Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);

5722

Info.align = Align(Size);

5723

Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore |

5724

MachineMemOperand::MOVolatile;

5725

return true;

5726

}

5727

case Intrinsic::x86_aadd32:

5728

case Intrinsic::x86_aadd64:

5729

case Intrinsic::x86_aand32:

5730

case Intrinsic::x86_aand64:

5731

case Intrinsic::x86_aor32:

5732

case Intrinsic::x86_aor64:

5733

case Intrinsic::x86_axor32:

5734

case Intrinsic::x86_axor64:

5735

case Intrinsic::x86_atomic_add_cc:

5736

case Intrinsic::x86_atomic_sub_cc:

5737

case Intrinsic::x86_atomic_or_cc:

5738

case Intrinsic::x86_atomic_and_cc:

5739

case Intrinsic::x86_atomic_xor_cc: {

5740

Info.opc = ISD::INTRINSIC_W_CHAIN;

5741

Info.ptrVal = I.getArgOperand(0);

5742

unsigned Size = I.getArgOperand(1)->getType()->getScalarSizeInBits();

5743

Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size);

5744

Info.align = Align(Size);

5745

Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore |

5746

MachineMemOperand::MOVolatile;

5747

return true;

5748

}

5749

}

5750

return false;

5751

}

5752

5753

switch (IntrData->Type) {

5754

case TRUNCATE_TO_MEM_VI8:

5755

case TRUNCATE_TO_MEM_VI16:

5756

case TRUNCATE_TO_MEM_VI32: {

5757

Info.opc = ISD::INTRINSIC_VOID;

5758

Info.ptrVal = I.getArgOperand(0);

5759

MVT VT = MVT::getVT(I.getArgOperand(1)->getType());

5760

MVT ScalarVT = MVT::INVALID_SIMPLE_VALUE_TYPE;

5761

if (IntrData->Type == TRUNCATE_TO_MEM_VI8)

5762

ScalarVT = MVT::i8;

5763

else if (IntrData->Type == TRUNCATE_TO_MEM_VI16)

5764

ScalarVT = MVT::i16;

5765

else if (IntrData->Type == TRUNCATE_TO_MEM_VI32)

5766

ScalarVT = MVT::i32;

5767

5768

Info.memVT = MVT::getVectorVT(ScalarVT, VT.getVectorNumElements());

5769

Info.align = Align(1);

5770

Info.flags |= MachineMemOperand::MOStore;

5771

break;

5772

}

5773

case GATHER:

5774

case GATHER_AVX2: {

5775

Info.opc = ISD::INTRINSIC_W_CHAIN;

5776

Info.ptrVal = nullptr;

5777

MVT DataVT = MVT::getVT(I.getType());

5778

MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());

5779

unsigned NumElts = std::min(DataVT.getVectorNumElements(),

5780

IndexVT.getVectorNumElements());

5781

Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);

5782

Info.align = Align(1);

5783

Info.flags |= MachineMemOperand::MOLoad;

5784

break;

5785

}

5786

case SCATTER: {

5787

Info.opc = ISD::INTRINSIC_VOID;

5788

Info.ptrVal = nullptr;

5789

MVT DataVT = MVT::getVT(I.getArgOperand(3)->getType());

5790

MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType());

5791

unsigned NumElts = std::min(DataVT.getVectorNumElements(),

5792

IndexVT.getVectorNumElements());

5793

Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);

5794

Info.align = Align(1);

5795

Info.flags |= MachineMemOperand::MOStore;

5796

break;

5797

}

5798

default:

5799

return false;

5800

}

5801

5802

return true;

5803

}

5804

5805

/// Returns true if the target can instruction select the

5806

/// specified FP immediate natively. If false, the legalizer will

5807

/// materialize the FP immediate as a load from a constant pool.

5808

bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,

5809

bool ForCodeSize) const {

5810

for (const APFloat &FPImm : LegalFPImmediates)

5811

if (Imm.bitwiseIsEqual(FPImm))

5812

return true;

5813

return false;

5814

}

5815

5816

bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load,

5817

ISD::LoadExtType ExtTy,

5818

EVT NewVT) const {

5819

assert(cast<LoadSDNode>(Load)->isSimple() && "illegal to narrow")(static_cast <bool> (cast<LoadSDNode>(Load)->isSimple
() && "illegal to narrow") ? void (0) : __assert_fail
("cast<LoadSDNode>(Load)->isSimple() && \"illegal to narrow\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 5819, __extension__
__PRETTY_FUNCTION__));

5820

5821

// "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF

5822

// relocation target a movq or addq instruction: don't let the load shrink.

5823

SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();

5824

if (BasePtr.getOpcode() == X86ISD::WrapperRIP)

5825

if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))

5826

return GA->getTargetFlags() != X86II::MO_GOTTPOFF;

5827

5828

// If this is an (1) AVX vector load with (2) multiple uses and (3) all of

5829

// those uses are extracted directly into a store, then the extract + store

5830

// can be store-folded. Therefore, it's probably not worth splitting the load.

5831

EVT VT = Load->getValueType(0);

5832

if ((VT.is256BitVector() || VT.is512BitVector()) && !Load->hasOneUse()) {

5833

for (auto UI = Load->use_begin(), UE = Load->use_end(); UI != UE; ++UI) {

5834

// Skip uses of the chain value. Result 0 of the node is the load value.

5835

if (UI.getUse().getResNo() != 0)

5836

continue;

5837

5838

// If this use is not an extract + store, it's probably worth splitting.

5839

if (UI->getOpcode() != ISD::EXTRACT_SUBVECTOR || !UI->hasOneUse() ||

5840

UI->use_begin()->getOpcode() != ISD::STORE)

5841

return true;

5842

}

5843

// All non-chain uses are extract + store.

5844

return false;

5845

}

5846

5847

return true;

5848

}

5849

5850

/// Returns true if it is beneficial to convert a load of a constant

5851

/// to just the constant itself.

5852

bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,

5853

Type *Ty) const {

5854

assert(Ty->isIntegerTy())(static_cast <bool> (Ty->isIntegerTy()) ? void (0) :
__assert_fail ("Ty->isIntegerTy()", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 5854, __extension__ __PRETTY_FUNCTION__));

5855

5856

unsigned BitSize = Ty->getPrimitiveSizeInBits();

5857

if (BitSize == 0 || BitSize > 64)

5858

return false;

5859

return true;

5860

}

5861

5862

bool X86TargetLowering::reduceSelectOfFPConstantLoads(EVT CmpOpVT) const {

5863

// If we are using XMM registers in the ABI and the condition of the select is

5864

// a floating-point compare and we have blendv or conditional move, then it is

5865

// cheaper to select instead of doing a cross-register move and creating a

5866

// load that depends on the compare result.

5867

bool IsFPSetCC = CmpOpVT.isFloatingPoint() && CmpOpVT != MVT::f128;

5868

return !IsFPSetCC || !Subtarget.isTarget64BitLP64() || !Subtarget.hasAVX();

5869

}

5870

5871

bool X86TargetLowering::convertSelectOfConstantsToMath(EVT VT) const {

5872

// TODO: It might be a win to ease or lift this restriction, but the generic

5873

// folds in DAGCombiner conflict with vector folds for an AVX512 target.

5874

if (VT.isVector() && Subtarget.hasAVX512())

5875

return false;

5876

5877

return true;

5878

}

5879

5880

bool X86TargetLowering::decomposeMulByConstant(LLVMContext &Context, EVT VT,

5881

SDValue C) const {

5882

// TODO: We handle scalars using custom code, but generic combining could make

5883

// that unnecessary.

5884

APInt MulC;

5885

if (!ISD::isConstantSplatVector(C.getNode(), MulC))

5886

return false;

5887

5888

// Find the type this will be legalized too. Otherwise we might prematurely

5889

// convert this to shl+add/sub and then still have to type legalize those ops.

5890

// Another choice would be to defer the decision for illegal types until

5891

// after type legalization. But constant splat vectors of i64 can't make it

5892

// through type legalization on 32-bit targets so we would need to special

5893

// case vXi64.

5894

while (getTypeAction(Context, VT) != TypeLegal)

5895

VT = getTypeToTransformTo(Context, VT);

5896

5897

// If vector multiply is legal, assume that's faster than shl + add/sub.

5898

// Multiply is a complex op with higher latency and lower throughput in

5899

// most implementations, sub-vXi32 vector multiplies are always fast,

5900

// vXi32 mustn't have a SlowMULLD implementation, and anything larger (vXi64)

5901

// is always going to be slow.

5902

unsigned EltSizeInBits = VT.getScalarSizeInBits();

5903

if (isOperationLegal(ISD::MUL, VT) && EltSizeInBits <= 32 &&

5904

(EltSizeInBits != 32 || !Subtarget.isPMULLDSlow()))

5905

return false;

5906

5907

// shl+add, shl+sub, shl+add+neg

5908

return (MulC + 1).isPowerOf2() || (MulC - 1).isPowerOf2() ||

5909

(1 - MulC).isPowerOf2() || (-(MulC + 1)).isPowerOf2();

5910

}

5911

5912

bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,

5913

unsigned Index) const {

5914

if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))

5915

return false;

5916

5917

// Mask vectors support all subregister combinations and operations that

5918

// extract half of vector.

5919

if (ResVT.getVectorElementType() == MVT::i1)

5920

return Index == 0 || ((ResVT.getSizeInBits() == SrcVT.getSizeInBits()*2) &&

5921

(Index == ResVT.getVectorNumElements()));

5922

5923

return (Index % ResVT.getVectorNumElements()) == 0;

5924

}

5925

5926

bool X86TargetLowering::shouldScalarizeBinop(SDValue VecOp) const {

5927

unsigned Opc = VecOp.getOpcode();

5928

5929

// Assume target opcodes can't be scalarized.

5930

// TODO - do we have any exceptions?

5931

if (Opc >= ISD::BUILTIN_OP_END)

5932

return false;

5933

5934

// If the vector op is not supported, try to convert to scalar.

5935

EVT VecVT = VecOp.getValueType();

5936

if (!isOperationLegalOrCustomOrPromote(Opc, VecVT))

5937

return true;

5938

5939

// If the vector op is supported, but the scalar op is not, the transform may

5940

// not be worthwhile.

5941

EVT ScalarVT = VecVT.getScalarType();

5942

return isOperationLegalOrCustomOrPromote(Opc, ScalarVT);

5943

}

5944

5945

bool X86TargetLowering::shouldFormOverflowOp(unsigned Opcode, EVT VT,

5946

bool) const {

5947

// TODO: Allow vectors?

5948

if (VT.isVector())

5949

return false;

5950

return VT.isSimple() || !isOperationExpand(Opcode, VT);

5951

}

5952

5953

bool X86TargetLowering::isCheapToSpeculateCttz(Type *Ty) const {

5954

// Speculate cttz only if we can directly use TZCNT or can promote to i32.

5955

return Subtarget.hasBMI() ||

5956

(!Ty->isVectorTy() && Ty->getScalarSizeInBits() < 32);

5957

}

5958

5959

bool X86TargetLowering::isCheapToSpeculateCtlz(Type *Ty) const {

5960

// Speculate ctlz only if we can directly use LZCNT.

5961

return Subtarget.hasLZCNT();

5962

}

5963

5964

bool X86TargetLowering::hasBitPreservingFPLogic(EVT VT) const {

5965

return VT == MVT::f32 || VT == MVT::f64 || VT.isVector();

5966

}

5967

5968

bool X86TargetLowering::ShouldShrinkFPConstant(EVT VT) const {

5969

// Don't shrink FP constpool if SSE2 is available since cvtss2sd is more

5970

// expensive than a straight movsd. On the other hand, it's important to

5971

// shrink long double fp constant since fldt is very slow.

5972

return !Subtarget.hasSSE2() || VT == MVT::f80;

5973

}

5974

5975

bool X86TargetLowering::isScalarFPTypeInSSEReg(EVT VT) const {

5976

return (VT == MVT::f64 && Subtarget.hasSSE2()) ||

5977

(VT == MVT::f32 && Subtarget.hasSSE1()) || VT == MVT::f16;

5978

}

5979

5980

bool X86TargetLowering::isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT,

5981

const SelectionDAG &DAG,

5982

const MachineMemOperand &MMO) const {

5983

if (!Subtarget.hasAVX512() && !LoadVT.isVector() && BitcastVT.isVector() &&

5984

BitcastVT.getVectorElementType() == MVT::i1)

5985

return false;

5986

5987

if (!Subtarget.hasDQI() && BitcastVT == MVT::v8i1 && LoadVT == MVT::i8)

5988

return false;

5989

5990

// If both types are legal vectors, it's always ok to convert them.

5991

if (LoadVT.isVector() && BitcastVT.isVector() &&

5992

isTypeLegal(LoadVT) && isTypeLegal(BitcastVT))

5993

return true;

5994

5995

return TargetLowering::isLoadBitCastBeneficial(LoadVT, BitcastVT, DAG, MMO);

5996

}

5997

5998

bool X86TargetLowering::canMergeStoresTo(unsigned AddressSpace, EVT MemVT,

5999

const MachineFunction &MF) const {

6000

// Do not merge to float value size (128 bytes) if no implicit

6001

// float attribute is set.

6002

bool NoFloat = MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat);

6003

6004

if (NoFloat) {

6005

unsigned MaxIntSize = Subtarget.is64Bit() ? 64 : 32;

6006

return (MemVT.getSizeInBits() <= MaxIntSize);

6007

}

6008

// Make sure we don't merge greater than our preferred vector

6009

// width.

6010

if (MemVT.getSizeInBits() > Subtarget.getPreferVectorWidth())

6011

return false;

6012

6013

return true;

6014

}

6015

6016

bool X86TargetLowering::isCtlzFast() const {

6017

return Subtarget.hasFastLZCNT();

6018

}

6019

6020

bool X86TargetLowering::isMaskAndCmp0FoldingBeneficial(

6021

const Instruction &AndI) const {

6022

return true;

6023

}

6024

6025

bool X86TargetLowering::hasAndNotCompare(SDValue Y) const {

6026

EVT VT = Y.getValueType();

6027

6028

if (VT.isVector())

6029

return false;

6030

6031

if (!Subtarget.hasBMI())

6032

return false;

6033

6034

// There are only 32-bit and 64-bit forms for 'andn'.

6035

if (VT != MVT::i32 && VT != MVT::i64)

6036

return false;

6037

6038

return !isa<ConstantSDNode>(Y);

6039

}

6040

6041

bool X86TargetLowering::hasAndNot(SDValue Y) const {

6042

EVT VT = Y.getValueType();

6043

6044

if (!VT.isVector())

6045

return hasAndNotCompare(Y);

6046

6047

// Vector.

6048

6049

if (!Subtarget.hasSSE1() || VT.getSizeInBits() < 128)

6050

return false;

6051

6052

if (VT == MVT::v4i32)

6053

return true;

6054

6055

return Subtarget.hasSSE2();

6056

}

6057

6058

bool X86TargetLowering::hasBitTest(SDValue X, SDValue Y) const {

6059

return X.getValueType().isScalarInteger(); // 'bt'

6060

}

6061

6062

bool X86TargetLowering::

6063

shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(

6064

SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y,

6065

unsigned OldShiftOpcode, unsigned NewShiftOpcode,

6066

SelectionDAG &DAG) const {

6067

// Does baseline recommend not to perform the fold by default?

6068

if (!TargetLowering::shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(

6069

X, XC, CC, Y, OldShiftOpcode, NewShiftOpcode, DAG))

6070

return false;

6071

// For scalars this transform is always beneficial.

6072

if (X.getValueType().isScalarInteger())

6073

return true;

6074

// If all the shift amounts are identical, then transform is beneficial even

6075

// with rudimentary SSE2 shifts.

6076

if (DAG.isSplatValue(Y, /*AllowUndefs=*/true))

6077

return true;

6078

// If we have AVX2 with it's powerful shift operations, then it's also good.

6079

if (Subtarget.hasAVX2())

6080

return true;

6081

// Pre-AVX2 vector codegen for this pattern is best for variant with 'shl'.

6082

return NewShiftOpcode == ISD::SHL;

6083

}

6084

6085

bool X86TargetLowering::preferScalarizeSplat(unsigned Opc) const {

6086

return Opc != ISD::FP_EXTEND;

6087

}

6088

6089

bool X86TargetLowering::shouldFoldConstantShiftPairToMask(

6090

const SDNode *N, CombineLevel Level) const {

6091

assert(((N->getOpcode() == ISD::SHL &&(static_cast <bool> (((N->getOpcode() == ISD::SHL &&
N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode
() == ISD::SRL && N->getOperand(0).getOpcode() == ISD
::SHL)) && "Expected shift-shift mask") ? void (0) : __assert_fail
("((N->getOpcode() == ISD::SHL && N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode() == ISD::SRL && N->getOperand(0).getOpcode() == ISD::SHL)) && \"Expected shift-shift mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6095, __extension__
__PRETTY_FUNCTION__))

6092

N->getOperand(0).getOpcode() == ISD::SRL) ||(static_cast <bool> (((N->getOpcode() == ISD::SHL &&
N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode
() == ISD::SRL && N->getOperand(0).getOpcode() == ISD
::SHL)) && "Expected shift-shift mask") ? void (0) : __assert_fail
("((N->getOpcode() == ISD::SHL && N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode() == ISD::SRL && N->getOperand(0).getOpcode() == ISD::SHL)) && \"Expected shift-shift mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6095, __extension__
__PRETTY_FUNCTION__))

6093

(N->getOpcode() == ISD::SRL &&(static_cast <bool> (((N->getOpcode() == ISD::SHL &&
N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode
() == ISD::SRL && N->getOperand(0).getOpcode() == ISD
::SHL)) && "Expected shift-shift mask") ? void (0) : __assert_fail
("((N->getOpcode() == ISD::SHL && N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode() == ISD::SRL && N->getOperand(0).getOpcode() == ISD::SHL)) && \"Expected shift-shift mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6095, __extension__
__PRETTY_FUNCTION__))

6094

N->getOperand(0).getOpcode() == ISD::SHL)) &&(static_cast <bool> (((N->getOpcode() == ISD::SHL &&
N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode
() == ISD::SRL && N->getOperand(0).getOpcode() == ISD
::SHL)) && "Expected shift-shift mask") ? void (0) : __assert_fail
("((N->getOpcode() == ISD::SHL && N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode() == ISD::SRL && N->getOperand(0).getOpcode() == ISD::SHL)) && \"Expected shift-shift mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6095, __extension__
__PRETTY_FUNCTION__))

6095

"Expected shift-shift mask")(static_cast <bool> (((N->getOpcode() == ISD::SHL &&
N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode
() == ISD::SRL && N->getOperand(0).getOpcode() == ISD
::SHL)) && "Expected shift-shift mask") ? void (0) : __assert_fail
("((N->getOpcode() == ISD::SHL && N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode() == ISD::SRL && N->getOperand(0).getOpcode() == ISD::SHL)) && \"Expected shift-shift mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6095, __extension__
__PRETTY_FUNCTION__));

6096

// TODO: Should we always create i64 masks? Or only folded immediates?

6097

EVT VT = N->getValueType(0);

6098

if ((Subtarget.hasFastVectorShiftMasks() && VT.isVector()) ||

6099

(Subtarget.hasFastScalarShiftMasks() && !VT.isVector())) {

6100

// Only fold if the shift values are equal - so it folds to AND.

6101

// TODO - we should fold if either is a non-uniform vector but we don't do

6102

// the fold for non-splats yet.

6103

return N->getOperand(1) == N->getOperand(0).getOperand(1);

6104

}

6105

return TargetLoweringBase::shouldFoldConstantShiftPairToMask(N, Level);

6106

}

6107

6108

bool X86TargetLowering::shouldFoldMaskToVariableShiftPair(SDValue Y) const {

6109

EVT VT = Y.getValueType();

6110

6111

// For vectors, we don't have a preference, but we probably want a mask.

6112

if (VT.isVector())

6113

return false;

6114

6115

// 64-bit shifts on 32-bit targets produce really bad bloated code.

6116

if (VT == MVT::i64 && !Subtarget.is64Bit())

6117

return false;

6118

6119

return true;

6120

}

6121

6122

TargetLowering::ShiftLegalizationStrategy

6123

X86TargetLowering::preferredShiftLegalizationStrategy(

6124

SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const {

6125

if (DAG.getMachineFunction().getFunction().hasMinSize() &&

6126

!Subtarget.isOSWindows())

6127

return ShiftLegalizationStrategy::LowerToLibcall;

6128

return TargetLowering::preferredShiftLegalizationStrategy(DAG, N,

6129

ExpansionFactor);

6130

}

6131

6132

bool X86TargetLowering::shouldSplatInsEltVarIndex(EVT VT) const {

6133

// Any legal vector type can be splatted more efficiently than

6134

// loading/spilling from memory.

6135

return isTypeLegal(VT);

6136

}

6137

6138

MVT X86TargetLowering::hasFastEqualityCompare(unsigned NumBits) const {

6139

MVT VT = MVT::getIntegerVT(NumBits);

6140

if (isTypeLegal(VT))

6141

return VT;

6142

6143

// PMOVMSKB can handle this.

6144

if (NumBits == 128 && isTypeLegal(MVT::v16i8))

6145

return MVT::v16i8;

6146

6147

// VPMOVMSKB can handle this.

6148

if (NumBits == 256 && isTypeLegal(MVT::v32i8))

6149

return MVT::v32i8;

6150

6151

// TODO: Allow 64-bit type for 32-bit target.

6152

// TODO: 512-bit types should be allowed, but make sure that those

6153

// cases are handled in combineVectorSizedSetCCEquality().

6154

6155

return MVT::INVALID_SIMPLE_VALUE_TYPE;

6156

}

6157

6158

/// Val is the undef sentinel value or equal to the specified value.

6159

static bool isUndefOrEqual(int Val, int CmpVal) {

6160

return ((Val == SM_SentinelUndef) || (Val == CmpVal));

6161

}

6162

6163

/// Return true if every element in Mask is the undef sentinel value or equal to

6164

/// the specified value..

6165

static bool isUndefOrEqual(ArrayRef<int> Mask, int CmpVal) {

6166

return llvm::all_of(Mask, [CmpVal](int M) {

6167

return (M == SM_SentinelUndef) || (M == CmpVal);

6168

});

6169

}

6170

6171

/// Val is either the undef or zero sentinel value.

6172

static bool isUndefOrZero(int Val) {

6173

return ((Val == SM_SentinelUndef) || (Val == SM_SentinelZero));

6174

}

6175

6176

/// Return true if every element in Mask, beginning from position Pos and ending

6177

/// in Pos+Size is the undef sentinel value.

6178

static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) {

6179

return llvm::all_of(Mask.slice(Pos, Size),

6180

[](int M) { return M == SM_SentinelUndef; });

6181

}

6182

6183

/// Return true if the mask creates a vector whose lower half is undefined.

6184

static bool isUndefLowerHalf(ArrayRef<int> Mask) {

6185

unsigned NumElts = Mask.size();

6186

return isUndefInRange(Mask, 0, NumElts / 2);

6187

}

6188

6189

/// Return true if the mask creates a vector whose upper half is undefined.

6190

static bool isUndefUpperHalf(ArrayRef<int> Mask) {

6191

unsigned NumElts = Mask.size();

6192

return isUndefInRange(Mask, NumElts / 2, NumElts / 2);

6193

}

6194

6195

/// Return true if Val falls within the specified range (L, H].

6196

static bool isInRange(int Val, int Low, int Hi) {

6197

return (Val >= Low && Val < Hi);

6198

}

6199

6200

/// Return true if the value of any element in Mask falls within the specified

6201

/// range (L, H].

6202

static bool isAnyInRange(ArrayRef<int> Mask, int Low, int Hi) {

6203

return llvm::any_of(Mask, [Low, Hi](int M) { return isInRange(M, Low, Hi); });

6204

}

6205

6206

/// Return true if the value of any element in Mask is the zero sentinel value.

6207

static bool isAnyZero(ArrayRef<int> Mask) {

6208

return llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; });

6209

}

6210

6211

/// Return true if the value of any element in Mask is the zero or undef

6212

/// sentinel values.

6213

static bool isAnyZeroOrUndef(ArrayRef<int> Mask) {

6214

return llvm::any_of(Mask, [](int M) {

6215

return M == SM_SentinelZero || M == SM_SentinelUndef;

6216

});

6217

}

6218

6219

/// Return true if Val is undef or if its value falls within the

6220

/// specified range (L, H].

6221

static bool isUndefOrInRange(int Val, int Low, int Hi) {

6222

return (Val == SM_SentinelUndef) || isInRange(Val, Low, Hi);

6223

}

6224

6225

/// Return true if every element in Mask is undef or if its value

6226

/// falls within the specified range (L, H].

6227

static bool isUndefOrInRange(ArrayRef<int> Mask, int Low, int Hi) {

6228

return llvm::all_of(

6229

Mask, [Low, Hi](int M) { return isUndefOrInRange(M, Low, Hi); });

6230

}

6231

6232

/// Return true if Val is undef, zero or if its value falls within the

6233

/// specified range (L, H].

6234

static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi) {

6235

return isUndefOrZero(Val) || isInRange(Val, Low, Hi);

6236

}

6237

6238

/// Return true if every element in Mask is undef, zero or if its value

6239

/// falls within the specified range (L, H].

6240

static bool isUndefOrZeroOrInRange(ArrayRef<int> Mask, int Low, int Hi) {

6241

return llvm::all_of(

6242

Mask, [Low, Hi](int M) { return isUndefOrZeroOrInRange(M, Low, Hi); });

6243

}

6244

6245

/// Return true if every element in Mask, beginning

6246

/// from position Pos and ending in Pos + Size, falls within the specified

6247

/// sequence (Low, Low + Step, ..., Low + (Size - 1) * Step) or is undef.

6248

static bool isSequentialOrUndefInRange(ArrayRef<int> Mask, unsigned Pos,

6249

unsigned Size, int Low, int Step = 1) {

6250

for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)

6251

if (!isUndefOrEqual(Mask[i], Low))

6252

return false;

6253

return true;

6254

}

6255

6256

/// Return true if every element in Mask, beginning

6257

/// from position Pos and ending in Pos+Size, falls within the specified

6258

/// sequential range (Low, Low+Size], or is undef or is zero.

6259

static bool isSequentialOrUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,

6260

unsigned Size, int Low,

6261

int Step = 1) {

6262

for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)

6263

if (!isUndefOrZero(Mask[i]) && Mask[i] != Low)

6264

return false;

6265

return true;

6266

}

6267

6268

/// Return true if every element in Mask, beginning

6269

/// from position Pos and ending in Pos+Size is undef or is zero.

6270

static bool isUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,

6271

unsigned Size) {

6272

return llvm::all_of(Mask.slice(Pos, Size), isUndefOrZero);

6273

}

6274

6275

/// Helper function to test whether a shuffle mask could be

6276

/// simplified by widening the elements being shuffled.

6277

///

6278

/// Appends the mask for wider elements in WidenedMask if valid. Otherwise

6279

/// leaves it in an unspecified state.

6280

///

6281

/// NOTE: This must handle normal vector shuffle masks and *target* vector

6282

/// shuffle masks. The latter have the special property of a '-2' representing

6283

/// a zero-ed lane of a vector.

6284

static bool canWidenShuffleElements(ArrayRef<int> Mask,

6285

SmallVectorImpl<int> &WidenedMask) {

6286

WidenedMask.assign(Mask.size() / 2, 0);

6287

for (int i = 0, Size = Mask.size(); i < Size; i += 2) {

6288

int M0 = Mask[i];

6289

int M1 = Mask[i + 1];

6290

6291

// If both elements are undef, its trivial.

6292

if (M0 == SM_SentinelUndef && M1 == SM_SentinelUndef) {

6293

WidenedMask[i / 2] = SM_SentinelUndef;

6294

continue;

6295

}

6296

6297

// Check for an undef mask and a mask value properly aligned to fit with

6298

// a pair of values. If we find such a case, use the non-undef mask's value.

6299

if (M0 == SM_SentinelUndef && M1 >= 0 && (M1 % 2) == 1) {

6300

WidenedMask[i / 2] = M1 / 2;

6301

continue;

6302

}

6303

if (M1 == SM_SentinelUndef && M0 >= 0 && (M0 % 2) == 0) {

6304

WidenedMask[i / 2] = M0 / 2;

6305

continue;

6306

}

6307

6308

// When zeroing, we need to spread the zeroing across both lanes to widen.

6309

if (M0 == SM_SentinelZero || M1 == SM_SentinelZero) {

6310

if ((M0 == SM_SentinelZero || M0 == SM_SentinelUndef) &&

6311

(M1 == SM_SentinelZero || M1 == SM_SentinelUndef)) {

6312

WidenedMask[i / 2] = SM_SentinelZero;

6313

continue;

6314

}

6315

return false;

6316

}

6317

6318

// Finally check if the two mask values are adjacent and aligned with

6319

// a pair.

6320

if (M0 != SM_SentinelUndef && (M0 % 2) == 0 && (M0 + 1) == M1) {

6321

WidenedMask[i / 2] = M0 / 2;

6322

continue;

6323

}

6324

6325

// Otherwise we can't safely widen the elements used in this shuffle.

6326

return false;

6327

}

6328

assert(WidenedMask.size() == Mask.size() / 2 &&(static_cast <bool> (WidenedMask.size() == Mask.size() /
2 && "Incorrect size of mask after widening the elements!"
) ? void (0) : __assert_fail ("WidenedMask.size() == Mask.size() / 2 && \"Incorrect size of mask after widening the elements!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6329, __extension__
__PRETTY_FUNCTION__))

6329

"Incorrect size of mask after widening the elements!")(static_cast <bool> (WidenedMask.size() == Mask.size() /
2 && "Incorrect size of mask after widening the elements!"
) ? void (0) : __assert_fail ("WidenedMask.size() == Mask.size() / 2 && \"Incorrect size of mask after widening the elements!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6329, __extension__
__PRETTY_FUNCTION__));

6330

6331

return true;

6332

}

6333

6334

static bool canWidenShuffleElements(ArrayRef<int> Mask,

6335

const APInt &Zeroable,

6336

bool V2IsZero,

6337

SmallVectorImpl<int> &WidenedMask) {

6338

// Create an alternative mask with info about zeroable elements.

6339

// Here we do not set undef elements as zeroable.

6340

SmallVector<int, 64> ZeroableMask(Mask);

6341

if (V2IsZero) {

6342

assert(!Zeroable.isZero() && "V2's non-undef elements are used?!")(static_cast <bool> (!Zeroable.isZero() && "V2's non-undef elements are used?!"
) ? void (0) : __assert_fail ("!Zeroable.isZero() && \"V2's non-undef elements are used?!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6342, __extension__
__PRETTY_FUNCTION__));

6343

for (int i = 0, Size = Mask.size(); i != Size; ++i)

6344

if (Mask[i] != SM_SentinelUndef && Zeroable[i])

6345

ZeroableMask[i] = SM_SentinelZero;

6346

}

6347

return canWidenShuffleElements(ZeroableMask, WidenedMask);

6348

}

6349

6350

static bool canWidenShuffleElements(ArrayRef<int> Mask) {

6351

SmallVector<int, 32> WidenedMask;

6352

return canWidenShuffleElements(Mask, WidenedMask);

6353

}

6354

6355

// Attempt to narrow/widen shuffle mask until it matches the target number of

6356

// elements.

6357

static bool scaleShuffleElements(ArrayRef<int> Mask, unsigned NumDstElts,

6358

SmallVectorImpl<int> &ScaledMask) {

6359

unsigned NumSrcElts = Mask.size();

6360

assert(((NumSrcElts % NumDstElts) == 0 || (NumDstElts % NumSrcElts) == 0) &&(static_cast <bool> (((NumSrcElts % NumDstElts) == 0 ||
(NumDstElts % NumSrcElts) == 0) && "Illegal shuffle scale factor"
) ? void (0) : __assert_fail ("((NumSrcElts % NumDstElts) == 0 || (NumDstElts % NumSrcElts) == 0) && \"Illegal shuffle scale factor\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6361, __extension__
__PRETTY_FUNCTION__))

6361

"Illegal shuffle scale factor")(static_cast <bool> (((NumSrcElts % NumDstElts) == 0 ||
(NumDstElts % NumSrcElts) == 0) && "Illegal shuffle scale factor"
) ? void (0) : __assert_fail ("((NumSrcElts % NumDstElts) == 0 || (NumDstElts % NumSrcElts) == 0) && \"Illegal shuffle scale factor\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6361, __extension__
__PRETTY_FUNCTION__));

6362

6363

// Narrowing is guaranteed to work.

6364

if (NumDstElts >= NumSrcElts) {

6365

int Scale = NumDstElts / NumSrcElts;

6366

llvm::narrowShuffleMaskElts(Scale, Mask, ScaledMask);

6367

return true;

6368

}

6369

6370

// We have to repeat the widening until we reach the target size, but we can

6371

// split out the first widening as it sets up ScaledMask for us.

6372

if (canWidenShuffleElements(Mask, ScaledMask)) {

6373

while (ScaledMask.size() > NumDstElts) {

6374

SmallVector<int, 16> WidenedMask;

6375

if (!canWidenShuffleElements(ScaledMask, WidenedMask))

6376

return false;

6377

ScaledMask = std::move(WidenedMask);

6378

}

6379

return true;

6380

}

6381

6382

return false;

6383

}

6384

6385

/// Returns true if Elt is a constant zero or a floating point constant +0.0.

6386

bool X86::isZeroNode(SDValue Elt) {

6387

return isNullConstant(Elt) || isNullFPConstant(Elt);

6388

}

6389

6390

// Build a vector of constants.

6391

// Use an UNDEF node if MaskElt == -1.

6392

// Split 64-bit constants in the 32-bit mode.

6393

static SDValue getConstVector(ArrayRef<int> Values, MVT VT, SelectionDAG &DAG,

6394

const SDLoc &dl, bool IsMask = false) {

6395

6396

SmallVector<SDValue, 32> Ops;

6397

bool Split = false;

6398

6399

MVT ConstVecVT = VT;

6400

unsigned NumElts = VT.getVectorNumElements();

6401

bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);

6402

if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {

6403

ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);

6404

Split = true;

6405

}

6406

6407

MVT EltVT = ConstVecVT.getVectorElementType();

6408

for (unsigned i = 0; i < NumElts; ++i) {

6409

bool IsUndef = Values[i] < 0 && IsMask;

6410

SDValue OpNode = IsUndef ? DAG.getUNDEF(EltVT) :

6411

DAG.getConstant(Values[i], dl, EltVT);

6412

Ops.push_back(OpNode);

6413

if (Split)

6414

Ops.push_back(IsUndef ? DAG.getUNDEF(EltVT) :

6415

DAG.getConstant(0, dl, EltVT));

6416

}

6417

SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);

6418

if (Split)

6419

ConstsNode = DAG.getBitcast(VT, ConstsNode);

6420

return ConstsNode;

6421

}

6422

6423

static SDValue getConstVector(ArrayRef<APInt> Bits, APInt &Undefs,

6424

MVT VT, SelectionDAG &DAG, const SDLoc &dl) {

6425

assert(Bits.size() == Undefs.getBitWidth() &&(static_cast <bool> (Bits.size() == Undefs.getBitWidth(
) && "Unequal constant and undef arrays") ? void (0) :
__assert_fail ("Bits.size() == Undefs.getBitWidth() && \"Unequal constant and undef arrays\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6426, __extension__
__PRETTY_FUNCTION__))

6426

"Unequal constant and undef arrays")(static_cast <bool> (Bits.size() == Undefs.getBitWidth(
) && "Unequal constant and undef arrays") ? void (0) :
__assert_fail ("Bits.size() == Undefs.getBitWidth() && \"Unequal constant and undef arrays\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6426, __extension__
__PRETTY_FUNCTION__));

6427

SmallVector<SDValue, 32> Ops;

6428

bool Split = false;

6429

6430

MVT ConstVecVT = VT;

6431

unsigned NumElts = VT.getVectorNumElements();

6432

bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);

6433

if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {

6434

ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);

6435

Split = true;

6436

}

6437

6438

MVT EltVT = ConstVecVT.getVectorElementType();

6439

for (unsigned i = 0, e = Bits.size(); i != e; ++i) {

6440

if (Undefs[i]) {

6441

Ops.append(Split ? 2 : 1, DAG.getUNDEF(EltVT));

6442

continue;

6443

}

6444

const APInt &V = Bits[i];

6445

assert(V.getBitWidth() == VT.getScalarSizeInBits() && "Unexpected sizes")(static_cast <bool> (V.getBitWidth() == VT.getScalarSizeInBits
() && "Unexpected sizes") ? void (0) : __assert_fail (
"V.getBitWidth() == VT.getScalarSizeInBits() && \"Unexpected sizes\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6445, __extension__
__PRETTY_FUNCTION__));

6446

if (Split) {

6447

Ops.push_back(DAG.getConstant(V.trunc(32), dl, EltVT));

6448

Ops.push_back(DAG.getConstant(V.lshr(32).trunc(32), dl, EltVT));

6449

} else if (EltVT == MVT::f32) {

6450

APFloat FV(APFloat::IEEEsingle(), V);

6451

Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));

6452

} else if (EltVT == MVT::f64) {

6453

APFloat FV(APFloat::IEEEdouble(), V);

6454

Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));

6455

} else {

6456

Ops.push_back(DAG.getConstant(V, dl, EltVT));

6457

}

6458

}

6459

6460

SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);

6461

return DAG.getBitcast(VT, ConstsNode);

6462

}

6463

6464

/// Returns a vector of specified type with all zero elements.

6465

static SDValue getZeroVector(MVT VT, const X86Subtarget &Subtarget,

6466

SelectionDAG &DAG, const SDLoc &dl) {

6467

assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() ||(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector() || VT.getVectorElementType() == MVT
::i1) && "Unexpected vector type") ? void (0) : __assert_fail
("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() || VT.getVectorElementType() == MVT::i1) && \"Unexpected vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6469, __extension__
__PRETTY_FUNCTION__))

6468

VT.getVectorElementType() == MVT::i1) &&(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector() || VT.getVectorElementType() == MVT
::i1) && "Unexpected vector type") ? void (0) : __assert_fail
("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() || VT.getVectorElementType() == MVT::i1) && \"Unexpected vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6469, __extension__
__PRETTY_FUNCTION__))

6469

"Unexpected vector type")(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector() || VT.getVectorElementType() == MVT
::i1) && "Unexpected vector type") ? void (0) : __assert_fail
("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() || VT.getVectorElementType() == MVT::i1) && \"Unexpected vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6469, __extension__
__PRETTY_FUNCTION__));

6470

6471

// Try to build SSE/AVX zero vectors as <N x i32> bitcasted to their dest

6472

// type. This ensures they get CSE'd. But if the integer type is not

6473

// available, use a floating-point +0.0 instead.

6474

SDValue Vec;

6475

if (!Subtarget.hasSSE2() && VT.is128BitVector()) {

6476

Vec = DAG.getConstantFP(+0.0, dl, MVT::v4f32);

6477

} else if (VT.isFloatingPoint()) {

6478

Vec = DAG.getConstantFP(+0.0, dl, VT);

6479

} else if (VT.getVectorElementType() == MVT::i1) {

6480

assert((Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) &&(static_cast <bool> ((Subtarget.hasBWI() || VT.getVectorNumElements
() <= 16) && "Unexpected vector type") ? void (0) :
__assert_fail ("(Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) && \"Unexpected vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6481, __extension__
__PRETTY_FUNCTION__))

6481

"Unexpected vector type")(static_cast <bool> ((Subtarget.hasBWI() || VT.getVectorNumElements
() <= 16) && "Unexpected vector type") ? void (0) :
__assert_fail ("(Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) && \"Unexpected vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6481, __extension__
__PRETTY_FUNCTION__));

6482

Vec = DAG.getConstant(0, dl, VT);

6483

} else {

6484

unsigned Num32BitElts = VT.getSizeInBits() / 32;

6485

Vec = DAG.getConstant(0, dl, MVT::getVectorVT(MVT::i32, Num32BitElts));

6486

}

6487

return DAG.getBitcast(VT, Vec);

6488

}

6489

6490

// Helper to determine if the ops are all the extracted subvectors come from a

6491

// single source. If we allow commute they don't have to be in order (Lo/Hi).

6492

static SDValue getSplitVectorSrc(SDValue LHS, SDValue RHS, bool AllowCommute) {

6493

if (LHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||

6494

RHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||

6495

LHS.getValueType() != RHS.getValueType() ||

6496

LHS.getOperand(0) != RHS.getOperand(0))

6497

return SDValue();

6498

6499

SDValue Src = LHS.getOperand(0);

6500

if (Src.getValueSizeInBits() != (LHS.getValueSizeInBits() * 2))

6501

return SDValue();

6502

6503

unsigned NumElts = LHS.getValueType().getVectorNumElements();

6504

if ((LHS.getConstantOperandAPInt(1) == 0 &&

6505

RHS.getConstantOperandAPInt(1) == NumElts) ||

6506

(AllowCommute && RHS.getConstantOperandAPInt(1) == 0 &&

6507

LHS.getConstantOperandAPInt(1) == NumElts))

6508

return Src;

6509

6510

return SDValue();

6511

}

6512

6513

static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG,

6514

const SDLoc &dl, unsigned vectorWidth) {

6515

EVT VT = Vec.getValueType();

6516

EVT ElVT = VT.getVectorElementType();

6517

unsigned Factor = VT.getSizeInBits() / vectorWidth;

6518

EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,

6519

VT.getVectorNumElements() / Factor);

6520

6521

// Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR

6522

unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();

6523

assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2")(static_cast <bool> (isPowerOf2_32(ElemsPerChunk) &&
"Elements per chunk not power of 2") ? void (0) : __assert_fail
("isPowerOf2_32(ElemsPerChunk) && \"Elements per chunk not power of 2\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6523, __extension__
__PRETTY_FUNCTION__));

6524

6525

// This is the index of the first element of the vectorWidth-bit chunk

6526

// we want. Since ElemsPerChunk is a power of 2 just need to clear bits.

6527

IdxVal &= ~(ElemsPerChunk - 1);

6528

6529

// If the input is a buildvector just emit a smaller one.

6530

if (Vec.getOpcode() == ISD::BUILD_VECTOR)

6531

return DAG.getBuildVector(ResultVT, dl,

6532

Vec->ops().slice(IdxVal, ElemsPerChunk));

6533

6534

SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);

6535

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx);

6536

}

6537

6538

/// Generate a DAG to grab 128-bits from a vector > 128 bits. This

6539

/// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128

6540

/// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4

6541

/// instructions or a simple subregister reference. Idx is an index in the

6542

/// 128 bits we want. It need not be aligned to a 128-bit boundary. That makes

6543

/// lowering EXTRACT_VECTOR_ELT operations easier.

6544

static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal,

6545

SelectionDAG &DAG, const SDLoc &dl) {

6546

assert((Vec.getValueType().is256BitVector() ||(static_cast <bool> ((Vec.getValueType().is256BitVector
() || Vec.getValueType().is512BitVector()) && "Unexpected vector size!"
) ? void (0) : __assert_fail ("(Vec.getValueType().is256BitVector() || Vec.getValueType().is512BitVector()) && \"Unexpected vector size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6547, __extension__
__PRETTY_FUNCTION__))

6547

Vec.getValueType().is512BitVector()) && "Unexpected vector size!")(static_cast <bool> ((Vec.getValueType().is256BitVector
() || Vec.getValueType().is512BitVector()) && "Unexpected vector size!"
) ? void (0) : __assert_fail ("(Vec.getValueType().is256BitVector() || Vec.getValueType().is512BitVector()) && \"Unexpected vector size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6547, __extension__
__PRETTY_FUNCTION__));

6548

return extractSubVector(Vec, IdxVal, DAG, dl, 128);

6549

}

6550

6551

/// Generate a DAG to grab 256-bits from a 512-bit vector.

6552

static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal,

6553

SelectionDAG &DAG, const SDLoc &dl) {

6554

assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!")(static_cast <bool> (Vec.getValueType().is512BitVector(
) && "Unexpected vector size!") ? void (0) : __assert_fail
("Vec.getValueType().is512BitVector() && \"Unexpected vector size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6554, __extension__
__PRETTY_FUNCTION__));

6555

return extractSubVector(Vec, IdxVal, DAG, dl, 256);

6556

}

6557

6558

static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal,

6559

SelectionDAG &DAG, const SDLoc &dl,

6560

unsigned vectorWidth) {

6561

assert((vectorWidth == 128 || vectorWidth == 256) &&(static_cast <bool> ((vectorWidth == 128 || vectorWidth
== 256) && "Unsupported vector width") ? void (0) : __assert_fail
("(vectorWidth == 128 || vectorWidth == 256) && \"Unsupported vector width\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6562, __extension__
__PRETTY_FUNCTION__))

6562

"Unsupported vector width")(static_cast <bool> ((vectorWidth == 128 || vectorWidth
== 256) && "Unsupported vector width") ? void (0) : __assert_fail
("(vectorWidth == 128 || vectorWidth == 256) && \"Unsupported vector width\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6562, __extension__
__PRETTY_FUNCTION__));

6563

// Inserting UNDEF is Result

6564

if (Vec.isUndef())

6565

return Result;

6566

EVT VT = Vec.getValueType();

6567

EVT ElVT = VT.getVectorElementType();

6568

EVT ResultVT = Result.getValueType();

6569

6570

// Insert the relevant vectorWidth bits.

6571

unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();

6572

assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2")(static_cast <bool> (isPowerOf2_32(ElemsPerChunk) &&
"Elements per chunk not power of 2") ? void (0) : __assert_fail
("isPowerOf2_32(ElemsPerChunk) && \"Elements per chunk not power of 2\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6572, __extension__
__PRETTY_FUNCTION__));

6573

6574

// This is the index of the first element of the vectorWidth-bit chunk

6575

// we want. Since ElemsPerChunk is a power of 2 just need to clear bits.

6576

IdxVal &= ~(ElemsPerChunk - 1);

6577

6578

SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);

6579

return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx);

6580

}

6581

6582

/// Generate a DAG to put 128-bits into a vector > 128 bits. This

6583

/// sets things up to match to an AVX VINSERTF128/VINSERTI128 or

6584

/// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a

6585

/// simple superregister reference. Idx is an index in the 128 bits

6586

/// we want. It need not be aligned to a 128-bit boundary. That makes

6587

/// lowering INSERT_VECTOR_ELT operations easier.

6588

static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,

6589

SelectionDAG &DAG, const SDLoc &dl) {

6590

assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!")(static_cast <bool> (Vec.getValueType().is128BitVector(
) && "Unexpected vector size!") ? void (0) : __assert_fail
("Vec.getValueType().is128BitVector() && \"Unexpected vector size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6590, __extension__
__PRETTY_FUNCTION__));

6591

return insertSubVector(Result, Vec, IdxVal, DAG, dl, 128);

6592

}

6593

6594

/// Widen a vector to a larger size with the same scalar type, with the new

6595

/// elements either zero or undef.

6596

static SDValue widenSubVector(MVT VT, SDValue Vec, bool ZeroNewElements,

6597

const X86Subtarget &Subtarget, SelectionDAG &DAG,

6598

const SDLoc &dl) {

6599

assert(Vec.getValueSizeInBits().getFixedValue() < VT.getFixedSizeInBits() &&(static_cast <bool> (Vec.getValueSizeInBits().getFixedValue
() < VT.getFixedSizeInBits() && Vec.getValueType()
.getScalarType() == VT.getScalarType() && "Unsupported vector widening type"
) ? void (0) : __assert_fail ("Vec.getValueSizeInBits().getFixedValue() < VT.getFixedSizeInBits() && Vec.getValueType().getScalarType() == VT.getScalarType() && \"Unsupported vector widening type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6601, __extension__
__PRETTY_FUNCTION__))

6600

Vec.getValueType().getScalarType() == VT.getScalarType() &&(static_cast <bool> (Vec.getValueSizeInBits().getFixedValue
() < VT.getFixedSizeInBits() && Vec.getValueType()
.getScalarType() == VT.getScalarType() && "Unsupported vector widening type"
) ? void (0) : __assert_fail ("Vec.getValueSizeInBits().getFixedValue() < VT.getFixedSizeInBits() && Vec.getValueType().getScalarType() == VT.getScalarType() && \"Unsupported vector widening type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6601, __extension__
__PRETTY_FUNCTION__))

6601

"Unsupported vector widening type")(static_cast <bool> (Vec.getValueSizeInBits().getFixedValue
() < VT.getFixedSizeInBits() && Vec.getValueType()
.getScalarType() == VT.getScalarType() && "Unsupported vector widening type"
) ? void (0) : __assert_fail ("Vec.getValueSizeInBits().getFixedValue() < VT.getFixedSizeInBits() && Vec.getValueType().getScalarType() == VT.getScalarType() && \"Unsupported vector widening type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6601, __extension__
__PRETTY_FUNCTION__));

6602

SDValue Res = ZeroNewElements ? getZeroVector(VT, Subtarget, DAG, dl)

6603

: DAG.getUNDEF(VT);

6604

return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VT, Res, Vec,

6605

DAG.getIntPtrConstant(0, dl));

6606

}

6607

6608

/// Widen a vector to a larger size with the same scalar type, with the new

6609

/// elements either zero or undef.

6610

static SDValue widenSubVector(SDValue Vec, bool ZeroNewElements,

6611

const X86Subtarget &Subtarget, SelectionDAG &DAG,

6612

const SDLoc &dl, unsigned WideSizeInBits) {

6613

assert(Vec.getValueSizeInBits() < WideSizeInBits &&(static_cast <bool> (Vec.getValueSizeInBits() < WideSizeInBits
&& (WideSizeInBits % Vec.getScalarValueSizeInBits())
== 0 && "Unsupported vector widening type") ? void (
0) : __assert_fail ("Vec.getValueSizeInBits() < WideSizeInBits && (WideSizeInBits % Vec.getScalarValueSizeInBits()) == 0 && \"Unsupported vector widening type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6615, __extension__
__PRETTY_FUNCTION__))

6614

(WideSizeInBits % Vec.getScalarValueSizeInBits()) == 0 &&(static_cast <bool> (Vec.getValueSizeInBits() < WideSizeInBits
&& (WideSizeInBits % Vec.getScalarValueSizeInBits())
== 0 && "Unsupported vector widening type") ? void (
0) : __assert_fail ("Vec.getValueSizeInBits() < WideSizeInBits && (WideSizeInBits % Vec.getScalarValueSizeInBits()) == 0 && \"Unsupported vector widening type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6615, __extension__
__PRETTY_FUNCTION__))

6615

"Unsupported vector widening type")(static_cast <bool> (Vec.getValueSizeInBits() < WideSizeInBits
&& (WideSizeInBits % Vec.getScalarValueSizeInBits())
== 0 && "Unsupported vector widening type") ? void (
0) : __assert_fail ("Vec.getValueSizeInBits() < WideSizeInBits && (WideSizeInBits % Vec.getScalarValueSizeInBits()) == 0 && \"Unsupported vector widening type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6615, __extension__
__PRETTY_FUNCTION__));

6616

unsigned WideNumElts = WideSizeInBits / Vec.getScalarValueSizeInBits();

6617

MVT SVT = Vec.getSimpleValueType().getScalarType();

6618

MVT VT = MVT::getVectorVT(SVT, WideNumElts);

6619

return widenSubVector(VT, Vec, ZeroNewElements, Subtarget, DAG, dl);

6620

}

6621

6622

// Helper function to collect subvector ops that are concatenated together,

6623

// either by ISD::CONCAT_VECTORS or a ISD::INSERT_SUBVECTOR series.

6624

// The subvectors in Ops are guaranteed to be the same type.

6625

static bool collectConcatOps(SDNode *N, SmallVectorImpl<SDValue> &Ops,

6626

SelectionDAG &DAG) {

6627

assert(Ops.empty() && "Expected an empty ops vector")(static_cast <bool> (Ops.empty() && "Expected an empty ops vector"
) ? void (0) : __assert_fail ("Ops.empty() && \"Expected an empty ops vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6627, __extension__
__PRETTY_FUNCTION__));

6628

6629

if (N->getOpcode() == ISD::CONCAT_VECTORS) {

6630

Ops.append(N->op_begin(), N->op_end());

6631

return true;

6632

}

6633

6634

if (N->getOpcode() == ISD::INSERT_SUBVECTOR) {

6635

SDValue Src = N->getOperand(0);

6636

SDValue Sub = N->getOperand(1);

6637

const APInt &Idx = N->getConstantOperandAPInt(2);

6638

EVT VT = Src.getValueType();

6639

EVT SubVT = Sub.getValueType();

6640

6641

// TODO - Handle more general insert_subvector chains.

6642

if (VT.getSizeInBits() == (SubVT.getSizeInBits() * 2)) {

6643

// insert_subvector(undef, x, lo)

6644

if (Idx == 0 && Src.isUndef()) {

6645

Ops.push_back(Sub);

6646

Ops.push_back(DAG.getUNDEF(SubVT));

6647

return true;

6648

}

6649

if (Idx == (VT.getVectorNumElements() / 2)) {

6650

// insert_subvector(insert_subvector(undef, x, lo), y, hi)

6651

if (Src.getOpcode() == ISD::INSERT_SUBVECTOR &&

6652

Src.getOperand(1).getValueType() == SubVT &&

6653

isNullConstant(Src.getOperand(2))) {

6654

Ops.push_back(Src.getOperand(1));

6655

Ops.push_back(Sub);

6656

return true;

6657

}

6658

// insert_subvector(x, extract_subvector(x, lo), hi)

6659

if (Sub.getOpcode() == ISD::EXTRACT_SUBVECTOR &&

6660

Sub.getOperand(0) == Src && isNullConstant(Sub.getOperand(1))) {

6661

Ops.append(2, Sub);

6662

return true;

6663

}

6664

// insert_subvector(undef, x, hi)

6665

if (Src.isUndef()) {

6666

Ops.push_back(DAG.getUNDEF(SubVT));

6667

Ops.push_back(Sub);

6668

return true;

6669

}

6670

}

6671

}

6672

}

6673

6674

return false;

6675

}

6676

6677

static std::pair<SDValue, SDValue> splitVector(SDValue Op, SelectionDAG &DAG,

6678

const SDLoc &dl) {

6679

EVT VT = Op.getValueType();

6680

unsigned NumElems = VT.getVectorNumElements();

6681

unsigned SizeInBits = VT.getSizeInBits();

6682

assert((NumElems % 2) == 0 && (SizeInBits % 2) == 0 &&(static_cast <bool> ((NumElems % 2) == 0 && (SizeInBits
% 2) == 0 && "Can't split odd sized vector") ? void (
0) : __assert_fail ("(NumElems % 2) == 0 && (SizeInBits % 2) == 0 && \"Can't split odd sized vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6683, __extension__
__PRETTY_FUNCTION__))

6683

"Can't split odd sized vector")(static_cast <bool> ((NumElems % 2) == 0 && (SizeInBits
% 2) == 0 && "Can't split odd sized vector") ? void (
0) : __assert_fail ("(NumElems % 2) == 0 && (SizeInBits % 2) == 0 && \"Can't split odd sized vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6683, __extension__
__PRETTY_FUNCTION__));

6684

6685

// If this is a splat value (with no-undefs) then use the lower subvector,

6686

// which should be a free extraction.

6687

SDValue Lo = extractSubVector(Op, 0, DAG, dl, SizeInBits / 2);

6688

if (DAG.isSplatValue(Op, /*AllowUndefs*/ false))

6689

return std::make_pair(Lo, Lo);

6690

6691

SDValue Hi = extractSubVector(Op, NumElems / 2, DAG, dl, SizeInBits / 2);

6692

return std::make_pair(Lo, Hi);

6693

}

6694

6695

/// Break an operation into 2 half sized ops and then concatenate the results.

6696

static SDValue splitVectorOp(SDValue Op, SelectionDAG &DAG) {

6697

unsigned NumOps = Op.getNumOperands();

6698

EVT VT = Op.getValueType();

6699

SDLoc dl(Op);

6700

6701

// Extract the LHS Lo/Hi vectors

6702

SmallVector<SDValue> LoOps(NumOps, SDValue());

6703

SmallVector<SDValue> HiOps(NumOps, SDValue());

6704

for (unsigned I = 0; I != NumOps; ++I) {

6705

SDValue SrcOp = Op.getOperand(I);

6706

if (!SrcOp.getValueType().isVector()) {

6707

LoOps[I] = HiOps[I] = SrcOp;

6708

continue;

6709

}

6710

std::tie(LoOps[I], HiOps[I]) = splitVector(SrcOp, DAG, dl);

6711

}

6712

6713

EVT LoVT, HiVT;

6714

std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);

6715

return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,

6716

DAG.getNode(Op.getOpcode(), dl, LoVT, LoOps),

6717

DAG.getNode(Op.getOpcode(), dl, HiVT, HiOps));

6718

}

6719

6720

/// Break an unary integer operation into 2 half sized ops and then

6721

/// concatenate the result back.

6722

static SDValue splitVectorIntUnary(SDValue Op, SelectionDAG &DAG) {

6723

// Make sure we only try to split 256/512-bit types to avoid creating

6724

// narrow vectors.

6725

EVT VT = Op.getValueType();

6726

(void)VT;

6727

assert((Op.getOperand(0).getValueType().is256BitVector() ||(static_cast <bool> ((Op.getOperand(0).getValueType().is256BitVector
() || Op.getOperand(0).getValueType().is512BitVector()) &&
(VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!"
) ? void (0) : __assert_fail ("(Op.getOperand(0).getValueType().is256BitVector() || Op.getOperand(0).getValueType().is512BitVector()) && (VT.is256BitVector() || VT.is512BitVector()) && \"Unsupported VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6729, __extension__
__PRETTY_FUNCTION__))

6728

Op.getOperand(0).getValueType().is512BitVector()) &&(static_cast <bool> ((Op.getOperand(0).getValueType().is256BitVector
() || Op.getOperand(0).getValueType().is512BitVector()) &&
(VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!"
) ? void (0) : __assert_fail ("(Op.getOperand(0).getValueType().is256BitVector() || Op.getOperand(0).getValueType().is512BitVector()) && (VT.is256BitVector() || VT.is512BitVector()) && \"Unsupported VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6729, __extension__
__PRETTY_FUNCTION__))

6729

(VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!")(static_cast <bool> ((Op.getOperand(0).getValueType().is256BitVector
() || Op.getOperand(0).getValueType().is512BitVector()) &&
(VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!"
) ? void (0) : __assert_fail ("(Op.getOperand(0).getValueType().is256BitVector() || Op.getOperand(0).getValueType().is512BitVector()) && (VT.is256BitVector() || VT.is512BitVector()) && \"Unsupported VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6729, __extension__
__PRETTY_FUNCTION__));

6730

assert(Op.getOperand(0).getValueType().getVectorNumElements() ==(static_cast <bool> (Op.getOperand(0).getValueType().getVectorNumElements
() == VT.getVectorNumElements() && "Unexpected VTs!")
? void (0) : __assert_fail ("Op.getOperand(0).getValueType().getVectorNumElements() == VT.getVectorNumElements() && \"Unexpected VTs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6732, __extension__
__PRETTY_FUNCTION__))

6731

VT.getVectorNumElements() &&(static_cast <bool> (Op.getOperand(0).getValueType().getVectorNumElements
() == VT.getVectorNumElements() && "Unexpected VTs!")
? void (0) : __assert_fail ("Op.getOperand(0).getValueType().getVectorNumElements() == VT.getVectorNumElements() && \"Unexpected VTs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6732, __extension__
__PRETTY_FUNCTION__))

6732

"Unexpected VTs!")(static_cast <bool> (Op.getOperand(0).getValueType().getVectorNumElements
() == VT.getVectorNumElements() && "Unexpected VTs!")
? void (0) : __assert_fail ("Op.getOperand(0).getValueType().getVectorNumElements() == VT.getVectorNumElements() && \"Unexpected VTs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6732, __extension__
__PRETTY_FUNCTION__));

6733

return splitVectorOp(Op, DAG);

6734

}

6735

6736

/// Break a binary integer operation into 2 half sized ops and then

6737

/// concatenate the result back.

6738

static SDValue splitVectorIntBinary(SDValue Op, SelectionDAG &DAG) {

6739

// Assert that all the types match.

6740

EVT VT = Op.getValueType();

6741

(void)VT;

6742

assert(Op.getOperand(0).getValueType() == VT &&(static_cast <bool> (Op.getOperand(0).getValueType() ==
VT && Op.getOperand(1).getValueType() == VT &&
"Unexpected VTs!") ? void (0) : __assert_fail ("Op.getOperand(0).getValueType() == VT && Op.getOperand(1).getValueType() == VT && \"Unexpected VTs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6743, __extension__
__PRETTY_FUNCTION__))

6743

Op.getOperand(1).getValueType() == VT && "Unexpected VTs!")(static_cast <bool> (Op.getOperand(0).getValueType() ==
VT && Op.getOperand(1).getValueType() == VT &&
"Unexpected VTs!") ? void (0) : __assert_fail ("Op.getOperand(0).getValueType() == VT && Op.getOperand(1).getValueType() == VT && \"Unexpected VTs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6743, __extension__
__PRETTY_FUNCTION__));

6744

assert((VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!")(static_cast <bool> ((VT.is256BitVector() || VT.is512BitVector
()) && "Unsupported VT!") ? void (0) : __assert_fail (
"(VT.is256BitVector() || VT.is512BitVector()) && \"Unsupported VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6744, __extension__
__PRETTY_FUNCTION__));

6745

return splitVectorOp(Op, DAG);

6746

}

6747

6748

// Helper for splitting operands of an operation to legal target size and

6749

// apply a function on each part.

6750

// Useful for operations that are available on SSE2 in 128-bit, on AVX2 in

6751

// 256-bit and on AVX512BW in 512-bit. The argument VT is the type used for

6752

// deciding if/how to split Ops. Ops elements do *not* have to be of type VT.

6753

// The argument Builder is a function that will be applied on each split part:

6754

// SDValue Builder(SelectionDAG&G, SDLoc, ArrayRef<SDValue>)

6755

template <typename F>

6756

SDValue SplitOpsAndApply(SelectionDAG &DAG, const X86Subtarget &Subtarget,

6757

const SDLoc &DL, EVT VT, ArrayRef<SDValue> Ops,

6758

F Builder, bool CheckBWI = true) {

6759

assert(Subtarget.hasSSE2() && "Target assumed to support at least SSE2")(static_cast <bool> (Subtarget.hasSSE2() && "Target assumed to support at least SSE2"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"Target assumed to support at least SSE2\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6759, __extension__
__PRETTY_FUNCTION__));

6760

unsigned NumSubs = 1;

6761

if ((CheckBWI && Subtarget.useBWIRegs()) ||

6762

(!CheckBWI && Subtarget.useAVX512Regs())) {

6763

if (VT.getSizeInBits() > 512) {

6764

NumSubs = VT.getSizeInBits() / 512;

6765

assert((VT.getSizeInBits() % 512) == 0 && "Illegal vector size")(static_cast <bool> ((VT.getSizeInBits() % 512) == 0 &&
"Illegal vector size") ? void (0) : __assert_fail ("(VT.getSizeInBits() % 512) == 0 && \"Illegal vector size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6765, __extension__
__PRETTY_FUNCTION__));

6766

}

6767

} else if (Subtarget.hasAVX2()) {

6768

if (VT.getSizeInBits() > 256) {

6769

NumSubs = VT.getSizeInBits() / 256;

6770

assert((VT.getSizeInBits() % 256) == 0 && "Illegal vector size")(static_cast <bool> ((VT.getSizeInBits() % 256) == 0 &&
"Illegal vector size") ? void (0) : __assert_fail ("(VT.getSizeInBits() % 256) == 0 && \"Illegal vector size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6770, __extension__
__PRETTY_FUNCTION__));

6771

}

6772

} else {

6773

if (VT.getSizeInBits() > 128) {

6774

NumSubs = VT.getSizeInBits() / 128;

6775

assert((VT.getSizeInBits() % 128) == 0 && "Illegal vector size")(static_cast <bool> ((VT.getSizeInBits() % 128) == 0 &&
"Illegal vector size") ? void (0) : __assert_fail ("(VT.getSizeInBits() % 128) == 0 && \"Illegal vector size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6775, __extension__
__PRETTY_FUNCTION__));

6776

}

6777

}

6778

6779

if (NumSubs == 1)

6780

return Builder(DAG, DL, Ops);

6781

6782

SmallVector<SDValue, 4> Subs;

6783

for (unsigned i = 0; i != NumSubs; ++i) {

6784

SmallVector<SDValue, 2> SubOps;

6785

for (SDValue Op : Ops) {

6786

EVT OpVT = Op.getValueType();

6787

unsigned NumSubElts = OpVT.getVectorNumElements() / NumSubs;

6788

unsigned SizeSub = OpVT.getSizeInBits() / NumSubs;

6789

SubOps.push_back(extractSubVector(Op, i * NumSubElts, DAG, DL, SizeSub));

6790

}

6791

Subs.push_back(Builder(DAG, DL, SubOps));

6792

}

6793

return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);

6794

}

6795

6796

// Helper function that extends a non-512-bit vector op to 512-bits on non-VLX

6797

// targets.

6798

static SDValue getAVX512Node(unsigned Opcode, const SDLoc &DL, MVT VT,

6799

ArrayRef<SDValue> Ops, SelectionDAG &DAG,

6800

const X86Subtarget &Subtarget) {

6801

assert(Subtarget.hasAVX512() && "AVX512 target expected")(static_cast <bool> (Subtarget.hasAVX512() && "AVX512 target expected"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && \"AVX512 target expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6801, __extension__
__PRETTY_FUNCTION__));

6802

MVT SVT = VT.getScalarType();

6803

6804

// If we have a 32/64 splatted constant, splat it to DstTy to

6805

// encourage a foldable broadcast'd operand.

6806

auto MakeBroadcastOp = [&](SDValue Op, MVT OpVT, MVT DstVT) {

6807

unsigned OpEltSizeInBits = OpVT.getScalarSizeInBits();

6808

// AVX512 broadcasts 32/64-bit operands.

6809

// TODO: Support float once getAVX512Node is used by fp-ops.

6810

if (!OpVT.isInteger() || OpEltSizeInBits < 32 ||

6811

!DAG.getTargetLoweringInfo().isTypeLegal(SVT))

6812

return SDValue();

6813

// If we're not widening, don't bother if we're not bitcasting.

6814

if (OpVT == DstVT && Op.getOpcode() != ISD::BITCAST)

6815

return SDValue();

6816

if (auto *BV = dyn_cast<BuildVectorSDNode>(peekThroughBitcasts(Op))) {

6817

APInt SplatValue, SplatUndef;

6818

unsigned SplatBitSize;

6819

bool HasAnyUndefs;

6820

if (BV->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,

6821

HasAnyUndefs, OpEltSizeInBits) &&

6822

!HasAnyUndefs && SplatValue.getBitWidth() == OpEltSizeInBits)

6823

return DAG.getConstant(SplatValue, DL, DstVT);

6824

}

6825

return SDValue();

6826

};

6827

6828

bool Widen = !(Subtarget.hasVLX() || VT.is512BitVector());

6829

6830

MVT DstVT = VT;

6831

if (Widen)

6832

DstVT = MVT::getVectorVT(SVT, 512 / SVT.getSizeInBits());

6833

6834

// Canonicalize src operands.

6835

SmallVector<SDValue> SrcOps(Ops.begin(), Ops.end());

6836

for (SDValue &Op : SrcOps) {

6837

MVT OpVT = Op.getSimpleValueType();

6838

// Just pass through scalar operands.

6839

if (!OpVT.isVector())

6840

continue;

6841

assert(OpVT == VT && "Vector type mismatch")(static_cast <bool> (OpVT == VT && "Vector type mismatch"
) ? void (0) : __assert_fail ("OpVT == VT && \"Vector type mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6841, __extension__
__PRETTY_FUNCTION__));

6842

6843

if (SDValue BroadcastOp = MakeBroadcastOp(Op, OpVT, DstVT)) {

6844

Op = BroadcastOp;

6845

continue;

6846

}

6847

6848

// Just widen the subvector by inserting into an undef wide vector.

6849

if (Widen)

6850

Op = widenSubVector(Op, false, Subtarget, DAG, DL, 512);

6851

}

6852

6853

SDValue Res = DAG.getNode(Opcode, DL, DstVT, SrcOps);

6854

6855

// Perform the 512-bit op then extract the bottom subvector.

6856

if (Widen)

6857

Res = extractSubVector(Res, 0, DAG, DL, VT.getSizeInBits());

6858

return Res;

6859

}

6860

6861

/// Insert i1-subvector to i1-vector.

6862

static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,

6863

const X86Subtarget &Subtarget) {

6864

6865

SDLoc dl(Op);

6866

SDValue Vec = Op.getOperand(0);

6867

SDValue SubVec = Op.getOperand(1);

6868

SDValue Idx = Op.getOperand(2);

6869

unsigned IdxVal = Op.getConstantOperandVal(2);

6870

6871

// Inserting undef is a nop. We can just return the original vector.

6872

if (SubVec.isUndef())

6873

return Vec;

6874

6875

if (IdxVal == 0 && Vec.isUndef()) // the operation is legal

6876

return Op;

6877

6878

MVT OpVT = Op.getSimpleValueType();

6879

unsigned NumElems = OpVT.getVectorNumElements();

6880

SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);

6881

6882

// Extend to natively supported kshift.

6883

MVT WideOpVT = OpVT;

6884

if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8)

6885

WideOpVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;

6886

6887

// Inserting into the lsbs of a zero vector is legal. ISel will insert shifts

6888

// if necessary.

6889

if (IdxVal == 0 && ISD::isBuildVectorAllZeros(Vec.getNode())) {

6890

// May need to promote to a legal type.

6891

Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,

6892

DAG.getConstant(0, dl, WideOpVT),

6893

SubVec, Idx);

6894

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);

6895

}

6896

6897

MVT SubVecVT = SubVec.getSimpleValueType();

6898

unsigned SubVecNumElems = SubVecVT.getVectorNumElements();

6899

assert(IdxVal + SubVecNumElems <= NumElems &&(static_cast <bool> (IdxVal + SubVecNumElems <= NumElems
&& IdxVal % SubVecVT.getSizeInBits() == 0 &&
"Unexpected index value in INSERT_SUBVECTOR") ? void (0) : __assert_fail
("IdxVal + SubVecNumElems <= NumElems && IdxVal % SubVecVT.getSizeInBits() == 0 && \"Unexpected index value in INSERT_SUBVECTOR\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6901, __extension__
__PRETTY_FUNCTION__))

6900

IdxVal % SubVecVT.getSizeInBits() == 0 &&(static_cast <bool> (IdxVal + SubVecNumElems <= NumElems
&& IdxVal % SubVecVT.getSizeInBits() == 0 &&
"Unexpected index value in INSERT_SUBVECTOR") ? void (0) : __assert_fail
("IdxVal + SubVecNumElems <= NumElems && IdxVal % SubVecVT.getSizeInBits() == 0 && \"Unexpected index value in INSERT_SUBVECTOR\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6901, __extension__
__PRETTY_FUNCTION__))

6901

"Unexpected index value in INSERT_SUBVECTOR")(static_cast <bool> (IdxVal + SubVecNumElems <= NumElems
&& IdxVal % SubVecVT.getSizeInBits() == 0 &&
"Unexpected index value in INSERT_SUBVECTOR") ? void (0) : __assert_fail
("IdxVal + SubVecNumElems <= NumElems && IdxVal % SubVecVT.getSizeInBits() == 0 && \"Unexpected index value in INSERT_SUBVECTOR\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6901, __extension__
__PRETTY_FUNCTION__));

6902

6903

SDValue Undef = DAG.getUNDEF(WideOpVT);

6904

6905

if (IdxVal == 0) {

6906

// Zero lower bits of the Vec

6907

SDValue ShiftBits = DAG.getTargetConstant(SubVecNumElems, dl, MVT::i8);

6908

Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec,

6909

ZeroIdx);

6910

Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);

6911

Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);

6912

// Merge them together, SubVec should be zero extended.

6913

SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,

6914

DAG.getConstant(0, dl, WideOpVT),

6915

SubVec, ZeroIdx);

6916

Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);

6917

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);

6918

}

6919

6920

SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,

6921

Undef, SubVec, ZeroIdx);

6922

6923

if (Vec.isUndef()) {

6924

assert(IdxVal != 0 && "Unexpected index")(static_cast <bool> (IdxVal != 0 && "Unexpected index"
) ? void (0) : __assert_fail ("IdxVal != 0 && \"Unexpected index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6924, __extension__
__PRETTY_FUNCTION__));

6925

SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,

6926

DAG.getTargetConstant(IdxVal, dl, MVT::i8));

6927

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);

6928

}

6929

6930

if (ISD::isBuildVectorAllZeros(Vec.getNode())) {

6931

assert(IdxVal != 0 && "Unexpected index")(static_cast <bool> (IdxVal != 0 && "Unexpected index"
) ? void (0) : __assert_fail ("IdxVal != 0 && \"Unexpected index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 6931, __extension__
__PRETTY_FUNCTION__));

6932

// If upper elements of Vec are known undef, then just shift into place.

6933

if (llvm::all_of(Vec->ops().slice(IdxVal + SubVecNumElems),

6934

[](SDValue V) { return V.isUndef(); })) {

6935

SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,

6936

DAG.getTargetConstant(IdxVal, dl, MVT::i8));

6937

} else {

6938

NumElems = WideOpVT.getVectorNumElements();

6939

unsigned ShiftLeft = NumElems - SubVecNumElems;

6940

unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;

6941

SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,

6942

DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));

6943

if (ShiftRight != 0)

6944

SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,

6945

DAG.getTargetConstant(ShiftRight, dl, MVT::i8));

6946

}

6947

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);

6948

}

6949

6950

// Simple case when we put subvector in the upper part

6951

if (IdxVal + SubVecNumElems == NumElems) {

6952

SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,

6953

DAG.getTargetConstant(IdxVal, dl, MVT::i8));

6954

if (SubVecNumElems * 2 == NumElems) {

6955

// Special case, use legal zero extending insert_subvector. This allows

6956

// isel to optimize when bits are known zero.

6957

Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVecVT, Vec, ZeroIdx);

6958

Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,

6959

DAG.getConstant(0, dl, WideOpVT),

6960

Vec, ZeroIdx);

6961

} else {

6962

// Otherwise use explicit shifts to zero the bits.

6963

Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,

6964

Undef, Vec, ZeroIdx);

6965

NumElems = WideOpVT.getVectorNumElements();

6966

SDValue ShiftBits = DAG.getTargetConstant(NumElems - IdxVal, dl, MVT::i8);

6967

Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);

6968

Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);

6969

}

6970

Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);

6971

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);

6972

}

6973

6974

// Inserting into the middle is more complicated.

6975

6976

NumElems = WideOpVT.getVectorNumElements();

6977

6978

// Widen the vector if needed.

6979

Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);

6980

6981

unsigned ShiftLeft = NumElems - SubVecNumElems;

6982

unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;

6983

6984

// Do an optimization for the the most frequently used types.

6985

if (WideOpVT != MVT::v64i1 || Subtarget.is64Bit()) {

6986

APInt Mask0 = APInt::getBitsSet(NumElems, IdxVal, IdxVal + SubVecNumElems);

6987

Mask0.flipAllBits();

6988

SDValue CMask0 = DAG.getConstant(Mask0, dl, MVT::getIntegerVT(NumElems));

6989

SDValue VMask0 = DAG.getNode(ISD::BITCAST, dl, WideOpVT, CMask0);

6990

Vec = DAG.getNode(ISD::AND, dl, WideOpVT, Vec, VMask0);

6991

SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,

6992

DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));

6993

SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,

6994

DAG.getTargetConstant(ShiftRight, dl, MVT::i8));

6995

Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);

6996

6997

// Reduce to original width if needed.

6998

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);

6999

}

7000

7001

// Clear the upper bits of the subvector and move it to its insert position.

7002

SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,

7003

DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));

7004

SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,

7005

DAG.getTargetConstant(ShiftRight, dl, MVT::i8));

7006

7007

// Isolate the bits below the insertion point.

7008

unsigned LowShift = NumElems - IdxVal;

7009

SDValue Low = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec,

7010

DAG.getTargetConstant(LowShift, dl, MVT::i8));

7011

Low = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Low,

7012

DAG.getTargetConstant(LowShift, dl, MVT::i8));

7013

7014

// Isolate the bits after the last inserted bit.

7015

unsigned HighShift = IdxVal + SubVecNumElems;

7016

SDValue High = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec,

7017

DAG.getTargetConstant(HighShift, dl, MVT::i8));

7018

High = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, High,

7019

DAG.getTargetConstant(HighShift, dl, MVT::i8));

7020

7021

// Now OR all 3 pieces together.

7022

Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Low, High);

7023

SubVec = DAG.getNode(ISD::OR, dl, WideOpVT, SubVec, Vec);

7024

7025

// Reduce to original width if needed.

7026

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);

7027

}

7028

7029

static SDValue concatSubVectors(SDValue V1, SDValue V2, SelectionDAG &DAG,

7030

const SDLoc &dl) {

7031

assert(V1.getValueType() == V2.getValueType() && "subvector type mismatch")(static_cast <bool> (V1.getValueType() == V2.getValueType
() && "subvector type mismatch") ? void (0) : __assert_fail
("V1.getValueType() == V2.getValueType() && \"subvector type mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7031, __extension__
__PRETTY_FUNCTION__));

7032

EVT SubVT = V1.getValueType();

7033

EVT SubSVT = SubVT.getScalarType();

7034

unsigned SubNumElts = SubVT.getVectorNumElements();

7035

unsigned SubVectorWidth = SubVT.getSizeInBits();

7036

EVT VT = EVT::getVectorVT(*DAG.getContext(), SubSVT, 2 * SubNumElts);

7037

SDValue V = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, dl, SubVectorWidth);

7038

return insertSubVector(V, V2, SubNumElts, DAG, dl, SubVectorWidth);

7039

}

7040

7041

/// Returns a vector of specified type with all bits set.

7042

/// Always build ones vectors as <4 x i32>, <8 x i32> or <16 x i32>.

7043

/// Then bitcast to their original type, ensuring they get CSE'd.

7044

static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {

7045

assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector()) && "Expected a 128/256/512-bit vector type"
) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Expected a 128/256/512-bit vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7046, __extension__
__PRETTY_FUNCTION__))

7046

"Expected a 128/256/512-bit vector type")(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector()) && "Expected a 128/256/512-bit vector type"
) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Expected a 128/256/512-bit vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7046, __extension__
__PRETTY_FUNCTION__));

7047

7048

APInt Ones = APInt::getAllOnes(32);

7049

unsigned NumElts = VT.getSizeInBits() / 32;

7050

SDValue Vec = DAG.getConstant(Ones, dl, MVT::getVectorVT(MVT::i32, NumElts));

7051

return DAG.getBitcast(VT, Vec);

7052

}

7053

7054

static SDValue getEXTEND_VECTOR_INREG(unsigned Opcode, const SDLoc &DL, EVT VT,

7055

SDValue In, SelectionDAG &DAG) {

7056

EVT InVT = In.getValueType();

7057

assert(VT.isVector() && InVT.isVector() && "Expected vector VTs.")(static_cast <bool> (VT.isVector() && InVT.isVector
() && "Expected vector VTs.") ? void (0) : __assert_fail
("VT.isVector() && InVT.isVector() && \"Expected vector VTs.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7057, __extension__
__PRETTY_FUNCTION__));

7058

assert((ISD::ANY_EXTEND == Opcode || ISD::SIGN_EXTEND == Opcode ||(static_cast <bool> ((ISD::ANY_EXTEND == Opcode || ISD::
SIGN_EXTEND == Opcode || ISD::ZERO_EXTEND == Opcode) &&
"Unknown extension opcode") ? void (0) : __assert_fail ("(ISD::ANY_EXTEND == Opcode || ISD::SIGN_EXTEND == Opcode || ISD::ZERO_EXTEND == Opcode) && \"Unknown extension opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7060, __extension__
__PRETTY_FUNCTION__))

7059

ISD::ZERO_EXTEND == Opcode) &&(static_cast <bool> ((ISD::ANY_EXTEND == Opcode || ISD::
SIGN_EXTEND == Opcode || ISD::ZERO_EXTEND == Opcode) &&
"Unknown extension opcode") ? void (0) : __assert_fail ("(ISD::ANY_EXTEND == Opcode || ISD::SIGN_EXTEND == Opcode || ISD::ZERO_EXTEND == Opcode) && \"Unknown extension opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7060, __extension__
__PRETTY_FUNCTION__))

7060

"Unknown extension opcode")(static_cast <bool> ((ISD::ANY_EXTEND == Opcode || ISD::
SIGN_EXTEND == Opcode || ISD::ZERO_EXTEND == Opcode) &&
"Unknown extension opcode") ? void (0) : __assert_fail ("(ISD::ANY_EXTEND == Opcode || ISD::SIGN_EXTEND == Opcode || ISD::ZERO_EXTEND == Opcode) && \"Unknown extension opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7060, __extension__
__PRETTY_FUNCTION__));

7061

7062

// For 256-bit vectors, we only need the lower (128-bit) input half.

7063

// For 512-bit vectors, we only need the lower input half or quarter.

7064

if (InVT.getSizeInBits() > 128) {

7065

assert(VT.getSizeInBits() == InVT.getSizeInBits() &&(static_cast <bool> (VT.getSizeInBits() == InVT.getSizeInBits
() && "Expected VTs to be the same size!") ? void (0)
: __assert_fail ("VT.getSizeInBits() == InVT.getSizeInBits() && \"Expected VTs to be the same size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7066, __extension__
__PRETTY_FUNCTION__))

7066

"Expected VTs to be the same size!")(static_cast <bool> (VT.getSizeInBits() == InVT.getSizeInBits
() && "Expected VTs to be the same size!") ? void (0)
: __assert_fail ("VT.getSizeInBits() == InVT.getSizeInBits() && \"Expected VTs to be the same size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7066, __extension__
__PRETTY_FUNCTION__));

7067

unsigned Scale = VT.getScalarSizeInBits() / InVT.getScalarSizeInBits();

7068

In = extractSubVector(In, 0, DAG, DL,

7069

std::max(128U, (unsigned)VT.getSizeInBits() / Scale));

7070

InVT = In.getValueType();

7071

}

7072

7073

if (VT.getVectorNumElements() != InVT.getVectorNumElements())

7074

Opcode = DAG.getOpcode_EXTEND_VECTOR_INREG(Opcode);

7075

7076

return DAG.getNode(Opcode, DL, VT, In);

7077

}

7078

7079

// Match (xor X, -1) -> X.

7080

// Match extract_subvector(xor X, -1) -> extract_subvector(X).

7081

// Match concat_vectors(xor X, -1, xor Y, -1) -> concat_vectors(X, Y).

7082

static SDValue IsNOT(SDValue V, SelectionDAG &DAG) {

7083

V = peekThroughBitcasts(V);

7084

if (V.getOpcode() == ISD::XOR &&

7085

(ISD::isBuildVectorAllOnes(V.getOperand(1).getNode()) ||

7086

isAllOnesConstant(V.getOperand(1))))

7087

return V.getOperand(0);

7088

if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR &&

7089

(isNullConstant(V.getOperand(1)) || V.getOperand(0).hasOneUse())) {

7090

if (SDValue Not = IsNOT(V.getOperand(0), DAG)) {

7091

Not = DAG.getBitcast(V.getOperand(0).getValueType(), Not);

7092

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(Not), V.getValueType(),

7093

Not, V.getOperand(1));

7094

}

7095

}

7096

SmallVector<SDValue, 2> CatOps;

7097

if (collectConcatOps(V.getNode(), CatOps, DAG)) {

7098

for (SDValue &CatOp : CatOps) {

7099

SDValue NotCat = IsNOT(CatOp, DAG);

7100

if (!NotCat) return SDValue();

7101

CatOp = DAG.getBitcast(CatOp.getValueType(), NotCat);

7102

}

7103

return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(V), V.getValueType(), CatOps);

7104

}

7105

return SDValue();

7106

}

7107

7108

void llvm::createUnpackShuffleMask(EVT VT, SmallVectorImpl<int> &Mask,

7109

bool Lo, bool Unary) {

7110

assert(VT.getScalarType().isSimple() && (VT.getSizeInBits() % 128) == 0 &&(static_cast <bool> (VT.getScalarType().isSimple() &&
(VT.getSizeInBits() % 128) == 0 && "Illegal vector type to unpack"
) ? void (0) : __assert_fail ("VT.getScalarType().isSimple() && (VT.getSizeInBits() % 128) == 0 && \"Illegal vector type to unpack\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7111, __extension__
__PRETTY_FUNCTION__))

7111

"Illegal vector type to unpack")(static_cast <bool> (VT.getScalarType().isSimple() &&
(VT.getSizeInBits() % 128) == 0 && "Illegal vector type to unpack"
) ? void (0) : __assert_fail ("VT.getScalarType().isSimple() && (VT.getSizeInBits() % 128) == 0 && \"Illegal vector type to unpack\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7111, __extension__
__PRETTY_FUNCTION__));

7112

assert(Mask.empty() && "Expected an empty shuffle mask vector")(static_cast <bool> (Mask.empty() && "Expected an empty shuffle mask vector"
) ? void (0) : __assert_fail ("Mask.empty() && \"Expected an empty shuffle mask vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7112, __extension__
__PRETTY_FUNCTION__));

7113

int NumElts = VT.getVectorNumElements();

7114

int NumEltsInLane = 128 / VT.getScalarSizeInBits();

7115

for (int i = 0; i < NumElts; ++i) {

7116

unsigned LaneStart = (i / NumEltsInLane) * NumEltsInLane;

7117

int Pos = (i % NumEltsInLane) / 2 + LaneStart;

7118

Pos += (Unary ? 0 : NumElts * (i % 2));

7119

Pos += (Lo ? 0 : NumEltsInLane / 2);

7120

Mask.push_back(Pos);

7121

}

7122

}

7123

7124

/// Similar to unpacklo/unpackhi, but without the 128-bit lane limitation

7125

/// imposed by AVX and specific to the unary pattern. Example:

7126

/// v8iX Lo --> <0, 0, 1, 1, 2, 2, 3, 3>

7127

/// v8iX Hi --> <4, 4, 5, 5, 6, 6, 7, 7>

7128

void llvm::createSplat2ShuffleMask(MVT VT, SmallVectorImpl<int> &Mask,

7129

bool Lo) {

7130

assert(Mask.empty() && "Expected an empty shuffle mask vector")(static_cast <bool> (Mask.empty() && "Expected an empty shuffle mask vector"
) ? void (0) : __assert_fail ("Mask.empty() && \"Expected an empty shuffle mask vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7130, __extension__
__PRETTY_FUNCTION__));

7131

int NumElts = VT.getVectorNumElements();

7132

for (int i = 0; i < NumElts; ++i) {

7133

int Pos = i / 2;

7134

Pos += (Lo ? 0 : NumElts / 2);

7135

Mask.push_back(Pos);

7136

}

7137

}

7138

7139

// Attempt to constant fold, else just create a VECTOR_SHUFFLE.

7140

static SDValue getVectorShuffle(SelectionDAG &DAG, EVT VT, const SDLoc &dl,

7141

SDValue V1, SDValue V2, ArrayRef<int> Mask) {

7142

if ((ISD::isBuildVectorOfConstantSDNodes(V1.getNode()) || V1.isUndef()) &&

7143

(ISD::isBuildVectorOfConstantSDNodes(V2.getNode()) || V2.isUndef())) {

7144

SmallVector<SDValue> Ops(Mask.size(), DAG.getUNDEF(VT.getScalarType()));

7145

for (int I = 0, NumElts = Mask.size(); I != NumElts; ++I) {

7146

int M = Mask[I];

7147

if (M < 0)

7148

continue;

7149

SDValue V = (M < NumElts) ? V1 : V2;

7150

if (V.isUndef())

7151

continue;

7152

Ops[I] = V.getOperand(M % NumElts);

7153

}

7154

return DAG.getBuildVector(VT, dl, Ops);

7155

}

7156

7157

return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);

7158

}

7159

7160

/// Returns a vector_shuffle node for an unpackl operation.

7161

static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, EVT VT,

7162

SDValue V1, SDValue V2) {

7163

SmallVector<int, 8> Mask;

7164

createUnpackShuffleMask(VT, Mask, /* Lo = */ true, /* Unary = */ false);

7165

return getVectorShuffle(DAG, VT, dl, V1, V2, Mask);

7166

}

7167

7168

/// Returns a vector_shuffle node for an unpackh operation.

7169

static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, EVT VT,

7170

SDValue V1, SDValue V2) {

7171

SmallVector<int, 8> Mask;

7172

createUnpackShuffleMask(VT, Mask, /* Lo = */ false, /* Unary = */ false);

7173

return getVectorShuffle(DAG, VT, dl, V1, V2, Mask);

7174

}

7175

7176

/// Returns a node that packs the LHS + RHS nodes together at half width.

7177

/// May return X86ISD::PACKSS/PACKUS, packing the top/bottom half.

7178

/// TODO: Add subvector splitting if/when we have a need for it.

7179

static SDValue getPack(SelectionDAG &DAG, const X86Subtarget &Subtarget,

7180

const SDLoc &dl, MVT VT, SDValue LHS, SDValue RHS,

7181

bool PackHiHalf = false) {

7182

MVT OpVT = LHS.getSimpleValueType();

7183

unsigned EltSizeInBits = VT.getScalarSizeInBits();

7184

bool UsePackUS = Subtarget.hasSSE41() || EltSizeInBits == 8;

7185

assert(OpVT == RHS.getSimpleValueType() &&(static_cast <bool> (OpVT == RHS.getSimpleValueType() &&
VT.getSizeInBits() == OpVT.getSizeInBits() && (EltSizeInBits
* 2) == OpVT.getScalarSizeInBits() && "Unexpected PACK operand types"
) ? void (0) : __assert_fail ("OpVT == RHS.getSimpleValueType() && VT.getSizeInBits() == OpVT.getSizeInBits() && (EltSizeInBits * 2) == OpVT.getScalarSizeInBits() && \"Unexpected PACK operand types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7188, __extension__
__PRETTY_FUNCTION__))

7186

VT.getSizeInBits() == OpVT.getSizeInBits() &&(static_cast <bool> (OpVT == RHS.getSimpleValueType() &&
VT.getSizeInBits() == OpVT.getSizeInBits() && (EltSizeInBits
* 2) == OpVT.getScalarSizeInBits() && "Unexpected PACK operand types"
) ? void (0) : __assert_fail ("OpVT == RHS.getSimpleValueType() && VT.getSizeInBits() == OpVT.getSizeInBits() && (EltSizeInBits * 2) == OpVT.getScalarSizeInBits() && \"Unexpected PACK operand types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7188, __extension__
__PRETTY_FUNCTION__))

7187

(EltSizeInBits * 2) == OpVT.getScalarSizeInBits() &&(static_cast <bool> (OpVT == RHS.getSimpleValueType() &&
VT.getSizeInBits() == OpVT.getSizeInBits() && (EltSizeInBits
* 2) == OpVT.getScalarSizeInBits() && "Unexpected PACK operand types"
) ? void (0) : __assert_fail ("OpVT == RHS.getSimpleValueType() && VT.getSizeInBits() == OpVT.getSizeInBits() && (EltSizeInBits * 2) == OpVT.getScalarSizeInBits() && \"Unexpected PACK operand types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7188, __extension__
__PRETTY_FUNCTION__))

7188

"Unexpected PACK operand types")(static_cast <bool> (OpVT == RHS.getSimpleValueType() &&
VT.getSizeInBits() == OpVT.getSizeInBits() && (EltSizeInBits
* 2) == OpVT.getScalarSizeInBits() && "Unexpected PACK operand types"
) ? void (0) : __assert_fail ("OpVT == RHS.getSimpleValueType() && VT.getSizeInBits() == OpVT.getSizeInBits() && (EltSizeInBits * 2) == OpVT.getScalarSizeInBits() && \"Unexpected PACK operand types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7188, __extension__
__PRETTY_FUNCTION__));

7189

assert((EltSizeInBits == 8 || EltSizeInBits == 16 || EltSizeInBits == 32) &&(static_cast <bool> ((EltSizeInBits == 8 || EltSizeInBits
== 16 || EltSizeInBits == 32) && "Unexpected PACK result type"
) ? void (0) : __assert_fail ("(EltSizeInBits == 8 || EltSizeInBits == 16 || EltSizeInBits == 32) && \"Unexpected PACK result type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7190, __extension__
__PRETTY_FUNCTION__))

7190

"Unexpected PACK result type")(static_cast <bool> ((EltSizeInBits == 8 || EltSizeInBits
== 16 || EltSizeInBits == 32) && "Unexpected PACK result type"
) ? void (0) : __assert_fail ("(EltSizeInBits == 8 || EltSizeInBits == 16 || EltSizeInBits == 32) && \"Unexpected PACK result type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7190, __extension__
__PRETTY_FUNCTION__));

7191

7192

// Rely on vector shuffles for vXi64 -> vXi32 packing.

7193

if (EltSizeInBits == 32) {

7194

SmallVector<int> PackMask;

7195

int Offset = PackHiHalf ? 1 : 0;

7196

int NumElts = VT.getVectorNumElements();

7197

for (int I = 0; I != NumElts; I += 4) {

7198

PackMask.push_back(I + Offset);

7199

PackMask.push_back(I + Offset + 2);

7200

PackMask.push_back(I + Offset + NumElts);

7201

PackMask.push_back(I + Offset + NumElts + 2);

7202

}

7203

return DAG.getVectorShuffle(VT, dl, DAG.getBitcast(VT, LHS),

7204

DAG.getBitcast(VT, RHS), PackMask);

7205

}

7206

7207

// See if we already have sufficient leading bits for PACKSS/PACKUS.

7208

if (!PackHiHalf) {

7209

if (UsePackUS &&

7210

DAG.computeKnownBits(LHS).countMaxActiveBits() <= EltSizeInBits &&

7211

DAG.computeKnownBits(RHS).countMaxActiveBits() <= EltSizeInBits)

7212

return DAG.getNode(X86ISD::PACKUS, dl, VT, LHS, RHS);

7213

7214

if (DAG.ComputeMaxSignificantBits(LHS) <= EltSizeInBits &&

7215

DAG.ComputeMaxSignificantBits(RHS) <= EltSizeInBits)

7216

return DAG.getNode(X86ISD::PACKSS, dl, VT, LHS, RHS);

7217

}

7218

7219

// Fallback to sign/zero extending the requested half and pack.

7220

SDValue Amt = DAG.getTargetConstant(EltSizeInBits, dl, MVT::i8);

7221

if (UsePackUS) {

7222

if (PackHiHalf) {

7223

LHS = DAG.getNode(X86ISD::VSRLI, dl, OpVT, LHS, Amt);

7224

RHS = DAG.getNode(X86ISD::VSRLI, dl, OpVT, RHS, Amt);

7225

} else {

7226

SDValue Mask = DAG.getConstant((1ULL << EltSizeInBits) - 1, dl, OpVT);

7227

LHS = DAG.getNode(ISD::AND, dl, OpVT, LHS, Mask);

7228

RHS = DAG.getNode(ISD::AND, dl, OpVT, RHS, Mask);

7229

};

7230

return DAG.getNode(X86ISD::PACKUS, dl, VT, LHS, RHS);

7231

};

7232

7233

if (!PackHiHalf) {

7234

LHS = DAG.getNode(X86ISD::VSHLI, dl, OpVT, LHS, Amt);

7235

RHS = DAG.getNode(X86ISD::VSHLI, dl, OpVT, RHS, Amt);

7236

}

7237

LHS = DAG.getNode(X86ISD::VSRAI, dl, OpVT, LHS, Amt);

7238

RHS = DAG.getNode(X86ISD::VSRAI, dl, OpVT, RHS, Amt);

7239

return DAG.getNode(X86ISD::PACKSS, dl, VT, LHS, RHS);

7240

}

7241

7242

/// Return a vector_shuffle of the specified vector of zero or undef vector.

7243

/// This produces a shuffle where the low element of V2 is swizzled into the

7244

/// zero/undef vector, landing at element Idx.

7245

/// This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3).

7246

static SDValue getShuffleVectorZeroOrUndef(SDValue V2, int Idx,

7247

bool IsZero,

7248

const X86Subtarget &Subtarget,

7249

SelectionDAG &DAG) {

7250

MVT VT = V2.getSimpleValueType();

7251

SDValue V1 = IsZero

7252

? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);

7253

int NumElems = VT.getVectorNumElements();

7254

SmallVector<int, 16> MaskVec(NumElems);

7255

for (int i = 0; i != NumElems; ++i)

7256

// If this is the insertion idx, put the low elt of V2 here.

7257

MaskVec[i] = (i == Idx) ? NumElems : i;

7258

return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, MaskVec);

7259

}

7260

7261

static const Constant *getTargetConstantFromBasePtr(SDValue Ptr) {

7262

if (Ptr.getOpcode() == X86ISD::Wrapper ||

7263

Ptr.getOpcode() == X86ISD::WrapperRIP)

7264

Ptr = Ptr.getOperand(0);

7265

7266

auto *CNode = dyn_cast<ConstantPoolSDNode>(Ptr);

7267

if (!CNode || CNode->isMachineConstantPoolEntry() || CNode->getOffset() != 0)

7268

return nullptr;

7269

7270

return CNode->getConstVal();

7271

}

7272

7273

static const Constant *getTargetConstantFromNode(LoadSDNode *Load) {

7274

if (!Load || !ISD::isNormalLoad(Load))

7275

return nullptr;

7276

return getTargetConstantFromBasePtr(Load->getBasePtr());

7277

}

7278

7279

static const Constant *getTargetConstantFromNode(SDValue Op) {

7280

Op = peekThroughBitcasts(Op);

7281

return getTargetConstantFromNode(dyn_cast<LoadSDNode>(Op));

7282

}

7283

7284

const Constant *

7285

X86TargetLowering::getTargetConstantFromLoad(LoadSDNode *LD) const {

7286

assert(LD && "Unexpected null LoadSDNode")(static_cast <bool> (LD && "Unexpected null LoadSDNode"
) ? void (0) : __assert_fail ("LD && \"Unexpected null LoadSDNode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7286, __extension__
__PRETTY_FUNCTION__));

7287

return getTargetConstantFromNode(LD);

7288

}

7289

7290

// Extract raw constant bits from constant pools.

7291

static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,

7292

APInt &UndefElts,

7293

SmallVectorImpl<APInt> &EltBits,

7294

bool AllowWholeUndefs = true,

7295

bool AllowPartialUndefs = true) {

7296

assert(EltBits.empty() && "Expected an empty EltBits vector")(static_cast <bool> (EltBits.empty() && "Expected an empty EltBits vector"
) ? void (0) : __assert_fail ("EltBits.empty() && \"Expected an empty EltBits vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7296, __extension__
__PRETTY_FUNCTION__));

7297

7298

Op = peekThroughBitcasts(Op);

7299

7300

EVT VT = Op.getValueType();

7301

unsigned SizeInBits = VT.getSizeInBits();

7302

assert((SizeInBits % EltSizeInBits) == 0 && "Can't split constant!")(static_cast <bool> ((SizeInBits % EltSizeInBits) == 0 &&
"Can't split constant!") ? void (0) : __assert_fail ("(SizeInBits % EltSizeInBits) == 0 && \"Can't split constant!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7302, __extension__
__PRETTY_FUNCTION__));

7303

unsigned NumElts = SizeInBits / EltSizeInBits;

7304

7305

// Bitcast a source array of element bits to the target size.

7306

auto CastBitData = [&](APInt &UndefSrcElts, ArrayRef<APInt> SrcEltBits) {

7307

unsigned NumSrcElts = UndefSrcElts.getBitWidth();

7308

unsigned SrcEltSizeInBits = SrcEltBits[0].getBitWidth();

7309

assert((NumSrcElts * SrcEltSizeInBits) == SizeInBits &&(static_cast <bool> ((NumSrcElts * SrcEltSizeInBits) ==
SizeInBits && "Constant bit sizes don't match") ? void
(0) : __assert_fail ("(NumSrcElts * SrcEltSizeInBits) == SizeInBits && \"Constant bit sizes don't match\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7310, __extension__
__PRETTY_FUNCTION__))

7310

"Constant bit sizes don't match")(static_cast <bool> ((NumSrcElts * SrcEltSizeInBits) ==
SizeInBits && "Constant bit sizes don't match") ? void
(0) : __assert_fail ("(NumSrcElts * SrcEltSizeInBits) == SizeInBits && \"Constant bit sizes don't match\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7310, __extension__
__PRETTY_FUNCTION__));

7311

7312

// Don't split if we don't allow undef bits.

7313

bool AllowUndefs = AllowWholeUndefs || AllowPartialUndefs;

7314

if (UndefSrcElts.getBoolValue() && !AllowUndefs)

7315

return false;

7316

7317

// If we're already the right size, don't bother bitcasting.

7318

if (NumSrcElts == NumElts) {

7319

UndefElts = UndefSrcElts;

7320

EltBits.assign(SrcEltBits.begin(), SrcEltBits.end());

7321

return true;

7322

}

7323

7324

// Extract all the undef/constant element data and pack into single bitsets.

7325

APInt UndefBits(SizeInBits, 0);

7326

APInt MaskBits(SizeInBits, 0);

7327

7328

for (unsigned i = 0; i != NumSrcElts; ++i) {

7329

unsigned BitOffset = i * SrcEltSizeInBits;

7330

if (UndefSrcElts[i])

7331

UndefBits.setBits(BitOffset, BitOffset + SrcEltSizeInBits);

7332

MaskBits.insertBits(SrcEltBits[i], BitOffset);

7333

}

7334

7335

// Split the undef/constant single bitset data into the target elements.

7336

UndefElts = APInt(NumElts, 0);

7337

EltBits.resize(NumElts, APInt(EltSizeInBits, 0));

7338

7339

for (unsigned i = 0; i != NumElts; ++i) {

7340

unsigned BitOffset = i * EltSizeInBits;

7341

APInt UndefEltBits = UndefBits.extractBits(EltSizeInBits, BitOffset);

7342

7343

// Only treat an element as UNDEF if all bits are UNDEF.

7344

if (UndefEltBits.isAllOnes()) {

7345

if (!AllowWholeUndefs)

7346

return false;

7347

UndefElts.setBit(i);

7348

continue;

7349

}

7350

7351

// If only some bits are UNDEF then treat them as zero (or bail if not

7352

// supported).

7353

if (UndefEltBits.getBoolValue() && !AllowPartialUndefs)

7354

return false;

7355

7356

EltBits[i] = MaskBits.extractBits(EltSizeInBits, BitOffset);

7357

}

7358

return true;

7359

};

7360

7361

// Collect constant bits and insert into mask/undef bit masks.

7362

auto CollectConstantBits = [](const Constant *Cst, APInt &Mask, APInt &Undefs,

7363

unsigned UndefBitIndex) {

7364

if (!Cst)

7365

return false;

7366

if (isa<UndefValue>(Cst)) {

7367

Undefs.setBit(UndefBitIndex);

7368

return true;

7369

}

7370

if (auto *CInt = dyn_cast<ConstantInt>(Cst)) {

7371

Mask = CInt->getValue();

7372

return true;

7373

}

7374

if (auto *CFP = dyn_cast<ConstantFP>(Cst)) {

7375

Mask = CFP->getValueAPF().bitcastToAPInt();

7376

return true;

7377

}

7378

return false;

7379

};

7380

7381

// Handle UNDEFs.

7382

if (Op.isUndef()) {

7383

APInt UndefSrcElts = APInt::getAllOnes(NumElts);

7384

SmallVector<APInt, 64> SrcEltBits(NumElts, APInt(EltSizeInBits, 0));

7385

return CastBitData(UndefSrcElts, SrcEltBits);

7386

}

7387

7388

// Extract scalar constant bits.

7389

if (auto *Cst = dyn_cast<ConstantSDNode>(Op)) {

7390

APInt UndefSrcElts = APInt::getZero(1);

7391

SmallVector<APInt, 64> SrcEltBits(1, Cst->getAPIntValue());

7392

return CastBitData(UndefSrcElts, SrcEltBits);

7393

}

7394

if (auto *Cst = dyn_cast<ConstantFPSDNode>(Op)) {

7395

APInt UndefSrcElts = APInt::getZero(1);

7396

APInt RawBits = Cst->getValueAPF().bitcastToAPInt();

7397

SmallVector<APInt, 64> SrcEltBits(1, RawBits);

7398

return CastBitData(UndefSrcElts, SrcEltBits);

7399

}

7400

7401

// Extract constant bits from build vector.

7402

if (auto *BV = dyn_cast<BuildVectorSDNode>(Op)) {

7403

BitVector Undefs;

7404

SmallVector<APInt> SrcEltBits;

7405

unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();

7406

if (BV->getConstantRawBits(true, SrcEltSizeInBits, SrcEltBits, Undefs)) {

7407

APInt UndefSrcElts = APInt::getNullValue(SrcEltBits.size());

7408

for (unsigned I = 0, E = SrcEltBits.size(); I != E; ++I)

7409

if (Undefs[I])

7410

UndefSrcElts.setBit(I);

7411

return CastBitData(UndefSrcElts, SrcEltBits);

7412

}

7413

}

7414

7415

// Extract constant bits from constant pool vector.

7416

if (auto *Cst = getTargetConstantFromNode(Op)) {

7417

Type *CstTy = Cst->getType();

7418

unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();

7419

if (!CstTy->isVectorTy() || (CstSizeInBits % SizeInBits) != 0)

7420

return false;

7421

7422

unsigned SrcEltSizeInBits = CstTy->getScalarSizeInBits();

7423

unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;

7424

7425

APInt UndefSrcElts(NumSrcElts, 0);

7426

SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));

7427

for (unsigned i = 0; i != NumSrcElts; ++i)

7428

if (!CollectConstantBits(Cst->getAggregateElement(i), SrcEltBits[i],

7429

UndefSrcElts, i))

7430

return false;

7431

7432

return CastBitData(UndefSrcElts, SrcEltBits);

7433

}

7434

7435

// Extract constant bits from a broadcasted constant pool scalar.

7436

if (Op.getOpcode() == X86ISD::VBROADCAST_LOAD &&

7437

EltSizeInBits <= VT.getScalarSizeInBits()) {

7438

auto *MemIntr = cast<MemIntrinsicSDNode>(Op);

7439

if (MemIntr->getMemoryVT().getScalarSizeInBits() != VT.getScalarSizeInBits())

7440

return false;

7441

7442

SDValue Ptr = MemIntr->getBasePtr();

7443

if (const Constant *C = getTargetConstantFromBasePtr(Ptr)) {

7444

unsigned SrcEltSizeInBits = C->getType()->getScalarSizeInBits();

7445

unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;

7446

7447

APInt UndefSrcElts(NumSrcElts, 0);

7448

SmallVector<APInt, 64> SrcEltBits(1, APInt(SrcEltSizeInBits, 0));

7449

if (CollectConstantBits(C, SrcEltBits[0], UndefSrcElts, 0)) {

7450

if (UndefSrcElts[0])

7451

UndefSrcElts.setBits(0, NumSrcElts);

7452

SrcEltBits.append(NumSrcElts - 1, SrcEltBits[0]);

7453

return CastBitData(UndefSrcElts, SrcEltBits);

7454

}

7455

}

7456

}

7457

7458

// Extract constant bits from a subvector broadcast.

7459

if (Op.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) {

7460

auto *MemIntr = cast<MemIntrinsicSDNode>(Op);

7461

SDValue Ptr = MemIntr->getBasePtr();

7462

// The source constant may be larger than the subvector broadcast,

7463

// ensure we extract the correct subvector constants.

7464

if (const Constant *Cst = getTargetConstantFromBasePtr(Ptr)) {

7465

Type *CstTy = Cst->getType();

7466

unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();

7467

unsigned SubVecSizeInBits = MemIntr->getMemoryVT().getStoreSizeInBits();

7468

if (!CstTy->isVectorTy() || (CstSizeInBits % SubVecSizeInBits) != 0 ||

7469

(SizeInBits % SubVecSizeInBits) != 0)

7470

return false;

7471

unsigned CstEltSizeInBits = CstTy->getScalarSizeInBits();

7472

unsigned NumSubElts = SubVecSizeInBits / CstEltSizeInBits;

7473

unsigned NumSubVecs = SizeInBits / SubVecSizeInBits;

7474

APInt UndefSubElts(NumSubElts, 0);

7475

SmallVector<APInt, 64> SubEltBits(NumSubElts * NumSubVecs,

7476

APInt(CstEltSizeInBits, 0));

7477

for (unsigned i = 0; i != NumSubElts; ++i) {

7478

if (!CollectConstantBits(Cst->getAggregateElement(i), SubEltBits[i],

7479

UndefSubElts, i))

7480

return false;

7481

for (unsigned j = 1; j != NumSubVecs; ++j)

7482

SubEltBits[i + (j * NumSubElts)] = SubEltBits[i];

7483

}

7484

UndefSubElts = APInt::getSplat(NumSubVecs * UndefSubElts.getBitWidth(),

7485

UndefSubElts);

7486

return CastBitData(UndefSubElts, SubEltBits);

7487

}

7488

}

7489

7490

// Extract a rematerialized scalar constant insertion.

7491

if (Op.getOpcode() == X86ISD::VZEXT_MOVL &&

7492

Op.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&

7493

isa<ConstantSDNode>(Op.getOperand(0).getOperand(0))) {

7494

unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();

7495

unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;

7496

7497

APInt UndefSrcElts(NumSrcElts, 0);

7498

SmallVector<APInt, 64> SrcEltBits;

7499

auto *CN = cast<ConstantSDNode>(Op.getOperand(0).getOperand(0));

7500

SrcEltBits.push_back(CN->getAPIntValue().zextOrTrunc(SrcEltSizeInBits));

7501

SrcEltBits.append(NumSrcElts - 1, APInt(SrcEltSizeInBits, 0));

7502

return CastBitData(UndefSrcElts, SrcEltBits);

7503

}

7504

7505

// Insert constant bits from a base and sub vector sources.

7506

if (Op.getOpcode() == ISD::INSERT_SUBVECTOR) {

7507

// If bitcasts to larger elements we might lose track of undefs - don't

7508

// allow any to be safe.

7509

unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();

7510

bool AllowUndefs = EltSizeInBits >= SrcEltSizeInBits;

7511

7512

APInt UndefSrcElts, UndefSubElts;

7513

SmallVector<APInt, 32> EltSrcBits, EltSubBits;

7514

if (getTargetConstantBitsFromNode(Op.getOperand(1), SrcEltSizeInBits,

7515

UndefSubElts, EltSubBits,

7516

AllowWholeUndefs && AllowUndefs,

7517

AllowPartialUndefs && AllowUndefs) &&

7518

getTargetConstantBitsFromNode(Op.getOperand(0), SrcEltSizeInBits,

7519

UndefSrcElts, EltSrcBits,

7520

AllowWholeUndefs && AllowUndefs,

7521

AllowPartialUndefs && AllowUndefs)) {

7522

unsigned BaseIdx = Op.getConstantOperandVal(2);

7523

UndefSrcElts.insertBits(UndefSubElts, BaseIdx);

7524

for (unsigned i = 0, e = EltSubBits.size(); i != e; ++i)

7525

EltSrcBits[BaseIdx + i] = EltSubBits[i];

7526

return CastBitData(UndefSrcElts, EltSrcBits);

7527

}

7528

}

7529

7530

// Extract constant bits from a subvector's source.

7531

if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR) {

7532

// TODO - support extract_subvector through bitcasts.

7533

if (EltSizeInBits != VT.getScalarSizeInBits())

7534

return false;

7535

7536

if (getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,

7537

UndefElts, EltBits, AllowWholeUndefs,

7538

AllowPartialUndefs)) {

7539

EVT SrcVT = Op.getOperand(0).getValueType();

7540

unsigned NumSrcElts = SrcVT.getVectorNumElements();

7541

unsigned NumSubElts = VT.getVectorNumElements();

7542

unsigned BaseIdx = Op.getConstantOperandVal(1);

7543

UndefElts = UndefElts.extractBits(NumSubElts, BaseIdx);

7544

if ((BaseIdx + NumSubElts) != NumSrcElts)

7545

EltBits.erase(EltBits.begin() + BaseIdx + NumSubElts, EltBits.end());

7546

if (BaseIdx != 0)

7547

EltBits.erase(EltBits.begin(), EltBits.begin() + BaseIdx);

7548

return true;

7549

}

7550

}

7551

7552

// Extract constant bits from shuffle node sources.

7553

if (auto *SVN = dyn_cast<ShuffleVectorSDNode>(Op)) {

7554

// TODO - support shuffle through bitcasts.

7555

if (EltSizeInBits != VT.getScalarSizeInBits())

7556

return false;

7557

7558

ArrayRef<int> Mask = SVN->getMask();

7559

if ((!AllowWholeUndefs || !AllowPartialUndefs) &&

7560

llvm::any_of(Mask, [](int M) { return M < 0; }))

7561

return false;

7562

7563

APInt UndefElts0, UndefElts1;

7564

SmallVector<APInt, 32> EltBits0, EltBits1;

7565

if (isAnyInRange(Mask, 0, NumElts) &&

7566

!getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits,

7567

UndefElts0, EltBits0, AllowWholeUndefs,

7568

AllowPartialUndefs))

7569

return false;

7570

if (isAnyInRange(Mask, NumElts, 2 * NumElts) &&

7571

!getTargetConstantBitsFromNode(Op.getOperand(1), EltSizeInBits,

7572

UndefElts1, EltBits1, AllowWholeUndefs,

7573

AllowPartialUndefs))

7574

return false;

7575

7576

UndefElts = APInt::getZero(NumElts);

7577

for (int i = 0; i != (int)NumElts; ++i) {

7578

int M = Mask[i];

7579

if (M < 0) {

7580

UndefElts.setBit(i);

7581

EltBits.push_back(APInt::getZero(EltSizeInBits));

7582

} else if (M < (int)NumElts) {

7583

if (UndefElts0[M])

7584

UndefElts.setBit(i);

7585

EltBits.push_back(EltBits0[M]);

7586

} else {

7587

if (UndefElts1[M - NumElts])

7588

UndefElts.setBit(i);

7589

EltBits.push_back(EltBits1[M - NumElts]);

7590

}

7591

}

7592

return true;

7593

}

7594

7595

return false;

7596

}

7597

7598

namespace llvm {

7599

namespace X86 {

7600

bool isConstantSplat(SDValue Op, APInt &SplatVal, bool AllowPartialUndefs) {

7601

APInt UndefElts;

7602

SmallVector<APInt, 16> EltBits;

7603

if (getTargetConstantBitsFromNode(Op, Op.getScalarValueSizeInBits(),

7604

UndefElts, EltBits, true,

7605

AllowPartialUndefs)) {

7606

int SplatIndex = -1;

7607

for (int i = 0, e = EltBits.size(); i != e; ++i) {

7608

if (UndefElts[i])

7609

continue;

7610

if (0 <= SplatIndex && EltBits[i] != EltBits[SplatIndex]) {

7611

SplatIndex = -1;

7612

break;

7613

}

7614

SplatIndex = i;

7615

}

7616

if (0 <= SplatIndex) {

7617

SplatVal = EltBits[SplatIndex];

7618

return true;

7619

}

7620

}

7621

7622

return false;

7623

}

7624

} // namespace X86

7625

} // namespace llvm

7626

7627

static bool getTargetShuffleMaskIndices(SDValue MaskNode,

7628

unsigned MaskEltSizeInBits,

7629

SmallVectorImpl<uint64_t> &RawMask,

7630

APInt &UndefElts) {

7631

// Extract the raw target constant bits.

7632

SmallVector<APInt, 64> EltBits;

7633

if (!getTargetConstantBitsFromNode(MaskNode, MaskEltSizeInBits, UndefElts,

7634

EltBits, /* AllowWholeUndefs */ true,

7635

/* AllowPartialUndefs */ false))

7636

return false;

7637

7638

// Insert the extracted elements into the mask.

7639

for (const APInt &Elt : EltBits)

7640

RawMask.push_back(Elt.getZExtValue());

7641

7642

return true;

7643

}

7644

7645

/// Create a shuffle mask that matches the PACKSS/PACKUS truncation.

7646

/// A multi-stage pack shuffle mask is created by specifying NumStages > 1.

7647

/// Note: This ignores saturation, so inputs must be checked first.

7648

static void createPackShuffleMask(MVT VT, SmallVectorImpl<int> &Mask,

7649

bool Unary, unsigned NumStages = 1) {

7650

assert(Mask.empty() && "Expected an empty shuffle mask vector")(static_cast <bool> (Mask.empty() && "Expected an empty shuffle mask vector"
) ? void (0) : __assert_fail ("Mask.empty() && \"Expected an empty shuffle mask vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7650, __extension__
__PRETTY_FUNCTION__));

7651

unsigned NumElts = VT.getVectorNumElements();

7652

unsigned NumLanes = VT.getSizeInBits() / 128;

7653

unsigned NumEltsPerLane = 128 / VT.getScalarSizeInBits();

7654

unsigned Offset = Unary ? 0 : NumElts;

7655

unsigned Repetitions = 1u << (NumStages - 1);

7656

unsigned Increment = 1u << NumStages;

7657

assert((NumEltsPerLane >> NumStages) > 0 && "Illegal packing compaction")(static_cast <bool> ((NumEltsPerLane >> NumStages
) > 0 && "Illegal packing compaction") ? void (0) :
__assert_fail ("(NumEltsPerLane >> NumStages) > 0 && \"Illegal packing compaction\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7657, __extension__
__PRETTY_FUNCTION__));

7658

7659

for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {

7660

for (unsigned Stage = 0; Stage != Repetitions; ++Stage) {

7661

for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += Increment)

7662

Mask.push_back(Elt + (Lane * NumEltsPerLane));

7663

for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += Increment)

7664

Mask.push_back(Elt + (Lane * NumEltsPerLane) + Offset);

7665

}

7666

}

7667

}

7668

7669

// Split the demanded elts of a PACKSS/PACKUS node between its operands.

7670

static void getPackDemandedElts(EVT VT, const APInt &DemandedElts,

7671

APInt &DemandedLHS, APInt &DemandedRHS) {

7672

int NumLanes = VT.getSizeInBits() / 128;

7673

int NumElts = DemandedElts.getBitWidth();

7674

int NumInnerElts = NumElts / 2;

7675

int NumEltsPerLane = NumElts / NumLanes;

7676

int NumInnerEltsPerLane = NumInnerElts / NumLanes;

7677

7678

DemandedLHS = APInt::getZero(NumInnerElts);

7679

DemandedRHS = APInt::getZero(NumInnerElts);

7680

7681

// Map DemandedElts to the packed operands.

7682

for (int Lane = 0; Lane != NumLanes; ++Lane) {

7683

for (int Elt = 0; Elt != NumInnerEltsPerLane; ++Elt) {

7684

int OuterIdx = (Lane * NumEltsPerLane) + Elt;

7685

int InnerIdx = (Lane * NumInnerEltsPerLane) + Elt;

7686

if (DemandedElts[OuterIdx])

7687

DemandedLHS.setBit(InnerIdx);

7688

if (DemandedElts[OuterIdx + NumInnerEltsPerLane])

7689

DemandedRHS.setBit(InnerIdx);

7690

}

7691

}

7692

}

7693

7694

// Split the demanded elts of a HADD/HSUB node between its operands.

7695

static void getHorizDemandedElts(EVT VT, const APInt &DemandedElts,

7696

APInt &DemandedLHS, APInt &DemandedRHS) {

7697

int NumLanes = VT.getSizeInBits() / 128;

7698

int NumElts = DemandedElts.getBitWidth();

7699

int NumEltsPerLane = NumElts / NumLanes;

7700

int HalfEltsPerLane = NumEltsPerLane / 2;

7701

7702

DemandedLHS = APInt::getZero(NumElts);

7703

DemandedRHS = APInt::getZero(NumElts);

7704

7705

// Map DemandedElts to the horizontal operands.

7706

for (int Idx = 0; Idx != NumElts; ++Idx) {

7707

if (!DemandedElts[Idx])

7708

continue;

7709

int LaneIdx = (Idx / NumEltsPerLane) * NumEltsPerLane;

7710

int LocalIdx = Idx % NumEltsPerLane;

7711

if (LocalIdx < HalfEltsPerLane) {

7712

DemandedLHS.setBit(LaneIdx + 2 * LocalIdx + 0);

7713

DemandedLHS.setBit(LaneIdx + 2 * LocalIdx + 1);

7714

} else {

7715

LocalIdx -= HalfEltsPerLane;

7716

DemandedRHS.setBit(LaneIdx + 2 * LocalIdx + 0);

7717

DemandedRHS.setBit(LaneIdx + 2 * LocalIdx + 1);

7718

}

7719

}

7720

}

7721

7722

/// Calculates the shuffle mask corresponding to the target-specific opcode.

7723

/// If the mask could be calculated, returns it in \p Mask, returns the shuffle

7724

/// operands in \p Ops, and returns true.

7725

/// Sets \p IsUnary to true if only one source is used. Note that this will set

7726

/// IsUnary for shuffles which use a single input multiple times, and in those

7727

/// cases it will adjust the mask to only have indices within that single input.

7728

/// It is an error to call this with non-empty Mask/Ops vectors.

7729

static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,

7730

SmallVectorImpl<SDValue> &Ops,

7731

SmallVectorImpl<int> &Mask, bool &IsUnary) {

7732

unsigned NumElems = VT.getVectorNumElements();

7733

unsigned MaskEltSize = VT.getScalarSizeInBits();

7734

SmallVector<uint64_t, 32> RawMask;

7735

APInt RawUndefs;

7736

uint64_t ImmN;

7737

7738

assert(Mask.empty() && "getTargetShuffleMask expects an empty Mask vector")(static_cast <bool> (Mask.empty() && "getTargetShuffleMask expects an empty Mask vector"
) ? void (0) : __assert_fail ("Mask.empty() && \"getTargetShuffleMask expects an empty Mask vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7738, __extension__
__PRETTY_FUNCTION__));

7739

assert(Ops.empty() && "getTargetShuffleMask expects an empty Ops vector")(static_cast <bool> (Ops.empty() && "getTargetShuffleMask expects an empty Ops vector"
) ? void (0) : __assert_fail ("Ops.empty() && \"getTargetShuffleMask expects an empty Ops vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7739, __extension__
__PRETTY_FUNCTION__));

7740

7741

IsUnary = false;

7742

bool IsFakeUnary = false;

7743

switch (N->getOpcode()) {

7744

case X86ISD::BLENDI:

7745

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7745, __extension__
__PRETTY_FUNCTION__));

7746

assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7746, __extension__
__PRETTY_FUNCTION__));

7747

ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);

7748

DecodeBLENDMask(NumElems, ImmN, Mask);

7749

IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);

7750

break;

7751

case X86ISD::SHUFP:

7752

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7752, __extension__
__PRETTY_FUNCTION__));

7753

assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7753, __extension__
__PRETTY_FUNCTION__));

7754

ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);

7755

DecodeSHUFPMask(NumElems, MaskEltSize, ImmN, Mask);

7756

IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);

7757

break;

7758

case X86ISD::INSERTPS:

7759

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7759, __extension__
__PRETTY_FUNCTION__));

7760

assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7760, __extension__
__PRETTY_FUNCTION__));

7761

ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);

7762

DecodeINSERTPSMask(ImmN, Mask);

7763

IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);

7764

break;

7765

case X86ISD::EXTRQI:

7766

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7766, __extension__
__PRETTY_FUNCTION__));

7767

if (isa<ConstantSDNode>(N->getOperand(1)) &&

7768

isa<ConstantSDNode>(N->getOperand(2))) {

7769

int BitLen = N->getConstantOperandVal(1);

7770

int BitIdx = N->getConstantOperandVal(2);

7771

DecodeEXTRQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask);

7772

IsUnary = true;

7773

}

7774

break;

7775

case X86ISD::INSERTQI:

7776

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7776, __extension__
__PRETTY_FUNCTION__));

7777

assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7777, __extension__
__PRETTY_FUNCTION__));

7778

if (isa<ConstantSDNode>(N->getOperand(2)) &&

7779

isa<ConstantSDNode>(N->getOperand(3))) {

7780

int BitLen = N->getConstantOperandVal(2);

7781

int BitIdx = N->getConstantOperandVal(3);

7782

DecodeINSERTQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask);

7783

IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);

7784

}

7785

break;

7786

case X86ISD::UNPCKH:

7787

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7787, __extension__
__PRETTY_FUNCTION__));

7788

assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7788, __extension__
__PRETTY_FUNCTION__));

7789

DecodeUNPCKHMask(NumElems, MaskEltSize, Mask);

7790

IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);

7791

break;

7792

case X86ISD::UNPCKL:

7793

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7793, __extension__
__PRETTY_FUNCTION__));

7794

assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7794, __extension__
__PRETTY_FUNCTION__));

7795

DecodeUNPCKLMask(NumElems, MaskEltSize, Mask);

7796

IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);

7797

break;

7798

case X86ISD::MOVHLPS:

7799

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7799, __extension__
__PRETTY_FUNCTION__));

7800

assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7800, __extension__
__PRETTY_FUNCTION__));

7801

DecodeMOVHLPSMask(NumElems, Mask);

7802

IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);

7803

break;

7804

case X86ISD::MOVLHPS:

7805

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7805, __extension__
__PRETTY_FUNCTION__));

7806

assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7806, __extension__
__PRETTY_FUNCTION__));

7807

DecodeMOVLHPSMask(NumElems, Mask);

7808

IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);

7809

break;

7810

case X86ISD::VALIGN:

7811

assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&(static_cast <bool> ((VT.getScalarType() == MVT::i32 ||
VT.getScalarType() == MVT::i64) && "Only 32-bit and 64-bit elements are supported!"
) ? void (0) : __assert_fail ("(VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) && \"Only 32-bit and 64-bit elements are supported!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7812, __extension__
__PRETTY_FUNCTION__))

7812

"Only 32-bit and 64-bit elements are supported!")(static_cast <bool> ((VT.getScalarType() == MVT::i32 ||
VT.getScalarType() == MVT::i64) && "Only 32-bit and 64-bit elements are supported!"
) ? void (0) : __assert_fail ("(VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) && \"Only 32-bit and 64-bit elements are supported!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7812, __extension__
__PRETTY_FUNCTION__));

7813

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7813, __extension__
__PRETTY_FUNCTION__));

7814

assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7814, __extension__
__PRETTY_FUNCTION__));

7815

ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);

7816

DecodeVALIGNMask(NumElems, ImmN, Mask);

7817

IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);

7818

Ops.push_back(N->getOperand(1));

7819

Ops.push_back(N->getOperand(0));

7820

break;

7821

case X86ISD::PALIGNR:

7822

assert(VT.getScalarType() == MVT::i8 && "Byte vector expected")(static_cast <bool> (VT.getScalarType() == MVT::i8 &&
"Byte vector expected") ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i8 && \"Byte vector expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7822, __extension__
__PRETTY_FUNCTION__));

7823

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7823, __extension__
__PRETTY_FUNCTION__));

7824

assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7824, __extension__
__PRETTY_FUNCTION__));

7825

ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);

7826

DecodePALIGNRMask(NumElems, ImmN, Mask);

7827

IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);

7828

Ops.push_back(N->getOperand(1));

7829

Ops.push_back(N->getOperand(0));

7830

break;

7831

case X86ISD::VSHLDQ:

7832

assert(VT.getScalarType() == MVT::i8 && "Byte vector expected")(static_cast <bool> (VT.getScalarType() == MVT::i8 &&
"Byte vector expected") ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i8 && \"Byte vector expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7832, __extension__
__PRETTY_FUNCTION__));

7833

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7833, __extension__
__PRETTY_FUNCTION__));

7834

ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);

7835

DecodePSLLDQMask(NumElems, ImmN, Mask);

7836

IsUnary = true;

7837

break;

7838

case X86ISD::VSRLDQ:

7839

assert(VT.getScalarType() == MVT::i8 && "Byte vector expected")(static_cast <bool> (VT.getScalarType() == MVT::i8 &&
"Byte vector expected") ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i8 && \"Byte vector expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7839, __extension__
__PRETTY_FUNCTION__));

7840

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7840, __extension__
__PRETTY_FUNCTION__));

7841

ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);

7842

DecodePSRLDQMask(NumElems, ImmN, Mask);

7843

IsUnary = true;

7844

break;

7845

case X86ISD::PSHUFD:

7846

case X86ISD::VPERMILPI:

7847

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7847, __extension__
__PRETTY_FUNCTION__));

7848

ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);

7849

DecodePSHUFMask(NumElems, MaskEltSize, ImmN, Mask);

7850

IsUnary = true;

7851

break;

7852

case X86ISD::PSHUFHW:

7853

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7853, __extension__
__PRETTY_FUNCTION__));

7854

ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);

7855

DecodePSHUFHWMask(NumElems, ImmN, Mask);

7856

IsUnary = true;

7857

break;

7858

case X86ISD::PSHUFLW:

7859

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7859, __extension__
__PRETTY_FUNCTION__));

7860

ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);

7861

DecodePSHUFLWMask(NumElems, ImmN, Mask);

7862

IsUnary = true;

7863

break;

7864

case X86ISD::VZEXT_MOVL:

7865

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7865, __extension__
__PRETTY_FUNCTION__));

7866

DecodeZeroMoveLowMask(NumElems, Mask);

7867

IsUnary = true;

7868

break;

7869

case X86ISD::VBROADCAST:

7870

// We only decode broadcasts of same-sized vectors, peeking through to

7871

// extracted subvectors is likely to cause hasOneUse issues with

7872

// SimplifyDemandedBits etc.

7873

if (N->getOperand(0).getValueType() == VT) {

7874

DecodeVectorBroadcast(NumElems, Mask);

7875

IsUnary = true;

7876

break;

7877

}

7878

return false;

7879

case X86ISD::VPERMILPV: {

7880

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7880, __extension__
__PRETTY_FUNCTION__));

7881

IsUnary = true;

7882

SDValue MaskNode = N->getOperand(1);

7883

if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,

7884

RawUndefs)) {

7885

DecodeVPERMILPMask(NumElems, MaskEltSize, RawMask, RawUndefs, Mask);

7886

break;

7887

}

7888

return false;

7889

}

7890

case X86ISD::PSHUFB: {

7891

assert(VT.getScalarType() == MVT::i8 && "Byte vector expected")(static_cast <bool> (VT.getScalarType() == MVT::i8 &&
"Byte vector expected") ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i8 && \"Byte vector expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7891, __extension__
__PRETTY_FUNCTION__));

7892

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7892, __extension__
__PRETTY_FUNCTION__));

7893

assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7893, __extension__
__PRETTY_FUNCTION__));

7894

IsUnary = true;

7895

SDValue MaskNode = N->getOperand(1);

7896

if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) {

7897

DecodePSHUFBMask(RawMask, RawUndefs, Mask);

7898

break;

7899

}

7900

return false;

7901

}

7902

case X86ISD::VPERMI:

7903

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7903, __extension__
__PRETTY_FUNCTION__));

7904

ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);

7905

DecodeVPERMMask(NumElems, ImmN, Mask);

7906

IsUnary = true;

7907

break;

7908

case X86ISD::MOVSS:

7909

case X86ISD::MOVSD:

7910

case X86ISD::MOVSH:

7911

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7911, __extension__
__PRETTY_FUNCTION__));

7912

assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7912, __extension__
__PRETTY_FUNCTION__));

7913

DecodeScalarMoveMask(NumElems, /* IsLoad */ false, Mask);

7914

break;

7915

case X86ISD::VPERM2X128:

7916

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7916, __extension__
__PRETTY_FUNCTION__));

7917

assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7917, __extension__
__PRETTY_FUNCTION__));

7918

ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);

7919

DecodeVPERM2X128Mask(NumElems, ImmN, Mask);

7920

IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);

7921

break;

7922

case X86ISD::SHUF128:

7923

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7923, __extension__
__PRETTY_FUNCTION__));

7924

assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7924, __extension__
__PRETTY_FUNCTION__));

7925

ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);

7926

decodeVSHUF64x2FamilyMask(NumElems, MaskEltSize, ImmN, Mask);

7927

IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);

7928

break;

7929

case X86ISD::MOVSLDUP:

7930

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7930, __extension__
__PRETTY_FUNCTION__));

7931

DecodeMOVSLDUPMask(NumElems, Mask);

7932

IsUnary = true;

7933

break;

7934

case X86ISD::MOVSHDUP:

7935

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7935, __extension__
__PRETTY_FUNCTION__));

7936

DecodeMOVSHDUPMask(NumElems, Mask);

7937

IsUnary = true;

7938

break;

7939

case X86ISD::MOVDDUP:

7940

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7940, __extension__
__PRETTY_FUNCTION__));

7941

DecodeMOVDDUPMask(NumElems, Mask);

7942

IsUnary = true;

7943

break;

7944

case X86ISD::VPERMIL2: {

7945

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7945, __extension__
__PRETTY_FUNCTION__));

7946

assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7946, __extension__
__PRETTY_FUNCTION__));

7947

IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);

7948

SDValue MaskNode = N->getOperand(2);

7949

SDValue CtrlNode = N->getOperand(3);

7950

if (ConstantSDNode *CtrlOp = dyn_cast<ConstantSDNode>(CtrlNode)) {

7951

unsigned CtrlImm = CtrlOp->getZExtValue();

7952

if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,

7953

RawUndefs)) {

7954

DecodeVPERMIL2PMask(NumElems, MaskEltSize, CtrlImm, RawMask, RawUndefs,

7955

Mask);

7956

break;

7957

}

7958

}

7959

return false;

7960

}

7961

case X86ISD::VPPERM: {

7962

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7962, __extension__
__PRETTY_FUNCTION__));

7963

assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7963, __extension__
__PRETTY_FUNCTION__));

7964

IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);

7965

SDValue MaskNode = N->getOperand(2);

7966

if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) {

7967

DecodeVPPERMMask(RawMask, RawUndefs, Mask);

7968

break;

7969

}

7970

return false;

7971

}

7972

case X86ISD::VPERMV: {

7973

assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(1).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7973, __extension__
__PRETTY_FUNCTION__));

7974

IsUnary = true;

7975

// Unlike most shuffle nodes, VPERMV's mask operand is operand 0.

7976

Ops.push_back(N->getOperand(1));

7977

SDValue MaskNode = N->getOperand(0);

7978

if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,

7979

RawUndefs)) {

7980

DecodeVPERMVMask(RawMask, RawUndefs, Mask);

7981

break;

7982

}

7983

return false;

7984

}

7985

case X86ISD::VPERMV3: {

7986

assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(0).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7986, __extension__
__PRETTY_FUNCTION__));

7987

assert(N->getOperand(2).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(2).getValueType()
== VT && "Unexpected value type") ? void (0) : __assert_fail
("N->getOperand(2).getValueType() == VT && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 7987, __extension__
__PRETTY_FUNCTION__));

7988

IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(2);

7989

// Unlike most shuffle nodes, VPERMV3's mask operand is the middle one.

7990

Ops.push_back(N->getOperand(0));

7991

Ops.push_back(N->getOperand(2));

7992

SDValue MaskNode = N->getOperand(1);

7993

if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask,

7994

RawUndefs)) {

7995

DecodeVPERMV3Mask(RawMask, RawUndefs, Mask);

7996

break;

7997

}

7998

return false;

7999

}

8000

default: llvm_unreachable("unknown target shuffle node")::llvm::llvm_unreachable_internal("unknown target shuffle node"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8000);

8001

}

8002

8003

// Empty mask indicates the decode failed.

8004

if (Mask.empty())

8005

return false;

8006

8007

// Check if we're getting a shuffle mask with zero'd elements.

8008

if (!AllowSentinelZero && isAnyZero(Mask))

8009

return false;

8010

8011

// If we have a fake unary shuffle, the shuffle mask is spread across two

8012

// inputs that are actually the same node. Re-map the mask to always point

8013

// into the first input.

8014

if (IsFakeUnary)

8015

for (int &M : Mask)

8016

if (M >= (int)Mask.size())

8017

M -= Mask.size();

8018

8019

// If we didn't already add operands in the opcode-specific code, default to

8020

// adding 1 or 2 operands starting at 0.

8021

if (Ops.empty()) {

8022

Ops.push_back(N->getOperand(0));

8023

if (!IsUnary || IsFakeUnary)

8024

Ops.push_back(N->getOperand(1));

8025

}

8026

8027

return true;

8028

}

8029

8030

// Wrapper for getTargetShuffleMask with InUnary;

8031

static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,

8032

SmallVectorImpl<SDValue> &Ops,

8033

SmallVectorImpl<int> &Mask) {

8034

bool IsUnary;

8035

return getTargetShuffleMask(N, VT, AllowSentinelZero, Ops, Mask, IsUnary);

8036

}

8037

8038

/// Compute whether each element of a shuffle is zeroable.

8039

///

8040

/// A "zeroable" vector shuffle element is one which can be lowered to zero.

8041

/// Either it is an undef element in the shuffle mask, the element of the input

8042

/// referenced is undef, or the element of the input referenced is known to be

8043

/// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle

8044

/// as many lanes with this technique as possible to simplify the remaining

8045

/// shuffle.

8046

static void computeZeroableShuffleElements(ArrayRef<int> Mask,

8047

SDValue V1, SDValue V2,

8048

APInt &KnownUndef, APInt &KnownZero) {

8049

int Size = Mask.size();

8050

KnownUndef = KnownZero = APInt::getZero(Size);

8051

8052

V1 = peekThroughBitcasts(V1);

8053

V2 = peekThroughBitcasts(V2);

8054

8055

bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());

8056

bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());

8057

8058

int VectorSizeInBits = V1.getValueSizeInBits();

8059

int ScalarSizeInBits = VectorSizeInBits / Size;

8060

assert(!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size")(static_cast <bool> (!(VectorSizeInBits % ScalarSizeInBits
) && "Illegal shuffle mask size") ? void (0) : __assert_fail
("!(VectorSizeInBits % ScalarSizeInBits) && \"Illegal shuffle mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8060, __extension__
__PRETTY_FUNCTION__));

8061

8062

for (int i = 0; i < Size; ++i) {

8063

int M = Mask[i];

8064

// Handle the easy cases.

8065

if (M < 0) {

8066

KnownUndef.setBit(i);

8067

continue;

8068

}

8069

if ((M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {

8070

KnownZero.setBit(i);

8071

continue;

8072

}

8073

8074

// Determine shuffle input and normalize the mask.

8075

SDValue V = M < Size ? V1 : V2;

8076

M %= Size;

8077

8078

// Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.

8079

if (V.getOpcode() != ISD::BUILD_VECTOR)

8080

continue;

8081

8082

// If the BUILD_VECTOR has fewer elements then the bitcasted portion of

8083

// the (larger) source element must be UNDEF/ZERO.

8084

if ((Size % V.getNumOperands()) == 0) {

8085

int Scale = Size / V->getNumOperands();

8086

SDValue Op = V.getOperand(M / Scale);

8087

if (Op.isUndef())

8088

KnownUndef.setBit(i);

8089

if (X86::isZeroNode(Op))

8090

KnownZero.setBit(i);

8091

else if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {

8092

APInt Val = Cst->getAPIntValue();

8093

Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);

8094

if (Val == 0)

8095

KnownZero.setBit(i);

8096

} else if (ConstantFPSDNode *Cst = dyn_cast<ConstantFPSDNode>(Op)) {

8097

APInt Val = Cst->getValueAPF().bitcastToAPInt();

8098

Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);

8099

if (Val == 0)

8100

KnownZero.setBit(i);

8101

}

8102

continue;

8103

}

8104

8105

// If the BUILD_VECTOR has more elements then all the (smaller) source

8106

// elements must be UNDEF or ZERO.

8107

if ((V.getNumOperands() % Size) == 0) {

8108

int Scale = V->getNumOperands() / Size;

8109

bool AllUndef = true;

8110

bool AllZero = true;

8111

for (int j = 0; j < Scale; ++j) {

8112

SDValue Op = V.getOperand((M * Scale) + j);

8113

AllUndef &= Op.isUndef();

8114

AllZero &= X86::isZeroNode(Op);

8115

}

8116

if (AllUndef)

8117

KnownUndef.setBit(i);

8118

if (AllZero)

8119

KnownZero.setBit(i);

8120

continue;

8121

}

8122

}

8123

}

8124

8125

/// Decode a target shuffle mask and inputs and see if any values are

8126

/// known to be undef or zero from their inputs.

8127

/// Returns true if the target shuffle mask was decoded.

8128

/// FIXME: Merge this with computeZeroableShuffleElements?

8129

static bool getTargetShuffleAndZeroables(SDValue N, SmallVectorImpl<int> &Mask,

8130

SmallVectorImpl<SDValue> &Ops,

8131

APInt &KnownUndef, APInt &KnownZero) {

8132

bool IsUnary;

8133

if (!isTargetShuffle(N.getOpcode()))

8134

return false;

8135

8136

MVT VT = N.getSimpleValueType();

8137

if (!getTargetShuffleMask(N.getNode(), VT, true, Ops, Mask, IsUnary))

8138

return false;

8139

8140

int Size = Mask.size();

8141

SDValue V1 = Ops[0];

8142

SDValue V2 = IsUnary ? V1 : Ops[1];

8143

KnownUndef = KnownZero = APInt::getZero(Size);

8144

8145

V1 = peekThroughBitcasts(V1);

8146

V2 = peekThroughBitcasts(V2);

8147

8148

assert((VT.getSizeInBits() % Size) == 0 &&(static_cast <bool> ((VT.getSizeInBits() % Size) == 0 &&
"Illegal split of shuffle value type") ? void (0) : __assert_fail
("(VT.getSizeInBits() % Size) == 0 && \"Illegal split of shuffle value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8149, __extension__
__PRETTY_FUNCTION__))

8149

"Illegal split of shuffle value type")(static_cast <bool> ((VT.getSizeInBits() % Size) == 0 &&
"Illegal split of shuffle value type") ? void (0) : __assert_fail
("(VT.getSizeInBits() % Size) == 0 && \"Illegal split of shuffle value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8149, __extension__
__PRETTY_FUNCTION__));

8150

unsigned EltSizeInBits = VT.getSizeInBits() / Size;

8151

8152

// Extract known constant input data.

8153

APInt UndefSrcElts[2];

8154

SmallVector<APInt, 32> SrcEltBits[2];

8155

bool IsSrcConstant[2] = {

8156

getTargetConstantBitsFromNode(V1, EltSizeInBits, UndefSrcElts[0],

8157

SrcEltBits[0], true, false),

8158

getTargetConstantBitsFromNode(V2, EltSizeInBits, UndefSrcElts[1],

8159

SrcEltBits[1], true, false)};

8160

8161

for (int i = 0; i < Size; ++i) {

8162

int M = Mask[i];

8163

8164

// Already decoded as SM_SentinelZero / SM_SentinelUndef.

8165

if (M < 0) {

8166

assert(isUndefOrZero(M) && "Unknown shuffle sentinel value!")(static_cast <bool> (isUndefOrZero(M) && "Unknown shuffle sentinel value!"
) ? void (0) : __assert_fail ("isUndefOrZero(M) && \"Unknown shuffle sentinel value!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8166, __extension__
__PRETTY_FUNCTION__));

8167

if (SM_SentinelUndef == M)

8168

KnownUndef.setBit(i);

8169

if (SM_SentinelZero == M)

8170

KnownZero.setBit(i);

8171

continue;

8172

}

8173

8174

// Determine shuffle input and normalize the mask.

8175

unsigned SrcIdx = M / Size;

8176

SDValue V = M < Size ? V1 : V2;

8177

M %= Size;

8178

8179

// We are referencing an UNDEF input.

8180

if (V.isUndef()) {

8181

KnownUndef.setBit(i);

8182

continue;

8183

}

8184

8185

// SCALAR_TO_VECTOR - only the first element is defined, and the rest UNDEF.

8186

// TODO: We currently only set UNDEF for integer types - floats use the same

8187

// registers as vectors and many of the scalar folded loads rely on the

8188

// SCALAR_TO_VECTOR pattern.

8189

if (V.getOpcode() == ISD::SCALAR_TO_VECTOR &&

8190

(Size % V.getValueType().getVectorNumElements()) == 0) {

8191

int Scale = Size / V.getValueType().getVectorNumElements();

8192

int Idx = M / Scale;

8193

if (Idx != 0 && !VT.isFloatingPoint())

8194

KnownUndef.setBit(i);

8195

else if (Idx == 0 && X86::isZeroNode(V.getOperand(0)))

8196

KnownZero.setBit(i);

8197

continue;

8198

}

8199

8200

// INSERT_SUBVECTOR - to widen vectors we often insert them into UNDEF

8201

// base vectors.

8202

if (V.getOpcode() == ISD::INSERT_SUBVECTOR) {

8203

SDValue Vec = V.getOperand(0);

8204

int NumVecElts = Vec.getValueType().getVectorNumElements();

8205

if (Vec.isUndef() && Size == NumVecElts) {

8206

int Idx = V.getConstantOperandVal(2);

8207

int NumSubElts = V.getOperand(1).getValueType().getVectorNumElements();

8208

if (M < Idx || (Idx + NumSubElts) <= M)

8209

KnownUndef.setBit(i);

8210

}

8211

continue;

8212

}

8213

8214

// Attempt to extract from the source's constant bits.

8215

if (IsSrcConstant[SrcIdx]) {

8216

if (UndefSrcElts[SrcIdx][M])

8217

KnownUndef.setBit(i);

8218

else if (SrcEltBits[SrcIdx][M] == 0)

8219

KnownZero.setBit(i);

8220

}

8221

}

8222

8223

assert(VT.getVectorNumElements() == (unsigned)Size &&(static_cast <bool> (VT.getVectorNumElements() == (unsigned
)Size && "Different mask size from vector size!") ? void
(0) : __assert_fail ("VT.getVectorNumElements() == (unsigned)Size && \"Different mask size from vector size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8224, __extension__
__PRETTY_FUNCTION__))

8224

"Different mask size from vector size!")(static_cast <bool> (VT.getVectorNumElements() == (unsigned
)Size && "Different mask size from vector size!") ? void
(0) : __assert_fail ("VT.getVectorNumElements() == (unsigned)Size && \"Different mask size from vector size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8224, __extension__
__PRETTY_FUNCTION__));

8225

return true;

8226

}

8227

8228

// Replace target shuffle mask elements with known undef/zero sentinels.

8229

static void resolveTargetShuffleFromZeroables(SmallVectorImpl<int> &Mask,

8230

const APInt &KnownUndef,

8231

const APInt &KnownZero,

8232

bool ResolveKnownZeros= true) {

8233

unsigned NumElts = Mask.size();

8234

assert(KnownUndef.getBitWidth() == NumElts &&(static_cast <bool> (KnownUndef.getBitWidth() == NumElts
&& KnownZero.getBitWidth() == NumElts && "Shuffle mask size mismatch"
) ? void (0) : __assert_fail ("KnownUndef.getBitWidth() == NumElts && KnownZero.getBitWidth() == NumElts && \"Shuffle mask size mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8235, __extension__
__PRETTY_FUNCTION__))

8235

KnownZero.getBitWidth() == NumElts && "Shuffle mask size mismatch")(static_cast <bool> (KnownUndef.getBitWidth() == NumElts
&& KnownZero.getBitWidth() == NumElts && "Shuffle mask size mismatch"
) ? void (0) : __assert_fail ("KnownUndef.getBitWidth() == NumElts && KnownZero.getBitWidth() == NumElts && \"Shuffle mask size mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8235, __extension__
__PRETTY_FUNCTION__));

8236

8237

for (unsigned i = 0; i != NumElts; ++i) {

8238

if (KnownUndef[i])

8239

Mask[i] = SM_SentinelUndef;

8240

else if (ResolveKnownZeros && KnownZero[i])

8241

Mask[i] = SM_SentinelZero;

8242

}

8243

}

8244

8245

// Extract target shuffle mask sentinel elements to known undef/zero bitmasks.

8246

static void resolveZeroablesFromTargetShuffle(const SmallVectorImpl<int> &Mask,

8247

APInt &KnownUndef,

8248

APInt &KnownZero) {

8249

unsigned NumElts = Mask.size();

8250

KnownUndef = KnownZero = APInt::getZero(NumElts);

8251

8252

for (unsigned i = 0; i != NumElts; ++i) {

8253

int M = Mask[i];

8254

if (SM_SentinelUndef == M)

8255

KnownUndef.setBit(i);

8256

if (SM_SentinelZero == M)

8257

KnownZero.setBit(i);

8258

}

8259

}

8260

8261

// Attempt to create a shuffle mask from a VSELECT/BLENDV condition mask.

8262

static bool createShuffleMaskFromVSELECT(SmallVectorImpl<int> &Mask,

8263

SDValue Cond, bool IsBLENDV = false) {

8264

EVT CondVT = Cond.getValueType();

8265

unsigned EltSizeInBits = CondVT.getScalarSizeInBits();

8266

unsigned NumElts = CondVT.getVectorNumElements();

8267

8268

APInt UndefElts;

8269

SmallVector<APInt, 32> EltBits;

8270

if (!getTargetConstantBitsFromNode(Cond, EltSizeInBits, UndefElts, EltBits,

8271

true, false))

8272

return false;

8273

8274

Mask.resize(NumElts, SM_SentinelUndef);

8275

8276

for (int i = 0; i != (int)NumElts; ++i) {

8277

Mask[i] = i;

8278

// Arbitrarily choose from the 2nd operand if the select condition element

8279

// is undef.

8280

// TODO: Can we do better by matching patterns such as even/odd?

8281

if (UndefElts[i] || (!IsBLENDV && EltBits[i].isZero()) ||

8282

(IsBLENDV && EltBits[i].isNonNegative()))

8283

Mask[i] += NumElts;

8284

}

8285

8286

return true;

8287

}

8288

8289

// Forward declaration (for getFauxShuffleMask recursive check).

8290

static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,

8291

SmallVectorImpl<SDValue> &Inputs,

8292

SmallVectorImpl<int> &Mask,

8293

const SelectionDAG &DAG, unsigned Depth,

8294

bool ResolveKnownElts);

8295

8296

// Attempt to decode ops that could be represented as a shuffle mask.

8297

// The decoded shuffle mask may contain a different number of elements to the

8298

// destination value type.

8299

// TODO: Merge into getTargetShuffleInputs()

8300

static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,

8301

SmallVectorImpl<int> &Mask,

8302

SmallVectorImpl<SDValue> &Ops,

8303

const SelectionDAG &DAG, unsigned Depth,

8304

bool ResolveKnownElts) {

8305

Mask.clear();

8306

Ops.clear();

8307

8308

MVT VT = N.getSimpleValueType();

8309

unsigned NumElts = VT.getVectorNumElements();

8310

unsigned NumSizeInBits = VT.getSizeInBits();

8311

unsigned NumBitsPerElt = VT.getScalarSizeInBits();

8312

if ((NumBitsPerElt % 8) != 0 || (NumSizeInBits % 8) != 0)

8313

return false;

8314

assert(NumElts == DemandedElts.getBitWidth() && "Unexpected vector size")(static_cast <bool> (NumElts == DemandedElts.getBitWidth
() && "Unexpected vector size") ? void (0) : __assert_fail
("NumElts == DemandedElts.getBitWidth() && \"Unexpected vector size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8314, __extension__
__PRETTY_FUNCTION__));

8315

unsigned NumSizeInBytes = NumSizeInBits / 8;

8316

unsigned NumBytesPerElt = NumBitsPerElt / 8;

8317

8318

unsigned Opcode = N.getOpcode();

8319

switch (Opcode) {

8320

case ISD::VECTOR_SHUFFLE: {

8321

// Don't treat ISD::VECTOR_SHUFFLE as a target shuffle so decode it here.

8322

ArrayRef<int> ShuffleMask = cast<ShuffleVectorSDNode>(N)->getMask();

8323

if (isUndefOrInRange(ShuffleMask, 0, 2 * NumElts)) {

8324

Mask.append(ShuffleMask.begin(), ShuffleMask.end());

8325

Ops.push_back(N.getOperand(0));

8326

Ops.push_back(N.getOperand(1));

8327

return true;

8328

}

8329

return false;

8330

}

8331

case ISD::AND:

8332

case X86ISD::ANDNP: {

8333

// Attempt to decode as a per-byte mask.

8334

APInt UndefElts;

8335

SmallVector<APInt, 32> EltBits;

8336

SDValue N0 = N.getOperand(0);

8337

SDValue N1 = N.getOperand(1);

8338

bool IsAndN = (X86ISD::ANDNP == Opcode);

8339

uint64_t ZeroMask = IsAndN ? 255 : 0;

8340

if (!getTargetConstantBitsFromNode(IsAndN ? N0 : N1, 8, UndefElts, EltBits))

8341

return false;

8342

// We can't assume an undef src element gives an undef dst - the other src

8343

// might be zero.

8344

if (!UndefElts.isZero())

8345

return false;

8346

for (int i = 0, e = (int)EltBits.size(); i != e; ++i) {

8347

const APInt &ByteBits = EltBits[i];

8348

if (ByteBits != 0 && ByteBits != 255)

8349

return false;

8350

Mask.push_back(ByteBits == ZeroMask ? SM_SentinelZero : i);

8351

}

8352

Ops.push_back(IsAndN ? N1 : N0);

8353

return true;

8354

}

8355

case ISD::OR: {

8356

// Handle OR(SHUFFLE,SHUFFLE) case where one source is zero and the other

8357

// is a valid shuffle index.

8358

SDValue N0 = peekThroughBitcasts(N.getOperand(0));

8359

SDValue N1 = peekThroughBitcasts(N.getOperand(1));

8360

if (!N0.getValueType().isVector() || !N1.getValueType().isVector())

8361

return false;

8362

8363

SmallVector<int, 64> SrcMask0, SrcMask1;

8364

SmallVector<SDValue, 2> SrcInputs0, SrcInputs1;

8365

APInt Demand0 = APInt::getAllOnes(N0.getValueType().getVectorNumElements());

8366

APInt Demand1 = APInt::getAllOnes(N1.getValueType().getVectorNumElements());

8367

if (!getTargetShuffleInputs(N0, Demand0, SrcInputs0, SrcMask0, DAG,

8368

Depth + 1, true) ||

8369

!getTargetShuffleInputs(N1, Demand1, SrcInputs1, SrcMask1, DAG,

8370

Depth + 1, true))

8371

return false;

8372

8373

size_t MaskSize = std::max(SrcMask0.size(), SrcMask1.size());

8374

SmallVector<int, 64> Mask0, Mask1;

8375

narrowShuffleMaskElts(MaskSize / SrcMask0.size(), SrcMask0, Mask0);

8376

narrowShuffleMaskElts(MaskSize / SrcMask1.size(), SrcMask1, Mask1);

8377

for (int i = 0; i != (int)MaskSize; ++i) {

8378

// NOTE: Don't handle SM_SentinelUndef, as we can end up in infinite

8379

// loops converting between OR and BLEND shuffles due to

8380

// canWidenShuffleElements merging away undef elements, meaning we

8381

// fail to recognise the OR as the undef element isn't known zero.

8382

if (Mask0[i] == SM_SentinelZero && Mask1[i] == SM_SentinelZero)

8383

Mask.push_back(SM_SentinelZero);

8384

else if (Mask1[i] == SM_SentinelZero)

8385

Mask.push_back(i);

8386

else if (Mask0[i] == SM_SentinelZero)

8387

Mask.push_back(i + MaskSize);

8388

else

8389

return false;

8390

}

8391

Ops.push_back(N0);

8392

Ops.push_back(N1);

8393

return true;

8394

}

8395

case ISD::INSERT_SUBVECTOR: {

8396

SDValue Src = N.getOperand(0);

8397

SDValue Sub = N.getOperand(1);

8398

EVT SubVT = Sub.getValueType();

8399

unsigned NumSubElts = SubVT.getVectorNumElements();

8400

if (!N->isOnlyUserOf(Sub.getNode()))

8401

return false;

8402

uint64_t InsertIdx = N.getConstantOperandVal(2);

8403

// Handle INSERT_SUBVECTOR(SRC0, EXTRACT_SUBVECTOR(SRC1)).

8404

if (Sub.getOpcode() == ISD::EXTRACT_SUBVECTOR &&

8405

Sub.getOperand(0).getValueType() == VT) {

8406

uint64_t ExtractIdx = Sub.getConstantOperandVal(1);

8407

for (int i = 0; i != (int)NumElts; ++i)

8408

Mask.push_back(i);

8409

for (int i = 0; i != (int)NumSubElts; ++i)

8410

Mask[InsertIdx + i] = NumElts + ExtractIdx + i;

8411

Ops.push_back(Src);

8412

Ops.push_back(Sub.getOperand(0));

8413

return true;

8414

}

8415

// Handle INSERT_SUBVECTOR(SRC0, SHUFFLE(SRC1)).

8416

SmallVector<int, 64> SubMask;

8417

SmallVector<SDValue, 2> SubInputs;

8418

SDValue SubSrc = peekThroughOneUseBitcasts(Sub);

8419

EVT SubSrcVT = SubSrc.getValueType();

8420

if (!SubSrcVT.isVector())

8421

return false;

8422

8423

APInt SubDemand = APInt::getAllOnes(SubSrcVT.getVectorNumElements());

8424

if (!getTargetShuffleInputs(SubSrc, SubDemand, SubInputs, SubMask, DAG,

8425

Depth + 1, ResolveKnownElts))

8426

return false;

8427

8428

// Subvector shuffle inputs must not be larger than the subvector.

8429

if (llvm::any_of(SubInputs, [SubVT](SDValue SubInput) {

8430

return SubVT.getFixedSizeInBits() <

8431

SubInput.getValueSizeInBits().getFixedValue();

8432

}))

8433

return false;

8434

8435

if (SubMask.size() != NumSubElts) {

8436

assert(((SubMask.size() % NumSubElts) == 0 ||(static_cast <bool> (((SubMask.size() % NumSubElts) == 0
|| (NumSubElts % SubMask.size()) == 0) && "Illegal submask scale"
) ? void (0) : __assert_fail ("((SubMask.size() % NumSubElts) == 0 || (NumSubElts % SubMask.size()) == 0) && \"Illegal submask scale\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8437, __extension__
__PRETTY_FUNCTION__))

8437

(NumSubElts % SubMask.size()) == 0) && "Illegal submask scale")(static_cast <bool> (((SubMask.size() % NumSubElts) == 0
|| (NumSubElts % SubMask.size()) == 0) && "Illegal submask scale"
) ? void (0) : __assert_fail ("((SubMask.size() % NumSubElts) == 0 || (NumSubElts % SubMask.size()) == 0) && \"Illegal submask scale\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8437, __extension__
__PRETTY_FUNCTION__));

8438

if ((NumSubElts % SubMask.size()) == 0) {

8439

int Scale = NumSubElts / SubMask.size();

8440

SmallVector<int,64> ScaledSubMask;

8441

narrowShuffleMaskElts(Scale, SubMask, ScaledSubMask);

8442

SubMask = ScaledSubMask;

8443

} else {

8444

int Scale = SubMask.size() / NumSubElts;

8445

NumSubElts = SubMask.size();

8446

NumElts *= Scale;

8447

InsertIdx *= Scale;

8448

}

8449

}

8450

Ops.push_back(Src);

8451

Ops.append(SubInputs.begin(), SubInputs.end());

8452

if (ISD::isBuildVectorAllZeros(Src.getNode()))

8453

Mask.append(NumElts, SM_SentinelZero);

8454

else

8455

for (int i = 0; i != (int)NumElts; ++i)

8456

Mask.push_back(i);

8457

for (int i = 0; i != (int)NumSubElts; ++i) {

8458

int M = SubMask[i];

8459

if (0 <= M) {

8460

int InputIdx = M / NumSubElts;

8461

M = (NumElts * (1 + InputIdx)) + (M % NumSubElts);

8462

}

8463

Mask[i + InsertIdx] = M;

8464

}

8465

return true;

8466

}

8467

case X86ISD::PINSRB:

8468

case X86ISD::PINSRW:

8469

case ISD::SCALAR_TO_VECTOR:

8470

case ISD::INSERT_VECTOR_ELT: {

8471

// Match against a insert_vector_elt/scalar_to_vector of an extract from a

8472

// vector, for matching src/dst vector types.

8473

SDValue Scl = N.getOperand(Opcode == ISD::SCALAR_TO_VECTOR ? 0 : 1);

8474

8475

unsigned DstIdx = 0;

8476

if (Opcode != ISD::SCALAR_TO_VECTOR) {

8477

// Check we have an in-range constant insertion index.

8478

if (!isa<ConstantSDNode>(N.getOperand(2)) ||

8479

N.getConstantOperandAPInt(2).uge(NumElts))

8480

return false;

8481

DstIdx = N.getConstantOperandVal(2);

8482

8483

// Attempt to recognise an INSERT*(VEC, 0, DstIdx) shuffle pattern.

8484

if (X86::isZeroNode(Scl)) {

8485

Ops.push_back(N.getOperand(0));

8486

for (unsigned i = 0; i != NumElts; ++i)

8487

Mask.push_back(i == DstIdx ? SM_SentinelZero : (int)i);

8488

return true;

8489

}

8490

}

8491

8492

// Peek through trunc/aext/zext.

8493

// TODO: aext shouldn't require SM_SentinelZero padding.

8494

// TODO: handle shift of scalars.

8495

unsigned MinBitsPerElt = Scl.getScalarValueSizeInBits();

8496

while (Scl.getOpcode() == ISD::TRUNCATE ||

8497

Scl.getOpcode() == ISD::ANY_EXTEND ||

8498

Scl.getOpcode() == ISD::ZERO_EXTEND) {

8499

Scl = Scl.getOperand(0);

8500

MinBitsPerElt =

8501

std::min<unsigned>(MinBitsPerElt, Scl.getScalarValueSizeInBits());

8502

}

8503

if ((MinBitsPerElt % 8) != 0)

8504

return false;

8505

8506

// Attempt to find the source vector the scalar was extracted from.

8507

SDValue SrcExtract;

8508

if ((Scl.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||

8509

Scl.getOpcode() == X86ISD::PEXTRW ||

8510

Scl.getOpcode() == X86ISD::PEXTRB) &&

8511

Scl.getOperand(0).getValueSizeInBits() == NumSizeInBits) {

8512

SrcExtract = Scl;

8513

}

8514

if (!SrcExtract || !isa<ConstantSDNode>(SrcExtract.getOperand(1)))

8515

return false;

8516

8517

SDValue SrcVec = SrcExtract.getOperand(0);

8518

EVT SrcVT = SrcVec.getValueType();

8519

if (!SrcVT.getScalarType().isByteSized())

8520

return false;

8521

unsigned SrcIdx = SrcExtract.getConstantOperandVal(1);

8522

unsigned SrcByte = SrcIdx * (SrcVT.getScalarSizeInBits() / 8);

8523

unsigned DstByte = DstIdx * NumBytesPerElt;

8524

MinBitsPerElt =

8525

std::min<unsigned>(MinBitsPerElt, SrcVT.getScalarSizeInBits());

8526

8527

// Create 'identity' byte level shuffle mask and then add inserted bytes.

8528

if (Opcode == ISD::SCALAR_TO_VECTOR) {

8529

Ops.push_back(SrcVec);

8530

Mask.append(NumSizeInBytes, SM_SentinelUndef);

8531

} else {

8532

Ops.push_back(SrcVec);

8533

Ops.push_back(N.getOperand(0));

8534

for (int i = 0; i != (int)NumSizeInBytes; ++i)

8535

Mask.push_back(NumSizeInBytes + i);

8536

}

8537

8538

unsigned MinBytesPerElts = MinBitsPerElt / 8;

8539

MinBytesPerElts = std::min(MinBytesPerElts, NumBytesPerElt);

8540

for (unsigned i = 0; i != MinBytesPerElts; ++i)

8541

Mask[DstByte + i] = SrcByte + i;

8542

for (unsigned i = MinBytesPerElts; i < NumBytesPerElt; ++i)

8543

Mask[DstByte + i] = SM_SentinelZero;

8544

return true;

8545

}

8546

case X86ISD::PACKSS:

8547

case X86ISD::PACKUS: {

8548

SDValue N0 = N.getOperand(0);

8549

SDValue N1 = N.getOperand(1);

8550

assert(N0.getValueType().getVectorNumElements() == (NumElts / 2) &&(static_cast <bool> (N0.getValueType().getVectorNumElements
() == (NumElts / 2) && N1.getValueType().getVectorNumElements
() == (NumElts / 2) && "Unexpected input value type")
? void (0) : __assert_fail ("N0.getValueType().getVectorNumElements() == (NumElts / 2) && N1.getValueType().getVectorNumElements() == (NumElts / 2) && \"Unexpected input value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8552, __extension__
__PRETTY_FUNCTION__))

8551

N1.getValueType().getVectorNumElements() == (NumElts / 2) &&(static_cast <bool> (N0.getValueType().getVectorNumElements
() == (NumElts / 2) && N1.getValueType().getVectorNumElements
() == (NumElts / 2) && "Unexpected input value type")
? void (0) : __assert_fail ("N0.getValueType().getVectorNumElements() == (NumElts / 2) && N1.getValueType().getVectorNumElements() == (NumElts / 2) && \"Unexpected input value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8552, __extension__
__PRETTY_FUNCTION__))

8552

"Unexpected input value type")(static_cast <bool> (N0.getValueType().getVectorNumElements
() == (NumElts / 2) && N1.getValueType().getVectorNumElements
() == (NumElts / 2) && "Unexpected input value type")
? void (0) : __assert_fail ("N0.getValueType().getVectorNumElements() == (NumElts / 2) && N1.getValueType().getVectorNumElements() == (NumElts / 2) && \"Unexpected input value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8552, __extension__
__PRETTY_FUNCTION__));

8553

8554

APInt EltsLHS, EltsRHS;

8555

getPackDemandedElts(VT, DemandedElts, EltsLHS, EltsRHS);

8556

8557

// If we know input saturation won't happen (or we don't care for particular

8558

// lanes), we can treat this as a truncation shuffle.

8559

bool Offset0 = false, Offset1 = false;

8560

if (Opcode == X86ISD::PACKSS) {

8561

if ((!(N0.isUndef() || EltsLHS.isZero()) &&

8562

DAG.ComputeNumSignBits(N0, EltsLHS, Depth + 1) <= NumBitsPerElt) ||

8563

(!(N1.isUndef() || EltsRHS.isZero()) &&

8564

DAG.ComputeNumSignBits(N1, EltsRHS, Depth + 1) <= NumBitsPerElt))

8565

return false;

8566

// We can't easily fold ASHR into a shuffle, but if it was feeding a

8567

// PACKSS then it was likely being used for sign-extension for a

8568

// truncation, so just peek through and adjust the mask accordingly.

8569

if (N0.getOpcode() == X86ISD::VSRAI && N->isOnlyUserOf(N0.getNode()) &&

8570

N0.getConstantOperandAPInt(1) == NumBitsPerElt) {

8571

Offset0 = true;

8572

N0 = N0.getOperand(0);

8573

}

8574

if (N1.getOpcode() == X86ISD::VSRAI && N->isOnlyUserOf(N1.getNode()) &&

8575

N1.getConstantOperandAPInt(1) == NumBitsPerElt) {

8576

Offset1 = true;

8577

N1 = N1.getOperand(0);

8578

}

8579

} else {

8580

APInt ZeroMask = APInt::getHighBitsSet(2 * NumBitsPerElt, NumBitsPerElt);

8581

if ((!(N0.isUndef() || EltsLHS.isZero()) &&

8582

!DAG.MaskedValueIsZero(N0, ZeroMask, EltsLHS, Depth + 1)) ||

8583

(!(N1.isUndef() || EltsRHS.isZero()) &&

8584

!DAG.MaskedValueIsZero(N1, ZeroMask, EltsRHS, Depth + 1)))

8585

return false;

8586

}

8587

8588

bool IsUnary = (N0 == N1);

8589

8590

Ops.push_back(N0);

8591

if (!IsUnary)

8592

Ops.push_back(N1);

8593

8594

createPackShuffleMask(VT, Mask, IsUnary);

8595

8596

if (Offset0 || Offset1) {

8597

for (int &M : Mask)

8598

if ((Offset0 && isInRange(M, 0, NumElts)) ||

8599

(Offset1 && isInRange(M, NumElts, 2 * NumElts)))

8600

++M;

8601

}

8602

return true;

8603

}

8604

case ISD::VSELECT:

8605

case X86ISD::BLENDV: {

8606

SDValue Cond = N.getOperand(0);

8607

if (createShuffleMaskFromVSELECT(Mask, Cond, Opcode == X86ISD::BLENDV)) {

8608

Ops.push_back(N.getOperand(1));

8609

Ops.push_back(N.getOperand(2));

8610

return true;

8611

}

8612

return false;

8613

}

8614

case X86ISD::VTRUNC: {

8615

SDValue Src = N.getOperand(0);

8616

EVT SrcVT = Src.getValueType();

8617

// Truncated source must be a simple vector.

8618

if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 ||

8619

(SrcVT.getScalarSizeInBits() % 8) != 0)

8620

return false;

8621

unsigned NumSrcElts = SrcVT.getVectorNumElements();

8622

unsigned NumBitsPerSrcElt = SrcVT.getScalarSizeInBits();

8623

unsigned Scale = NumBitsPerSrcElt / NumBitsPerElt;

8624

assert((NumBitsPerSrcElt % NumBitsPerElt) == 0 && "Illegal truncation")(static_cast <bool> ((NumBitsPerSrcElt % NumBitsPerElt)
== 0 && "Illegal truncation") ? void (0) : __assert_fail
("(NumBitsPerSrcElt % NumBitsPerElt) == 0 && \"Illegal truncation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8624, __extension__
__PRETTY_FUNCTION__));

8625

for (unsigned i = 0; i != NumSrcElts; ++i)

8626

Mask.push_back(i * Scale);

8627

Mask.append(NumElts - NumSrcElts, SM_SentinelZero);

8628

Ops.push_back(Src);

8629

return true;

8630

}

8631

case X86ISD::VSHLI:

8632

case X86ISD::VSRLI: {

8633

uint64_t ShiftVal = N.getConstantOperandVal(1);

8634

// Out of range bit shifts are guaranteed to be zero.

8635

if (NumBitsPerElt <= ShiftVal) {

8636

Mask.append(NumElts, SM_SentinelZero);

8637

return true;

8638

}

8639

8640

// We can only decode 'whole byte' bit shifts as shuffles.

8641

if ((ShiftVal % 8) != 0)

8642

break;

8643

8644

uint64_t ByteShift = ShiftVal / 8;

8645

Ops.push_back(N.getOperand(0));

8646

8647

// Clear mask to all zeros and insert the shifted byte indices.

8648

Mask.append(NumSizeInBytes, SM_SentinelZero);

8649

8650

if (X86ISD::VSHLI == Opcode) {

8651

for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)

8652

for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)

8653

Mask[i + j] = i + j - ByteShift;

8654

} else {

8655

for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)

8656

for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)

8657

Mask[i + j - ByteShift] = i + j;

8658

}

8659

return true;

8660

}

8661

case X86ISD::VROTLI:

8662

case X86ISD::VROTRI: {

8663

// We can only decode 'whole byte' bit rotates as shuffles.

8664

uint64_t RotateVal = N.getConstantOperandAPInt(1).urem(NumBitsPerElt);

8665

if ((RotateVal % 8) != 0)

8666

return false;

8667

Ops.push_back(N.getOperand(0));

8668

int Offset = RotateVal / 8;

8669

Offset = (X86ISD::VROTLI == Opcode ? NumBytesPerElt - Offset : Offset);

8670

for (int i = 0; i != (int)NumElts; ++i) {

8671

int BaseIdx = i * NumBytesPerElt;

8672

for (int j = 0; j != (int)NumBytesPerElt; ++j) {

8673

Mask.push_back(BaseIdx + ((Offset + j) % NumBytesPerElt));

8674

}

8675

}

8676

return true;

8677

}

8678

case X86ISD::VBROADCAST: {

8679

SDValue Src = N.getOperand(0);

8680

if (!Src.getSimpleValueType().isVector()) {

8681

if (Src.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

8682

!isNullConstant(Src.getOperand(1)) ||

8683

Src.getOperand(0).getValueType().getScalarType() !=

8684

VT.getScalarType())

8685

return false;

8686

Src = Src.getOperand(0);

8687

}

8688

Ops.push_back(Src);

8689

Mask.append(NumElts, 0);

8690

return true;

8691

}

8692

case ISD::ZERO_EXTEND:

8693

case ISD::ANY_EXTEND:

8694

case ISD::ZERO_EXTEND_VECTOR_INREG:

8695

case ISD::ANY_EXTEND_VECTOR_INREG: {

8696

SDValue Src = N.getOperand(0);

8697

EVT SrcVT = Src.getValueType();

8698

8699

// Extended source must be a simple vector.

8700

if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 ||

8701

(SrcVT.getScalarSizeInBits() % 8) != 0)

8702

return false;

8703

8704

bool IsAnyExtend =

8705

(ISD::ANY_EXTEND == Opcode || ISD::ANY_EXTEND_VECTOR_INREG == Opcode);

8706

DecodeZeroExtendMask(SrcVT.getScalarSizeInBits(), NumBitsPerElt, NumElts,

8707

IsAnyExtend, Mask);

8708

Ops.push_back(Src);

8709

return true;

8710

}

8711

}

8712

8713

return false;

8714

}

8715

8716

/// Removes unused/repeated shuffle source inputs and adjusts the shuffle mask.

8717

static void resolveTargetShuffleInputsAndMask(SmallVectorImpl<SDValue> &Inputs,

8718

SmallVectorImpl<int> &Mask) {

8719

int MaskWidth = Mask.size();

8720

SmallVector<SDValue, 16> UsedInputs;

8721

for (int i = 0, e = Inputs.size(); i < e; ++i) {

8722

int lo = UsedInputs.size() * MaskWidth;

8723

int hi = lo + MaskWidth;

8724

8725

// Strip UNDEF input usage.

8726

if (Inputs[i].isUndef())

8727

for (int &M : Mask)

8728

if ((lo <= M) && (M < hi))

8729

M = SM_SentinelUndef;

8730

8731

// Check for unused inputs.

8732

if (none_of(Mask, [lo, hi](int i) { return (lo <= i) && (i < hi); })) {

8733

for (int &M : Mask)

8734

if (lo <= M)

8735

M -= MaskWidth;

8736

continue;

8737

}

8738

8739

// Check for repeated inputs.

8740

bool IsRepeat = false;

8741

for (int j = 0, ue = UsedInputs.size(); j != ue; ++j) {

8742

if (UsedInputs[j] != Inputs[i])

8743

continue;

8744

for (int &M : Mask)

8745

if (lo <= M)

8746

M = (M < hi) ? ((M - lo) + (j * MaskWidth)) : (M - MaskWidth);

8747

IsRepeat = true;

8748

break;

8749

}

8750

if (IsRepeat)

8751

continue;

8752

8753

UsedInputs.push_back(Inputs[i]);

8754

}

8755

Inputs = UsedInputs;

8756

}

8757

8758

/// Calls getTargetShuffleAndZeroables to resolve a target shuffle mask's inputs

8759

/// and then sets the SM_SentinelUndef and SM_SentinelZero values.

8760

/// Returns true if the target shuffle mask was decoded.

8761

static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,

8762

SmallVectorImpl<SDValue> &Inputs,

8763

SmallVectorImpl<int> &Mask,

8764

APInt &KnownUndef, APInt &KnownZero,

8765

const SelectionDAG &DAG, unsigned Depth,

8766

bool ResolveKnownElts) {

8767

if (Depth >= SelectionDAG::MaxRecursionDepth)

8768

return false; // Limit search depth.

8769

8770

EVT VT = Op.getValueType();

8771

if (!VT.isSimple() || !VT.isVector())

8772

return false;

8773

8774

if (getTargetShuffleAndZeroables(Op, Mask, Inputs, KnownUndef, KnownZero)) {

8775

if (ResolveKnownElts)

8776

resolveTargetShuffleFromZeroables(Mask, KnownUndef, KnownZero);

8777

return true;

8778

}

8779

if (getFauxShuffleMask(Op, DemandedElts, Mask, Inputs, DAG, Depth,

8780

ResolveKnownElts)) {

8781

resolveZeroablesFromTargetShuffle(Mask, KnownUndef, KnownZero);

8782

return true;

8783

}

8784

return false;

8785

}

8786

8787

static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,

8788

SmallVectorImpl<SDValue> &Inputs,

8789

SmallVectorImpl<int> &Mask,

8790

const SelectionDAG &DAG, unsigned Depth,

8791

bool ResolveKnownElts) {

8792

APInt KnownUndef, KnownZero;

8793

return getTargetShuffleInputs(Op, DemandedElts, Inputs, Mask, KnownUndef,

8794

KnownZero, DAG, Depth, ResolveKnownElts);

8795

}

8796

8797

static bool getTargetShuffleInputs(SDValue Op, SmallVectorImpl<SDValue> &Inputs,

8798

SmallVectorImpl<int> &Mask,

8799

const SelectionDAG &DAG, unsigned Depth = 0,

8800

bool ResolveKnownElts = true) {

8801

EVT VT = Op.getValueType();

8802

if (!VT.isSimple() || !VT.isVector())

8803

return false;

8804

8805

unsigned NumElts = Op.getValueType().getVectorNumElements();

8806

APInt DemandedElts = APInt::getAllOnes(NumElts);

8807

return getTargetShuffleInputs(Op, DemandedElts, Inputs, Mask, DAG, Depth,

8808

ResolveKnownElts);

8809

}

8810

8811

// Attempt to create a scalar/subvector broadcast from the base MemSDNode.

8812

static SDValue getBROADCAST_LOAD(unsigned Opcode, const SDLoc &DL, EVT VT,

8813

EVT MemVT, MemSDNode *Mem, unsigned Offset,

8814

SelectionDAG &DAG) {

8815

assert((Opcode == X86ISD::VBROADCAST_LOAD ||(static_cast <bool> ((Opcode == X86ISD::VBROADCAST_LOAD
|| Opcode == X86ISD::SUBV_BROADCAST_LOAD) && "Unknown broadcast load type"
) ? void (0) : __assert_fail ("(Opcode == X86ISD::VBROADCAST_LOAD || Opcode == X86ISD::SUBV_BROADCAST_LOAD) && \"Unknown broadcast load type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8817, __extension__
__PRETTY_FUNCTION__))

8816

Opcode == X86ISD::SUBV_BROADCAST_LOAD) &&(static_cast <bool> ((Opcode == X86ISD::VBROADCAST_LOAD
|| Opcode == X86ISD::SUBV_BROADCAST_LOAD) && "Unknown broadcast load type"
) ? void (0) : __assert_fail ("(Opcode == X86ISD::VBROADCAST_LOAD || Opcode == X86ISD::SUBV_BROADCAST_LOAD) && \"Unknown broadcast load type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8817, __extension__
__PRETTY_FUNCTION__))

8817

"Unknown broadcast load type")(static_cast <bool> ((Opcode == X86ISD::VBROADCAST_LOAD
|| Opcode == X86ISD::SUBV_BROADCAST_LOAD) && "Unknown broadcast load type"
) ? void (0) : __assert_fail ("(Opcode == X86ISD::VBROADCAST_LOAD || Opcode == X86ISD::SUBV_BROADCAST_LOAD) && \"Unknown broadcast load type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8817, __extension__
__PRETTY_FUNCTION__));

8818

8819

// Ensure this is a simple (non-atomic, non-voltile), temporal read memop.

8820

if (!Mem || !Mem->readMem() || !Mem->isSimple() || Mem->isNonTemporal())

8821

return SDValue();

8822

8823

SDValue Ptr =

8824

DAG.getMemBasePlusOffset(Mem->getBasePtr(), TypeSize::Fixed(Offset), DL);

8825

SDVTList Tys = DAG.getVTList(VT, MVT::Other);

8826

SDValue Ops[] = {Mem->getChain(), Ptr};

8827

SDValue BcstLd = DAG.getMemIntrinsicNode(

8828

Opcode, DL, Tys, Ops, MemVT,

8829

DAG.getMachineFunction().getMachineMemOperand(

8830

Mem->getMemOperand(), Offset, MemVT.getStoreSize()));

8831

DAG.makeEquivalentMemoryOrdering(SDValue(Mem, 1), BcstLd.getValue(1));

8832

return BcstLd;

8833

}

8834

8835

/// Returns the scalar element that will make up the i'th

8836

/// element of the result of the vector shuffle.

8837

static SDValue getShuffleScalarElt(SDValue Op, unsigned Index,

8838

SelectionDAG &DAG, unsigned Depth) {

8839

if (Depth >= SelectionDAG::MaxRecursionDepth)

8840

return SDValue(); // Limit search depth.

8841

8842

EVT VT = Op.getValueType();

8843

unsigned Opcode = Op.getOpcode();

8844

unsigned NumElems = VT.getVectorNumElements();

8845

8846

// Recurse into ISD::VECTOR_SHUFFLE node to find scalars.

8847

if (auto *SV = dyn_cast<ShuffleVectorSDNode>(Op)) {

8848

int Elt = SV->getMaskElt(Index);

8849

8850

if (Elt < 0)

8851

return DAG.getUNDEF(VT.getVectorElementType());

8852

8853

SDValue Src = (Elt < (int)NumElems) ? SV->getOperand(0) : SV->getOperand(1);

8854

return getShuffleScalarElt(Src, Elt % NumElems, DAG, Depth + 1);

8855

}

8856

8857

// Recurse into target specific vector shuffles to find scalars.

8858

if (isTargetShuffle(Opcode)) {

8859

MVT ShufVT = VT.getSimpleVT();

8860

MVT ShufSVT = ShufVT.getVectorElementType();

8861

int NumElems = (int)ShufVT.getVectorNumElements();

8862

SmallVector<int, 16> ShuffleMask;

8863

SmallVector<SDValue, 16> ShuffleOps;

8864

if (!getTargetShuffleMask(Op.getNode(), ShufVT, true, ShuffleOps,

8865

ShuffleMask))

8866

return SDValue();

8867

8868

int Elt = ShuffleMask[Index];

8869

if (Elt == SM_SentinelZero)

8870

return ShufSVT.isInteger() ? DAG.getConstant(0, SDLoc(Op), ShufSVT)

8871

: DAG.getConstantFP(+0.0, SDLoc(Op), ShufSVT);

8872

if (Elt == SM_SentinelUndef)

8873

return DAG.getUNDEF(ShufSVT);

8874

8875

assert(0 <= Elt && Elt < (2 * NumElems) && "Shuffle index out of range")(static_cast <bool> (0 <= Elt && Elt < (2
* NumElems) && "Shuffle index out of range") ? void (
0) : __assert_fail ("0 <= Elt && Elt < (2 * NumElems) && \"Shuffle index out of range\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8875, __extension__
__PRETTY_FUNCTION__));

8876

SDValue Src = (Elt < NumElems) ? ShuffleOps[0] : ShuffleOps[1];

8877

return getShuffleScalarElt(Src, Elt % NumElems, DAG, Depth + 1);

8878

}

8879

8880

// Recurse into insert_subvector base/sub vector to find scalars.

8881

if (Opcode == ISD::INSERT_SUBVECTOR) {

8882

SDValue Vec = Op.getOperand(0);

8883

SDValue Sub = Op.getOperand(1);

8884

uint64_t SubIdx = Op.getConstantOperandVal(2);

8885

unsigned NumSubElts = Sub.getValueType().getVectorNumElements();

8886

8887

if (SubIdx <= Index && Index < (SubIdx + NumSubElts))

8888

return getShuffleScalarElt(Sub, Index - SubIdx, DAG, Depth + 1);

8889

return getShuffleScalarElt(Vec, Index, DAG, Depth + 1);

8890

}

8891

8892

// Recurse into concat_vectors sub vector to find scalars.

8893

if (Opcode == ISD::CONCAT_VECTORS) {

8894

EVT SubVT = Op.getOperand(0).getValueType();

8895

unsigned NumSubElts = SubVT.getVectorNumElements();

8896

uint64_t SubIdx = Index / NumSubElts;

8897

uint64_t SubElt = Index % NumSubElts;

8898

return getShuffleScalarElt(Op.getOperand(SubIdx), SubElt, DAG, Depth + 1);

8899

}

8900

8901

// Recurse into extract_subvector src vector to find scalars.

8902

if (Opcode == ISD::EXTRACT_SUBVECTOR) {

8903

SDValue Src = Op.getOperand(0);

8904

uint64_t SrcIdx = Op.getConstantOperandVal(1);

8905

return getShuffleScalarElt(Src, Index + SrcIdx, DAG, Depth + 1);

8906

}

8907

8908

// We only peek through bitcasts of the same vector width.

8909

if (Opcode == ISD::BITCAST) {

8910

SDValue Src = Op.getOperand(0);

8911

EVT SrcVT = Src.getValueType();

8912

if (SrcVT.isVector() && SrcVT.getVectorNumElements() == NumElems)

8913

return getShuffleScalarElt(Src, Index, DAG, Depth + 1);

8914

return SDValue();

8915

}

8916

8917

// Actual nodes that may contain scalar elements

8918

8919

// For insert_vector_elt - either return the index matching scalar or recurse

8920

// into the base vector.

8921

if (Opcode == ISD::INSERT_VECTOR_ELT &&

8922

isa<ConstantSDNode>(Op.getOperand(2))) {

8923

if (Op.getConstantOperandAPInt(2) == Index)

8924

return Op.getOperand(1);

8925

return getShuffleScalarElt(Op.getOperand(0), Index, DAG, Depth + 1);

8926

}

8927

8928

if (Opcode == ISD::SCALAR_TO_VECTOR)

8929

return (Index == 0) ? Op.getOperand(0)

8930

: DAG.getUNDEF(VT.getVectorElementType());

8931

8932

if (Opcode == ISD::BUILD_VECTOR)

8933

return Op.getOperand(Index);

8934

8935

return SDValue();

8936

}

8937

8938

// Use PINSRB/PINSRW/PINSRD to create a build vector.

8939

static SDValue LowerBuildVectorAsInsert(SDValue Op, const APInt &NonZeroMask,

8940

unsigned NumNonZero, unsigned NumZero,

8941

SelectionDAG &DAG,

8942

const X86Subtarget &Subtarget) {

8943

MVT VT = Op.getSimpleValueType();

8944

unsigned NumElts = VT.getVectorNumElements();

8945

assert(((VT == MVT::v8i16 && Subtarget.hasSSE2()) ||(static_cast <bool> (((VT == MVT::v8i16 && Subtarget
.hasSSE2()) || ((VT == MVT::v16i8 || VT == MVT::v4i32) &&
Subtarget.hasSSE41())) && "Illegal vector insertion"
) ? void (0) : __assert_fail ("((VT == MVT::v8i16 && Subtarget.hasSSE2()) || ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) && \"Illegal vector insertion\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8947, __extension__
__PRETTY_FUNCTION__))

8946

((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) &&(static_cast <bool> (((VT == MVT::v8i16 && Subtarget
.hasSSE2()) || ((VT == MVT::v16i8 || VT == MVT::v4i32) &&
Subtarget.hasSSE41())) && "Illegal vector insertion"
) ? void (0) : __assert_fail ("((VT == MVT::v8i16 && Subtarget.hasSSE2()) || ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) && \"Illegal vector insertion\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8947, __extension__
__PRETTY_FUNCTION__))

8947

"Illegal vector insertion")(static_cast <bool> (((VT == MVT::v8i16 && Subtarget
.hasSSE2()) || ((VT == MVT::v16i8 || VT == MVT::v4i32) &&
Subtarget.hasSSE41())) && "Illegal vector insertion"
) ? void (0) : __assert_fail ("((VT == MVT::v8i16 && Subtarget.hasSSE2()) || ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) && \"Illegal vector insertion\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8947, __extension__
__PRETTY_FUNCTION__));

8948

8949

SDLoc dl(Op);

8950

SDValue V;

8951

bool First = true;

8952

8953

for (unsigned i = 0; i < NumElts; ++i) {

8954

bool IsNonZero = NonZeroMask[i];

8955

if (!IsNonZero)

8956

continue;

8957

8958

// If the build vector contains zeros or our first insertion is not the

8959

// first index then insert into zero vector to break any register

8960

// dependency else use SCALAR_TO_VECTOR.

8961

if (First) {

8962

First = false;

8963

if (NumZero || 0 != i)

8964

V = getZeroVector(VT, Subtarget, DAG, dl);

8965

else {

8966

assert(0 == i && "Expected insertion into zero-index")(static_cast <bool> (0 == i && "Expected insertion into zero-index"
) ? void (0) : __assert_fail ("0 == i && \"Expected insertion into zero-index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 8966, __extension__
__PRETTY_FUNCTION__));

8967

V = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);

8968

V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);

8969

V = DAG.getBitcast(VT, V);

8970

continue;

8971

}

8972

}

8973

V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, V, Op.getOperand(i),

8974

DAG.getIntPtrConstant(i, dl));

8975

}

8976

8977

return V;

8978

}

8979

8980

/// Custom lower build_vector of v16i8.

8981

static SDValue LowerBuildVectorv16i8(SDValue Op, const APInt &NonZeroMask,

8982

unsigned NumNonZero, unsigned NumZero,

8983

SelectionDAG &DAG,

8984

const X86Subtarget &Subtarget) {

8985

if (NumNonZero > 8 && !Subtarget.hasSSE41())

8986

return SDValue();

8987

8988

// SSE4.1 - use PINSRB to insert each byte directly.

8989

if (Subtarget.hasSSE41())

8990

return LowerBuildVectorAsInsert(Op, NonZeroMask, NumNonZero, NumZero, DAG,

8991

Subtarget);

8992

8993

SDLoc dl(Op);

8994

SDValue V;

8995

8996

// Pre-SSE4.1 - merge byte pairs and insert with PINSRW.

8997

for (unsigned i = 0; i < 16; i += 2) {

8998

bool ThisIsNonZero = NonZeroMask[i];

8999

bool NextIsNonZero = NonZeroMask[i + 1];

9000

if (!ThisIsNonZero && !NextIsNonZero)

9001

continue;

9002

9003

// FIXME: Investigate combining the first 4 bytes as a i32 instead.

9004

SDValue Elt;

9005

if (ThisIsNonZero) {

9006

if (NumZero || NextIsNonZero)

9007

Elt = DAG.getZExtOrTrunc(Op.getOperand(i), dl, MVT::i32);

9008

else

9009

Elt = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);

9010

}

9011

9012

if (NextIsNonZero) {

9013

SDValue NextElt = Op.getOperand(i + 1);

9014

if (i == 0 && NumZero)

9015

NextElt = DAG.getZExtOrTrunc(NextElt, dl, MVT::i32);

9016

else

9017

NextElt = DAG.getAnyExtOrTrunc(NextElt, dl, MVT::i32);

9018

NextElt = DAG.getNode(ISD::SHL, dl, MVT::i32, NextElt,

9019

DAG.getConstant(8, dl, MVT::i8));

9020

if (ThisIsNonZero)

9021

Elt = DAG.getNode(ISD::OR, dl, MVT::i32, NextElt, Elt);

9022

else

9023

Elt = NextElt;

9024

}

9025

9026

// If our first insertion is not the first index or zeros are needed, then

9027

// insert into zero vector. Otherwise, use SCALAR_TO_VECTOR (leaves high

9028

// elements undefined).

9029

if (!V) {

9030

if (i != 0 || NumZero)

9031

V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);

9032

else {

9033

V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Elt);

9034

V = DAG.getBitcast(MVT::v8i16, V);

9035

continue;

9036

}

9037

}

9038

Elt = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Elt);

9039

V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, Elt,

9040

DAG.getIntPtrConstant(i / 2, dl));

9041

}

9042

9043

return DAG.getBitcast(MVT::v16i8, V);

9044

}

9045

9046

/// Custom lower build_vector of v8i16.

9047

static SDValue LowerBuildVectorv8i16(SDValue Op, const APInt &NonZeroMask,

9048

unsigned NumNonZero, unsigned NumZero,

9049

SelectionDAG &DAG,

9050

const X86Subtarget &Subtarget) {

9051

if (NumNonZero > 4 && !Subtarget.hasSSE41())

9052

return SDValue();

9053

9054

// Use PINSRW to insert each byte directly.

9055

return LowerBuildVectorAsInsert(Op, NonZeroMask, NumNonZero, NumZero, DAG,

9056

Subtarget);

9057

}

9058

9059

/// Custom lower build_vector of v4i32 or v4f32.

9060

static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,

9061

const X86Subtarget &Subtarget) {

9062

// If this is a splat of a pair of elements, use MOVDDUP (unless the target

9063

// has XOP; in that case defer lowering to potentially use VPERMIL2PS).

9064

// Because we're creating a less complicated build vector here, we may enable

9065

// further folding of the MOVDDUP via shuffle transforms.

9066

if (Subtarget.hasSSE3() && !Subtarget.hasXOP() &&

9067

Op.getOperand(0) == Op.getOperand(2) &&

9068

Op.getOperand(1) == Op.getOperand(3) &&

9069

Op.getOperand(0) != Op.getOperand(1)) {

9070

SDLoc DL(Op);

9071

MVT VT = Op.getSimpleValueType();

9072

MVT EltVT = VT.getVectorElementType();

9073

// Create a new build vector with the first 2 elements followed by undef

9074

// padding, bitcast to v2f64, duplicate, and bitcast back.

9075

SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),

9076

DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };

9077

SDValue NewBV = DAG.getBitcast(MVT::v2f64, DAG.getBuildVector(VT, DL, Ops));

9078

SDValue Dup = DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, NewBV);

9079

return DAG.getBitcast(VT, Dup);

9080

}

9081

9082

// Find all zeroable elements.

9083

std::bitset<4> Zeroable, Undefs;

9084

for (int i = 0; i < 4; ++i) {

9085

SDValue Elt = Op.getOperand(i);

9086

Undefs[i] = Elt.isUndef();

9087

Zeroable[i] = (Elt.isUndef() || X86::isZeroNode(Elt));

9088

}

9089

assert(Zeroable.size() - Zeroable.count() > 1 &&(static_cast <bool> (Zeroable.size() - Zeroable.count()
> 1 && "We expect at least two non-zero elements!"
) ? void (0) : __assert_fail ("Zeroable.size() - Zeroable.count() > 1 && \"We expect at least two non-zero elements!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9090, __extension__
__PRETTY_FUNCTION__))

9090

"We expect at least two non-zero elements!")(static_cast <bool> (Zeroable.size() - Zeroable.count()
> 1 && "We expect at least two non-zero elements!"
) ? void (0) : __assert_fail ("Zeroable.size() - Zeroable.count() > 1 && \"We expect at least two non-zero elements!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9090, __extension__
__PRETTY_FUNCTION__));

9091

9092

// We only know how to deal with build_vector nodes where elements are either

9093

// zeroable or extract_vector_elt with constant index.

9094

SDValue FirstNonZero;

9095

unsigned FirstNonZeroIdx;

9096

for (unsigned i = 0; i < 4; ++i) {

9097

if (Zeroable[i])

9098

continue;

9099

SDValue Elt = Op.getOperand(i);

9100

if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

9101

!isa<ConstantSDNode>(Elt.getOperand(1)))

9102

return SDValue();

9103

// Make sure that this node is extracting from a 128-bit vector.

9104

MVT VT = Elt.getOperand(0).getSimpleValueType();

9105

if (!VT.is128BitVector())

9106

return SDValue();

9107

if (!FirstNonZero.getNode()) {

9108

FirstNonZero = Elt;

9109

FirstNonZeroIdx = i;

9110

}

9111

}

9112

9113

assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!")(static_cast <bool> (FirstNonZero.getNode() && "Unexpected build vector of all zeros!"
) ? void (0) : __assert_fail ("FirstNonZero.getNode() && \"Unexpected build vector of all zeros!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9113, __extension__
__PRETTY_FUNCTION__));

9114

SDValue V1 = FirstNonZero.getOperand(0);

9115

MVT VT = V1.getSimpleValueType();

9116

9117

// See if this build_vector can be lowered as a blend with zero.

9118

SDValue Elt;

9119

unsigned EltMaskIdx, EltIdx;

9120

int Mask[4];

9121

for (EltIdx = 0; EltIdx < 4; ++EltIdx) {

9122

if (Zeroable[EltIdx]) {

9123

// The zero vector will be on the right hand side.

9124

Mask[EltIdx] = EltIdx+4;

9125

continue;

9126

}

9127

9128

Elt = Op->getOperand(EltIdx);

9129

// By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.

9130

EltMaskIdx = Elt.getConstantOperandVal(1);

9131

if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx)

9132

break;

9133

Mask[EltIdx] = EltIdx;

9134

}

9135

9136

if (EltIdx == 4) {

9137

// Let the shuffle legalizer deal with blend operations.

9138

SDValue VZeroOrUndef = (Zeroable == Undefs)

9139

? DAG.getUNDEF(VT)

9140

: getZeroVector(VT, Subtarget, DAG, SDLoc(Op));

9141

if (V1.getSimpleValueType() != VT)

9142

V1 = DAG.getBitcast(VT, V1);

9143

return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZeroOrUndef, Mask);

9144

}

9145

9146

// See if we can lower this build_vector to a INSERTPS.

9147

if (!Subtarget.hasSSE41())

9148

return SDValue();

9149

9150

SDValue V2 = Elt.getOperand(0);

9151

if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)

9152

V1 = SDValue();

9153

9154

bool CanFold = true;

9155

for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {

9156

if (Zeroable[i])

9157

continue;

9158

9159

SDValue Current = Op->getOperand(i);

9160

SDValue SrcVector = Current->getOperand(0);

9161

if (!V1.getNode())

9162

V1 = SrcVector;

9163

CanFold = (SrcVector == V1) && (Current.getConstantOperandAPInt(1) == i);

9164

}

9165

9166

if (!CanFold)

9167

return SDValue();

9168

9169

assert(V1.getNode() && "Expected at least two non-zero elements!")(static_cast <bool> (V1.getNode() && "Expected at least two non-zero elements!"
) ? void (0) : __assert_fail ("V1.getNode() && \"Expected at least two non-zero elements!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9169, __extension__
__PRETTY_FUNCTION__));

9170

if (V1.getSimpleValueType() != MVT::v4f32)

9171

V1 = DAG.getBitcast(MVT::v4f32, V1);

9172

if (V2.getSimpleValueType() != MVT::v4f32)

9173

V2 = DAG.getBitcast(MVT::v4f32, V2);

9174

9175

// Ok, we can emit an INSERTPS instruction.

9176

unsigned ZMask = Zeroable.to_ulong();

9177

9178

unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask;

9179

assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!")(static_cast <bool> ((InsertPSMask & ~0xFFu) == 0 &&
"Invalid mask!") ? void (0) : __assert_fail ("(InsertPSMask & ~0xFFu) == 0 && \"Invalid mask!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9179, __extension__
__PRETTY_FUNCTION__));

9180

SDLoc DL(Op);

9181

SDValue Result = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,

9182

DAG.getIntPtrConstant(InsertPSMask, DL, true));

9183

return DAG.getBitcast(VT, Result);

9184

}

9185

9186

/// Return a vector logical shift node.

9187

static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits,

9188

SelectionDAG &DAG, const TargetLowering &TLI,

9189

const SDLoc &dl) {

9190

assert(VT.is128BitVector() && "Unknown type for VShift")(static_cast <bool> (VT.is128BitVector() && "Unknown type for VShift"
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Unknown type for VShift\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9190, __extension__
__PRETTY_FUNCTION__));

9191

MVT ShVT = MVT::v16i8;

9192

unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;

9193

SrcOp = DAG.getBitcast(ShVT, SrcOp);

9194

assert(NumBits % 8 == 0 && "Only support byte sized shifts")(static_cast <bool> (NumBits % 8 == 0 && "Only support byte sized shifts"
) ? void (0) : __assert_fail ("NumBits % 8 == 0 && \"Only support byte sized shifts\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9194, __extension__
__PRETTY_FUNCTION__));

9195

SDValue ShiftVal = DAG.getTargetConstant(NumBits / 8, dl, MVT::i8);

9196

return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));

9197

}

9198

9199

static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl,

9200

SelectionDAG &DAG) {

9201

9202

// Check if the scalar load can be widened into a vector load. And if

9203

// the address is "base + cst" see if the cst can be "absorbed" into

9204

// the shuffle mask.

9205

if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {

9206

SDValue Ptr = LD->getBasePtr();

9207

if (!ISD::isNormalLoad(LD) || !LD->isSimple())

9208

return SDValue();

9209

EVT PVT = LD->getValueType(0);

9210

if (PVT != MVT::i32 && PVT != MVT::f32)

9211

return SDValue();

9212

9213

int FI = -1;

9214

int64_t Offset = 0;

9215

if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {

9216

FI = FINode->getIndex();

9217

Offset = 0;

9218

} else if (DAG.isBaseWithConstantOffset(Ptr) &&

9219

isa<FrameIndexSDNode>(Ptr.getOperand(0))) {

9220

FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();

9221

Offset = Ptr.getConstantOperandVal(1);

9222

Ptr = Ptr.getOperand(0);

9223

} else {

9224

return SDValue();

9225

}

9226

9227

// FIXME: 256-bit vector instructions don't require a strict alignment,

9228

// improve this code to support it better.

9229

Align RequiredAlign(VT.getSizeInBits() / 8);

9230

SDValue Chain = LD->getChain();

9231

// Make sure the stack object alignment is at least 16 or 32.

9232

MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();

9233

MaybeAlign InferredAlign = DAG.InferPtrAlign(Ptr);

9234

if (!InferredAlign || *InferredAlign < RequiredAlign) {

9235

if (MFI.isFixedObjectIndex(FI)) {

9236

// Can't change the alignment. FIXME: It's possible to compute

9237

// the exact stack offset and reference FI + adjust offset instead.

9238

// If someone *really* cares about this. That's the way to implement it.

9239

return SDValue();

9240

} else {

9241

MFI.setObjectAlignment(FI, RequiredAlign);

9242

}

9243

}

9244

9245

// (Offset % 16 or 32) must be multiple of 4. Then address is then

9246

// Ptr + (Offset & ~15).

9247

if (Offset < 0)

9248

return SDValue();

9249

if ((Offset % RequiredAlign.value()) & 3)

9250

return SDValue();

9251

int64_t StartOffset = Offset & ~int64_t(RequiredAlign.value() - 1);

9252

if (StartOffset) {

9253

SDLoc DL(Ptr);

9254

Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,

9255

DAG.getConstant(StartOffset, DL, Ptr.getValueType()));

9256

}

9257

9258

int EltNo = (Offset - StartOffset) >> 2;

9259

unsigned NumElems = VT.getVectorNumElements();

9260

9261

EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);

9262

SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,

9263

LD->getPointerInfo().getWithOffset(StartOffset));

9264

9265

SmallVector<int, 8> Mask(NumElems, EltNo);

9266

9267

return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), Mask);

9268

}

9269

9270

return SDValue();

9271

}

9272

9273

// Recurse to find a LoadSDNode source and the accumulated ByteOffest.

9274

static bool findEltLoadSrc(SDValue Elt, LoadSDNode *&Ld, int64_t &ByteOffset) {

9275

if (ISD::isNON_EXTLoad(Elt.getNode())) {

9276

auto *BaseLd = cast<LoadSDNode>(Elt);

9277

if (!BaseLd->isSimple())

9278

return false;

9279

Ld = BaseLd;

9280

ByteOffset = 0;

9281

return true;

9282

}

9283

9284

switch (Elt.getOpcode()) {

9285

case ISD::BITCAST:

9286

case ISD::TRUNCATE:

9287

case ISD::SCALAR_TO_VECTOR:

9288

return findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset);

9289

case ISD::SRL:

9290

if (auto *AmtC = dyn_cast<ConstantSDNode>(Elt.getOperand(1))) {

9291

uint64_t Amt = AmtC->getZExtValue();

9292

if ((Amt % 8) == 0 && findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset)) {

9293

ByteOffset += Amt / 8;

9294

return true;

9295

}

9296

}

9297

break;

9298

case ISD::EXTRACT_VECTOR_ELT:

9299

if (auto *IdxC = dyn_cast<ConstantSDNode>(Elt.getOperand(1))) {

9300

SDValue Src = Elt.getOperand(0);

9301

unsigned SrcSizeInBits = Src.getScalarValueSizeInBits();

9302

unsigned DstSizeInBits = Elt.getScalarValueSizeInBits();

9303

if (DstSizeInBits == SrcSizeInBits && (SrcSizeInBits % 8) == 0 &&

9304

findEltLoadSrc(Src, Ld, ByteOffset)) {

9305

uint64_t Idx = IdxC->getZExtValue();

9306

ByteOffset += Idx * (SrcSizeInBits / 8);

9307

return true;

9308

}

9309

}

9310

break;

9311

}

9312

9313

return false;

9314

}

9315

9316

/// Given the initializing elements 'Elts' of a vector of type 'VT', see if the

9317

/// elements can be replaced by a single large load which has the same value as

9318

/// a build_vector or insert_subvector whose loaded operands are 'Elts'.

9319

///

9320

/// Example: <load i32 *a, load i32 *a+4, zero, undef> -> zextload a

9321

static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,

9322

const SDLoc &DL, SelectionDAG &DAG,

9323

const X86Subtarget &Subtarget,

9324

bool IsAfterLegalize) {

9325

if ((VT.getScalarSizeInBits() % 8) != 0)

9326

return SDValue();

9327

9328

unsigned NumElems = Elts.size();

9329

9330

int LastLoadedElt = -1;

9331

APInt LoadMask = APInt::getZero(NumElems);

9332

APInt ZeroMask = APInt::getZero(NumElems);

9333

APInt UndefMask = APInt::getZero(NumElems);

9334

9335

SmallVector<LoadSDNode*, 8> Loads(NumElems, nullptr);

9336

SmallVector<int64_t, 8> ByteOffsets(NumElems, 0);

9337

9338

// For each element in the initializer, see if we've found a load, zero or an

9339

// undef.

9340

for (unsigned i = 0; i < NumElems; ++i) {

9341

SDValue Elt = peekThroughBitcasts(Elts[i]);

9342

if (!Elt.getNode())

9343

return SDValue();

9344

if (Elt.isUndef()) {

9345

UndefMask.setBit(i);

9346

continue;

9347

}

9348

if (X86::isZeroNode(Elt) || ISD::isBuildVectorAllZeros(Elt.getNode())) {

9349

ZeroMask.setBit(i);

9350

continue;

9351

}

9352

9353

// Each loaded element must be the correct fractional portion of the

9354

// requested vector load.

9355

unsigned EltSizeInBits = Elt.getValueSizeInBits();

9356

if ((NumElems * EltSizeInBits) != VT.getSizeInBits())

9357

return SDValue();

9358

9359

if (!findEltLoadSrc(Elt, Loads[i], ByteOffsets[i]) || ByteOffsets[i] < 0)

9360

return SDValue();

9361

unsigned LoadSizeInBits = Loads[i]->getValueSizeInBits(0);

9362

if (((ByteOffsets[i] * 8) + EltSizeInBits) > LoadSizeInBits)

9363

return SDValue();

9364

9365

LoadMask.setBit(i);

9366

LastLoadedElt = i;

9367

}

9368

assert((ZeroMask.countPopulation() + UndefMask.countPopulation() +(static_cast <bool> ((ZeroMask.countPopulation() + UndefMask
.countPopulation() + LoadMask.countPopulation()) == NumElems &&
"Incomplete element masks") ? void (0) : __assert_fail ("(ZeroMask.countPopulation() + UndefMask.countPopulation() + LoadMask.countPopulation()) == NumElems && \"Incomplete element masks\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9370, __extension__
__PRETTY_FUNCTION__))

9369

LoadMask.countPopulation()) == NumElems &&(static_cast <bool> ((ZeroMask.countPopulation() + UndefMask
.countPopulation() + LoadMask.countPopulation()) == NumElems &&
"Incomplete element masks") ? void (0) : __assert_fail ("(ZeroMask.countPopulation() + UndefMask.countPopulation() + LoadMask.countPopulation()) == NumElems && \"Incomplete element masks\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9370, __extension__
__PRETTY_FUNCTION__))

9370

"Incomplete element masks")(static_cast <bool> ((ZeroMask.countPopulation() + UndefMask
.countPopulation() + LoadMask.countPopulation()) == NumElems &&
"Incomplete element masks") ? void (0) : __assert_fail ("(ZeroMask.countPopulation() + UndefMask.countPopulation() + LoadMask.countPopulation()) == NumElems && \"Incomplete element masks\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9370, __extension__
__PRETTY_FUNCTION__));

9371

9372

// Handle Special Cases - all undef or undef/zero.

9373

if (UndefMask.countPopulation() == NumElems)

9374

return DAG.getUNDEF(VT);

9375

if ((ZeroMask.countPopulation() + UndefMask.countPopulation()) == NumElems)

9376

return VT.isInteger() ? DAG.getConstant(0, DL, VT)

9377

: DAG.getConstantFP(0.0, DL, VT);

9378

9379

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

9380

int FirstLoadedElt = LoadMask.countTrailingZeros();

9381

SDValue EltBase = peekThroughBitcasts(Elts[FirstLoadedElt]);

9382

EVT EltBaseVT = EltBase.getValueType();

9383

assert(EltBaseVT.getSizeInBits() == EltBaseVT.getStoreSizeInBits() &&(static_cast <bool> (EltBaseVT.getSizeInBits() == EltBaseVT
.getStoreSizeInBits() && "Register/Memory size mismatch"
) ? void (0) : __assert_fail ("EltBaseVT.getSizeInBits() == EltBaseVT.getStoreSizeInBits() && \"Register/Memory size mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9384, __extension__
__PRETTY_FUNCTION__))

9384

"Register/Memory size mismatch")(static_cast <bool> (EltBaseVT.getSizeInBits() == EltBaseVT
.getStoreSizeInBits() && "Register/Memory size mismatch"
) ? void (0) : __assert_fail ("EltBaseVT.getSizeInBits() == EltBaseVT.getStoreSizeInBits() && \"Register/Memory size mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9384, __extension__
__PRETTY_FUNCTION__));

9385

LoadSDNode *LDBase = Loads[FirstLoadedElt];

9386

assert(LDBase && "Did not find base load for merging consecutive loads")(static_cast <bool> (LDBase && "Did not find base load for merging consecutive loads"
) ? void (0) : __assert_fail ("LDBase && \"Did not find base load for merging consecutive loads\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9386, __extension__
__PRETTY_FUNCTION__));

9387

unsigned BaseSizeInBits = EltBaseVT.getStoreSizeInBits();

9388

unsigned BaseSizeInBytes = BaseSizeInBits / 8;

9389

int NumLoadedElts = (1 + LastLoadedElt - FirstLoadedElt);

9390

int LoadSizeInBits = NumLoadedElts * BaseSizeInBits;

9391

assert((BaseSizeInBits % 8) == 0 && "Sub-byte element loads detected")(static_cast <bool> ((BaseSizeInBits % 8) == 0 &&
"Sub-byte element loads detected") ? void (0) : __assert_fail
("(BaseSizeInBits % 8) == 0 && \"Sub-byte element loads detected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9391, __extension__
__PRETTY_FUNCTION__));

9392

9393

// TODO: Support offsetting the base load.

9394

if (ByteOffsets[FirstLoadedElt] != 0)

9395

return SDValue();

9396

9397

// Check to see if the element's load is consecutive to the base load

9398

// or offset from a previous (already checked) load.

9399

auto CheckConsecutiveLoad = [&](LoadSDNode *Base, int EltIdx) {

9400

LoadSDNode *Ld = Loads[EltIdx];

9401

int64_t ByteOffset = ByteOffsets[EltIdx];

9402

if (ByteOffset && (ByteOffset % BaseSizeInBytes) == 0) {

9403

int64_t BaseIdx = EltIdx - (ByteOffset / BaseSizeInBytes);

9404

return (0 <= BaseIdx && BaseIdx < (int)NumElems && LoadMask[BaseIdx] &&

9405

Loads[BaseIdx] == Ld && ByteOffsets[BaseIdx] == 0);

9406

}

9407

return DAG.areNonVolatileConsecutiveLoads(Ld, Base, BaseSizeInBytes,

9408

EltIdx - FirstLoadedElt);

9409

};

9410

9411

// Consecutive loads can contain UNDEFS but not ZERO elements.

9412

// Consecutive loads with UNDEFs and ZEROs elements require a

9413

// an additional shuffle stage to clear the ZERO elements.

9414

bool IsConsecutiveLoad = true;

9415

bool IsConsecutiveLoadWithZeros = true;

9416

for (int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) {

9417

if (LoadMask[i]) {

9418

if (!CheckConsecutiveLoad(LDBase, i)) {

9419

IsConsecutiveLoad = false;

9420

IsConsecutiveLoadWithZeros = false;

9421

break;

9422

}

9423

} else if (ZeroMask[i]) {

9424

IsConsecutiveLoad = false;

9425

}

9426

}

9427

9428

auto CreateLoad = [&DAG, &DL, &Loads](EVT VT, LoadSDNode *LDBase) {

9429

auto MMOFlags = LDBase->getMemOperand()->getFlags();

9430

assert(LDBase->isSimple() &&(static_cast <bool> (LDBase->isSimple() && "Cannot merge volatile or atomic loads."
) ? void (0) : __assert_fail ("LDBase->isSimple() && \"Cannot merge volatile or atomic loads.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9431, __extension__
__PRETTY_FUNCTION__))

9431

"Cannot merge volatile or atomic loads.")(static_cast <bool> (LDBase->isSimple() && "Cannot merge volatile or atomic loads."
) ? void (0) : __assert_fail ("LDBase->isSimple() && \"Cannot merge volatile or atomic loads.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9431, __extension__
__PRETTY_FUNCTION__));

9432

SDValue NewLd =

9433

DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),

9434

LDBase->getPointerInfo(), LDBase->getOriginalAlign(),

9435

MMOFlags);

9436

for (auto *LD : Loads)

9437

if (LD)

9438

DAG.makeEquivalentMemoryOrdering(LD, NewLd);

9439

return NewLd;

9440

};

9441

9442

// Check if the base load is entirely dereferenceable.

9443

bool IsDereferenceable = LDBase->getPointerInfo().isDereferenceable(

9444

VT.getSizeInBits() / 8, *DAG.getContext(), DAG.getDataLayout());

9445

9446

// LOAD - all consecutive load/undefs (must start/end with a load or be

9447

// entirely dereferenceable). If we have found an entire vector of loads and

9448

// undefs, then return a large load of the entire vector width starting at the

9449

// base pointer. If the vector contains zeros, then attempt to shuffle those

9450

// elements.

9451

if (FirstLoadedElt == 0 &&

9452

(NumLoadedElts == (int)NumElems || IsDereferenceable) &&

9453

(IsConsecutiveLoad || IsConsecutiveLoadWithZeros)) {

9454

if (IsAfterLegalize && !TLI.isOperationLegal(ISD::LOAD, VT))

9455

return SDValue();

9456

9457

// Don't create 256-bit non-temporal aligned loads without AVX2 as these

9458

// will lower to regular temporal loads and use the cache.

9459

if (LDBase->isNonTemporal() && LDBase->getAlign() >= Align(32) &&

9460

VT.is256BitVector() && !Subtarget.hasInt256())

9461

return SDValue();

9462

9463

if (NumElems == 1)

9464

return DAG.getBitcast(VT, Elts[FirstLoadedElt]);

9465

9466

if (!ZeroMask)

9467

return CreateLoad(VT, LDBase);

9468

9469

// IsConsecutiveLoadWithZeros - we need to create a shuffle of the loaded

9470

// vector and a zero vector to clear out the zero elements.

9471

if (!IsAfterLegalize && VT.isVector()) {

9472

unsigned NumMaskElts = VT.getVectorNumElements();

9473

if ((NumMaskElts % NumElems) == 0) {

9474

unsigned Scale = NumMaskElts / NumElems;

9475

SmallVector<int, 4> ClearMask(NumMaskElts, -1);

9476

for (unsigned i = 0; i < NumElems; ++i) {

9477

if (UndefMask[i])

9478

continue;

9479

int Offset = ZeroMask[i] ? NumMaskElts : 0;

9480

for (unsigned j = 0; j != Scale; ++j)

9481

ClearMask[(i * Scale) + j] = (i * Scale) + j + Offset;

9482

}

9483

SDValue V = CreateLoad(VT, LDBase);

9484

SDValue Z = VT.isInteger() ? DAG.getConstant(0, DL, VT)

9485

: DAG.getConstantFP(0.0, DL, VT);

9486

return DAG.getVectorShuffle(VT, DL, V, Z, ClearMask);

9487

}

9488

}

9489

}

9490

9491

// If the upper half of a ymm/zmm load is undef then just load the lower half.

9492

if (VT.is256BitVector() || VT.is512BitVector()) {

9493

unsigned HalfNumElems = NumElems / 2;

9494

if (UndefMask.extractBits(HalfNumElems, HalfNumElems).isAllOnes()) {

9495

EVT HalfVT =

9496

EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(), HalfNumElems);

9497

SDValue HalfLD =

9498

EltsFromConsecutiveLoads(HalfVT, Elts.drop_back(HalfNumElems), DL,

9499

DAG, Subtarget, IsAfterLegalize);

9500

if (HalfLD)

9501

return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT),

9502

HalfLD, DAG.getIntPtrConstant(0, DL));

9503

}

9504

}

9505

9506

// VZEXT_LOAD - consecutive 32/64-bit load/undefs followed by zeros/undefs.

9507

if (IsConsecutiveLoad && FirstLoadedElt == 0 &&

9508

((LoadSizeInBits == 16 && Subtarget.hasFP16()) || LoadSizeInBits == 32 ||

9509

LoadSizeInBits == 64) &&

9510

((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))) {

9511

MVT VecSVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(LoadSizeInBits)

9512

: MVT::getIntegerVT(LoadSizeInBits);

9513

MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / LoadSizeInBits);

9514

// Allow v4f32 on SSE1 only targets.

9515

// FIXME: Add more isel patterns so we can just use VT directly.

9516

if (!Subtarget.hasSSE2() && VT == MVT::v4f32)

9517

VecVT = MVT::v4f32;

9518

if (TLI.isTypeLegal(VecVT)) {

9519

SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);

9520

SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };

9521

SDValue ResNode = DAG.getMemIntrinsicNode(

9522

X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT, LDBase->getPointerInfo(),

9523

LDBase->getOriginalAlign(), MachineMemOperand::MOLoad);

9524

for (auto *LD : Loads)

9525

if (LD)

9526

DAG.makeEquivalentMemoryOrdering(LD, ResNode);

9527

return DAG.getBitcast(VT, ResNode);

9528

}

9529

}

9530

9531

// BROADCAST - match the smallest possible repetition pattern, load that

9532

// scalar/subvector element and then broadcast to the entire vector.

9533

if (ZeroMask.isZero() && isPowerOf2_32(NumElems) && Subtarget.hasAVX() &&

9534

(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector())) {

9535

for (unsigned SubElems = 1; SubElems < NumElems; SubElems *= 2) {

9536

unsigned RepeatSize = SubElems * BaseSizeInBits;

9537

unsigned ScalarSize = std::min(RepeatSize, 64u);

9538

if (!Subtarget.hasAVX2() && ScalarSize < 32)

9539

continue;

9540

9541

// Don't attempt a 1:N subvector broadcast - it should be caught by

9542

// combineConcatVectorOps, else will cause infinite loops.

9543

if (RepeatSize > ScalarSize && SubElems == 1)

9544

continue;

9545

9546

bool Match = true;

9547

SmallVector<SDValue, 8> RepeatedLoads(SubElems, DAG.getUNDEF(EltBaseVT));

9548

for (unsigned i = 0; i != NumElems && Match; ++i) {

9549

if (!LoadMask[i])

9550

continue;

9551

SDValue Elt = peekThroughBitcasts(Elts[i]);

9552

if (RepeatedLoads[i % SubElems].isUndef())

9553

RepeatedLoads[i % SubElems] = Elt;

9554

else

9555

Match &= (RepeatedLoads[i % SubElems] == Elt);

9556

}

9557

9558

// We must have loads at both ends of the repetition.

9559

Match &= !RepeatedLoads.front().isUndef();

9560

Match &= !RepeatedLoads.back().isUndef();

9561

if (!Match)

9562

continue;

9563

9564

EVT RepeatVT =

9565

VT.isInteger() && (RepeatSize != 64 || TLI.isTypeLegal(MVT::i64))

9566

? EVT::getIntegerVT(*DAG.getContext(), ScalarSize)

9567

: EVT::getFloatingPointVT(ScalarSize);

9568

if (RepeatSize > ScalarSize)

9569

RepeatVT = EVT::getVectorVT(*DAG.getContext(), RepeatVT,

9570

RepeatSize / ScalarSize);

9571

EVT BroadcastVT =

9572

EVT::getVectorVT(*DAG.getContext(), RepeatVT.getScalarType(),

9573

VT.getSizeInBits() / ScalarSize);

9574

if (TLI.isTypeLegal(BroadcastVT)) {

9575

if (SDValue RepeatLoad = EltsFromConsecutiveLoads(

9576

RepeatVT, RepeatedLoads, DL, DAG, Subtarget, IsAfterLegalize)) {

9577

SDValue Broadcast = RepeatLoad;

9578

if (RepeatSize > ScalarSize) {

9579

while (Broadcast.getValueSizeInBits() < VT.getSizeInBits())

9580

Broadcast = concatSubVectors(Broadcast, Broadcast, DAG, DL);

9581

} else {

9582

if (!Subtarget.hasAVX2() &&

9583

!X86::mayFoldLoadIntoBroadcastFromMem(

9584

RepeatLoad, RepeatVT.getScalarType().getSimpleVT(),

9585

Subtarget,

9586

/*AssumeSingleUse=*/true))

9587

return SDValue();

9588

Broadcast =

9589

DAG.getNode(X86ISD::VBROADCAST, DL, BroadcastVT, RepeatLoad);

9590

}

9591

return DAG.getBitcast(VT, Broadcast);

9592

}

9593

}

9594

}

9595

}

9596

9597

return SDValue();

9598

}

9599

9600

// Combine a vector ops (shuffles etc.) that is equal to build_vector load1,

9601

// load2, load3, load4, <0, 1, 2, 3> into a vector load if the load addresses

9602

// are consecutive, non-overlapping, and in the right order.

9603

static SDValue combineToConsecutiveLoads(EVT VT, SDValue Op, const SDLoc &DL,

9604

SelectionDAG &DAG,

9605

const X86Subtarget &Subtarget,

9606

bool IsAfterLegalize) {

9607

SmallVector<SDValue, 64> Elts;

9608

for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {

9609

if (SDValue Elt = getShuffleScalarElt(Op, i, DAG, 0)) {

9610

Elts.push_back(Elt);

9611

continue;

9612

}

9613

return SDValue();

9614

}

9615

assert(Elts.size() == VT.getVectorNumElements())(static_cast <bool> (Elts.size() == VT.getVectorNumElements
()) ? void (0) : __assert_fail ("Elts.size() == VT.getVectorNumElements()"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9615, __extension__
__PRETTY_FUNCTION__));

9616

return EltsFromConsecutiveLoads(VT, Elts, DL, DAG, Subtarget,

9617

IsAfterLegalize);

9618

}

9619

9620

static Constant *getConstantVector(MVT VT, const APInt &SplatValue,

9621

unsigned SplatBitSize, LLVMContext &C) {

9622

unsigned ScalarSize = VT.getScalarSizeInBits();

9623

unsigned NumElm = SplatBitSize / ScalarSize;

9624

9625

SmallVector<Constant *, 32> ConstantVec;

9626

for (unsigned i = 0; i < NumElm; i++) {

9627

APInt Val = SplatValue.extractBits(ScalarSize, ScalarSize * i);

9628

Constant *Const;

9629

if (VT.isFloatingPoint()) {

9630

if (ScalarSize == 16) {

9631

Const = ConstantFP::get(C, APFloat(APFloat::IEEEhalf(), Val));

9632

} else if (ScalarSize == 32) {

9633

Const = ConstantFP::get(C, APFloat(APFloat::IEEEsingle(), Val));

9634

} else {

9635

assert(ScalarSize == 64 && "Unsupported floating point scalar size")(static_cast <bool> (ScalarSize == 64 && "Unsupported floating point scalar size"
) ? void (0) : __assert_fail ("ScalarSize == 64 && \"Unsupported floating point scalar size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9635, __extension__
__PRETTY_FUNCTION__));

9636

Const = ConstantFP::get(C, APFloat(APFloat::IEEEdouble(), Val));

9637

}

9638

} else

9639

Const = Constant::getIntegerValue(Type::getIntNTy(C, ScalarSize), Val);

9640

ConstantVec.push_back(Const);

9641

}

9642

return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));

9643

}

9644

9645

static bool isFoldableUseOfShuffle(SDNode *N) {

9646

for (auto *U : N->uses()) {

9647

unsigned Opc = U->getOpcode();

9648

// VPERMV/VPERMV3 shuffles can never fold their index operands.

9649

if (Opc == X86ISD::VPERMV && U->getOperand(0).getNode() == N)

9650

return false;

9651

if (Opc == X86ISD::VPERMV3 && U->getOperand(1).getNode() == N)

9652

return false;

9653

if (isTargetShuffle(Opc))

9654

return true;

9655

if (Opc == ISD::BITCAST) // Ignore bitcasts

9656

return isFoldableUseOfShuffle(U);

9657

if (N->hasOneUse()) {

9658

// TODO, there may be some general way to know if a SDNode can

9659

// be folded. We now only know whether an MI is foldable.

9660

if (Opc == X86ISD::VPDPBUSD && U->getOperand(2).getNode() != N)

9661

return false;

9662

return true;

9663

}

9664

}

9665

return false;

9666

}

9667

9668

/// Attempt to use the vbroadcast instruction to generate a splat value

9669

/// from a splat BUILD_VECTOR which uses:

9670

/// a. A single scalar load, or a constant.

9671

/// b. Repeated pattern of constants (e.g. <0,1,0,1> or <0,1,2,3,0,1,2,3>).

9672

///

9673

/// The VBROADCAST node is returned when a pattern is found,

9674

/// or SDValue() otherwise.

9675

static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,

9676

const X86Subtarget &Subtarget,

9677

SelectionDAG &DAG) {

9678

// VBROADCAST requires AVX.

9679

// TODO: Splats could be generated for non-AVX CPUs using SSE

9680

// instructions, but there's less potential gain for only 128-bit vectors.

9681

if (!Subtarget.hasAVX())

9682

return SDValue();

9683

9684

MVT VT = BVOp->getSimpleValueType(0);

9685

unsigned NumElts = VT.getVectorNumElements();

9686

SDLoc dl(BVOp);

9687

9688

assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector()) && "Unsupported vector type for broadcast."
) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Unsupported vector type for broadcast.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9689, __extension__
__PRETTY_FUNCTION__))

9689

"Unsupported vector type for broadcast.")(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector()) && "Unsupported vector type for broadcast."
) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Unsupported vector type for broadcast.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9689, __extension__
__PRETTY_FUNCTION__));

9690

9691

// See if the build vector is a repeating sequence of scalars (inc. splat).

9692

SDValue Ld;

9693

BitVector UndefElements;

9694

SmallVector<SDValue, 16> Sequence;

9695

if (BVOp->getRepeatedSequence(Sequence, &UndefElements)) {

9696

assert((NumElts % Sequence.size()) == 0 && "Sequence doesn't fit.")(static_cast <bool> ((NumElts % Sequence.size()) == 0 &&
"Sequence doesn't fit.") ? void (0) : __assert_fail ("(NumElts % Sequence.size()) == 0 && \"Sequence doesn't fit.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9696, __extension__
__PRETTY_FUNCTION__));

9697

if (Sequence.size() == 1)

9698

Ld = Sequence[0];

9699

}

9700

9701

// Attempt to use VBROADCASTM

9702

// From this pattern:

9703

// a. t0 = (zext_i64 (bitcast_i8 v2i1 X))

9704

// b. t1 = (build_vector t0 t0)

9705

//

9706

// Create (VBROADCASTM v2i1 X)

9707

if (!Sequence.empty() && Subtarget.hasCDI()) {

9708

// If not a splat, are the upper sequence values zeroable?

9709

unsigned SeqLen = Sequence.size();

9710

bool UpperZeroOrUndef =

9711

SeqLen == 1 ||

9712

llvm::all_of(ArrayRef(Sequence).drop_front(), [](SDValue V) {

9713

return !V || V.isUndef() || isNullConstant(V);

9714

});

9715

SDValue Op0 = Sequence[0];

9716

if (UpperZeroOrUndef && ((Op0.getOpcode() == ISD::BITCAST) ||

9717

(Op0.getOpcode() == ISD::ZERO_EXTEND &&

9718

Op0.getOperand(0).getOpcode() == ISD::BITCAST))) {

9719

SDValue BOperand = Op0.getOpcode() == ISD::BITCAST

9720

? Op0.getOperand(0)

9721

: Op0.getOperand(0).getOperand(0);

9722

MVT MaskVT = BOperand.getSimpleValueType();

9723

MVT EltType = MVT::getIntegerVT(VT.getScalarSizeInBits() * SeqLen);

9724

if ((EltType == MVT::i64 && MaskVT == MVT::v8i1) || // for broadcastmb2q

9725

(EltType == MVT::i32 && MaskVT == MVT::v16i1)) { // for broadcastmw2d

9726

MVT BcstVT = MVT::getVectorVT(EltType, NumElts / SeqLen);

9727

if (!VT.is512BitVector() && !Subtarget.hasVLX()) {

9728

unsigned Scale = 512 / VT.getSizeInBits();

9729

BcstVT = MVT::getVectorVT(EltType, Scale * (NumElts / SeqLen));

9730

}

9731

SDValue Bcst = DAG.getNode(X86ISD::VBROADCASTM, dl, BcstVT, BOperand);

9732

if (BcstVT.getSizeInBits() != VT.getSizeInBits())

9733

Bcst = extractSubVector(Bcst, 0, DAG, dl, VT.getSizeInBits());

9734

return DAG.getBitcast(VT, Bcst);

9735

}

9736

}

9737

}

9738

9739

unsigned NumUndefElts = UndefElements.count();

9740

if (!Ld || (NumElts - NumUndefElts) <= 1) {

9741

APInt SplatValue, Undef;

9742

unsigned SplatBitSize;

9743

bool HasUndef;

9744

// Check if this is a repeated constant pattern suitable for broadcasting.

9745

if (BVOp->isConstantSplat(SplatValue, Undef, SplatBitSize, HasUndef) &&

9746

SplatBitSize > VT.getScalarSizeInBits() &&

9747

SplatBitSize < VT.getSizeInBits()) {

9748

// Avoid replacing with broadcast when it's a use of a shuffle

9749

// instruction to preserve the present custom lowering of shuffles.

9750

if (isFoldableUseOfShuffle(BVOp))

9751

return SDValue();

9752

// replace BUILD_VECTOR with broadcast of the repeated constants.

9753

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

9754

LLVMContext *Ctx = DAG.getContext();

9755

MVT PVT = TLI.getPointerTy(DAG.getDataLayout());

9756

if (Subtarget.hasAVX()) {

9757

if (SplatBitSize == 32 || SplatBitSize == 64 ||

9758

(SplatBitSize < 32 && Subtarget.hasAVX2())) {

9759

// Splatted value can fit in one INTEGER constant in constant pool.

9760

// Load the constant and broadcast it.

9761

MVT CVT = MVT::getIntegerVT(SplatBitSize);

9762

Type *ScalarTy = Type::getIntNTy(*Ctx, SplatBitSize);

9763

Constant *C = Constant::getIntegerValue(ScalarTy, SplatValue);

9764

SDValue CP = DAG.getConstantPool(C, PVT);

9765

unsigned Repeat = VT.getSizeInBits() / SplatBitSize;

9766

9767

Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();

9768

SDVTList Tys =

9769

DAG.getVTList(MVT::getVectorVT(CVT, Repeat), MVT::Other);

9770

SDValue Ops[] = {DAG.getEntryNode(), CP};

9771

MachinePointerInfo MPI =

9772

MachinePointerInfo::getConstantPool(DAG.getMachineFunction());

9773

SDValue Brdcst = DAG.getMemIntrinsicNode(

9774

X86ISD::VBROADCAST_LOAD, dl, Tys, Ops, CVT, MPI, Alignment,

9775

MachineMemOperand::MOLoad);

9776

return DAG.getBitcast(VT, Brdcst);

9777

}

9778

if (SplatBitSize > 64) {

9779

// Load the vector of constants and broadcast it.

9780

Constant *VecC = getConstantVector(VT, SplatValue, SplatBitSize,

9781

*Ctx);

9782

SDValue VCP = DAG.getConstantPool(VecC, PVT);

9783

unsigned NumElm = SplatBitSize / VT.getScalarSizeInBits();

9784

MVT VVT = MVT::getVectorVT(VT.getScalarType(), NumElm);

9785

Align Alignment = cast<ConstantPoolSDNode>(VCP)->getAlign();

9786

SDVTList Tys = DAG.getVTList(VT, MVT::Other);

9787

SDValue Ops[] = {DAG.getEntryNode(), VCP};

9788

MachinePointerInfo MPI =

9789

MachinePointerInfo::getConstantPool(DAG.getMachineFunction());

9790

return DAG.getMemIntrinsicNode(

9791

X86ISD::SUBV_BROADCAST_LOAD, dl, Tys, Ops, VVT, MPI, Alignment,

9792

MachineMemOperand::MOLoad);

9793

}

9794

}

9795

}

9796

9797

// If we are moving a scalar into a vector (Ld must be set and all elements

9798

// but 1 are undef) and that operation is not obviously supported by

9799

// vmovd/vmovq/vmovss/vmovsd, then keep trying to form a broadcast.

9800

// That's better than general shuffling and may eliminate a load to GPR and

9801

// move from scalar to vector register.

9802

if (!Ld || NumElts - NumUndefElts != 1)

9803

return SDValue();

9804

unsigned ScalarSize = Ld.getValueSizeInBits();

9805

if (!(UndefElements[0] || (ScalarSize != 32 && ScalarSize != 64)))

9806

return SDValue();

9807

}

9808

9809

bool ConstSplatVal =

9810

(Ld.getOpcode() == ISD::Constant || Ld.getOpcode() == ISD::ConstantFP);

9811

bool IsLoad = ISD::isNormalLoad(Ld.getNode());

9812

9813

// TODO: Handle broadcasts of non-constant sequences.

9814

9815

// Make sure that all of the users of a non-constant load are from the

9816

// BUILD_VECTOR node.

9817

// FIXME: Is the use count needed for non-constant, non-load case?

9818

if (!ConstSplatVal && !IsLoad && !BVOp->isOnlyUserOf(Ld.getNode()))

9819

return SDValue();

9820

9821

unsigned ScalarSize = Ld.getValueSizeInBits();

9822

bool IsGE256 = (VT.getSizeInBits() >= 256);

9823

9824

// When optimizing for size, generate up to 5 extra bytes for a broadcast

9825

// instruction to save 8 or more bytes of constant pool data.

9826

// TODO: If multiple splats are generated to load the same constant,

9827

// it may be detrimental to overall size. There needs to be a way to detect

9828

// that condition to know if this is truly a size win.

9829

bool OptForSize = DAG.shouldOptForSize();

9830

9831

// Handle broadcasting a single constant scalar from the constant pool

9832

// into a vector.

9833

// On Sandybridge (no AVX2), it is still better to load a constant vector

9834

// from the constant pool and not to broadcast it from a scalar.

9835

// But override that restriction when optimizing for size.

9836

// TODO: Check if splatting is recommended for other AVX-capable CPUs.

9837

if (ConstSplatVal && (Subtarget.hasAVX2() || OptForSize)) {

9838

EVT CVT = Ld.getValueType();

9839

assert(!CVT.isVector() && "Must not broadcast a vector type")(static_cast <bool> (!CVT.isVector() && "Must not broadcast a vector type"
) ? void (0) : __assert_fail ("!CVT.isVector() && \"Must not broadcast a vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9839, __extension__
__PRETTY_FUNCTION__));

9840

9841

// Splat f16, f32, i32, v4f64, v4i64 in all cases with AVX2.

9842

// For size optimization, also splat v2f64 and v2i64, and for size opt

9843

// with AVX2, also splat i8 and i16.

9844

// With pattern matching, the VBROADCAST node may become a VMOVDDUP.

9845

if (ScalarSize == 32 ||

9846

(ScalarSize == 64 && (IsGE256 || Subtarget.hasVLX())) ||

9847

CVT == MVT::f16 ||

9848

(OptForSize && (ScalarSize == 64 || Subtarget.hasAVX2()))) {

9849

const Constant *C = nullptr;

9850

if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))

9851

C = CI->getConstantIntValue();

9852

else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))

9853

C = CF->getConstantFPValue();

9854

9855

assert(C && "Invalid constant type")(static_cast <bool> (C && "Invalid constant type"
) ? void (0) : __assert_fail ("C && \"Invalid constant type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9855, __extension__
__PRETTY_FUNCTION__));

9856

9857

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

9858

SDValue CP =

9859

DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout()));

9860

Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();

9861

9862

SDVTList Tys = DAG.getVTList(VT, MVT::Other);

9863

SDValue Ops[] = {DAG.getEntryNode(), CP};

9864

MachinePointerInfo MPI =

9865

MachinePointerInfo::getConstantPool(DAG.getMachineFunction());

9866

return DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops, CVT,

9867

MPI, Alignment, MachineMemOperand::MOLoad);

9868

}

9869

}

9870

9871

// Handle AVX2 in-register broadcasts.

9872

if (!IsLoad && Subtarget.hasInt256() &&

9873

(ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))

9874

return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);

9875

9876

// The scalar source must be a normal load.

9877

if (!IsLoad)

9878

return SDValue();

9879

9880

// Make sure the non-chain result is only used by this build vector.

9881

if (!Ld->hasNUsesOfValue(NumElts - NumUndefElts, 0))

9882

return SDValue();

9883

9884

if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||

9885

(Subtarget.hasVLX() && ScalarSize == 64)) {

9886

auto *LN = cast<LoadSDNode>(Ld);

9887

SDVTList Tys = DAG.getVTList(VT, MVT::Other);

9888

SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};

9889

SDValue BCast =

9890

DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops,

9891

LN->getMemoryVT(), LN->getMemOperand());

9892

DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BCast.getValue(1));

9893

return BCast;

9894

}

9895

9896

// The integer check is needed for the 64-bit into 128-bit so it doesn't match

9897

// double since there is no vbroadcastsd xmm

9898

if (Subtarget.hasInt256() && Ld.getValueType().isInteger() &&

9899

(ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)) {

9900

auto *LN = cast<LoadSDNode>(Ld);

9901

SDVTList Tys = DAG.getVTList(VT, MVT::Other);

9902

SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};

9903

SDValue BCast =

9904

DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops,

9905

LN->getMemoryVT(), LN->getMemOperand());

9906

DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BCast.getValue(1));

9907

return BCast;

9908

}

9909

9910

if (ScalarSize == 16 && Subtarget.hasFP16() && IsGE256)

9911

return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);

9912

9913

// Unsupported broadcast.

9914

return SDValue();

9915

}

9916

9917

/// For an EXTRACT_VECTOR_ELT with a constant index return the real

9918

/// underlying vector and index.

9919

///

9920

/// Modifies \p ExtractedFromVec to the real vector and returns the real

9921

/// index.

9922

static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,

9923

SDValue ExtIdx) {

9924

int Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue();

9925

if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))

9926

return Idx;

9927

9928

// For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already

9929

// lowered this:

9930

// (extract_vector_elt (v8f32 %1), Constant<6>)

9931

// to:

9932

// (extract_vector_elt (vector_shuffle<2,u,u,u>

9933

// (extract_subvector (v8f32 %0), Constant<4>),

9934

// undef)

9935

// Constant<0>)

9936

// In this case the vector is the extract_subvector expression and the index

9937

// is 2, as specified by the shuffle.

9938

ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);

9939

SDValue ShuffleVec = SVOp->getOperand(0);

9940

MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();

9941

assert(ShuffleVecVT.getVectorElementType() ==(static_cast <bool> (ShuffleVecVT.getVectorElementType(
) == ExtractedFromVec.getSimpleValueType().getVectorElementType
()) ? void (0) : __assert_fail ("ShuffleVecVT.getVectorElementType() == ExtractedFromVec.getSimpleValueType().getVectorElementType()"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9942, __extension__
__PRETTY_FUNCTION__))

9942

ExtractedFromVec.getSimpleValueType().getVectorElementType())(static_cast <bool> (ShuffleVecVT.getVectorElementType(
) == ExtractedFromVec.getSimpleValueType().getVectorElementType
()) ? void (0) : __assert_fail ("ShuffleVecVT.getVectorElementType() == ExtractedFromVec.getSimpleValueType().getVectorElementType()"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 9942, __extension__
__PRETTY_FUNCTION__));

9943

9944

int ShuffleIdx = SVOp->getMaskElt(Idx);

9945

if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {

9946

ExtractedFromVec = ShuffleVec;

9947

return ShuffleIdx;

9948

}

9949

return Idx;

9950

}

9951

9952

static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {

9953

MVT VT = Op.getSimpleValueType();

9954

9955

// Skip if insert_vec_elt is not supported.

9956

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

9957

if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT))

9958

return SDValue();

9959

9960

SDLoc DL(Op);

9961

unsigned NumElems = Op.getNumOperands();

9962

9963

SDValue VecIn1;

9964

SDValue VecIn2;

9965

SmallVector<unsigned, 4> InsertIndices;

9966

SmallVector<int, 8> Mask(NumElems, -1);

9967

9968

for (unsigned i = 0; i != NumElems; ++i) {

9969

unsigned Opc = Op.getOperand(i).getOpcode();

9970

9971

if (Opc == ISD::UNDEF)

9972

continue;

9973

9974

if (Opc != ISD::EXTRACT_VECTOR_ELT) {

9975

// Quit if more than 1 elements need inserting.

9976

if (InsertIndices.size() > 1)

9977

return SDValue();

9978

9979

InsertIndices.push_back(i);

9980

continue;

9981

}

9982

9983

SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);

9984

SDValue ExtIdx = Op.getOperand(i).getOperand(1);

9985

9986

// Quit if non-constant index.

9987

if (!isa<ConstantSDNode>(ExtIdx))

9988

return SDValue();

9989

int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);

9990

9991

// Quit if extracted from vector of different type.

9992

if (ExtractedFromVec.getValueType() != VT)

9993

return SDValue();

9994

9995

if (!VecIn1.getNode())

9996

VecIn1 = ExtractedFromVec;

9997

else if (VecIn1 != ExtractedFromVec) {

9998

if (!VecIn2.getNode())

9999

VecIn2 = ExtractedFromVec;

10000

else if (VecIn2 != ExtractedFromVec)

10001

// Quit if more than 2 vectors to shuffle

10002

return SDValue();

10003

}

10004

10005

if (ExtractedFromVec == VecIn1)

10006

Mask[i] = Idx;

10007

else if (ExtractedFromVec == VecIn2)

10008

Mask[i] = Idx + NumElems;

10009

}

10010

10011

if (!VecIn1.getNode())

10012

return SDValue();

10013

10014

VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);

10015

SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, Mask);

10016

10017

for (unsigned Idx : InsertIndices)

10018

NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),

10019

DAG.getIntPtrConstant(Idx, DL));

10020

10021

return NV;

10022

}

10023

10024

// Lower BUILD_VECTOR operation for v8bf16, v16bf16 and v32bf16 types.

10025

static SDValue LowerBUILD_VECTORvXbf16(SDValue Op, SelectionDAG &DAG,

10026

const X86Subtarget &Subtarget) {

10027

MVT VT = Op.getSimpleValueType();

10028

MVT IVT = VT.changeVectorElementTypeToInteger();

10029

SmallVector<SDValue, 16> NewOps;

10030

for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I)

10031

NewOps.push_back(DAG.getBitcast(MVT::i16, Op.getOperand(I)));

10032

SDValue Res = DAG.getNode(ISD::BUILD_VECTOR, SDLoc(), IVT, NewOps);

10033

return DAG.getBitcast(VT, Res);

10034

}

10035

10036

// Lower BUILD_VECTOR operation for v8i1 and v16i1 types.

10037

static SDValue LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG,

10038

const X86Subtarget &Subtarget) {

10039

10040

MVT VT = Op.getSimpleValueType();

10041

assert((VT.getVectorElementType() == MVT::i1) &&(static_cast <bool> ((VT.getVectorElementType() == MVT::
i1) && "Unexpected type in LowerBUILD_VECTORvXi1!") ?
void (0) : __assert_fail ("(VT.getVectorElementType() == MVT::i1) && \"Unexpected type in LowerBUILD_VECTORvXi1!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10042, __extension__
__PRETTY_FUNCTION__))

10042

"Unexpected type in LowerBUILD_VECTORvXi1!")(static_cast <bool> ((VT.getVectorElementType() == MVT::
i1) && "Unexpected type in LowerBUILD_VECTORvXi1!") ?
void (0) : __assert_fail ("(VT.getVectorElementType() == MVT::i1) && \"Unexpected type in LowerBUILD_VECTORvXi1!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10042, __extension__
__PRETTY_FUNCTION__));

10043

10044

SDLoc dl(Op);

10045

if (ISD::isBuildVectorAllZeros(Op.getNode()) ||

10046

ISD::isBuildVectorAllOnes(Op.getNode()))

10047

return Op;

10048

10049

uint64_t Immediate = 0;

10050

SmallVector<unsigned, 16> NonConstIdx;

10051

bool IsSplat = true;

10052

bool HasConstElts = false;

10053

int SplatIdx = -1;

10054

for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {

10055

SDValue In = Op.getOperand(idx);

10056

if (In.isUndef())

10057

continue;

10058

if (auto *InC = dyn_cast<ConstantSDNode>(In)) {

10059

Immediate |= (InC->getZExtValue() & 0x1) << idx;

10060

HasConstElts = true;

10061

} else {

10062

NonConstIdx.push_back(idx);

10063

}

10064

if (SplatIdx < 0)

10065

SplatIdx = idx;

10066

else if (In != Op.getOperand(SplatIdx))

10067

IsSplat = false;

10068

}

10069

10070

// for splat use " (select i1 splat_elt, all-ones, all-zeroes)"

10071

if (IsSplat) {

10072

// The build_vector allows the scalar element to be larger than the vector

10073

// element type. We need to mask it to use as a condition unless we know

10074

// the upper bits are zero.

10075

// FIXME: Use computeKnownBits instead of checking specific opcode?

10076

SDValue Cond = Op.getOperand(SplatIdx);

10077

assert(Cond.getValueType() == MVT::i8 && "Unexpected VT!")(static_cast <bool> (Cond.getValueType() == MVT::i8 &&
"Unexpected VT!") ? void (0) : __assert_fail ("Cond.getValueType() == MVT::i8 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10077, __extension__
__PRETTY_FUNCTION__));

10078

if (Cond.getOpcode() != ISD::SETCC)

10079

Cond = DAG.getNode(ISD::AND, dl, MVT::i8, Cond,

10080

DAG.getConstant(1, dl, MVT::i8));

10081

10082

// Perform the select in the scalar domain so we can use cmov.

10083

if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {

10084

SDValue Select = DAG.getSelect(dl, MVT::i32, Cond,

10085

DAG.getAllOnesConstant(dl, MVT::i32),

10086

DAG.getConstant(0, dl, MVT::i32));

10087

Select = DAG.getBitcast(MVT::v32i1, Select);

10088

return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Select, Select);

10089

} else {

10090

MVT ImmVT = MVT::getIntegerVT(std::max((unsigned)VT.getSizeInBits(), 8U));

10091

SDValue Select = DAG.getSelect(dl, ImmVT, Cond,

10092

DAG.getAllOnesConstant(dl, ImmVT),

10093

DAG.getConstant(0, dl, ImmVT));

10094

MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1;

10095

Select = DAG.getBitcast(VecVT, Select);

10096

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Select,

10097

DAG.getIntPtrConstant(0, dl));

10098

}

10099

}

10100

10101

// insert elements one by one

10102

SDValue DstVec;

10103

if (HasConstElts) {

10104

if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {

10105

SDValue ImmL = DAG.getConstant(Lo_32(Immediate), dl, MVT::i32);

10106

SDValue ImmH = DAG.getConstant(Hi_32(Immediate), dl, MVT::i32);

10107

ImmL = DAG.getBitcast(MVT::v32i1, ImmL);

10108

ImmH = DAG.getBitcast(MVT::v32i1, ImmH);

10109

DstVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, ImmL, ImmH);

10110

} else {

10111

MVT ImmVT = MVT::getIntegerVT(std::max((unsigned)VT.getSizeInBits(), 8U));

10112

SDValue Imm = DAG.getConstant(Immediate, dl, ImmVT);

10113

MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1;

10114

DstVec = DAG.getBitcast(VecVT, Imm);

10115

DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, DstVec,

10116

DAG.getIntPtrConstant(0, dl));

10117

}

10118

} else

10119

DstVec = DAG.getUNDEF(VT);

10120

10121

for (unsigned i = 0, e = NonConstIdx.size(); i != e; ++i) {

10122

unsigned InsertIdx = NonConstIdx[i];

10123

DstVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,

10124

Op.getOperand(InsertIdx),

10125

DAG.getIntPtrConstant(InsertIdx, dl));

10126

}

10127

return DstVec;

10128

}

10129

10130

LLVM_ATTRIBUTE_UNUSED__attribute__((__unused__)) static bool isHorizOp(unsigned Opcode) {

10131

switch (Opcode) {

10132

case X86ISD::PACKSS:

10133

case X86ISD::PACKUS:

10134

case X86ISD::FHADD:

10135

case X86ISD::FHSUB:

10136

case X86ISD::HADD:

10137

case X86ISD::HSUB:

10138

return true;

10139

}

10140

return false;

10141

}

10142

10143

/// This is a helper function of LowerToHorizontalOp().

10144

/// This function checks that the build_vector \p N in input implements a

10145

/// 128-bit partial horizontal operation on a 256-bit vector, but that operation

10146

/// may not match the layout of an x86 256-bit horizontal instruction.

10147

/// In other words, if this returns true, then some extraction/insertion will

10148

/// be required to produce a valid horizontal instruction.

10149

///

10150

/// Parameter \p Opcode defines the kind of horizontal operation to match.

10151

/// For example, if \p Opcode is equal to ISD::ADD, then this function

10152

/// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode

10153

/// is equal to ISD::SUB, then this function checks if this is a horizontal

10154

/// arithmetic sub.

10155

///

10156

/// This function only analyzes elements of \p N whose indices are

10157

/// in range [BaseIdx, LastIdx).

10158

///

10159

/// TODO: This function was originally used to match both real and fake partial

10160

/// horizontal operations, but the index-matching logic is incorrect for that.

10161

/// See the corrected implementation in isHopBuildVector(). Can we reduce this

10162

/// code because it is only used for partial h-op matching now?

10163

static bool isHorizontalBinOpPart(const BuildVectorSDNode *N, unsigned Opcode,

10164

SelectionDAG &DAG,

10165

unsigned BaseIdx, unsigned LastIdx,

10166

SDValue &V0, SDValue &V1) {

10167

EVT VT = N->getValueType(0);

10168

assert(VT.is256BitVector() && "Only use for matching partial 256-bit h-ops")(static_cast <bool> (VT.is256BitVector() && "Only use for matching partial 256-bit h-ops"
) ? void (0) : __assert_fail ("VT.is256BitVector() && \"Only use for matching partial 256-bit h-ops\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10168, __extension__
__PRETTY_FUNCTION__));

10169

assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!")(static_cast <bool> (BaseIdx * 2 <= LastIdx &&
"Invalid Indices in input!") ? void (0) : __assert_fail ("BaseIdx * 2 <= LastIdx && \"Invalid Indices in input!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10169, __extension__
__PRETTY_FUNCTION__));

10170

assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&(static_cast <bool> (VT.isVector() && VT.getVectorNumElements
() >= LastIdx && "Invalid Vector in input!") ? void
(0) : __assert_fail ("VT.isVector() && VT.getVectorNumElements() >= LastIdx && \"Invalid Vector in input!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10171, __extension__
__PRETTY_FUNCTION__))

10171

"Invalid Vector in input!")(static_cast <bool> (VT.isVector() && VT.getVectorNumElements
() >= LastIdx && "Invalid Vector in input!") ? void
(0) : __assert_fail ("VT.isVector() && VT.getVectorNumElements() >= LastIdx && \"Invalid Vector in input!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10171, __extension__
__PRETTY_FUNCTION__));

10172

10173

bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);

10174

bool CanFold = true;

10175

unsigned ExpectedVExtractIdx = BaseIdx;

10176

unsigned NumElts = LastIdx - BaseIdx;

10177

V0 = DAG.getUNDEF(VT);

10178

V1 = DAG.getUNDEF(VT);

10179

10180

// Check if N implements a horizontal binop.

10181

for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {

10182

SDValue Op = N->getOperand(i + BaseIdx);

10183

10184

// Skip UNDEFs.

10185

if (Op->isUndef()) {

10186

// Update the expected vector extract index.

10187

if (i * 2 == NumElts)

10188

ExpectedVExtractIdx = BaseIdx;

10189

ExpectedVExtractIdx += 2;

10190

continue;

10191

}

10192

10193

CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();

10194

10195

if (!CanFold)

10196

break;

10197

10198

SDValue Op0 = Op.getOperand(0);

10199

SDValue Op1 = Op.getOperand(1);

10200

10201

// Try to match the following pattern:

10202

// (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))

10203

CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&

10204

Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&

10205

Op0.getOperand(0) == Op1.getOperand(0) &&

10206

isa<ConstantSDNode>(Op0.getOperand(1)) &&

10207

isa<ConstantSDNode>(Op1.getOperand(1)));

10208

if (!CanFold)

10209

break;

10210

10211

unsigned I0 = Op0.getConstantOperandVal(1);

10212

unsigned I1 = Op1.getConstantOperandVal(1);

10213

10214

if (i * 2 < NumElts) {

10215

if (V0.isUndef()) {

10216

V0 = Op0.getOperand(0);

10217

if (V0.getValueType() != VT)

10218

return false;

10219

}

10220

} else {

10221

if (V1.isUndef()) {

10222

V1 = Op0.getOperand(0);

10223

if (V1.getValueType() != VT)

10224

return false;

10225

}

10226

if (i * 2 == NumElts)

10227

ExpectedVExtractIdx = BaseIdx;

10228

}

10229

10230

SDValue Expected = (i * 2 < NumElts) ? V0 : V1;

10231

if (I0 == ExpectedVExtractIdx)

10232

CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;

10233

else if (IsCommutable && I1 == ExpectedVExtractIdx) {

10234

// Try to match the following dag sequence:

10235

// (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))

10236

CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;

10237

} else

10238

CanFold = false;

10239

10240

ExpectedVExtractIdx += 2;

10241

}

10242

10243

return CanFold;

10244

}

10245

10246

/// Emit a sequence of two 128-bit horizontal add/sub followed by

10247

/// a concat_vector.

10248

///

10249

/// This is a helper function of LowerToHorizontalOp().

10250

/// This function expects two 256-bit vectors called V0 and V1.

10251

/// At first, each vector is split into two separate 128-bit vectors.

10252

/// Then, the resulting 128-bit vectors are used to implement two

10253

/// horizontal binary operations.

10254

///

10255

/// The kind of horizontal binary operation is defined by \p X86Opcode.

10256

///

10257

/// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to

10258

/// the two new horizontal binop.

10259

/// When Mode is set, the first horizontal binop dag node would take as input

10260

/// the lower 128-bit of V0 and the upper 128-bit of V0. The second

10261

/// horizontal binop dag node would take as input the lower 128-bit of V1

10262

/// and the upper 128-bit of V1.

10263

/// Example:

10264

/// HADD V0_LO, V0_HI

10265

/// HADD V1_LO, V1_HI

10266

///

10267

/// Otherwise, the first horizontal binop dag node takes as input the lower

10268

/// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop

10269

/// dag node takes the upper 128-bit of V0 and the upper 128-bit of V1.

10270

/// Example:

10271

/// HADD V0_LO, V1_LO

10272

/// HADD V0_HI, V1_HI

10273

///

10274

/// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower

10275

/// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to

10276

/// the upper 128-bits of the result.

10277

static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,

10278

const SDLoc &DL, SelectionDAG &DAG,

10279

unsigned X86Opcode, bool Mode,

10280

bool isUndefLO, bool isUndefHI) {

10281

MVT VT = V0.getSimpleValueType();

10282

assert(VT.is256BitVector() && VT == V1.getSimpleValueType() &&(static_cast <bool> (VT.is256BitVector() && VT ==
V1.getSimpleValueType() && "Invalid nodes in input!"
) ? void (0) : __assert_fail ("VT.is256BitVector() && VT == V1.getSimpleValueType() && \"Invalid nodes in input!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10283, __extension__
__PRETTY_FUNCTION__))

10283

"Invalid nodes in input!")(static_cast <bool> (VT.is256BitVector() && VT ==
V1.getSimpleValueType() && "Invalid nodes in input!"
) ? void (0) : __assert_fail ("VT.is256BitVector() && VT == V1.getSimpleValueType() && \"Invalid nodes in input!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10283, __extension__
__PRETTY_FUNCTION__));

10284

10285

unsigned NumElts = VT.getVectorNumElements();

10286

SDValue V0_LO = extract128BitVector(V0, 0, DAG, DL);

10287

SDValue V0_HI = extract128BitVector(V0, NumElts/2, DAG, DL);

10288

SDValue V1_LO = extract128BitVector(V1, 0, DAG, DL);

10289

SDValue V1_HI = extract128BitVector(V1, NumElts/2, DAG, DL);

10290

MVT NewVT = V0_LO.getSimpleValueType();

10291

10292

SDValue LO = DAG.getUNDEF(NewVT);

10293

SDValue HI = DAG.getUNDEF(NewVT);

10294

10295

if (Mode) {

10296

// Don't emit a horizontal binop if the result is expected to be UNDEF.

10297

if (!isUndefLO && !V0->isUndef())

10298

LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);

10299

if (!isUndefHI && !V1->isUndef())

10300

HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);

10301

} else {

10302

// Don't emit a horizontal binop if the result is expected to be UNDEF.

10303

if (!isUndefLO && (!V0_LO->isUndef() || !V1_LO->isUndef()))

10304

LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);

10305

10306

if (!isUndefHI && (!V0_HI->isUndef() || !V1_HI->isUndef()))

10307

HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);

10308

}

10309

10310

return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);

10311

}

10312

10313

/// Returns true iff \p BV builds a vector with the result equivalent to

10314

/// the result of ADDSUB/SUBADD operation.

10315

/// If true is returned then the operands of ADDSUB = Opnd0 +- Opnd1

10316

/// (SUBADD = Opnd0 -+ Opnd1) operation are written to the parameters

10317

/// \p Opnd0 and \p Opnd1.

10318

static bool isAddSubOrSubAdd(const BuildVectorSDNode *BV,

10319

const X86Subtarget &Subtarget, SelectionDAG &DAG,

10320

SDValue &Opnd0, SDValue &Opnd1,

10321

unsigned &NumExtracts,

10322

bool &IsSubAdd) {

10323

10324

MVT VT = BV->getSimpleValueType(0);

10325

if (!Subtarget.hasSSE3() || !VT.isFloatingPoint())

10326

return false;

10327

10328

unsigned NumElts = VT.getVectorNumElements();

10329

SDValue InVec0 = DAG.getUNDEF(VT);

10330

SDValue InVec1 = DAG.getUNDEF(VT);

10331

10332

NumExtracts = 0;

10333

10334

// Odd-numbered elements in the input build vector are obtained from

10335

// adding/subtracting two integer/float elements.

10336

// Even-numbered elements in the input build vector are obtained from

10337

// subtracting/adding two integer/float elements.

10338

unsigned Opc[2] = {0, 0};

10339

for (unsigned i = 0, e = NumElts; i != e; ++i) {

10340

SDValue Op = BV->getOperand(i);

10341

10342

// Skip 'undef' values.

10343

unsigned Opcode = Op.getOpcode();

10344

if (Opcode == ISD::UNDEF)

10345

continue;

10346

10347

// Early exit if we found an unexpected opcode.

10348

if (Opcode != ISD::FADD && Opcode != ISD::FSUB)

10349

return false;

10350

10351

SDValue Op0 = Op.getOperand(0);

10352

SDValue Op1 = Op.getOperand(1);

10353

10354

// Try to match the following pattern:

10355

// (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))

10356

// Early exit if we cannot match that sequence.

10357

if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

10358

Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

10359

!isa<ConstantSDNode>(Op0.getOperand(1)) ||

10360

Op0.getOperand(1) != Op1.getOperand(1))

10361

return false;

10362

10363

unsigned I0 = Op0.getConstantOperandVal(1);

10364

if (I0 != i)

10365

return false;

10366

10367

// We found a valid add/sub node, make sure its the same opcode as previous

10368

// elements for this parity.

10369

if (Opc[i % 2] != 0 && Opc[i % 2] != Opcode)

10370

return false;

10371

Opc[i % 2] = Opcode;

10372

10373

// Update InVec0 and InVec1.

10374

if (InVec0.isUndef()) {

10375

InVec0 = Op0.getOperand(0);

10376

if (InVec0.getSimpleValueType() != VT)

10377

return false;

10378

}

10379

if (InVec1.isUndef()) {

10380

InVec1 = Op1.getOperand(0);

10381

if (InVec1.getSimpleValueType() != VT)

10382

return false;

10383

}

10384

10385

// Make sure that operands in input to each add/sub node always

10386

// come from a same pair of vectors.

10387

if (InVec0 != Op0.getOperand(0)) {

10388

if (Opcode == ISD::FSUB)

10389

return false;

10390

10391

// FADD is commutable. Try to commute the operands

10392

// and then test again.

10393

std::swap(Op0, Op1);

10394

if (InVec0 != Op0.getOperand(0))

10395

return false;

10396

}

10397

10398

if (InVec1 != Op1.getOperand(0))

10399

return false;

10400

10401

// Increment the number of extractions done.

10402

++NumExtracts;

10403

}

10404

10405

// Ensure we have found an opcode for both parities and that they are

10406

// different. Don't try to fold this build_vector into an ADDSUB/SUBADD if the

10407

// inputs are undef.

10408

if (!Opc[0] || !Opc[1] || Opc[0] == Opc[1] ||

10409

InVec0.isUndef() || InVec1.isUndef())

10410

return false;

10411

10412

IsSubAdd = Opc[0] == ISD::FADD;

10413

10414

Opnd0 = InVec0;

10415

Opnd1 = InVec1;

10416

return true;

10417

}

10418

10419

/// Returns true if is possible to fold MUL and an idiom that has already been

10420

/// recognized as ADDSUB/SUBADD(\p Opnd0, \p Opnd1) into

10421

/// FMADDSUB/FMSUBADD(x, y, \p Opnd1). If (and only if) true is returned, the

10422

/// operands of FMADDSUB/FMSUBADD are written to parameters \p Opnd0, \p Opnd1, \p Opnd2.

10423

///

10424

/// Prior to calling this function it should be known that there is some

10425

/// SDNode that potentially can be replaced with an X86ISD::ADDSUB operation

10426

/// using \p Opnd0 and \p Opnd1 as operands. Also, this method is called

10427

/// before replacement of such SDNode with ADDSUB operation. Thus the number

10428

/// of \p Opnd0 uses is expected to be equal to 2.

10429

/// For example, this function may be called for the following IR:

10430

/// %AB = fmul fast <2 x double> %A, %B

10431

/// %Sub = fsub fast <2 x double> %AB, %C

10432

/// %Add = fadd fast <2 x double> %AB, %C

10433

/// %Addsub = shufflevector <2 x double> %Sub, <2 x double> %Add,

10434

/// <2 x i32> <i32 0, i32 3>

10435

/// There is a def for %Addsub here, which potentially can be replaced by

10436

/// X86ISD::ADDSUB operation:

10437

/// %Addsub = X86ISD::ADDSUB %AB, %C

10438

/// and such ADDSUB can further be replaced with FMADDSUB:

10439

/// %Addsub = FMADDSUB %A, %B, %C.

10440

///

10441

/// The main reason why this method is called before the replacement of the

10442

/// recognized ADDSUB idiom with ADDSUB operation is that such replacement

10443

/// is illegal sometimes. E.g. 512-bit ADDSUB is not available, while 512-bit

10444

/// FMADDSUB is.

10445

static bool isFMAddSubOrFMSubAdd(const X86Subtarget &Subtarget,

10446

SelectionDAG &DAG,

10447

SDValue &Opnd0, SDValue &Opnd1, SDValue &Opnd2,

10448

unsigned ExpectedUses) {

10449

if (Opnd0.getOpcode() != ISD::FMUL ||

10450

!Opnd0->hasNUsesOfValue(ExpectedUses, 0) || !Subtarget.hasAnyFMA())

10451

return false;

10452

10453

// FIXME: These checks must match the similar ones in

10454

// DAGCombiner::visitFADDForFMACombine. It would be good to have one

10455

// function that would answer if it is Ok to fuse MUL + ADD to FMADD

10456

// or MUL + ADDSUB to FMADDSUB.

10457

const TargetOptions &Options = DAG.getTarget().Options;

10458

bool AllowFusion =

10459

(Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath);

10460

if (!AllowFusion)

10461

return false;

10462

10463

Opnd2 = Opnd1;

10464

Opnd1 = Opnd0.getOperand(1);

10465

Opnd0 = Opnd0.getOperand(0);

10466

10467

return true;

10468

}

10469

10470

/// Try to fold a build_vector that performs an 'addsub' or 'fmaddsub' or

10471

/// 'fsubadd' operation accordingly to X86ISD::ADDSUB or X86ISD::FMADDSUB or

10472

/// X86ISD::FMSUBADD node.

10473

static SDValue lowerToAddSubOrFMAddSub(const BuildVectorSDNode *BV,

10474

const X86Subtarget &Subtarget,

10475

SelectionDAG &DAG) {

10476

SDValue Opnd0, Opnd1;

10477

unsigned NumExtracts;

10478

bool IsSubAdd;

10479

if (!isAddSubOrSubAdd(BV, Subtarget, DAG, Opnd0, Opnd1, NumExtracts,

10480

IsSubAdd))

10481

return SDValue();

10482

10483

MVT VT = BV->getSimpleValueType(0);

10484

SDLoc DL(BV);

10485

10486

// Try to generate X86ISD::FMADDSUB node here.

10487

SDValue Opnd2;

10488

if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, NumExtracts)) {

10489

unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;

10490

return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);

10491

}

10492

10493

// We only support ADDSUB.

10494

if (IsSubAdd)

10495

return SDValue();

10496

10497

// There are no known X86 targets with 512-bit ADDSUB instructions!

10498

// Convert to blend(fsub,fadd).

10499

if (VT.is512BitVector()) {

10500

SmallVector<int> Mask;

10501

for (int I = 0, E = VT.getVectorNumElements(); I != E; I += 2) {

10502

Mask.push_back(I);

10503

Mask.push_back(I + E + 1);

10504

}

10505

SDValue Sub = DAG.getNode(ISD::FSUB, DL, VT, Opnd0, Opnd1);

10506

SDValue Add = DAG.getNode(ISD::FADD, DL, VT, Opnd0, Opnd1);

10507

return DAG.getVectorShuffle(VT, DL, Sub, Add, Mask);

10508

}

10509

10510

return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);

10511

}

10512

10513

static bool isHopBuildVector(const BuildVectorSDNode *BV, SelectionDAG &DAG,

10514

unsigned &HOpcode, SDValue &V0, SDValue &V1) {

10515

// Initialize outputs to known values.

10516

MVT VT = BV->getSimpleValueType(0);

10517

HOpcode = ISD::DELETED_NODE;

10518

V0 = DAG.getUNDEF(VT);

10519

V1 = DAG.getUNDEF(VT);

10520

10521

// x86 256-bit horizontal ops are defined in a non-obvious way. Each 128-bit

10522

// half of the result is calculated independently from the 128-bit halves of

10523

// the inputs, so that makes the index-checking logic below more complicated.

10524

unsigned NumElts = VT.getVectorNumElements();

10525

unsigned GenericOpcode = ISD::DELETED_NODE;

10526

unsigned Num128BitChunks = VT.is256BitVector() ? 2 : 1;

10527

unsigned NumEltsIn128Bits = NumElts / Num128BitChunks;

10528

unsigned NumEltsIn64Bits = NumEltsIn128Bits / 2;

10529

for (unsigned i = 0; i != Num128BitChunks; ++i) {

10530

for (unsigned j = 0; j != NumEltsIn128Bits; ++j) {

10531

// Ignore undef elements.

10532

SDValue Op = BV->getOperand(i * NumEltsIn128Bits + j);

10533

if (Op.isUndef())

10534

continue;

10535

10536

// If there's an opcode mismatch, we're done.

10537

if (HOpcode != ISD::DELETED_NODE && Op.getOpcode() != GenericOpcode)

10538

return false;

10539

10540

// Initialize horizontal opcode.

10541

if (HOpcode == ISD::DELETED_NODE) {

10542

GenericOpcode = Op.getOpcode();

10543

switch (GenericOpcode) {

10544

case ISD::ADD: HOpcode = X86ISD::HADD; break;

10545

case ISD::SUB: HOpcode = X86ISD::HSUB; break;

10546

case ISD::FADD: HOpcode = X86ISD::FHADD; break;

10547

case ISD::FSUB: HOpcode = X86ISD::FHSUB; break;

10548

default: return false;

10549

}

10550

}

10551

10552

SDValue Op0 = Op.getOperand(0);

10553

SDValue Op1 = Op.getOperand(1);

10554

if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

10555

Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

10556

Op0.getOperand(0) != Op1.getOperand(0) ||

10557

!isa<ConstantSDNode>(Op0.getOperand(1)) ||

10558

!isa<ConstantSDNode>(Op1.getOperand(1)) || !Op.hasOneUse())

10559

return false;

10560

10561

// The source vector is chosen based on which 64-bit half of the

10562

// destination vector is being calculated.

10563

if (j < NumEltsIn64Bits) {

10564

if (V0.isUndef())

10565

V0 = Op0.getOperand(0);

10566

} else {

10567

if (V1.isUndef())

10568

V1 = Op0.getOperand(0);

10569

}

10570

10571

SDValue SourceVec = (j < NumEltsIn64Bits) ? V0 : V1;

10572

if (SourceVec != Op0.getOperand(0))

10573

return false;

10574

10575

// op (extract_vector_elt A, I), (extract_vector_elt A, I+1)

10576

unsigned ExtIndex0 = Op0.getConstantOperandVal(1);

10577

unsigned ExtIndex1 = Op1.getConstantOperandVal(1);

10578

unsigned ExpectedIndex = i * NumEltsIn128Bits +

10579

(j % NumEltsIn64Bits) * 2;

10580

if (ExpectedIndex == ExtIndex0 && ExtIndex1 == ExtIndex0 + 1)

10581

continue;

10582

10583

// If this is not a commutative op, this does not match.

10584

if (GenericOpcode != ISD::ADD && GenericOpcode != ISD::FADD)

10585

return false;

10586

10587

// Addition is commutative, so try swapping the extract indexes.

10588

// op (extract_vector_elt A, I+1), (extract_vector_elt A, I)

10589

if (ExpectedIndex == ExtIndex1 && ExtIndex0 == ExtIndex1 + 1)

10590

continue;

10591

10592

// Extract indexes do not match horizontal requirement.

10593

return false;

10594

}

10595

}

10596

// We matched. Opcode and operands are returned by reference as arguments.

10597

return true;

10598

}

10599

10600

static SDValue getHopForBuildVector(const BuildVectorSDNode *BV,

10601

SelectionDAG &DAG, unsigned HOpcode,

10602

SDValue V0, SDValue V1) {

10603

// If either input vector is not the same size as the build vector,

10604

// extract/insert the low bits to the correct size.

10605

// This is free (examples: zmm --> xmm, xmm --> ymm).

10606

MVT VT = BV->getSimpleValueType(0);

10607

unsigned Width = VT.getSizeInBits();

10608

if (V0.getValueSizeInBits() > Width)

10609

V0 = extractSubVector(V0, 0, DAG, SDLoc(BV), Width);

10610

else if (V0.getValueSizeInBits() < Width)

10611

V0 = insertSubVector(DAG.getUNDEF(VT), V0, 0, DAG, SDLoc(BV), Width);

10612

10613

if (V1.getValueSizeInBits() > Width)

10614

V1 = extractSubVector(V1, 0, DAG, SDLoc(BV), Width);

10615

else if (V1.getValueSizeInBits() < Width)

10616

V1 = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, SDLoc(BV), Width);

10617

10618

unsigned NumElts = VT.getVectorNumElements();

10619

APInt DemandedElts = APInt::getAllOnes(NumElts);

10620

for (unsigned i = 0; i != NumElts; ++i)

10621

if (BV->getOperand(i).isUndef())

10622

DemandedElts.clearBit(i);

10623

10624

// If we don't need the upper xmm, then perform as a xmm hop.

10625

unsigned HalfNumElts = NumElts / 2;

10626

if (VT.is256BitVector() && DemandedElts.lshr(HalfNumElts) == 0) {

10627

MVT HalfVT = VT.getHalfNumVectorElementsVT();

10628

V0 = extractSubVector(V0, 0, DAG, SDLoc(BV), 128);

10629

V1 = extractSubVector(V1, 0, DAG, SDLoc(BV), 128);

10630

SDValue Half = DAG.getNode(HOpcode, SDLoc(BV), HalfVT, V0, V1);

10631

return insertSubVector(DAG.getUNDEF(VT), Half, 0, DAG, SDLoc(BV), 256);

10632

}

10633

10634

return DAG.getNode(HOpcode, SDLoc(BV), VT, V0, V1);

10635

}

10636

10637

/// Lower BUILD_VECTOR to a horizontal add/sub operation if possible.

10638

static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV,

10639

const X86Subtarget &Subtarget,

10640

SelectionDAG &DAG) {

10641

// We need at least 2 non-undef elements to make this worthwhile by default.

10642

unsigned NumNonUndefs =

10643

count_if(BV->op_values(), [](SDValue V) { return !V.isUndef(); });

10644

if (NumNonUndefs < 2)

10645

return SDValue();

10646

10647

// There are 4 sets of horizontal math operations distinguished by type:

10648

// int/FP at 128-bit/256-bit. Each type was introduced with a different

10649

// subtarget feature. Try to match those "native" patterns first.

10650

MVT VT = BV->getSimpleValueType(0);

10651

if (((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget.hasSSE3()) ||

10652

((VT == MVT::v8i16 || VT == MVT::v4i32) && Subtarget.hasSSSE3()) ||

10653

((VT == MVT::v8f32 || VT == MVT::v4f64) && Subtarget.hasAVX()) ||

10654

((VT == MVT::v16i16 || VT == MVT::v8i32) && Subtarget.hasAVX2())) {

10655

unsigned HOpcode;

10656

SDValue V0, V1;

10657

if (isHopBuildVector(BV, DAG, HOpcode, V0, V1))

10658

return getHopForBuildVector(BV, DAG, HOpcode, V0, V1);

10659

}

10660

10661

// Try harder to match 256-bit ops by using extract/concat.

10662

if (!Subtarget.hasAVX() || !VT.is256BitVector())

10663

return SDValue();

10664

10665

// Count the number of UNDEF operands in the build_vector in input.

10666

unsigned NumElts = VT.getVectorNumElements();

10667

unsigned Half = NumElts / 2;

10668

unsigned NumUndefsLO = 0;

10669

unsigned NumUndefsHI = 0;

10670

for (unsigned i = 0, e = Half; i != e; ++i)

10671

if (BV->getOperand(i)->isUndef())

10672

NumUndefsLO++;

10673

10674

for (unsigned i = Half, e = NumElts; i != e; ++i)

10675

if (BV->getOperand(i)->isUndef())

10676

NumUndefsHI++;

10677

10678

SDLoc DL(BV);

10679

SDValue InVec0, InVec1;

10680

if (VT == MVT::v8i32 || VT == MVT::v16i16) {

10681

SDValue InVec2, InVec3;

10682

unsigned X86Opcode;

10683

bool CanFold = true;

10684

10685

if (isHorizontalBinOpPart(BV, ISD::ADD, DAG, 0, Half, InVec0, InVec1) &&

10686

isHorizontalBinOpPart(BV, ISD::ADD, DAG, Half, NumElts, InVec2,

10687

InVec3) &&

10688

((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&

10689

((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))

10690

X86Opcode = X86ISD::HADD;

10691

else if (isHorizontalBinOpPart(BV, ISD::SUB, DAG, 0, Half, InVec0,

10692

InVec1) &&

10693

isHorizontalBinOpPart(BV, ISD::SUB, DAG, Half, NumElts, InVec2,

10694

InVec3) &&

10695

((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&

10696

((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))

10697

X86Opcode = X86ISD::HSUB;

10698

else

10699

CanFold = false;

10700

10701

if (CanFold) {

10702

// Do not try to expand this build_vector into a pair of horizontal

10703

// add/sub if we can emit a pair of scalar add/sub.

10704

if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)

10705

return SDValue();

10706

10707

// Convert this build_vector into a pair of horizontal binops followed by

10708

// a concat vector. We must adjust the outputs from the partial horizontal

10709

// matching calls above to account for undefined vector halves.

10710

SDValue V0 = InVec0.isUndef() ? InVec2 : InVec0;

10711

SDValue V1 = InVec1.isUndef() ? InVec3 : InVec1;

10712

assert((!V0.isUndef() || !V1.isUndef()) && "Horizontal-op of undefs?")(static_cast <bool> ((!V0.isUndef() || !V1.isUndef()) &&
"Horizontal-op of undefs?") ? void (0) : __assert_fail ("(!V0.isUndef() || !V1.isUndef()) && \"Horizontal-op of undefs?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10712, __extension__
__PRETTY_FUNCTION__));

10713

bool isUndefLO = NumUndefsLO == Half;

10714

bool isUndefHI = NumUndefsHI == Half;

10715

return ExpandHorizontalBinOp(V0, V1, DL, DAG, X86Opcode, false, isUndefLO,

10716

isUndefHI);

10717

}

10718

}

10719

10720

if (VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||

10721

VT == MVT::v16i16) {

10722

unsigned X86Opcode;

10723

if (isHorizontalBinOpPart(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))

10724

X86Opcode = X86ISD::HADD;

10725

else if (isHorizontalBinOpPart(BV, ISD::SUB, DAG, 0, NumElts, InVec0,

10726

InVec1))

10727

X86Opcode = X86ISD::HSUB;

10728

else if (isHorizontalBinOpPart(BV, ISD::FADD, DAG, 0, NumElts, InVec0,

10729

InVec1))

10730

X86Opcode = X86ISD::FHADD;

10731

else if (isHorizontalBinOpPart(BV, ISD::FSUB, DAG, 0, NumElts, InVec0,

10732

InVec1))

10733

X86Opcode = X86ISD::FHSUB;

10734

else

10735

return SDValue();

10736

10737

// Don't try to expand this build_vector into a pair of horizontal add/sub

10738

// if we can simply emit a pair of scalar add/sub.

10739

if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)

10740

return SDValue();

10741

10742

// Convert this build_vector into two horizontal add/sub followed by

10743

// a concat vector.

10744

bool isUndefLO = NumUndefsLO == Half;

10745

bool isUndefHI = NumUndefsHI == Half;

10746

return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,

10747

isUndefLO, isUndefHI);

10748

}

10749

10750

return SDValue();

10751

}

10752

10753

static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,

10754

SelectionDAG &DAG);

10755

10756

/// If a BUILD_VECTOR's source elements all apply the same bit operation and

10757

/// one of their operands is constant, lower to a pair of BUILD_VECTOR and

10758

/// just apply the bit to the vectors.

10759

/// NOTE: Its not in our interest to start make a general purpose vectorizer

10760

/// from this, but enough scalar bit operations are created from the later

10761

/// legalization + scalarization stages to need basic support.

10762

static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op,

10763

const X86Subtarget &Subtarget,

10764

SelectionDAG &DAG) {

10765

SDLoc DL(Op);

10766

MVT VT = Op->getSimpleValueType(0);

10767

unsigned NumElems = VT.getVectorNumElements();

10768

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

10769

10770

// Check that all elements have the same opcode.

10771

// TODO: Should we allow UNDEFS and if so how many?

10772

unsigned Opcode = Op->getOperand(0).getOpcode();

10773

for (unsigned i = 1; i < NumElems; ++i)

10774

if (Opcode != Op->getOperand(i).getOpcode())

10775

return SDValue();

10776

10777

// TODO: We may be able to add support for other Ops (ADD/SUB + shifts).

10778

bool IsShift = false;

10779

switch (Opcode) {

10780

default:

10781

return SDValue();

10782

case ISD::SHL:

10783

case ISD::SRL:

10784

case ISD::SRA:

10785

IsShift = true;

10786

break;

10787

case ISD::AND:

10788

case ISD::XOR:

10789

case ISD::OR:

10790

// Don't do this if the buildvector is a splat - we'd replace one

10791

// constant with an entire vector.

10792

if (Op->getSplatValue())

10793

return SDValue();

10794

if (!TLI.isOperationLegalOrPromote(Opcode, VT))

10795

return SDValue();

10796

break;

10797

}

10798

10799

SmallVector<SDValue, 4> LHSElts, RHSElts;

10800

for (SDValue Elt : Op->ops()) {

10801

SDValue LHS = Elt.getOperand(0);

10802

SDValue RHS = Elt.getOperand(1);

10803

10804

// We expect the canonicalized RHS operand to be the constant.

10805

if (!isa<ConstantSDNode>(RHS))

10806

return SDValue();

10807

10808

// Extend shift amounts.

10809

if (RHS.getValueSizeInBits() != VT.getScalarSizeInBits()) {

10810

if (!IsShift)

10811

return SDValue();

10812

RHS = DAG.getZExtOrTrunc(RHS, DL, VT.getScalarType());

10813

}

10814

10815

LHSElts.push_back(LHS);

10816

RHSElts.push_back(RHS);

10817

}

10818

10819

// Limit to shifts by uniform immediates.

10820

// TODO: Only accept vXi8/vXi64 special cases?

10821

// TODO: Permit non-uniform XOP/AVX2/MULLO cases?

10822

if (IsShift && any_of(RHSElts, [&](SDValue V) { return RHSElts[0] != V; }))

10823

return SDValue();

10824

10825

SDValue LHS = DAG.getBuildVector(VT, DL, LHSElts);

10826

SDValue RHS = DAG.getBuildVector(VT, DL, RHSElts);

10827

SDValue Res = DAG.getNode(Opcode, DL, VT, LHS, RHS);

10828

10829

if (!IsShift)

10830

return Res;

10831

10832

// Immediately lower the shift to ensure the constant build vector doesn't

10833

// get converted to a constant pool before the shift is lowered.

10834

return LowerShift(Res, Subtarget, DAG);

10835

}

10836

10837

/// Create a vector constant without a load. SSE/AVX provide the bare minimum

10838

/// functionality to do this, so it's all zeros, all ones, or some derivation

10839

/// that is cheap to calculate.

10840

static SDValue materializeVectorConstant(SDValue Op, SelectionDAG &DAG,

10841

const X86Subtarget &Subtarget) {

10842

SDLoc DL(Op);

10843

MVT VT = Op.getSimpleValueType();

10844

10845

// Vectors containing all zeros can be matched by pxor and xorps.

10846

if (ISD::isBuildVectorAllZeros(Op.getNode()))

10847

return Op;

10848

10849

// Vectors containing all ones can be matched by pcmpeqd on 128-bit width

10850

// vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use

10851

// vpcmpeqd on 256-bit vectors.

10852

if (Subtarget.hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {

10853

if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)

10854

return Op;

10855

10856

return getOnesVector(VT, DAG, DL);

10857

}

10858

10859

return SDValue();

10860

}

10861

10862

/// Look for opportunities to create a VPERMV/VPERMILPV/PSHUFB variable permute

10863

/// from a vector of source values and a vector of extraction indices.

10864

/// The vectors might be manipulated to match the type of the permute op.

10865

static SDValue createVariablePermute(MVT VT, SDValue SrcVec, SDValue IndicesVec,

10866

SDLoc &DL, SelectionDAG &DAG,

10867

const X86Subtarget &Subtarget) {

10868

MVT ShuffleVT = VT;

10869

EVT IndicesVT = EVT(VT).changeVectorElementTypeToInteger();

10870

unsigned NumElts = VT.getVectorNumElements();

10871

unsigned SizeInBits = VT.getSizeInBits();

10872

10873

// Adjust IndicesVec to match VT size.

10874

assert(IndicesVec.getValueType().getVectorNumElements() >= NumElts &&(static_cast <bool> (IndicesVec.getValueType().getVectorNumElements
() >= NumElts && "Illegal variable permute mask size"
) ? void (0) : __assert_fail ("IndicesVec.getValueType().getVectorNumElements() >= NumElts && \"Illegal variable permute mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10875, __extension__
__PRETTY_FUNCTION__))

10875

"Illegal variable permute mask size")(static_cast <bool> (IndicesVec.getValueType().getVectorNumElements
() >= NumElts && "Illegal variable permute mask size"
) ? void (0) : __assert_fail ("IndicesVec.getValueType().getVectorNumElements() >= NumElts && \"Illegal variable permute mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10875, __extension__
__PRETTY_FUNCTION__));

10876

if (IndicesVec.getValueType().getVectorNumElements() > NumElts) {

10877

// Narrow/widen the indices vector to the correct size.

10878

if (IndicesVec.getValueSizeInBits() > SizeInBits)

10879

IndicesVec = extractSubVector(IndicesVec, 0, DAG, SDLoc(IndicesVec),

10880

NumElts * VT.getScalarSizeInBits());

10881

else if (IndicesVec.getValueSizeInBits() < SizeInBits)

10882

IndicesVec = widenSubVector(IndicesVec, false, Subtarget, DAG,

10883

SDLoc(IndicesVec), SizeInBits);

10884

// Zero-extend the index elements within the vector.

10885

if (IndicesVec.getValueType().getVectorNumElements() > NumElts)

10886

IndicesVec = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(IndicesVec),

10887

IndicesVT, IndicesVec);

10888

}

10889

IndicesVec = DAG.getZExtOrTrunc(IndicesVec, SDLoc(IndicesVec), IndicesVT);

10890

10891

// Handle SrcVec that don't match VT type.

10892

if (SrcVec.getValueSizeInBits() != SizeInBits) {

10893

if ((SrcVec.getValueSizeInBits() % SizeInBits) == 0) {

10894

// Handle larger SrcVec by treating it as a larger permute.

10895

unsigned Scale = SrcVec.getValueSizeInBits() / SizeInBits;

10896

VT = MVT::getVectorVT(VT.getScalarType(), Scale * NumElts);

10897

IndicesVT = EVT(VT).changeVectorElementTypeToInteger();

10898

IndicesVec = widenSubVector(IndicesVT.getSimpleVT(), IndicesVec, false,

10899

Subtarget, DAG, SDLoc(IndicesVec));

10900

SDValue NewSrcVec =

10901

createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);

10902

if (NewSrcVec)

10903

return extractSubVector(NewSrcVec, 0, DAG, DL, SizeInBits);

10904

return SDValue();

10905

} else if (SrcVec.getValueSizeInBits() < SizeInBits) {

10906

// Widen smaller SrcVec to match VT.

10907

SrcVec = widenSubVector(VT, SrcVec, false, Subtarget, DAG, SDLoc(SrcVec));

10908

} else

10909

return SDValue();

10910

}

10911

10912

auto ScaleIndices = [&DAG](SDValue Idx, uint64_t Scale) {

10913

assert(isPowerOf2_64(Scale) && "Illegal variable permute shuffle scale")(static_cast <bool> (isPowerOf2_64(Scale) && "Illegal variable permute shuffle scale"
) ? void (0) : __assert_fail ("isPowerOf2_64(Scale) && \"Illegal variable permute shuffle scale\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 10913, __extension__
__PRETTY_FUNCTION__));

10914

EVT SrcVT = Idx.getValueType();

10915

unsigned NumDstBits = SrcVT.getScalarSizeInBits() / Scale;

10916

uint64_t IndexScale = 0;

10917

uint64_t IndexOffset = 0;

10918

10919

// If we're scaling a smaller permute op, then we need to repeat the

10920

// indices, scaling and offsetting them as well.

10921

// e.g. v4i32 -> v16i8 (Scale = 4)

10922

// IndexScale = v4i32 Splat(4 << 24 | 4 << 16 | 4 << 8 | 4)

10923

// IndexOffset = v4i32 Splat(3 << 24 | 2 << 16 | 1 << 8 | 0)

10924

for (uint64_t i = 0; i != Scale; ++i) {

10925

IndexScale |= Scale << (i * NumDstBits);

10926

IndexOffset |= i << (i * NumDstBits);

10927

}

10928

10929

Idx = DAG.getNode(ISD::MUL, SDLoc(Idx), SrcVT, Idx,

10930

DAG.getConstant(IndexScale, SDLoc(Idx), SrcVT));

10931

Idx = DAG.getNode(ISD::ADD, SDLoc(Idx), SrcVT, Idx,

10932

DAG.getConstant(IndexOffset, SDLoc(Idx), SrcVT));

10933

return Idx;

10934

};

10935

10936

unsigned Opcode = 0;

10937

switch (VT.SimpleTy) {

10938

default:

10939

break;

10940

case MVT::v16i8:

10941

if (Subtarget.hasSSSE3())

10942

Opcode = X86ISD::PSHUFB;

10943

break;

10944

case MVT::v8i16:

10945

if (Subtarget.hasVLX() && Subtarget.hasBWI())

10946

Opcode = X86ISD::VPERMV;

10947

else if (Subtarget.hasSSSE3()) {

10948

Opcode = X86ISD::PSHUFB;

10949

ShuffleVT = MVT::v16i8;

10950

}

10951

break;

10952

case MVT::v4f32:

10953

case MVT::v4i32:

10954

if (Subtarget.hasAVX()) {

10955

Opcode = X86ISD::VPERMILPV;

10956

ShuffleVT = MVT::v4f32;

10957

} else if (Subtarget.hasSSSE3()) {

10958

Opcode = X86ISD::PSHUFB;

10959

ShuffleVT = MVT::v16i8;

10960

}

10961

break;

10962

case MVT::v2f64:

10963

case MVT::v2i64:

10964

if (Subtarget.hasAVX()) {

10965

// VPERMILPD selects using bit#1 of the index vector, so scale IndicesVec.

10966

IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);

10967

Opcode = X86ISD::VPERMILPV;

10968

ShuffleVT = MVT::v2f64;

10969

} else if (Subtarget.hasSSE41()) {

10970

// SSE41 can compare v2i64 - select between indices 0 and 1.

10971

return DAG.getSelectCC(

10972

DL, IndicesVec,

10973

getZeroVector(IndicesVT.getSimpleVT(), Subtarget, DAG, DL),

10974

DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {0, 0}),

10975

DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {1, 1}),

10976

ISD::CondCode::SETEQ);

10977

}

10978

break;

10979

case MVT::v32i8:

10980

if (Subtarget.hasVLX() && Subtarget.hasVBMI())

10981

Opcode = X86ISD::VPERMV;

10982

else if (Subtarget.hasXOP()) {

10983

SDValue LoSrc = extract128BitVector(SrcVec, 0, DAG, DL);

10984

SDValue HiSrc = extract128BitVector(SrcVec, 16, DAG, DL);

10985

SDValue LoIdx = extract128BitVector(IndicesVec, 0, DAG, DL);

10986

SDValue HiIdx = extract128BitVector(IndicesVec, 16, DAG, DL);

10987

return DAG.getNode(

10988

ISD::CONCAT_VECTORS, DL, VT,

10989

DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, LoIdx),

10990

DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, HiIdx));

10991

} else if (Subtarget.hasAVX()) {

10992

SDValue Lo = extract128BitVector(SrcVec, 0, DAG, DL);

10993

SDValue Hi = extract128BitVector(SrcVec, 16, DAG, DL);

10994

SDValue LoLo = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Lo);

10995

SDValue HiHi = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Hi, Hi);

10996

auto PSHUFBBuilder = [](SelectionDAG &DAG, const SDLoc &DL,

10997

ArrayRef<SDValue> Ops) {

10998

// Permute Lo and Hi and then select based on index range.

10999

// This works as SHUFB uses bits[3:0] to permute elements and we don't

11000

// care about the bit[7] as its just an index vector.

11001

SDValue Idx = Ops[2];

11002

EVT VT = Idx.getValueType();

11003

return DAG.getSelectCC(DL, Idx, DAG.getConstant(15, DL, VT),

11004

DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[1], Idx),

11005

DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[0], Idx),

11006

ISD::CondCode::SETGT);

11007

};

11008

SDValue Ops[] = {LoLo, HiHi, IndicesVec};

11009

return SplitOpsAndApply(DAG, Subtarget, DL, MVT::v32i8, Ops,

11010

PSHUFBBuilder);

11011

}

11012

break;

11013

case MVT::v16i16:

11014

if (Subtarget.hasVLX() && Subtarget.hasBWI())

11015

Opcode = X86ISD::VPERMV;

11016

else if (Subtarget.hasAVX()) {

11017

// Scale to v32i8 and perform as v32i8.

11018

IndicesVec = ScaleIndices(IndicesVec, 2);

11019

return DAG.getBitcast(

11020

VT, createVariablePermute(

11021

MVT::v32i8, DAG.getBitcast(MVT::v32i8, SrcVec),

11022

DAG.getBitcast(MVT::v32i8, IndicesVec), DL, DAG, Subtarget));

11023

}

11024

break;

11025

case MVT::v8f32:

11026

case MVT::v8i32:

11027

if (Subtarget.hasAVX2())

11028

Opcode = X86ISD::VPERMV;

11029

else if (Subtarget.hasAVX()) {

11030

SrcVec = DAG.getBitcast(MVT::v8f32, SrcVec);

11031

SDValue LoLo = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,

11032

{0, 1, 2, 3, 0, 1, 2, 3});

11033

SDValue HiHi = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,

11034

{4, 5, 6, 7, 4, 5, 6, 7});

11035

if (Subtarget.hasXOP())

11036

return DAG.getBitcast(

11037

VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v8f32, LoLo, HiHi,

11038

IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8)));

11039

// Permute Lo and Hi and then select based on index range.

11040

// This works as VPERMILPS only uses index bits[0:1] to permute elements.

11041

SDValue Res = DAG.getSelectCC(

11042

DL, IndicesVec, DAG.getConstant(3, DL, MVT::v8i32),

11043

DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, HiHi, IndicesVec),

11044

DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, LoLo, IndicesVec),

11045

ISD::CondCode::SETGT);

11046

return DAG.getBitcast(VT, Res);

11047

}

11048

break;

11049

case MVT::v4i64:

11050

case MVT::v4f64:

11051

if (Subtarget.hasAVX512()) {

11052

if (!Subtarget.hasVLX()) {

11053

MVT WidenSrcVT = MVT::getVectorVT(VT.getScalarType(), 8);

11054

SrcVec = widenSubVector(WidenSrcVT, SrcVec, false, Subtarget, DAG,

11055

SDLoc(SrcVec));

11056

IndicesVec = widenSubVector(MVT::v8i64, IndicesVec, false, Subtarget,

11057

DAG, SDLoc(IndicesVec));

11058

SDValue Res = createVariablePermute(WidenSrcVT, SrcVec, IndicesVec, DL,

11059

DAG, Subtarget);

11060

return extract256BitVector(Res, 0, DAG, DL);

11061

}

11062

Opcode = X86ISD::VPERMV;

11063

} else if (Subtarget.hasAVX()) {

11064

SrcVec = DAG.getBitcast(MVT::v4f64, SrcVec);

11065

SDValue LoLo =

11066

DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {0, 1, 0, 1});

11067

SDValue HiHi =

11068

DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {2, 3, 2, 3});

11069

// VPERMIL2PD selects with bit#1 of the index vector, so scale IndicesVec.

11070

IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);

11071

if (Subtarget.hasXOP())

11072

return DAG.getBitcast(

11073

VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v4f64, LoLo, HiHi,

11074

IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8)));

11075

// Permute Lo and Hi and then select based on index range.

11076

// This works as VPERMILPD only uses index bit[1] to permute elements.

11077

SDValue Res = DAG.getSelectCC(

11078

DL, IndicesVec, DAG.getConstant(2, DL, MVT::v4i64),

11079

DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, HiHi, IndicesVec),

11080

DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, LoLo, IndicesVec),

11081

ISD::CondCode::SETGT);

11082

return DAG.getBitcast(VT, Res);

11083

}

11084

break;

11085

case MVT::v64i8:

11086

if (Subtarget.hasVBMI())

11087

Opcode = X86ISD::VPERMV;

11088

break;

11089

case MVT::v32i16:

11090

if (Subtarget.hasBWI())

11091

Opcode = X86ISD::VPERMV;

11092

break;

11093

case MVT::v16f32:

11094

case MVT::v16i32:

11095

case MVT::v8f64:

11096

case MVT::v8i64:

11097

if (Subtarget.hasAVX512())

11098

Opcode = X86ISD::VPERMV;

11099

break;

11100

}

11101

if (!Opcode)

11102

return SDValue();

11103

11104

assert((VT.getSizeInBits() == ShuffleVT.getSizeInBits()) &&(static_cast <bool> ((VT.getSizeInBits() == ShuffleVT.getSizeInBits
()) && (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits
()) == 0 && "Illegal variable permute shuffle type") ?
void (0) : __assert_fail ("(VT.getSizeInBits() == ShuffleVT.getSizeInBits()) && (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits()) == 0 && \"Illegal variable permute shuffle type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11106, __extension__
__PRETTY_FUNCTION__))

11105

(VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits()) == 0 &&(static_cast <bool> ((VT.getSizeInBits() == ShuffleVT.getSizeInBits
()) && (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits
()) == 0 && "Illegal variable permute shuffle type") ?
void (0) : __assert_fail ("(VT.getSizeInBits() == ShuffleVT.getSizeInBits()) && (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits()) == 0 && \"Illegal variable permute shuffle type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11106, __extension__
__PRETTY_FUNCTION__))

11106

"Illegal variable permute shuffle type")(static_cast <bool> ((VT.getSizeInBits() == ShuffleVT.getSizeInBits
()) && (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits
()) == 0 && "Illegal variable permute shuffle type") ?
void (0) : __assert_fail ("(VT.getSizeInBits() == ShuffleVT.getSizeInBits()) && (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits()) == 0 && \"Illegal variable permute shuffle type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11106, __extension__
__PRETTY_FUNCTION__));

11107

11108

uint64_t Scale = VT.getScalarSizeInBits() / ShuffleVT.getScalarSizeInBits();

11109

if (Scale > 1)

11110

IndicesVec = ScaleIndices(IndicesVec, Scale);

11111

11112

EVT ShuffleIdxVT = EVT(ShuffleVT).changeVectorElementTypeToInteger();

11113

IndicesVec = DAG.getBitcast(ShuffleIdxVT, IndicesVec);

11114

11115

SrcVec = DAG.getBitcast(ShuffleVT, SrcVec);

11116

SDValue Res = Opcode == X86ISD::VPERMV

11117

? DAG.getNode(Opcode, DL, ShuffleVT, IndicesVec, SrcVec)

11118

: DAG.getNode(Opcode, DL, ShuffleVT, SrcVec, IndicesVec);

11119

return DAG.getBitcast(VT, Res);

11120

}

11121

11122

// Tries to lower a BUILD_VECTOR composed of extract-extract chains that can be

11123

// reasoned to be a permutation of a vector by indices in a non-constant vector.

11124

// (build_vector (extract_elt V, (extract_elt I, 0)),

11125

// (extract_elt V, (extract_elt I, 1)),

11126

// ...

11127

// ->

11128

// (vpermv I, V)

11129

//

11130

// TODO: Handle undefs

11131

// TODO: Utilize pshufb and zero mask blending to support more efficient

11132

// construction of vectors with constant-0 elements.

11133

static SDValue

11134

LowerBUILD_VECTORAsVariablePermute(SDValue V, SelectionDAG &DAG,

11135

const X86Subtarget &Subtarget) {

11136

SDValue SrcVec, IndicesVec;

11137

// Check for a match of the permute source vector and permute index elements.

11138

// This is done by checking that the i-th build_vector operand is of the form:

11139

// (extract_elt SrcVec, (extract_elt IndicesVec, i)).

11140

for (unsigned Idx = 0, E = V.getNumOperands(); Idx != E; ++Idx) {

11141

SDValue Op = V.getOperand(Idx);

11142

if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT)

11143

return SDValue();

11144

11145

// If this is the first extract encountered in V, set the source vector,

11146

// otherwise verify the extract is from the previously defined source

11147

// vector.

11148

if (!SrcVec)

11149

SrcVec = Op.getOperand(0);

11150

else if (SrcVec != Op.getOperand(0))

11151

return SDValue();

11152

SDValue ExtractedIndex = Op->getOperand(1);

11153

// Peek through extends.

11154

if (ExtractedIndex.getOpcode() == ISD::ZERO_EXTEND ||

11155

ExtractedIndex.getOpcode() == ISD::SIGN_EXTEND)

11156

ExtractedIndex = ExtractedIndex.getOperand(0);

11157

if (ExtractedIndex.getOpcode() != ISD::EXTRACT_VECTOR_ELT)

11158

return SDValue();

11159

11160

// If this is the first extract from the index vector candidate, set the

11161

// indices vector, otherwise verify the extract is from the previously

11162

// defined indices vector.

11163

if (!IndicesVec)

11164

IndicesVec = ExtractedIndex.getOperand(0);

11165

else if (IndicesVec != ExtractedIndex.getOperand(0))

11166

return SDValue();

11167

11168

auto *PermIdx = dyn_cast<ConstantSDNode>(ExtractedIndex.getOperand(1));

11169

if (!PermIdx || PermIdx->getAPIntValue() != Idx)

11170

return SDValue();

11171

}

11172

11173

SDLoc DL(V);

11174

MVT VT = V.getSimpleValueType();

11175

return createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);

11176

}

11177

11178

SDValue

11179

X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {

11180

SDLoc dl(Op);

11181

11182

MVT VT = Op.getSimpleValueType();

11183

MVT EltVT = VT.getVectorElementType();

11184

MVT OpEltVT = Op.getOperand(0).getSimpleValueType();

11185

unsigned NumElems = Op.getNumOperands();

11186

11187

// Generate vectors for predicate vectors.

11188

if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512())

11189

return LowerBUILD_VECTORvXi1(Op, DAG, Subtarget);

11190

11191

if (VT.getVectorElementType() == MVT::bf16 && Subtarget.hasBF16())

11192

return LowerBUILD_VECTORvXbf16(Op, DAG, Subtarget);

11193

11194

if (SDValue VectorConstant = materializeVectorConstant(Op, DAG, Subtarget))

11195

return VectorConstant;

11196

11197

unsigned EVTBits = EltVT.getSizeInBits();

11198

APInt UndefMask = APInt::getZero(NumElems);

11199

APInt FrozenUndefMask = APInt::getZero(NumElems);

11200

APInt ZeroMask = APInt::getZero(NumElems);

11201

APInt NonZeroMask = APInt::getZero(NumElems);

11202

bool IsAllConstants = true;

11203

SmallSet<SDValue, 8> Values;

11204

unsigned NumConstants = NumElems;

11205

for (unsigned i = 0; i < NumElems; ++i) {

11206

SDValue Elt = Op.getOperand(i);

11207

if (Elt.isUndef()) {

11208

UndefMask.setBit(i);

11209

continue;

11210

}

11211

if (Elt.getOpcode() == ISD::FREEZE && Elt.getOperand(0).isUndef()) {

11212

FrozenUndefMask.setBit(i);

11213

continue;

11214

}

11215

Values.insert(Elt);

11216

if (!isa<ConstantSDNode>(Elt) && !isa<ConstantFPSDNode>(Elt)) {

11217

IsAllConstants = false;

11218

NumConstants--;

11219

}

11220

if (X86::isZeroNode(Elt)) {

11221

ZeroMask.setBit(i);

11222

} else {

11223

NonZeroMask.setBit(i);

11224

}

11225

}

11226

11227

// All undef vector. Return an UNDEF.

11228

if (UndefMask.isAllOnes())

11229

return DAG.getUNDEF(VT);

11230

11231

// If we have multiple FREEZE-UNDEF operands, we are likely going to end up

11232

// lowering into a suboptimal insertion sequence. Instead, thaw the UNDEF in

11233

// our source BUILD_VECTOR, create another FREEZE-UNDEF splat BUILD_VECTOR,

11234

// and blend the FREEZE-UNDEF operands back in.

11235

// FIXME: is this worthwhile even for a single FREEZE-UNDEF operand?

11236

if (unsigned NumFrozenUndefElts = FrozenUndefMask.countPopulation();

11237

NumFrozenUndefElts >= 2 && NumFrozenUndefElts < NumElems) {

11238

SmallVector<int, 16> BlendMask(NumElems, -1);

11239

SmallVector<SDValue, 16> Elts(NumElems, DAG.getUNDEF(OpEltVT));

11240

for (unsigned i = 0; i < NumElems; ++i) {

11241

if (UndefMask[i]) {

11242

BlendMask[i] = -1;

11243

continue;

11244

}

11245

BlendMask[i] = i;

11246

if (!FrozenUndefMask[i])

11247

Elts[i] = Op.getOperand(i);

11248

else

11249

BlendMask[i] += NumElems;

11250

}

11251

SDValue EltsBV = DAG.getBuildVector(VT, dl, Elts);

11252

SDValue FrozenUndefElt = DAG.getFreeze(DAG.getUNDEF(OpEltVT));

11253

SDValue FrozenUndefBV = DAG.getSplatBuildVector(VT, dl, FrozenUndefElt);

11254

return DAG.getVectorShuffle(VT, dl, EltsBV, FrozenUndefBV, BlendMask);

11255

}

11256

11257

BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode());

11258

11259

// If the upper elts of a ymm/zmm are undef/zero then we might be better off

11260

// lowering to a smaller build vector and padding with undef/zero.

11261

if ((VT.is256BitVector() || VT.is512BitVector()) &&

11262

!isFoldableUseOfShuffle(BV)) {

11263

unsigned UpperElems = NumElems / 2;

11264

APInt UndefOrZeroMask = UndefMask | ZeroMask;

11265

unsigned NumUpperUndefsOrZeros = UndefOrZeroMask.countLeadingOnes();

11266

if (NumUpperUndefsOrZeros >= UpperElems) {

11267

if (VT.is512BitVector() &&

11268

NumUpperUndefsOrZeros >= (NumElems - (NumElems / 4)))

11269

UpperElems = NumElems - (NumElems / 4);

11270

bool UndefUpper = UndefMask.countLeadingOnes() >= UpperElems;

11271

MVT LowerVT = MVT::getVectorVT(EltVT, NumElems - UpperElems);

11272

SDValue NewBV =

11273

DAG.getBuildVector(LowerVT, dl, Op->ops().drop_back(UpperElems));

11274

return widenSubVector(VT, NewBV, !UndefUpper, Subtarget, DAG, dl);

11275

}

11276

}

11277

11278

if (SDValue AddSub = lowerToAddSubOrFMAddSub(BV, Subtarget, DAG))

11279

return AddSub;

11280

if (SDValue HorizontalOp = LowerToHorizontalOp(BV, Subtarget, DAG))

11281

return HorizontalOp;

11282

if (SDValue Broadcast = lowerBuildVectorAsBroadcast(BV, Subtarget, DAG))

11283

return Broadcast;

11284

if (SDValue BitOp = lowerBuildVectorToBitOp(BV, Subtarget, DAG))

11285

return BitOp;

11286

11287

unsigned NumZero = ZeroMask.countPopulation();

11288

unsigned NumNonZero = NonZeroMask.countPopulation();

11289

11290

// If we are inserting one variable into a vector of non-zero constants, try

11291

// to avoid loading each constant element as a scalar. Load the constants as a

11292

// vector and then insert the variable scalar element. If insertion is not

11293

// supported, fall back to a shuffle to get the scalar blended with the

11294

// constants. Insertion into a zero vector is handled as a special-case

11295

// somewhere below here.

11296

if (NumConstants == NumElems - 1 && NumNonZero != 1 &&

11297

(isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT) ||

11298

isOperationLegalOrCustom(ISD::VECTOR_SHUFFLE, VT))) {

11299

// Create an all-constant vector. The variable element in the old

11300

// build vector is replaced by undef in the constant vector. Save the

11301

// variable scalar element and its index for use in the insertelement.

11302

LLVMContext &Context = *DAG.getContext();

11303

Type *EltType = Op.getValueType().getScalarType().getTypeForEVT(Context);

11304

SmallVector<Constant *, 16> ConstVecOps(NumElems, UndefValue::get(EltType));

11305

SDValue VarElt;

11306

SDValue InsIndex;

11307

for (unsigned i = 0; i != NumElems; ++i) {

11308

SDValue Elt = Op.getOperand(i);

11309

if (auto *C = dyn_cast<ConstantSDNode>(Elt))

11310

ConstVecOps[i] = ConstantInt::get(Context, C->getAPIntValue());

11311

else if (auto *C = dyn_cast<ConstantFPSDNode>(Elt))

11312

ConstVecOps[i] = ConstantFP::get(Context, C->getValueAPF());

11313

else if (!Elt.isUndef()) {

11314

assert(!VarElt.getNode() && !InsIndex.getNode() &&(static_cast <bool> (!VarElt.getNode() && !InsIndex
.getNode() && "Expected one variable element in this vector"
) ? void (0) : __assert_fail ("!VarElt.getNode() && !InsIndex.getNode() && \"Expected one variable element in this vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11315, __extension__
__PRETTY_FUNCTION__))

11315

"Expected one variable element in this vector")(static_cast <bool> (!VarElt.getNode() && !InsIndex
.getNode() && "Expected one variable element in this vector"
) ? void (0) : __assert_fail ("!VarElt.getNode() && !InsIndex.getNode() && \"Expected one variable element in this vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11315, __extension__
__PRETTY_FUNCTION__));

11316

VarElt = Elt;

11317

InsIndex = DAG.getVectorIdxConstant(i, dl);

11318

}

11319

}

11320

Constant *CV = ConstantVector::get(ConstVecOps);

11321

SDValue DAGConstVec = DAG.getConstantPool(CV, VT);

11322

11323

// The constants we just created may not be legal (eg, floating point). We

11324

// must lower the vector right here because we can not guarantee that we'll

11325

// legalize it before loading it. This is also why we could not just create

11326

// a new build vector here. If the build vector contains illegal constants,

11327

// it could get split back up into a series of insert elements.

11328

// TODO: Improve this by using shorter loads with broadcast/VZEXT_LOAD.

11329

SDValue LegalDAGConstVec = LowerConstantPool(DAGConstVec, DAG);

11330

MachineFunction &MF = DAG.getMachineFunction();

11331

MachinePointerInfo MPI = MachinePointerInfo::getConstantPool(MF);

11332

SDValue Ld = DAG.getLoad(VT, dl, DAG.getEntryNode(), LegalDAGConstVec, MPI);

11333

unsigned InsertC = cast<ConstantSDNode>(InsIndex)->getZExtValue();

11334

unsigned NumEltsInLow128Bits = 128 / VT.getScalarSizeInBits();

11335

if (InsertC < NumEltsInLow128Bits)

11336

return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ld, VarElt, InsIndex);

11337

11338

// There's no good way to insert into the high elements of a >128-bit

11339

// vector, so use shuffles to avoid an extract/insert sequence.

11340

assert(VT.getSizeInBits() > 128 && "Invalid insertion index?")(static_cast <bool> (VT.getSizeInBits() > 128 &&
"Invalid insertion index?") ? void (0) : __assert_fail ("VT.getSizeInBits() > 128 && \"Invalid insertion index?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11340, __extension__
__PRETTY_FUNCTION__));

11341

assert(Subtarget.hasAVX() && "Must have AVX with >16-byte vector")(static_cast <bool> (Subtarget.hasAVX() && "Must have AVX with >16-byte vector"
) ? void (0) : __assert_fail ("Subtarget.hasAVX() && \"Must have AVX with >16-byte vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11341, __extension__
__PRETTY_FUNCTION__));

11342

SmallVector<int, 8> ShuffleMask;

11343

unsigned NumElts = VT.getVectorNumElements();

11344

for (unsigned i = 0; i != NumElts; ++i)

11345

ShuffleMask.push_back(i == InsertC ? NumElts : i);

11346

SDValue S2V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, VarElt);

11347

return DAG.getVectorShuffle(VT, dl, Ld, S2V, ShuffleMask);

11348

}

11349

11350

// Special case for single non-zero, non-undef, element.

11351

if (NumNonZero == 1) {

11352

unsigned Idx = NonZeroMask.countTrailingZeros();

11353

SDValue Item = Op.getOperand(Idx);

11354

11355

// If we have a constant or non-constant insertion into the low element of

11356

// a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into

11357

// the rest of the elements. This will be matched as movd/movq/movss/movsd

11358

// depending on what the source datatype is.

11359

if (Idx == 0) {

11360

if (NumZero == 0)

11361

return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);

11362

11363

if (EltVT == MVT::i32 || EltVT == MVT::f16 || EltVT == MVT::f32 ||

11364

EltVT == MVT::f64 || (EltVT == MVT::i64 && Subtarget.is64Bit()) ||

11365

(EltVT == MVT::i16 && Subtarget.hasFP16())) {

11366

assert((VT.is128BitVector() || VT.is256BitVector() ||(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector()) && "Expected an SSE value type!"
) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Expected an SSE value type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11368, __extension__
__PRETTY_FUNCTION__))

11367

VT.is512BitVector()) &&(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector()) && "Expected an SSE value type!"
) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Expected an SSE value type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11368, __extension__
__PRETTY_FUNCTION__))

11368

"Expected an SSE value type!")(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
() || VT.is512BitVector()) && "Expected an SSE value type!"
) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Expected an SSE value type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11368, __extension__
__PRETTY_FUNCTION__));

11369

Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);

11370

// Turn it into a MOVL (i.e. movsh, movss, movsd, movw or movd) to a

11371

// zero vector.

11372

return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);

11373

}

11374

11375

// We can't directly insert an i8 or i16 into a vector, so zero extend

11376

// it to i32 first.

11377

if (EltVT == MVT::i16 || EltVT == MVT::i8) {

11378

Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);

11379

MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);

11380

Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, Item);

11381

Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);

11382

return DAG.getBitcast(VT, Item);

11383

}

11384

}

11385

11386

// Is it a vector logical left shift?

11387

if (NumElems == 2 && Idx == 1 &&

11388

X86::isZeroNode(Op.getOperand(0)) &&

11389

!X86::isZeroNode(Op.getOperand(1))) {

11390

unsigned NumBits = VT.getSizeInBits();

11391

return getVShift(true, VT,

11392

DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,

11393

VT, Op.getOperand(1)),

11394

NumBits/2, DAG, *this, dl);

11395

}

11396

11397

if (IsAllConstants) // Otherwise, it's better to do a constpool load.

11398

return SDValue();

11399

11400

// Otherwise, if this is a vector with i32 or f32 elements, and the element

11401

// is a non-constant being inserted into an element other than the low one,

11402

// we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka

11403

// movd/movss) to move this into the low element, then shuffle it into

11404

// place.

11405

if (EVTBits == 32) {

11406

Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);

11407

return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);

11408

}

11409

}

11410

11411

// Splat is obviously ok. Let legalizer expand it to a shuffle.

11412

if (Values.size() == 1) {

11413

if (EVTBits == 32) {

11414

// Instead of a shuffle like this:

11415

// shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>

11416

// Check if it's possible to issue this instead.

11417

// shuffle (vload ptr)), undef, <1, 1, 1, 1>

11418

unsigned Idx = NonZeroMask.countTrailingZeros();

11419

SDValue Item = Op.getOperand(Idx);

11420

if (Op.getNode()->isOnlyUserOf(Item.getNode()))

11421

return LowerAsSplatVectorLoad(Item, VT, dl, DAG);

11422

}

11423

return SDValue();

11424

}

11425

11426

// A vector full of immediates; various special cases are already

11427

// handled, so this is best done with a single constant-pool load.

11428

if (IsAllConstants)

11429

return SDValue();

11430

11431

if (SDValue V = LowerBUILD_VECTORAsVariablePermute(Op, DAG, Subtarget))

11432

return V;

11433

11434

// See if we can use a vector load to get all of the elements.

11435

{

11436

SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems);

11437

if (SDValue LD =

11438

EltsFromConsecutiveLoads(VT, Ops, dl, DAG, Subtarget, false))

11439

return LD;

11440

}

11441

11442

// If this is a splat of pairs of 32-bit elements, we can use a narrower

11443

// build_vector and broadcast it.

11444

// TODO: We could probably generalize this more.

11445

if (Subtarget.hasAVX2() && EVTBits == 32 && Values.size() == 2) {

11446

SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),

11447

DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };

11448

auto CanSplat = [](SDValue Op, unsigned NumElems, ArrayRef<SDValue> Ops) {

11449

// Make sure all the even/odd operands match.

11450

for (unsigned i = 2; i != NumElems; ++i)

11451

if (Ops[i % 2] != Op.getOperand(i))

11452

return false;

11453

return true;

11454

};

11455

if (CanSplat(Op, NumElems, Ops)) {

11456

MVT WideEltVT = VT.isFloatingPoint() ? MVT::f64 : MVT::i64;

11457

MVT NarrowVT = MVT::getVectorVT(EltVT, 4);

11458

// Create a new build vector and cast to v2i64/v2f64.

11459

SDValue NewBV = DAG.getBitcast(MVT::getVectorVT(WideEltVT, 2),

11460

DAG.getBuildVector(NarrowVT, dl, Ops));

11461

// Broadcast from v2i64/v2f64 and cast to final VT.

11462

MVT BcastVT = MVT::getVectorVT(WideEltVT, NumElems / 2);

11463

return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, dl, BcastVT,

11464

NewBV));

11465

}

11466

}

11467

11468

// For AVX-length vectors, build the individual 128-bit pieces and use

11469

// shuffles to put them in place.

11470

if (VT.getSizeInBits() > 128) {

11471

MVT HVT = MVT::getVectorVT(EltVT, NumElems / 2);

11472

11473

// Build both the lower and upper subvector.

11474

SDValue Lower =

11475

DAG.getBuildVector(HVT, dl, Op->ops().slice(0, NumElems / 2));

11476

SDValue Upper = DAG.getBuildVector(

11477

HVT, dl, Op->ops().slice(NumElems / 2, NumElems /2));

11478

11479

// Recreate the wider vector with the lower and upper part.

11480

return concatSubVectors(Lower, Upper, DAG, dl);

11481

}

11482

11483

// Let legalizer expand 2-wide build_vectors.

11484

if (EVTBits == 64) {

11485

if (NumNonZero == 1) {

11486

// One half is zero or undef.

11487

unsigned Idx = NonZeroMask.countTrailingZeros();

11488

SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,

11489

Op.getOperand(Idx));

11490

return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);

11491

}

11492

return SDValue();

11493

}

11494

11495

// If element VT is < 32 bits, convert it to inserts into a zero vector.

11496

if (EVTBits == 8 && NumElems == 16)

11497

if (SDValue V = LowerBuildVectorv16i8(Op, NonZeroMask, NumNonZero, NumZero,

11498

DAG, Subtarget))

11499

return V;

11500

11501

if (EltVT == MVT::i16 && NumElems == 8)

11502

if (SDValue V = LowerBuildVectorv8i16(Op, NonZeroMask, NumNonZero, NumZero,

11503

DAG, Subtarget))

11504

return V;

11505

11506

// If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS

11507

if (EVTBits == 32 && NumElems == 4)

11508

if (SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget))

11509

return V;

11510

11511

// If element VT is == 32 bits, turn it into a number of shuffles.

11512

if (NumElems == 4 && NumZero > 0) {

11513

SmallVector<SDValue, 8> Ops(NumElems);

11514

for (unsigned i = 0; i < 4; ++i) {

11515

bool isZero = !NonZeroMask[i];

11516

if (isZero)

11517

Ops[i] = getZeroVector(VT, Subtarget, DAG, dl);

11518

else

11519

Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));

11520

}

11521

11522

for (unsigned i = 0; i < 2; ++i) {

11523

switch (NonZeroMask.extractBitsAsZExtValue(2, i * 2)) {

11524

default: llvm_unreachable("Unexpected NonZero count")::llvm::llvm_unreachable_internal("Unexpected NonZero count",
"llvm/lib/Target/X86/X86ISelLowering.cpp", 11524);

11525

case 0:

11526

Ops[i] = Ops[i*2]; // Must be a zero vector.

11527

break;

11528

case 1:

11529

Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2+1], Ops[i*2]);

11530

break;

11531

case 2:

11532

Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);

11533

break;

11534

case 3:

11535

Ops[i] = getUnpackl(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);

11536

break;

11537

}

11538

}

11539

11540

bool Reverse1 = NonZeroMask.extractBitsAsZExtValue(2, 0) == 2;

11541

bool Reverse2 = NonZeroMask.extractBitsAsZExtValue(2, 2) == 2;

11542

int MaskVec[] = {

11543

Reverse1 ? 1 : 0,

11544

Reverse1 ? 0 : 1,

11545

static_cast<int>(Reverse2 ? NumElems+1 : NumElems),

11546

static_cast<int>(Reverse2 ? NumElems : NumElems+1)

11547

};

11548

return DAG.getVectorShuffle(VT, dl, Ops[0], Ops[1], MaskVec);

11549

}

11550

11551

assert(Values.size() > 1 && "Expected non-undef and non-splat vector")(static_cast <bool> (Values.size() > 1 && "Expected non-undef and non-splat vector"
) ? void (0) : __assert_fail ("Values.size() > 1 && \"Expected non-undef and non-splat vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11551, __extension__
__PRETTY_FUNCTION__));

11552

11553

// Check for a build vector from mostly shuffle plus few inserting.

11554

if (SDValue Sh = buildFromShuffleMostly(Op, DAG))

11555

return Sh;

11556

11557

// For SSE 4.1, use insertps to put the high elements into the low element.

11558

if (Subtarget.hasSSE41() && EltVT != MVT::f16) {

11559

SDValue Result;

11560

if (!Op.getOperand(0).isUndef())

11561

Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));

11562

else

11563

Result = DAG.getUNDEF(VT);

11564

11565

for (unsigned i = 1; i < NumElems; ++i) {

11566

if (Op.getOperand(i).isUndef()) continue;

11567

Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,

11568

Op.getOperand(i), DAG.getIntPtrConstant(i, dl));

11569

}

11570

return Result;

11571

}

11572

11573

// Otherwise, expand into a number of unpckl*, start by extending each of

11574

// our (non-undef) elements to the full vector width with the element in the

11575

// bottom slot of the vector (which generates no code for SSE).

11576

SmallVector<SDValue, 8> Ops(NumElems);

11577

for (unsigned i = 0; i < NumElems; ++i) {

11578

if (!Op.getOperand(i).isUndef())

11579

Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));

11580

else

11581

Ops[i] = DAG.getUNDEF(VT);

11582

}

11583

11584

// Next, we iteratively mix elements, e.g. for v4f32:

11585

// Step 1: unpcklps 0, 1 ==> X: <?, ?, 1, 0>

11586

// : unpcklps 2, 3 ==> Y: <?, ?, 3, 2>

11587

// Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>

11588

for (unsigned Scale = 1; Scale < NumElems; Scale *= 2) {

11589

// Generate scaled UNPCKL shuffle mask.

11590

SmallVector<int, 16> Mask;

11591

for(unsigned i = 0; i != Scale; ++i)

11592

Mask.push_back(i);

11593

for (unsigned i = 0; i != Scale; ++i)

11594

Mask.push_back(NumElems+i);

11595

Mask.append(NumElems - Mask.size(), SM_SentinelUndef);

11596

11597

for (unsigned i = 0, e = NumElems / (2 * Scale); i != e; ++i)

11598

Ops[i] = DAG.getVectorShuffle(VT, dl, Ops[2*i], Ops[(2*i)+1], Mask);

11599

}

11600

return Ops[0];

11601

}

11602

11603

// 256-bit AVX can use the vinsertf128 instruction

11604

// to create 256-bit vectors from two other 128-bit ones.

11605

// TODO: Detect subvector broadcast here instead of DAG combine?

11606

static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG,

11607

const X86Subtarget &Subtarget) {

11608

SDLoc dl(Op);

11609

MVT ResVT = Op.getSimpleValueType();

11610

11611

assert((ResVT.is256BitVector() ||(static_cast <bool> ((ResVT.is256BitVector() || ResVT.is512BitVector
()) && "Value type must be 256-/512-bit wide") ? void
(0) : __assert_fail ("(ResVT.is256BitVector() || ResVT.is512BitVector()) && \"Value type must be 256-/512-bit wide\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11612, __extension__
__PRETTY_FUNCTION__))

11612

ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide")(static_cast <bool> ((ResVT.is256BitVector() || ResVT.is512BitVector
()) && "Value type must be 256-/512-bit wide") ? void
(0) : __assert_fail ("(ResVT.is256BitVector() || ResVT.is512BitVector()) && \"Value type must be 256-/512-bit wide\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11612, __extension__
__PRETTY_FUNCTION__));

11613

11614

unsigned NumOperands = Op.getNumOperands();

11615

unsigned NumFreezeUndef = 0;

11616

unsigned NumZero = 0;

11617

unsigned NumNonZero = 0;

11618

unsigned NonZeros = 0;

11619

for (unsigned i = 0; i != NumOperands; ++i) {

11620

SDValue SubVec = Op.getOperand(i);

11621

if (SubVec.isUndef())

11622

continue;

11623

if (ISD::isFreezeUndef(SubVec.getNode()) && SubVec.hasOneUse())

11624

++NumFreezeUndef;

11625

else if (ISD::isBuildVectorAllZeros(SubVec.getNode()))

11626

++NumZero;

11627

else {

11628

assert(i < sizeof(NonZeros) * CHAR_BIT)(static_cast <bool> (i < sizeof(NonZeros) * 8) ? void
(0) : __assert_fail ("i < sizeof(NonZeros) * CHAR_BIT", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11628, __extension__ __PRETTY_FUNCTION__)); // Ensure the shift is in range.

11629

NonZeros |= 1 << i;

11630

++NumNonZero;

11631

}

11632

}

11633

11634

// If we have more than 2 non-zeros, build each half separately.

11635

if (NumNonZero > 2) {

11636

MVT HalfVT = ResVT.getHalfNumVectorElementsVT();

11637

ArrayRef<SDUse> Ops = Op->ops();

11638

SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,

11639

Ops.slice(0, NumOperands/2));

11640

SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,

11641

Ops.slice(NumOperands/2));

11642

return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);

11643

}

11644

11645

// Otherwise, build it up through insert_subvectors.

11646

SDValue Vec = NumZero ? getZeroVector(ResVT, Subtarget, DAG, dl)

11647

: (NumFreezeUndef ? DAG.getFreeze(DAG.getUNDEF(ResVT))

11648

: DAG.getUNDEF(ResVT));

11649

11650

MVT SubVT = Op.getOperand(0).getSimpleValueType();

11651

unsigned NumSubElems = SubVT.getVectorNumElements();

11652

for (unsigned i = 0; i != NumOperands; ++i) {

11653

if ((NonZeros & (1 << i)) == 0)

11654

continue;

11655

11656

Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec,

11657

Op.getOperand(i),

11658

DAG.getIntPtrConstant(i * NumSubElems, dl));

11659

}

11660

11661

return Vec;

11662

}

11663

11664

// Returns true if the given node is a type promotion (by concatenating i1

11665

// zeros) of the result of a node that already zeros all upper bits of

11666

// k-register.

11667

// TODO: Merge this with LowerAVXCONCAT_VECTORS?

11668

static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,

11669

const X86Subtarget &Subtarget,

11670

SelectionDAG & DAG) {

11671

SDLoc dl(Op);

11672

MVT ResVT = Op.getSimpleValueType();

11673

unsigned NumOperands = Op.getNumOperands();

11674

11675

assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&(static_cast <bool> (NumOperands > 1 && isPowerOf2_32
(NumOperands) && "Unexpected number of operands in CONCAT_VECTORS"
) ? void (0) : __assert_fail ("NumOperands > 1 && isPowerOf2_32(NumOperands) && \"Unexpected number of operands in CONCAT_VECTORS\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11676, __extension__
__PRETTY_FUNCTION__))

11676

"Unexpected number of operands in CONCAT_VECTORS")(static_cast <bool> (NumOperands > 1 && isPowerOf2_32
(NumOperands) && "Unexpected number of operands in CONCAT_VECTORS"
) ? void (0) : __assert_fail ("NumOperands > 1 && isPowerOf2_32(NumOperands) && \"Unexpected number of operands in CONCAT_VECTORS\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11676, __extension__
__PRETTY_FUNCTION__));

11677

11678

uint64_t Zeros = 0;

11679

uint64_t NonZeros = 0;

11680

for (unsigned i = 0; i != NumOperands; ++i) {

11681

SDValue SubVec = Op.getOperand(i);

11682

if (SubVec.isUndef())

11683

continue;

11684

assert(i < sizeof(NonZeros) * CHAR_BIT)(static_cast <bool> (i < sizeof(NonZeros) * 8) ? void
(0) : __assert_fail ("i < sizeof(NonZeros) * CHAR_BIT", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 11684, __extension__ __PRETTY_FUNCTION__)); // Ensure the shift is in range.

11685

if (ISD::isBuildVectorAllZeros(SubVec.getNode()))

11686

Zeros |= (uint64_t)1 << i;

11687

else

11688

NonZeros |= (uint64_t)1 << i;

11689

}

11690

11691

unsigned NumElems = ResVT.getVectorNumElements();

11692

11693

// If we are inserting non-zero vector and there are zeros in LSBs and undef

11694

// in the MSBs we need to emit a KSHIFTL. The generic lowering to

11695

// insert_subvector will give us two kshifts.

11696

if (isPowerOf2_64(NonZeros) && Zeros != 0 && NonZeros > Zeros &&

11697

Log2_64(NonZeros) != NumOperands - 1) {

11698

MVT ShiftVT = ResVT;

11699

if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8)

11700

ShiftVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;

11701

unsigned Idx = Log2_64(NonZeros);

11702

SDValue SubVec = Op.getOperand(Idx);

11703

unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();

11704

SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ShiftVT,

11705

DAG.getUNDEF(ShiftVT), SubVec,

11706

DAG.getIntPtrConstant(0, dl));

11707

Op = DAG.getNode(X86ISD::KSHIFTL, dl, ShiftVT, SubVec,

11708

DAG.getTargetConstant(Idx * SubVecNumElts, dl, MVT::i8));

11709

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResVT, Op,

11710

DAG.getIntPtrConstant(0, dl));

11711

}

11712

11713

// If there are zero or one non-zeros we can handle this very simply.

11714

if (NonZeros == 0 || isPowerOf2_64(NonZeros)) {

11715

SDValue Vec = Zeros ? DAG.getConstant(0, dl, ResVT) : DAG.getUNDEF(ResVT);

11716

if (!NonZeros)

11717

return Vec;

11718

unsigned Idx = Log2_64(NonZeros);

11719

SDValue SubVec = Op.getOperand(Idx);

11720

unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();

11721

return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, SubVec,

11722

DAG.getIntPtrConstant(Idx * SubVecNumElts, dl));

11723

}

11724

11725

if (NumOperands > 2) {

11726

MVT HalfVT = ResVT.getHalfNumVectorElementsVT();

11727

ArrayRef<SDUse> Ops = Op->ops();

11728

SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,

11729

Ops.slice(0, NumOperands/2));

11730

SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,

11731

Ops.slice(NumOperands/2));

11732

return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);

11733

}

11734

11735

assert(llvm::popcount(NonZeros) == 2 && "Simple cases not handled?")(static_cast <bool> (llvm::popcount(NonZeros) == 2 &&
"Simple cases not handled?") ? void (0) : __assert_fail ("llvm::popcount(NonZeros) == 2 && \"Simple cases not handled?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11735, __extension__
__PRETTY_FUNCTION__));

11736

11737

if (ResVT.getVectorNumElements() >= 16)

11738

return Op; // The operation is legal with KUNPCK

11739

11740

SDValue Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT,

11741

DAG.getUNDEF(ResVT), Op.getOperand(0),

11742

DAG.getIntPtrConstant(0, dl));

11743

return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, Op.getOperand(1),

11744

DAG.getIntPtrConstant(NumElems/2, dl));

11745

}

11746

11747

static SDValue LowerCONCAT_VECTORS(SDValue Op,

11748

const X86Subtarget &Subtarget,

11749

SelectionDAG &DAG) {

11750

MVT VT = Op.getSimpleValueType();

11751

if (VT.getVectorElementType() == MVT::i1)

11752

return LowerCONCAT_VECTORSvXi1(Op, Subtarget, DAG);

11753

11754

assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||(static_cast <bool> ((VT.is256BitVector() && Op
.getNumOperands() == 2) || (VT.is512BitVector() && (Op
.getNumOperands() == 2 || Op.getNumOperands() == 4))) ? void (
0) : __assert_fail ("(VT.is256BitVector() && Op.getNumOperands() == 2) || (VT.is512BitVector() && (Op.getNumOperands() == 2 || Op.getNumOperands() == 4))"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11756, __extension__
__PRETTY_FUNCTION__))

11755

(VT.is512BitVector() && (Op.getNumOperands() == 2 ||(static_cast <bool> ((VT.is256BitVector() && Op
.getNumOperands() == 2) || (VT.is512BitVector() && (Op
.getNumOperands() == 2 || Op.getNumOperands() == 4))) ? void (
0) : __assert_fail ("(VT.is256BitVector() && Op.getNumOperands() == 2) || (VT.is512BitVector() && (Op.getNumOperands() == 2 || Op.getNumOperands() == 4))"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11756, __extension__
__PRETTY_FUNCTION__))

11756

Op.getNumOperands() == 4)))(static_cast <bool> ((VT.is256BitVector() && Op
.getNumOperands() == 2) || (VT.is512BitVector() && (Op
.getNumOperands() == 2 || Op.getNumOperands() == 4))) ? void (
0) : __assert_fail ("(VT.is256BitVector() && Op.getNumOperands() == 2) || (VT.is512BitVector() && (Op.getNumOperands() == 2 || Op.getNumOperands() == 4))"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11756, __extension__
__PRETTY_FUNCTION__));

11757

11758

// AVX can use the vinsertf128 instruction to create 256-bit vectors

11759

// from two other 128-bit ones.

11760

11761

// 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors

11762

return LowerAVXCONCAT_VECTORS(Op, DAG, Subtarget);

11763

}

11764

11765

//===----------------------------------------------------------------------===//

11766

// Vector shuffle lowering

11767

//

11768

// This is an experimental code path for lowering vector shuffles on x86. It is

11769

// designed to handle arbitrary vector shuffles and blends, gracefully

11770

// degrading performance as necessary. It works hard to recognize idiomatic

11771

// shuffles and lower them to optimal instruction patterns without leaving

11772

// a framework that allows reasonably efficient handling of all vector shuffle

11773

// patterns.

11774

//===----------------------------------------------------------------------===//

11775

11776

/// Tiny helper function to identify a no-op mask.

11777

///

11778

/// This is a somewhat boring predicate function. It checks whether the mask

11779

/// array input, which is assumed to be a single-input shuffle mask of the kind

11780

/// used by the X86 shuffle instructions (not a fully general

11781

/// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an

11782

/// in-place shuffle are 'no-op's.

11783

static bool isNoopShuffleMask(ArrayRef<int> Mask) {

11784

for (int i = 0, Size = Mask.size(); i < Size; ++i) {

11785

assert(Mask[i] >= -1 && "Out of bound mask element!")(static_cast <bool> (Mask[i] >= -1 && "Out of bound mask element!"
) ? void (0) : __assert_fail ("Mask[i] >= -1 && \"Out of bound mask element!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11785, __extension__
__PRETTY_FUNCTION__));

11786

if (Mask[i] >= 0 && Mask[i] != i)

11787

return false;

11788

}

11789

return true;

11790

}

11791

11792

/// Test whether there are elements crossing LaneSizeInBits lanes in this

11793

/// shuffle mask.

11794

///

11795

/// X86 divides up its shuffles into in-lane and cross-lane shuffle operations

11796

/// and we routinely test for these.

11797

static bool isLaneCrossingShuffleMask(unsigned LaneSizeInBits,

11798

unsigned ScalarSizeInBits,

11799

ArrayRef<int> Mask) {

11800

assert(LaneSizeInBits && ScalarSizeInBits &&(static_cast <bool> (LaneSizeInBits && ScalarSizeInBits
&& (LaneSizeInBits % ScalarSizeInBits) == 0 &&
"Illegal shuffle lane size") ? void (0) : __assert_fail ("LaneSizeInBits && ScalarSizeInBits && (LaneSizeInBits % ScalarSizeInBits) == 0 && \"Illegal shuffle lane size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11802, __extension__
__PRETTY_FUNCTION__))

11801

(LaneSizeInBits % ScalarSizeInBits) == 0 &&(static_cast <bool> (LaneSizeInBits && ScalarSizeInBits
&& (LaneSizeInBits % ScalarSizeInBits) == 0 &&
"Illegal shuffle lane size") ? void (0) : __assert_fail ("LaneSizeInBits && ScalarSizeInBits && (LaneSizeInBits % ScalarSizeInBits) == 0 && \"Illegal shuffle lane size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11802, __extension__
__PRETTY_FUNCTION__))

11802

"Illegal shuffle lane size")(static_cast <bool> (LaneSizeInBits && ScalarSizeInBits
&& (LaneSizeInBits % ScalarSizeInBits) == 0 &&
"Illegal shuffle lane size") ? void (0) : __assert_fail ("LaneSizeInBits && ScalarSizeInBits && (LaneSizeInBits % ScalarSizeInBits) == 0 && \"Illegal shuffle lane size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11802, __extension__
__PRETTY_FUNCTION__));

11803

int LaneSize = LaneSizeInBits / ScalarSizeInBits;

11804

int Size = Mask.size();

11805

for (int i = 0; i < Size; ++i)

11806

if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)

11807

return true;

11808

return false;

11809

}

11810

11811

/// Test whether there are elements crossing 128-bit lanes in this

11812

/// shuffle mask.

11813

static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) {

11814

return isLaneCrossingShuffleMask(128, VT.getScalarSizeInBits(), Mask);

11815

}

11816

11817

/// Test whether elements in each LaneSizeInBits lane in this shuffle mask come

11818

/// from multiple lanes - this is different to isLaneCrossingShuffleMask to

11819

/// better support 'repeated mask + lane permute' style shuffles.

11820

static bool isMultiLaneShuffleMask(unsigned LaneSizeInBits,

11821

unsigned ScalarSizeInBits,

11822

ArrayRef<int> Mask) {

11823

assert(LaneSizeInBits && ScalarSizeInBits &&(static_cast <bool> (LaneSizeInBits && ScalarSizeInBits
&& (LaneSizeInBits % ScalarSizeInBits) == 0 &&
"Illegal shuffle lane size") ? void (0) : __assert_fail ("LaneSizeInBits && ScalarSizeInBits && (LaneSizeInBits % ScalarSizeInBits) == 0 && \"Illegal shuffle lane size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11825, __extension__
__PRETTY_FUNCTION__))

11824

(LaneSizeInBits % ScalarSizeInBits) == 0 &&(static_cast <bool> (LaneSizeInBits && ScalarSizeInBits
&& (LaneSizeInBits % ScalarSizeInBits) == 0 &&
"Illegal shuffle lane size") ? void (0) : __assert_fail ("LaneSizeInBits && ScalarSizeInBits && (LaneSizeInBits % ScalarSizeInBits) == 0 && \"Illegal shuffle lane size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11825, __extension__
__PRETTY_FUNCTION__))

11825

"Illegal shuffle lane size")(static_cast <bool> (LaneSizeInBits && ScalarSizeInBits
&& (LaneSizeInBits % ScalarSizeInBits) == 0 &&
"Illegal shuffle lane size") ? void (0) : __assert_fail ("LaneSizeInBits && ScalarSizeInBits && (LaneSizeInBits % ScalarSizeInBits) == 0 && \"Illegal shuffle lane size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11825, __extension__
__PRETTY_FUNCTION__));

11826

int NumElts = Mask.size();

11827

int NumEltsPerLane = LaneSizeInBits / ScalarSizeInBits;

11828

int NumLanes = NumElts / NumEltsPerLane;

11829

if (NumLanes > 1) {

11830

for (int i = 0; i != NumLanes; ++i) {

11831

int SrcLane = -1;

11832

for (int j = 0; j != NumEltsPerLane; ++j) {

11833

int M = Mask[(i * NumEltsPerLane) + j];

11834

if (M < 0)

11835

continue;

11836

int Lane = (M % NumElts) / NumEltsPerLane;

11837

if (SrcLane >= 0 && SrcLane != Lane)

11838

return true;

11839

SrcLane = Lane;

11840

}

11841

}

11842

}

11843

return false;

11844

}

11845

11846

/// Test whether a shuffle mask is equivalent within each sub-lane.

11847

///

11848

/// This checks a shuffle mask to see if it is performing the same

11849

/// lane-relative shuffle in each sub-lane. This trivially implies

11850

/// that it is also not lane-crossing. It may however involve a blend from the

11851

/// same lane of a second vector.

11852

///

11853

/// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is

11854

/// non-trivial to compute in the face of undef lanes. The representation is

11855

/// suitable for use with existing 128-bit shuffles as entries from the second

11856

/// vector have been remapped to [LaneSize, 2*LaneSize).

11857

static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT,

11858

ArrayRef<int> Mask,

11859

SmallVectorImpl<int> &RepeatedMask) {

11860

auto LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();

11861

RepeatedMask.assign(LaneSize, -1);

11862

int Size = Mask.size();

11863

for (int i = 0; i < Size; ++i) {

11864

assert(Mask[i] == SM_SentinelUndef || Mask[i] >= 0)(static_cast <bool> (Mask[i] == SM_SentinelUndef || Mask
[i] >= 0) ? void (0) : __assert_fail ("Mask[i] == SM_SentinelUndef || Mask[i] >= 0"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11864, __extension__
__PRETTY_FUNCTION__));

11865

if (Mask[i] < 0)

11866

continue;

11867

if ((Mask[i] % Size) / LaneSize != i / LaneSize)

11868

// This entry crosses lanes, so there is no way to model this shuffle.

11869

return false;

11870

11871

// Ok, handle the in-lane shuffles by detecting if and when they repeat.

11872

// Adjust second vector indices to start at LaneSize instead of Size.

11873

int LocalM = Mask[i] < Size ? Mask[i] % LaneSize

11874

: Mask[i] % LaneSize + LaneSize;

11875

if (RepeatedMask[i % LaneSize] < 0)

11876

// This is the first non-undef entry in this slot of a 128-bit lane.

11877

RepeatedMask[i % LaneSize] = LocalM;

11878

else if (RepeatedMask[i % LaneSize] != LocalM)

11879

// Found a mismatch with the repeated mask.

11880

return false;

11881

}

11882

return true;

11883

}

11884

11885

/// Test whether a shuffle mask is equivalent within each 128-bit lane.

11886

static bool

11887

is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,

11888

SmallVectorImpl<int> &RepeatedMask) {

11889

return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);

11890

}

11891

11892

static bool

11893

is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask) {

11894

SmallVector<int, 32> RepeatedMask;

11895

return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);

11896

}

11897

11898

/// Test whether a shuffle mask is equivalent within each 256-bit lane.

11899

static bool

11900

is256BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,

11901

SmallVectorImpl<int> &RepeatedMask) {

11902

return isRepeatedShuffleMask(256, VT, Mask, RepeatedMask);

11903

}

11904

11905

/// Test whether a target shuffle mask is equivalent within each sub-lane.

11906

/// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.

11907

static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits,

11908

unsigned EltSizeInBits,

11909

ArrayRef<int> Mask,

11910

SmallVectorImpl<int> &RepeatedMask) {

11911

int LaneSize = LaneSizeInBits / EltSizeInBits;

11912

RepeatedMask.assign(LaneSize, SM_SentinelUndef);

11913

int Size = Mask.size();

11914

for (int i = 0; i < Size; ++i) {

11915

assert(isUndefOrZero(Mask[i]) || (Mask[i] >= 0))(static_cast <bool> (isUndefOrZero(Mask[i]) || (Mask[i]
>= 0)) ? void (0) : __assert_fail ("isUndefOrZero(Mask[i]) || (Mask[i] >= 0)"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11915, __extension__
__PRETTY_FUNCTION__));

11916

if (Mask[i] == SM_SentinelUndef)

11917

continue;

11918

if (Mask[i] == SM_SentinelZero) {

11919

if (!isUndefOrZero(RepeatedMask[i % LaneSize]))

11920

return false;

11921

RepeatedMask[i % LaneSize] = SM_SentinelZero;

11922

continue;

11923

}

11924

if ((Mask[i] % Size) / LaneSize != i / LaneSize)

11925

// This entry crosses lanes, so there is no way to model this shuffle.

11926

return false;

11927

11928

// Handle the in-lane shuffles by detecting if and when they repeat. Adjust

11929

// later vector indices to start at multiples of LaneSize instead of Size.

11930

int LaneM = Mask[i] / Size;

11931

int LocalM = (Mask[i] % LaneSize) + (LaneM * LaneSize);

11932

if (RepeatedMask[i % LaneSize] == SM_SentinelUndef)

11933

// This is the first non-undef entry in this slot of a 128-bit lane.

11934

RepeatedMask[i % LaneSize] = LocalM;

11935

else if (RepeatedMask[i % LaneSize] != LocalM)

11936

// Found a mismatch with the repeated mask.

11937

return false;

11938

}

11939

return true;

11940

}

11941

11942

/// Test whether a target shuffle mask is equivalent within each sub-lane.

11943

/// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.

11944

static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, MVT VT,

11945

ArrayRef<int> Mask,

11946

SmallVectorImpl<int> &RepeatedMask) {

11947

return isRepeatedTargetShuffleMask(LaneSizeInBits, VT.getScalarSizeInBits(),

11948

Mask, RepeatedMask);

11949

}

11950

11951

/// Checks whether the vector elements referenced by two shuffle masks are

11952

/// equivalent.

11953

static bool IsElementEquivalent(int MaskSize, SDValue Op, SDValue ExpectedOp,

11954

int Idx, int ExpectedIdx) {

11955

assert(0 <= Idx && Idx < MaskSize && 0 <= ExpectedIdx &&(static_cast <bool> (0 <= Idx && Idx < MaskSize
&& 0 <= ExpectedIdx && ExpectedIdx < MaskSize
&& "Out of range element index") ? void (0) : __assert_fail
("0 <= Idx && Idx < MaskSize && 0 <= ExpectedIdx && ExpectedIdx < MaskSize && \"Out of range element index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11956, __extension__
__PRETTY_FUNCTION__))

11956

ExpectedIdx < MaskSize && "Out of range element index")(static_cast <bool> (0 <= Idx && Idx < MaskSize
&& 0 <= ExpectedIdx && ExpectedIdx < MaskSize
&& "Out of range element index") ? void (0) : __assert_fail
("0 <= Idx && Idx < MaskSize && 0 <= ExpectedIdx && ExpectedIdx < MaskSize && \"Out of range element index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 11956, __extension__
__PRETTY_FUNCTION__));

11957

if (!Op || !ExpectedOp || Op.getOpcode() != ExpectedOp.getOpcode())

11958

return false;

11959

11960

switch (Op.getOpcode()) {

11961

case ISD::BUILD_VECTOR:

11962

// If the values are build vectors, we can look through them to find

11963

// equivalent inputs that make the shuffles equivalent.

11964

// TODO: Handle MaskSize != Op.getNumOperands()?

11965

if (MaskSize == (int)Op.getNumOperands() &&

11966

MaskSize == (int)ExpectedOp.getNumOperands())

11967

return Op.getOperand(Idx) == ExpectedOp.getOperand(ExpectedIdx);

11968

break;

11969

case X86ISD::VBROADCAST:

11970

case X86ISD::VBROADCAST_LOAD:

11971

// TODO: Handle MaskSize != Op.getValueType().getVectorNumElements()?

11972

return (Op == ExpectedOp &&

11973

(int)Op.getValueType().getVectorNumElements() == MaskSize);

11974

case X86ISD::HADD:

11975

case X86ISD::HSUB:

11976

case X86ISD::FHADD:

11977

case X86ISD::FHSUB:

11978

case X86ISD::PACKSS:

11979

case X86ISD::PACKUS:

11980

// HOP(X,X) can refer to the elt from the lower/upper half of a lane.

11981

// TODO: Handle MaskSize != NumElts?

11982

// TODO: Handle HOP(X,Y) vs HOP(Y,X) equivalence cases.

11983

if (Op == ExpectedOp && Op.getOperand(0) == Op.getOperand(1)) {

11984

MVT VT = Op.getSimpleValueType();

11985

int NumElts = VT.getVectorNumElements();

11986

if (MaskSize == NumElts) {

11987

int NumLanes = VT.getSizeInBits() / 128;

11988

int NumEltsPerLane = NumElts / NumLanes;

11989

int NumHalfEltsPerLane = NumEltsPerLane / 2;

11990

bool SameLane =

11991

(Idx / NumEltsPerLane) == (ExpectedIdx / NumEltsPerLane);

11992

bool SameElt =

11993

(Idx % NumHalfEltsPerLane) == (ExpectedIdx % NumHalfEltsPerLane);

11994

return SameLane && SameElt;

11995

}

11996

}

11997

break;

11998

}

11999

12000

return false;

12001

}

12002

12003

/// Checks whether a shuffle mask is equivalent to an explicit list of

12004

/// arguments.

12005

///

12006

/// This is a fast way to test a shuffle mask against a fixed pattern:

12007

///

12008

/// if (isShuffleEquivalent(Mask, 3, 2, {1, 0})) { ... }

12009

///

12010

/// It returns true if the mask is exactly as wide as the argument list, and

12011

/// each element of the mask is either -1 (signifying undef) or the value given

12012

/// in the argument.

12013

static bool isShuffleEquivalent(ArrayRef<int> Mask, ArrayRef<int> ExpectedMask,

12014

SDValue V1 = SDValue(),

12015

SDValue V2 = SDValue()) {

12016

int Size = Mask.size();

12017

if (Size != (int)ExpectedMask.size())

12018

return false;

12019

12020

for (int i = 0; i < Size; ++i) {

12021

assert(Mask[i] >= -1 && "Out of bound mask element!")(static_cast <bool> (Mask[i] >= -1 && "Out of bound mask element!"
) ? void (0) : __assert_fail ("Mask[i] >= -1 && \"Out of bound mask element!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12021, __extension__
__PRETTY_FUNCTION__));

12022

int MaskIdx = Mask[i];

12023

int ExpectedIdx = ExpectedMask[i];

12024

if (0 <= MaskIdx && MaskIdx != ExpectedIdx) {

12025

SDValue MaskV = MaskIdx < Size ? V1 : V2;

12026

SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;

12027

MaskIdx = MaskIdx < Size ? MaskIdx : (MaskIdx - Size);

12028

ExpectedIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);

12029

if (!IsElementEquivalent(Size, MaskV, ExpectedV, MaskIdx, ExpectedIdx))

12030

return false;

12031

}

12032

}

12033

return true;

12034

}

12035

12036

/// Checks whether a target shuffle mask is equivalent to an explicit pattern.

12037

///

12038

/// The masks must be exactly the same width.

12039

///

12040

/// If an element in Mask matches SM_SentinelUndef (-1) then the corresponding

12041

/// value in ExpectedMask is always accepted. Otherwise the indices must match.

12042

///

12043

/// SM_SentinelZero is accepted as a valid negative index but must match in

12044

/// both, or via a known bits test.

12045

static bool isTargetShuffleEquivalent(MVT VT, ArrayRef<int> Mask,

12046

ArrayRef<int> ExpectedMask,

12047

const SelectionDAG &DAG,

12048

SDValue V1 = SDValue(),

12049

SDValue V2 = SDValue()) {

12050

int Size = Mask.size();

12051

if (Size != (int)ExpectedMask.size())

12052

return false;

12053

assert(llvm::all_of(ExpectedMask,(static_cast <bool> (llvm::all_of(ExpectedMask, [Size](
int M) { return isInRange(M, 0, 2 * Size); }) && "Illegal target shuffle mask"
) ? void (0) : __assert_fail ("llvm::all_of(ExpectedMask, [Size](int M) { return isInRange(M, 0, 2 * Size); }) && \"Illegal target shuffle mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12055, __extension__
__PRETTY_FUNCTION__))

12054

[Size](int M) { return isInRange(M, 0, 2 * Size); }) &&(static_cast <bool> (llvm::all_of(ExpectedMask, [Size](
int M) { return isInRange(M, 0, 2 * Size); }) && "Illegal target shuffle mask"
) ? void (0) : __assert_fail ("llvm::all_of(ExpectedMask, [Size](int M) { return isInRange(M, 0, 2 * Size); }) && \"Illegal target shuffle mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12055, __extension__
__PRETTY_FUNCTION__))

12055

"Illegal target shuffle mask")(static_cast <bool> (llvm::all_of(ExpectedMask, [Size](
int M) { return isInRange(M, 0, 2 * Size); }) && "Illegal target shuffle mask"
) ? void (0) : __assert_fail ("llvm::all_of(ExpectedMask, [Size](int M) { return isInRange(M, 0, 2 * Size); }) && \"Illegal target shuffle mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12055, __extension__
__PRETTY_FUNCTION__));

12056

12057

// Check for out-of-range target shuffle mask indices.

12058

if (!isUndefOrZeroOrInRange(Mask, 0, 2 * Size))

12059

return false;

12060

12061

// Don't use V1/V2 if they're not the same size as the shuffle mask type.

12062

if (V1 && V1.getValueSizeInBits() != VT.getSizeInBits())

12063

V1 = SDValue();

12064

if (V2 && V2.getValueSizeInBits() != VT.getSizeInBits())

12065

V2 = SDValue();

12066

12067

APInt ZeroV1 = APInt::getNullValue(Size);

12068

APInt ZeroV2 = APInt::getNullValue(Size);

12069

12070

for (int i = 0; i < Size; ++i) {

12071

int MaskIdx = Mask[i];

12072

int ExpectedIdx = ExpectedMask[i];

12073

if (MaskIdx == SM_SentinelUndef || MaskIdx == ExpectedIdx)

12074

continue;

12075

if (MaskIdx == SM_SentinelZero) {

12076

// If we need this expected index to be a zero element, then update the

12077

// relevant zero mask and perform the known bits at the end to minimize

12078

// repeated computes.

12079

SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;

12080

if (ExpectedV &&

12081

Size == (int)ExpectedV.getValueType().getVectorNumElements()) {

12082

int BitIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);

12083

APInt &ZeroMask = ExpectedIdx < Size ? ZeroV1 : ZeroV2;

12084

ZeroMask.setBit(BitIdx);

12085

continue;

12086

}

12087

}

12088

if (MaskIdx >= 0) {

12089

SDValue MaskV = MaskIdx < Size ? V1 : V2;

12090

SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2;

12091

MaskIdx = MaskIdx < Size ? MaskIdx : (MaskIdx - Size);

12092

ExpectedIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size);

12093

if (IsElementEquivalent(Size, MaskV, ExpectedV, MaskIdx, ExpectedIdx))

12094

continue;

12095

}

12096

return false;

12097

}

12098

return (ZeroV1.isNullValue() || DAG.MaskedVectorIsZero(V1, ZeroV1)) &&

12099

(ZeroV2.isNullValue() || DAG.MaskedVectorIsZero(V2, ZeroV2));

12100

}

12101

12102

// Check if the shuffle mask is suitable for the AVX vpunpcklwd or vpunpckhwd

12103

// instructions.

12104

static bool isUnpackWdShuffleMask(ArrayRef<int> Mask, MVT VT,

12105

const SelectionDAG &DAG) {

12106

if (VT != MVT::v8i32 && VT != MVT::v8f32)

12107

return false;

12108

12109

SmallVector<int, 8> Unpcklwd;

12110

createUnpackShuffleMask(MVT::v8i16, Unpcklwd, /* Lo = */ true,

12111

/* Unary = */ false);

12112

SmallVector<int, 8> Unpckhwd;

12113

createUnpackShuffleMask(MVT::v8i16, Unpckhwd, /* Lo = */ false,

12114

/* Unary = */ false);

12115

bool IsUnpackwdMask = (isTargetShuffleEquivalent(VT, Mask, Unpcklwd, DAG) ||

12116

isTargetShuffleEquivalent(VT, Mask, Unpckhwd, DAG));

12117

return IsUnpackwdMask;

12118

}

12119

12120

static bool is128BitUnpackShuffleMask(ArrayRef<int> Mask,

12121

const SelectionDAG &DAG) {

12122

// Create 128-bit vector type based on mask size.

12123

MVT EltVT = MVT::getIntegerVT(128 / Mask.size());

12124

MVT VT = MVT::getVectorVT(EltVT, Mask.size());

12125

12126

// We can't assume a canonical shuffle mask, so try the commuted version too.

12127

SmallVector<int, 4> CommutedMask(Mask);

12128

ShuffleVectorSDNode::commuteMask(CommutedMask);

12129

12130

// Match any of unary/binary or low/high.

12131

for (unsigned i = 0; i != 4; ++i) {

12132

SmallVector<int, 16> UnpackMask;

12133

createUnpackShuffleMask(VT, UnpackMask, (i >> 1) % 2, i % 2);

12134

if (isTargetShuffleEquivalent(VT, Mask, UnpackMask, DAG) ||

12135

isTargetShuffleEquivalent(VT, CommutedMask, UnpackMask, DAG))

12136

return true;

12137

}

12138

return false;

12139

}

12140

12141

/// Return true if a shuffle mask chooses elements identically in its top and

12142

/// bottom halves. For example, any splat mask has the same top and bottom

12143

/// halves. If an element is undefined in only one half of the mask, the halves

12144

/// are not considered identical.

12145

static bool hasIdenticalHalvesShuffleMask(ArrayRef<int> Mask) {

12146

assert(Mask.size() % 2 == 0 && "Expecting even number of elements in mask")(static_cast <bool> (Mask.size() % 2 == 0 && "Expecting even number of elements in mask"
) ? void (0) : __assert_fail ("Mask.size() % 2 == 0 && \"Expecting even number of elements in mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12146, __extension__
__PRETTY_FUNCTION__));

12147

unsigned HalfSize = Mask.size() / 2;

12148

for (unsigned i = 0; i != HalfSize; ++i) {

12149

if (Mask[i] != Mask[i + HalfSize])

12150

return false;

12151

}

12152

return true;

12153

}

12154

12155

/// Get a 4-lane 8-bit shuffle immediate for a mask.

12156

///

12157

/// This helper function produces an 8-bit shuffle immediate corresponding to

12158

/// the ubiquitous shuffle encoding scheme used in x86 instructions for

12159

/// shuffling 4 lanes. It can be used with most of the PSHUF instructions for

12160

/// example.

12161

///

12162

/// NB: We rely heavily on "undef" masks preserving the input lane.

12163

static unsigned getV4X86ShuffleImm(ArrayRef<int> Mask) {

12164

assert(Mask.size() == 4 && "Only 4-lane shuffle masks")(static_cast <bool> (Mask.size() == 4 && "Only 4-lane shuffle masks"
) ? void (0) : __assert_fail ("Mask.size() == 4 && \"Only 4-lane shuffle masks\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12164, __extension__
__PRETTY_FUNCTION__));

12165

assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!")(static_cast <bool> (Mask[0] >= -1 && Mask[0
] < 4 && "Out of bound mask element!") ? void (0) :
__assert_fail ("Mask[0] >= -1 && Mask[0] < 4 && \"Out of bound mask element!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12165, __extension__
__PRETTY_FUNCTION__));

12166

assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!")(static_cast <bool> (Mask[1] >= -1 && Mask[1
] < 4 && "Out of bound mask element!") ? void (0) :
__assert_fail ("Mask[1] >= -1 && Mask[1] < 4 && \"Out of bound mask element!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12166, __extension__
__PRETTY_FUNCTION__));

12167

assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!")(static_cast <bool> (Mask[2] >= -1 && Mask[2
] < 4 && "Out of bound mask element!") ? void (0) :
__assert_fail ("Mask[2] >= -1 && Mask[2] < 4 && \"Out of bound mask element!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12167, __extension__
__PRETTY_FUNCTION__));

12168

assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!")(static_cast <bool> (Mask[3] >= -1 && Mask[3
] < 4 && "Out of bound mask element!") ? void (0) :
__assert_fail ("Mask[3] >= -1 && Mask[3] < 4 && \"Out of bound mask element!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12168, __extension__
__PRETTY_FUNCTION__));

12169

12170

// If the mask only uses one non-undef element, then fully 'splat' it to

12171

// improve later broadcast matching.

12172

int FirstIndex = find_if(Mask, [](int M) { return M >= 0; }) - Mask.begin();

12173

assert(0 <= FirstIndex && FirstIndex < 4 && "All undef shuffle mask")(static_cast <bool> (0 <= FirstIndex && FirstIndex
< 4 && "All undef shuffle mask") ? void (0) : __assert_fail
("0 <= FirstIndex && FirstIndex < 4 && \"All undef shuffle mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12173, __extension__
__PRETTY_FUNCTION__));

12174

12175

int FirstElt = Mask[FirstIndex];

12176

if (all_of(Mask, [FirstElt](int M) { return M < 0 || M == FirstElt; }))

12177

return (FirstElt << 6) | (FirstElt << 4) | (FirstElt << 2) | FirstElt;

12178

12179

unsigned Imm = 0;

12180

Imm |= (Mask[0] < 0 ? 0 : Mask[0]) << 0;

12181

Imm |= (Mask[1] < 0 ? 1 : Mask[1]) << 2;

12182

Imm |= (Mask[2] < 0 ? 2 : Mask[2]) << 4;

12183

Imm |= (Mask[3] < 0 ? 3 : Mask[3]) << 6;

12184

return Imm;

12185

}

12186

12187

static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, const SDLoc &DL,

12188

SelectionDAG &DAG) {

12189

return DAG.getTargetConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8);

12190

}

12191

12192

// The Shuffle result is as follow:

12193

// 0*a[0]0*a[1]...0*a[n] , n >=0 where a[] elements in a ascending order.

12194

// Each Zeroable's element correspond to a particular Mask's element.

12195

// As described in computeZeroableShuffleElements function.

12196

//

12197

// The function looks for a sub-mask that the nonzero elements are in

12198

// increasing order. If such sub-mask exist. The function returns true.

12199

static bool isNonZeroElementsInOrder(const APInt &Zeroable,

12200

ArrayRef<int> Mask, const EVT &VectorType,

12201

bool &IsZeroSideLeft) {

12202

int NextElement = -1;

12203

// Check if the Mask's nonzero elements are in increasing order.

12204

for (int i = 0, e = Mask.size(); i < e; i++) {

12205

// Checks if the mask's zeros elements are built from only zeros.

12206

assert(Mask[i] >= -1 && "Out of bound mask element!")(static_cast <bool> (Mask[i] >= -1 && "Out of bound mask element!"
) ? void (0) : __assert_fail ("Mask[i] >= -1 && \"Out of bound mask element!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12206, __extension__
__PRETTY_FUNCTION__));

12207

if (Mask[i] < 0)

12208

return false;

12209

if (Zeroable[i])

12210

continue;

12211

// Find the lowest non zero element

12212

if (NextElement < 0) {

12213

NextElement = Mask[i] != 0 ? VectorType.getVectorNumElements() : 0;

12214

IsZeroSideLeft = NextElement != 0;

12215

}

12216

// Exit if the mask's non zero elements are not in increasing order.

12217

if (NextElement != Mask[i])

12218

return false;

12219

NextElement++;

12220

}

12221

return true;

12222

}

12223

12224

/// Try to lower a shuffle with a single PSHUFB of V1 or V2.

12225

static SDValue lowerShuffleWithPSHUFB(const SDLoc &DL, MVT VT,

12226

ArrayRef<int> Mask, SDValue V1,

12227

SDValue V2, const APInt &Zeroable,

12228

const X86Subtarget &Subtarget,

12229

SelectionDAG &DAG) {

12230

int Size = Mask.size();

12231

int LaneSize = 128 / VT.getScalarSizeInBits();

12232

const int NumBytes = VT.getSizeInBits() / 8;

12233

const int NumEltBytes = VT.getScalarSizeInBits() / 8;

12234

12235

assert((Subtarget.hasSSSE3() && VT.is128BitVector()) ||(static_cast <bool> ((Subtarget.hasSSSE3() && VT
.is128BitVector()) || (Subtarget.hasAVX2() && VT.is256BitVector
()) || (Subtarget.hasBWI() && VT.is512BitVector())) ?
void (0) : __assert_fail ("(Subtarget.hasSSSE3() && VT.is128BitVector()) || (Subtarget.hasAVX2() && VT.is256BitVector()) || (Subtarget.hasBWI() && VT.is512BitVector())"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12237, __extension__
__PRETTY_FUNCTION__))

12236

(Subtarget.hasAVX2() && VT.is256BitVector()) ||(static_cast <bool> ((Subtarget.hasSSSE3() && VT
.is128BitVector()) || (Subtarget.hasAVX2() && VT.is256BitVector
()) || (Subtarget.hasBWI() && VT.is512BitVector())) ?
void (0) : __assert_fail ("(Subtarget.hasSSSE3() && VT.is128BitVector()) || (Subtarget.hasAVX2() && VT.is256BitVector()) || (Subtarget.hasBWI() && VT.is512BitVector())"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12237, __extension__
__PRETTY_FUNCTION__))

12237

(Subtarget.hasBWI() && VT.is512BitVector()))(static_cast <bool> ((Subtarget.hasSSSE3() && VT
.is128BitVector()) || (Subtarget.hasAVX2() && VT.is256BitVector
()) || (Subtarget.hasBWI() && VT.is512BitVector())) ?
void (0) : __assert_fail ("(Subtarget.hasSSSE3() && VT.is128BitVector()) || (Subtarget.hasAVX2() && VT.is256BitVector()) || (Subtarget.hasBWI() && VT.is512BitVector())"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12237, __extension__
__PRETTY_FUNCTION__));

12238

12239

SmallVector<SDValue, 64> PSHUFBMask(NumBytes);

12240

// Sign bit set in i8 mask means zero element.

12241

SDValue ZeroMask = DAG.getConstant(0x80, DL, MVT::i8);

12242

12243

SDValue V;

12244

for (int i = 0; i < NumBytes; ++i) {

12245

int M = Mask[i / NumEltBytes];

12246

if (M < 0) {

12247

PSHUFBMask[i] = DAG.getUNDEF(MVT::i8);

12248

continue;

12249

}

12250

if (Zeroable[i / NumEltBytes]) {

12251

PSHUFBMask[i] = ZeroMask;

12252

continue;

12253

}

12254

12255

// We can only use a single input of V1 or V2.

12256

SDValue SrcV = (M >= Size ? V2 : V1);

12257

if (V && V != SrcV)

12258

return SDValue();

12259

V = SrcV;

12260

M %= Size;

12261

12262

// PSHUFB can't cross lanes, ensure this doesn't happen.

12263

if ((M / LaneSize) != ((i / NumEltBytes) / LaneSize))

12264

return SDValue();

12265

12266

M = M % LaneSize;

12267

M = M * NumEltBytes + (i % NumEltBytes);

12268

PSHUFBMask[i] = DAG.getConstant(M, DL, MVT::i8);

12269

}

12270

assert(V && "Failed to find a source input")(static_cast <bool> (V && "Failed to find a source input"
) ? void (0) : __assert_fail ("V && \"Failed to find a source input\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12270, __extension__
__PRETTY_FUNCTION__));

12271

12272

MVT I8VT = MVT::getVectorVT(MVT::i8, NumBytes);

12273

return DAG.getBitcast(

12274

VT, DAG.getNode(X86ISD::PSHUFB, DL, I8VT, DAG.getBitcast(I8VT, V),

12275

DAG.getBuildVector(I8VT, DL, PSHUFBMask)));

12276

}

12277

12278

static SDValue getMaskNode(SDValue Mask, MVT MaskVT,

12279

const X86Subtarget &Subtarget, SelectionDAG &DAG,

12280

const SDLoc &dl);

12281

12282

// X86 has dedicated shuffle that can be lowered to VEXPAND

12283

static SDValue lowerShuffleToEXPAND(const SDLoc &DL, MVT VT,

12284

const APInt &Zeroable,

12285

ArrayRef<int> Mask, SDValue &V1,

12286

SDValue &V2, SelectionDAG &DAG,

12287

const X86Subtarget &Subtarget) {

12288

bool IsLeftZeroSide = true;

12289

if (!isNonZeroElementsInOrder(Zeroable, Mask, V1.getValueType(),

12290

IsLeftZeroSide))

12291

return SDValue();

12292

unsigned VEXPANDMask = (~Zeroable).getZExtValue();

12293

MVT IntegerType =

12294

MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));

12295

SDValue MaskNode = DAG.getConstant(VEXPANDMask, DL, IntegerType);

12296

unsigned NumElts = VT.getVectorNumElements();

12297

assert((NumElts == 4 || NumElts == 8 || NumElts == 16) &&(static_cast <bool> ((NumElts == 4 || NumElts == 8 || NumElts
== 16) && "Unexpected number of vector elements") ? void
(0) : __assert_fail ("(NumElts == 4 || NumElts == 8 || NumElts == 16) && \"Unexpected number of vector elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12298, __extension__
__PRETTY_FUNCTION__))

12298

"Unexpected number of vector elements")(static_cast <bool> ((NumElts == 4 || NumElts == 8 || NumElts
== 16) && "Unexpected number of vector elements") ? void
(0) : __assert_fail ("(NumElts == 4 || NumElts == 8 || NumElts == 16) && \"Unexpected number of vector elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12298, __extension__
__PRETTY_FUNCTION__));

12299

SDValue VMask = getMaskNode(MaskNode, MVT::getVectorVT(MVT::i1, NumElts),

12300

Subtarget, DAG, DL);

12301

SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, DL);

12302

SDValue ExpandedVector = IsLeftZeroSide ? V2 : V1;

12303

return DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector, ZeroVector, VMask);

12304

}

12305

12306

static bool matchShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2,

12307

unsigned &UnpackOpcode, bool IsUnary,

12308

ArrayRef<int> TargetMask, const SDLoc &DL,

12309

SelectionDAG &DAG,

12310

const X86Subtarget &Subtarget) {

12311

int NumElts = VT.getVectorNumElements();

12312

12313

bool Undef1 = true, Undef2 = true, Zero1 = true, Zero2 = true;

12314

for (int i = 0; i != NumElts; i += 2) {

12315

int M1 = TargetMask[i + 0];

12316

int M2 = TargetMask[i + 1];

12317

Undef1 &= (SM_SentinelUndef == M1);

12318

Undef2 &= (SM_SentinelUndef == M2);

12319

Zero1 &= isUndefOrZero(M1);

12320

Zero2 &= isUndefOrZero(M2);

12321

}

12322

assert(!((Undef1 || Zero1) && (Undef2 || Zero2)) &&(static_cast <bool> (!((Undef1 || Zero1) && (Undef2
|| Zero2)) && "Zeroable shuffle detected") ? void (0
) : __assert_fail ("!((Undef1 || Zero1) && (Undef2 || Zero2)) && \"Zeroable shuffle detected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12323, __extension__
__PRETTY_FUNCTION__))

12323

"Zeroable shuffle detected")(static_cast <bool> (!((Undef1 || Zero1) && (Undef2
|| Zero2)) && "Zeroable shuffle detected") ? void (0
) : __assert_fail ("!((Undef1 || Zero1) && (Undef2 || Zero2)) && \"Zeroable shuffle detected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12323, __extension__
__PRETTY_FUNCTION__));

12324

12325

// Attempt to match the target mask against the unpack lo/hi mask patterns.

12326

SmallVector<int, 64> Unpckl, Unpckh;

12327

createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, IsUnary);

12328

if (isTargetShuffleEquivalent(VT, TargetMask, Unpckl, DAG, V1,

12329

(IsUnary ? V1 : V2))) {

12330

UnpackOpcode = X86ISD::UNPCKL;

12331

V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));

12332

V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);

12333

return true;

12334

}

12335

12336

createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, IsUnary);

12337

if (isTargetShuffleEquivalent(VT, TargetMask, Unpckh, DAG, V1,

12338

(IsUnary ? V1 : V2))) {

12339

UnpackOpcode = X86ISD::UNPCKH;

12340

V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));

12341

V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);

12342

return true;

12343

}

12344

12345

// If an unary shuffle, attempt to match as an unpack lo/hi with zero.

12346

if (IsUnary && (Zero1 || Zero2)) {

12347

// Don't bother if we can blend instead.

12348

if ((Subtarget.hasSSE41() || VT == MVT::v2i64 || VT == MVT::v2f64) &&

12349

isSequentialOrUndefOrZeroInRange(TargetMask, 0, NumElts, 0))

12350

return false;

12351

12352

bool MatchLo = true, MatchHi = true;

12353

for (int i = 0; (i != NumElts) && (MatchLo || MatchHi); ++i) {

12354

int M = TargetMask[i];

12355

12356

// Ignore if the input is known to be zero or the index is undef.

12357

if ((((i & 1) == 0) && Zero1) || (((i & 1) == 1) && Zero2) ||

12358

(M == SM_SentinelUndef))

12359

continue;

12360

12361

MatchLo &= (M == Unpckl[i]);

12362

MatchHi &= (M == Unpckh[i]);

12363

}

12364

12365

if (MatchLo || MatchHi) {

12366

UnpackOpcode = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;

12367

V2 = Zero2 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;

12368

V1 = Zero1 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;

12369

return true;

12370

}

12371

}

12372

12373

// If a binary shuffle, commute and try again.

12374

if (!IsUnary) {

12375

ShuffleVectorSDNode::commuteMask(Unpckl);

12376

if (isTargetShuffleEquivalent(VT, TargetMask, Unpckl, DAG)) {

12377

UnpackOpcode = X86ISD::UNPCKL;

12378

std::swap(V1, V2);

12379

return true;

12380

}

12381

12382

ShuffleVectorSDNode::commuteMask(Unpckh);

12383

if (isTargetShuffleEquivalent(VT, TargetMask, Unpckh, DAG)) {

12384

UnpackOpcode = X86ISD::UNPCKH;

12385

std::swap(V1, V2);

12386

return true;

12387

}

12388

}

12389

12390

return false;

12391

}

12392

12393

// X86 has dedicated unpack instructions that can handle specific blend

12394

// operations: UNPCKH and UNPCKL.

12395

static SDValue lowerShuffleWithUNPCK(const SDLoc &DL, MVT VT,

12396

ArrayRef<int> Mask, SDValue V1, SDValue V2,

12397

SelectionDAG &DAG) {

12398

SmallVector<int, 8> Unpckl;

12399

createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, /* Unary = */ false);

12400

if (isShuffleEquivalent(Mask, Unpckl, V1, V2))

12401

return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);

12402

12403

SmallVector<int, 8> Unpckh;

12404

createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, /* Unary = */ false);

12405

if (isShuffleEquivalent(Mask, Unpckh, V1, V2))

12406

return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);

12407

12408

// Commute and try again.

12409

ShuffleVectorSDNode::commuteMask(Unpckl);

12410

if (isShuffleEquivalent(Mask, Unpckl, V1, V2))

12411

return DAG.getNode(X86ISD::UNPCKL, DL, VT, V2, V1);

12412

12413

ShuffleVectorSDNode::commuteMask(Unpckh);

12414

if (isShuffleEquivalent(Mask, Unpckh, V1, V2))

12415

return DAG.getNode(X86ISD::UNPCKH, DL, VT, V2, V1);

12416

12417

return SDValue();

12418

}

12419

12420

/// Check if the mask can be mapped to a preliminary shuffle (vperm 64-bit)

12421

/// followed by unpack 256-bit.

12422

static SDValue lowerShuffleWithUNPCK256(const SDLoc &DL, MVT VT,

12423

ArrayRef<int> Mask, SDValue V1,

12424

SDValue V2, SelectionDAG &DAG) {

12425

SmallVector<int, 32> Unpckl, Unpckh;

12426

createSplat2ShuffleMask(VT, Unpckl, /* Lo */ true);

12427

createSplat2ShuffleMask(VT, Unpckh, /* Lo */ false);

12428

12429

unsigned UnpackOpcode;

12430

if (isShuffleEquivalent(Mask, Unpckl, V1, V2))

12431

UnpackOpcode = X86ISD::UNPCKL;

12432

else if (isShuffleEquivalent(Mask, Unpckh, V1, V2))

12433

UnpackOpcode = X86ISD::UNPCKH;

12434

else

12435

return SDValue();

12436

12437

// This is a "natural" unpack operation (rather than the 128-bit sectored

12438

// operation implemented by AVX). We need to rearrange 64-bit chunks of the

12439

// input in order to use the x86 instruction.

12440

V1 = DAG.getVectorShuffle(MVT::v4f64, DL, DAG.getBitcast(MVT::v4f64, V1),

12441

DAG.getUNDEF(MVT::v4f64), {0, 2, 1, 3});

12442

V1 = DAG.getBitcast(VT, V1);

12443

return DAG.getNode(UnpackOpcode, DL, VT, V1, V1);

12444

}

12445

12446

// Check if the mask can be mapped to a TRUNCATE or VTRUNC, truncating the

12447

// source into the lower elements and zeroing the upper elements.

12448

static bool matchShuffleAsVTRUNC(MVT &SrcVT, MVT &DstVT, MVT VT,

12449

ArrayRef<int> Mask, const APInt &Zeroable,

12450

const X86Subtarget &Subtarget) {

12451

if (!VT.is512BitVector() && !Subtarget.hasVLX())

12452

return false;

12453

12454

unsigned NumElts = Mask.size();

12455

unsigned EltSizeInBits = VT.getScalarSizeInBits();

12456

unsigned MaxScale = 64 / EltSizeInBits;

12457

12458

for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {

12459

unsigned SrcEltBits = EltSizeInBits * Scale;

12460

if (SrcEltBits < 32 && !Subtarget.hasBWI())

12461

continue;

12462

unsigned NumSrcElts = NumElts / Scale;

12463

if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale))

12464

continue;

12465

unsigned UpperElts = NumElts - NumSrcElts;

12466

if (!Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())

12467

continue;

12468

SrcVT = MVT::getIntegerVT(EltSizeInBits * Scale);

12469

SrcVT = MVT::getVectorVT(SrcVT, NumSrcElts);

12470

DstVT = MVT::getIntegerVT(EltSizeInBits);

12471

if ((NumSrcElts * EltSizeInBits) >= 128) {

12472

// ISD::TRUNCATE

12473

DstVT = MVT::getVectorVT(DstVT, NumSrcElts);

12474

} else {

12475

// X86ISD::VTRUNC

12476

DstVT = MVT::getVectorVT(DstVT, 128 / EltSizeInBits);

12477

}

12478

return true;

12479

}

12480

12481

return false;

12482

}

12483

12484

// Helper to create TRUNCATE/VTRUNC nodes, optionally with zero/undef upper

12485

// element padding to the final DstVT.

12486

static SDValue getAVX512TruncNode(const SDLoc &DL, MVT DstVT, SDValue Src,

12487

const X86Subtarget &Subtarget,

12488

SelectionDAG &DAG, bool ZeroUppers) {

12489

MVT SrcVT = Src.getSimpleValueType();

12490

MVT DstSVT = DstVT.getScalarType();

12491

unsigned NumDstElts = DstVT.getVectorNumElements();

12492

unsigned NumSrcElts = SrcVT.getVectorNumElements();

12493

unsigned DstEltSizeInBits = DstVT.getScalarSizeInBits();

12494

12495

if (!DAG.getTargetLoweringInfo().isTypeLegal(SrcVT))

12496

return SDValue();

12497

12498

// Perform a direct ISD::TRUNCATE if possible.

12499

if (NumSrcElts == NumDstElts)

12500

return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Src);

12501

12502

if (NumSrcElts > NumDstElts) {

12503

MVT TruncVT = MVT::getVectorVT(DstSVT, NumSrcElts);

12504

SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Src);

12505

return extractSubVector(Trunc, 0, DAG, DL, DstVT.getSizeInBits());

12506

}

12507

12508

if ((NumSrcElts * DstEltSizeInBits) >= 128) {

12509

MVT TruncVT = MVT::getVectorVT(DstSVT, NumSrcElts);

12510

SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Src);

12511

return widenSubVector(Trunc, ZeroUppers, Subtarget, DAG, DL,

12512

DstVT.getSizeInBits());

12513

}

12514

12515

// Non-VLX targets must truncate from a 512-bit type, so we need to

12516

// widen, truncate and then possibly extract the original subvector.

12517

if (!Subtarget.hasVLX() && !SrcVT.is512BitVector()) {

12518

SDValue NewSrc = widenSubVector(Src, ZeroUppers, Subtarget, DAG, DL, 512);

12519

return getAVX512TruncNode(DL, DstVT, NewSrc, Subtarget, DAG, ZeroUppers);

12520

}

12521

12522

// Fallback to a X86ISD::VTRUNC, padding if necessary.

12523

MVT TruncVT = MVT::getVectorVT(DstSVT, 128 / DstEltSizeInBits);

12524

SDValue Trunc = DAG.getNode(X86ISD::VTRUNC, DL, TruncVT, Src);

12525

if (DstVT != TruncVT)

12526

Trunc = widenSubVector(Trunc, ZeroUppers, Subtarget, DAG, DL,

12527

DstVT.getSizeInBits());

12528

return Trunc;

12529

}

12530

12531

// Try to lower trunc+vector_shuffle to a vpmovdb or a vpmovdw instruction.

12532

//

12533

// An example is the following:

12534

//

12535

// t0: ch = EntryToken

12536

// t2: v4i64,ch = CopyFromReg t0, Register:v4i64 %0

12537

// t25: v4i32 = truncate t2

12538

// t41: v8i16 = bitcast t25

12539

// t21: v8i16 = BUILD_VECTOR undef:i16, undef:i16, undef:i16, undef:i16,

12540

// Constant:i16<0>, Constant:i16<0>, Constant:i16<0>, Constant:i16<0>

12541

// t51: v8i16 = vector_shuffle<0,2,4,6,12,13,14,15> t41, t21

12542

// t18: v2i64 = bitcast t51

12543

//

12544

// One can just use a single vpmovdw instruction, without avx512vl we need to

12545

// use the zmm variant and extract the lower subvector, padding with zeroes.

12546

// TODO: Merge with lowerShuffleAsVTRUNC.

12547

static SDValue lowerShuffleWithVPMOV(const SDLoc &DL, MVT VT, SDValue V1,

12548

SDValue V2, ArrayRef<int> Mask,

12549

const APInt &Zeroable,

12550

const X86Subtarget &Subtarget,

12551

SelectionDAG &DAG) {

12552

assert((VT == MVT::v16i8 || VT == MVT::v8i16) && "Unexpected VTRUNC type")(static_cast <bool> ((VT == MVT::v16i8 || VT == MVT::v8i16
) && "Unexpected VTRUNC type") ? void (0) : __assert_fail
("(VT == MVT::v16i8 || VT == MVT::v8i16) && \"Unexpected VTRUNC type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12552, __extension__
__PRETTY_FUNCTION__));

12553

if (!Subtarget.hasAVX512())

12554

return SDValue();

12555

12556

unsigned NumElts = VT.getVectorNumElements();

12557

unsigned EltSizeInBits = VT.getScalarSizeInBits();

12558

unsigned MaxScale = 64 / EltSizeInBits;

12559

for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {

12560

unsigned SrcEltBits = EltSizeInBits * Scale;

12561

unsigned NumSrcElts = NumElts / Scale;

12562

unsigned UpperElts = NumElts - NumSrcElts;

12563

if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale) ||

12564

!Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())

12565

continue;

12566

12567

// Attempt to find a matching source truncation, but as a fall back VLX

12568

// cases can use the VPMOV directly.

12569

SDValue Src = peekThroughBitcasts(V1);

12570

if (Src.getOpcode() == ISD::TRUNCATE &&

12571

Src.getScalarValueSizeInBits() == SrcEltBits) {

12572

Src = Src.getOperand(0);

12573

} else if (Subtarget.hasVLX()) {

12574

MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);

12575

MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);

12576

Src = DAG.getBitcast(SrcVT, Src);

12577

// Don't do this if PACKSS/PACKUS could perform it cheaper.

12578

if (Scale == 2 &&

12579

((DAG.ComputeNumSignBits(Src) > EltSizeInBits) ||

12580

(DAG.computeKnownBits(Src).countMinLeadingZeros() >= EltSizeInBits)))

12581

return SDValue();

12582

} else

12583

return SDValue();

12584

12585

// VPMOVWB is only available with avx512bw.

12586

if (!Subtarget.hasBWI() && Src.getScalarValueSizeInBits() < 32)

12587

return SDValue();

12588

12589

bool UndefUppers = isUndefInRange(Mask, NumSrcElts, UpperElts);

12590

return getAVX512TruncNode(DL, VT, Src, Subtarget, DAG, !UndefUppers);

12591

}

12592

12593

return SDValue();

12594

}

12595

12596

// Attempt to match binary shuffle patterns as a truncate.

12597

static SDValue lowerShuffleAsVTRUNC(const SDLoc &DL, MVT VT, SDValue V1,

12598

SDValue V2, ArrayRef<int> Mask,

12599

const APInt &Zeroable,

12600

const X86Subtarget &Subtarget,

12601

SelectionDAG &DAG) {

12602

assert((VT.is128BitVector() || VT.is256BitVector()) &&(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
()) && "Unexpected VTRUNC type") ? void (0) : __assert_fail
("(VT.is128BitVector() || VT.is256BitVector()) && \"Unexpected VTRUNC type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12603, __extension__
__PRETTY_FUNCTION__))

12603

"Unexpected VTRUNC type")(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
()) && "Unexpected VTRUNC type") ? void (0) : __assert_fail
("(VT.is128BitVector() || VT.is256BitVector()) && \"Unexpected VTRUNC type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12603, __extension__
__PRETTY_FUNCTION__));

12604

if (!Subtarget.hasAVX512())

12605

return SDValue();

12606

12607

unsigned NumElts = VT.getVectorNumElements();

12608

unsigned EltSizeInBits = VT.getScalarSizeInBits();

12609

unsigned MaxScale = 64 / EltSizeInBits;

12610

for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {

12611

// TODO: Support non-BWI VPMOVWB truncations?

12612

unsigned SrcEltBits = EltSizeInBits * Scale;

12613

if (SrcEltBits < 32 && !Subtarget.hasBWI())

12614

continue;

12615

12616

// Match shuffle <Ofs,Ofs+Scale,Ofs+2*Scale,..,undef_or_zero,undef_or_zero>

12617

// Bail if the V2 elements are undef.

12618

unsigned NumHalfSrcElts = NumElts / Scale;

12619

unsigned NumSrcElts = 2 * NumHalfSrcElts;

12620

for (unsigned Offset = 0; Offset != Scale; ++Offset) {

12621

if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, Offset, Scale) ||

12622

isUndefInRange(Mask, NumHalfSrcElts, NumHalfSrcElts))

12623

continue;

12624

12625

// The elements beyond the truncation must be undef/zero.

12626

unsigned UpperElts = NumElts - NumSrcElts;

12627

if (UpperElts > 0 &&

12628

!Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())

12629

continue;

12630

bool UndefUppers =

12631

UpperElts > 0 && isUndefInRange(Mask, NumSrcElts, UpperElts);

12632

12633

// For offset truncations, ensure that the concat is cheap.

12634

if (Offset) {

12635

auto IsCheapConcat = [&](SDValue Lo, SDValue Hi) {

12636

if (Lo.getOpcode() == ISD::EXTRACT_SUBVECTOR &&

12637

Hi.getOpcode() == ISD::EXTRACT_SUBVECTOR)

12638

return Lo.getOperand(0) == Hi.getOperand(0);

12639

if (ISD::isNormalLoad(Lo.getNode()) &&

12640

ISD::isNormalLoad(Hi.getNode())) {

12641

auto *LDLo = cast<LoadSDNode>(Lo);

12642

auto *LDHi = cast<LoadSDNode>(Hi);

12643

return DAG.areNonVolatileConsecutiveLoads(

12644

LDHi, LDLo, Lo.getValueType().getStoreSize(), 1);

12645

}

12646

return false;

12647

};

12648

if (!IsCheapConcat(V1, V2))

12649

continue;

12650

}

12651

12652

// As we're using both sources then we need to concat them together

12653

// and truncate from the double-sized src.

12654

MVT ConcatVT = MVT::getVectorVT(VT.getScalarType(), NumElts * 2);

12655

SDValue Src = DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT, V1, V2);

12656

12657

MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);

12658

MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);

12659

Src = DAG.getBitcast(SrcVT, Src);

12660

12661

// Shift the offset'd elements into place for the truncation.

12662

// TODO: Use getTargetVShiftByConstNode.

12663

if (Offset)

12664

Src = DAG.getNode(

12665

X86ISD::VSRLI, DL, SrcVT, Src,

12666

DAG.getTargetConstant(Offset * EltSizeInBits, DL, MVT::i8));

12667

12668

return getAVX512TruncNode(DL, VT, Src, Subtarget, DAG, !UndefUppers);

12669

}

12670

}

12671

12672

return SDValue();

12673

}

12674

12675

/// Check whether a compaction lowering can be done by dropping even/odd

12676

/// elements and compute how many times even/odd elements must be dropped.

12677

///

12678

/// This handles shuffles which take every Nth element where N is a power of

12679

/// two. Example shuffle masks:

12680

///

12681

/// (even)

12682

/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14

12683

/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30

12684

/// N = 2: 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12

12685

/// N = 2: 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28

12686

/// N = 3: 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8

12687

/// N = 3: 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24

12688

///

12689

/// (odd)

12690

/// N = 1: 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, 12, 14

12691

/// N = 1: 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31

12692

///

12693

/// Any of these lanes can of course be undef.

12694

///

12695

/// This routine only supports N <= 3.

12696

/// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here

12697

/// for larger N.

12698

///

12699

/// \returns N above, or the number of times even/odd elements must be dropped

12700

/// if there is such a number. Otherwise returns zero.

12701

static int canLowerByDroppingElements(ArrayRef<int> Mask, bool MatchEven,

12702

bool IsSingleInput) {

12703

// The modulus for the shuffle vector entries is based on whether this is

12704

// a single input or not.

12705

int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);

12706

assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&(static_cast <bool> (isPowerOf2_32((uint32_t)ShuffleModulus
) && "We should only be called with masks with a power-of-2 size!"
) ? void (0) : __assert_fail ("isPowerOf2_32((uint32_t)ShuffleModulus) && \"We should only be called with masks with a power-of-2 size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12707, __extension__
__PRETTY_FUNCTION__))

12707

"We should only be called with masks with a power-of-2 size!")(static_cast <bool> (isPowerOf2_32((uint32_t)ShuffleModulus
) && "We should only be called with masks with a power-of-2 size!"
) ? void (0) : __assert_fail ("isPowerOf2_32((uint32_t)ShuffleModulus) && \"We should only be called with masks with a power-of-2 size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12707, __extension__
__PRETTY_FUNCTION__));

12708

12709

uint64_t ModMask = (uint64_t)ShuffleModulus - 1;

12710

int Offset = MatchEven ? 0 : 1;

12711

12712

// We track whether the input is viable for all power-of-2 strides 2^1, 2^2,

12713

// and 2^3 simultaneously. This is because we may have ambiguity with

12714

// partially undef inputs.

12715

bool ViableForN[3] = {true, true, true};

12716

12717

for (int i = 0, e = Mask.size(); i < e; ++i) {

12718

// Ignore undef lanes, we'll optimistically collapse them to the pattern we

12719

// want.

12720

if (Mask[i] < 0)

12721

continue;

12722

12723

bool IsAnyViable = false;

12724

for (unsigned j = 0; j != std::size(ViableForN); ++j)

12725

if (ViableForN[j]) {

12726

uint64_t N = j + 1;

12727

12728

// The shuffle mask must be equal to (i * 2^N) % M.

12729

if ((uint64_t)(Mask[i] - Offset) == (((uint64_t)i << N) & ModMask))

12730

IsAnyViable = true;

12731

else

12732

ViableForN[j] = false;

12733

}

12734

// Early exit if we exhaust the possible powers of two.

12735

if (!IsAnyViable)

12736

break;

12737

}

12738

12739

for (unsigned j = 0; j != std::size(ViableForN); ++j)

12740

if (ViableForN[j])

12741

return j + 1;

12742

12743

// Return 0 as there is no viable power of two.

12744

return 0;

12745

}

12746

12747

// X86 has dedicated pack instructions that can handle specific truncation

12748

// operations: PACKSS and PACKUS.

12749

// Checks for compaction shuffle masks if MaxStages > 1.

12750

// TODO: Add support for matching multiple PACKSS/PACKUS stages.

12751

static bool matchShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1, SDValue &V2,

12752

unsigned &PackOpcode, ArrayRef<int> TargetMask,

12753

const SelectionDAG &DAG,

12754

const X86Subtarget &Subtarget,

12755

unsigned MaxStages = 1) {

12756

unsigned NumElts = VT.getVectorNumElements();

12757

unsigned BitSize = VT.getScalarSizeInBits();

12758

assert(0 < MaxStages && MaxStages <= 3 && (BitSize << MaxStages) <= 64 &&(static_cast <bool> (0 < MaxStages && MaxStages
<= 3 && (BitSize << MaxStages) <= 64 &&
"Illegal maximum compaction") ? void (0) : __assert_fail ("0 < MaxStages && MaxStages <= 3 && (BitSize << MaxStages) <= 64 && \"Illegal maximum compaction\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12759, __extension__
__PRETTY_FUNCTION__))

12759

"Illegal maximum compaction")(static_cast <bool> (0 < MaxStages && MaxStages
<= 3 && (BitSize << MaxStages) <= 64 &&
"Illegal maximum compaction") ? void (0) : __assert_fail ("0 < MaxStages && MaxStages <= 3 && (BitSize << MaxStages) <= 64 && \"Illegal maximum compaction\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12759, __extension__
__PRETTY_FUNCTION__));

12760

12761

auto MatchPACK = [&](SDValue N1, SDValue N2, MVT PackVT) {

12762

unsigned NumSrcBits = PackVT.getScalarSizeInBits();

12763

unsigned NumPackedBits = NumSrcBits - BitSize;

12764

N1 = peekThroughBitcasts(N1);

12765

N2 = peekThroughBitcasts(N2);

12766

unsigned NumBits1 = N1.getScalarValueSizeInBits();

12767

unsigned NumBits2 = N2.getScalarValueSizeInBits();

12768

bool IsZero1 = llvm::isNullOrNullSplat(N1, /*AllowUndefs*/ false);

12769

bool IsZero2 = llvm::isNullOrNullSplat(N2, /*AllowUndefs*/ false);

12770

if ((!N1.isUndef() && !IsZero1 && NumBits1 != NumSrcBits) ||

12771

(!N2.isUndef() && !IsZero2 && NumBits2 != NumSrcBits))

12772

return false;

12773

if (Subtarget.hasSSE41() || BitSize == 8) {

12774

APInt ZeroMask = APInt::getHighBitsSet(NumSrcBits, NumPackedBits);

12775

if ((N1.isUndef() || IsZero1 || DAG.MaskedValueIsZero(N1, ZeroMask)) &&

12776

(N2.isUndef() || IsZero2 || DAG.MaskedValueIsZero(N2, ZeroMask))) {

12777

V1 = N1;

12778

V2 = N2;

12779

SrcVT = PackVT;

12780

PackOpcode = X86ISD::PACKUS;

12781

return true;

12782

}

12783

}

12784

bool IsAllOnes1 = llvm::isAllOnesOrAllOnesSplat(N1, /*AllowUndefs*/ false);

12785

bool IsAllOnes2 = llvm::isAllOnesOrAllOnesSplat(N2, /*AllowUndefs*/ false);

12786

if ((N1.isUndef() || IsZero1 || IsAllOnes1 ||

12787

DAG.ComputeNumSignBits(N1) > NumPackedBits) &&

12788

(N2.isUndef() || IsZero2 || IsAllOnes2 ||

12789

DAG.ComputeNumSignBits(N2) > NumPackedBits)) {

12790

V1 = N1;

12791

V2 = N2;

12792

SrcVT = PackVT;

12793

PackOpcode = X86ISD::PACKSS;

12794

return true;

12795

}

12796

return false;

12797

};

12798

12799

// Attempt to match against wider and wider compaction patterns.

12800

for (unsigned NumStages = 1; NumStages <= MaxStages; ++NumStages) {

12801

MVT PackSVT = MVT::getIntegerVT(BitSize << NumStages);

12802

MVT PackVT = MVT::getVectorVT(PackSVT, NumElts >> NumStages);

12803

12804

// Try binary shuffle.

12805

SmallVector<int, 32> BinaryMask;

12806

createPackShuffleMask(VT, BinaryMask, false, NumStages);

12807

if (isTargetShuffleEquivalent(VT, TargetMask, BinaryMask, DAG, V1, V2))

12808

if (MatchPACK(V1, V2, PackVT))

12809

return true;

12810

12811

// Try unary shuffle.

12812

SmallVector<int, 32> UnaryMask;

12813

createPackShuffleMask(VT, UnaryMask, true, NumStages);

12814

if (isTargetShuffleEquivalent(VT, TargetMask, UnaryMask, DAG, V1))

12815

if (MatchPACK(V1, V1, PackVT))

12816

return true;

12817

}

12818

12819

return false;

12820

}

12821

12822

static SDValue lowerShuffleWithPACK(const SDLoc &DL, MVT VT, ArrayRef<int> Mask,

12823

SDValue V1, SDValue V2, SelectionDAG &DAG,

12824

const X86Subtarget &Subtarget) {

12825

MVT PackVT;

12826

unsigned PackOpcode;

12827

unsigned SizeBits = VT.getSizeInBits();

12828

unsigned EltBits = VT.getScalarSizeInBits();

12829

unsigned MaxStages = Log2_32(64 / EltBits);

12830

if (!matchShuffleWithPACK(VT, PackVT, V1, V2, PackOpcode, Mask, DAG,

12831

Subtarget, MaxStages))

12832

return SDValue();

12833

12834

unsigned CurrentEltBits = PackVT.getScalarSizeInBits();

12835

unsigned NumStages = Log2_32(CurrentEltBits / EltBits);

12836

12837

// Don't lower multi-stage packs on AVX512, truncation is better.

12838

if (NumStages != 1 && SizeBits == 128 && Subtarget.hasVLX())

12839

return SDValue();

12840

12841

// Pack to the largest type possible:

12842

// vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.

12843

unsigned MaxPackBits = 16;

12844

if (CurrentEltBits > 16 &&

12845

(PackOpcode == X86ISD::PACKSS || Subtarget.hasSSE41()))

12846

MaxPackBits = 32;

12847

12848

// Repeatedly pack down to the target size.

12849

SDValue Res;

12850

for (unsigned i = 0; i != NumStages; ++i) {

12851

unsigned SrcEltBits = std::min(MaxPackBits, CurrentEltBits);

12852

unsigned NumSrcElts = SizeBits / SrcEltBits;

12853

MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);

12854

MVT DstSVT = MVT::getIntegerVT(SrcEltBits / 2);

12855

MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);

12856

MVT DstVT = MVT::getVectorVT(DstSVT, NumSrcElts * 2);

12857

Res = DAG.getNode(PackOpcode, DL, DstVT, DAG.getBitcast(SrcVT, V1),

12858

DAG.getBitcast(SrcVT, V2));

12859

V1 = V2 = Res;

12860

CurrentEltBits /= 2;

12861

}

12862

assert(Res && Res.getValueType() == VT &&(static_cast <bool> (Res && Res.getValueType() ==
VT && "Failed to lower compaction shuffle") ? void (
0) : __assert_fail ("Res && Res.getValueType() == VT && \"Failed to lower compaction shuffle\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12863, __extension__
__PRETTY_FUNCTION__))

12863

"Failed to lower compaction shuffle")(static_cast <bool> (Res && Res.getValueType() ==
VT && "Failed to lower compaction shuffle") ? void (
0) : __assert_fail ("Res && Res.getValueType() == VT && \"Failed to lower compaction shuffle\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12863, __extension__
__PRETTY_FUNCTION__));

12864

return Res;

12865

}

12866

12867

/// Try to emit a bitmask instruction for a shuffle.

12868

///

12869

/// This handles cases where we can model a blend exactly as a bitmask due to

12870

/// one of the inputs being zeroable.

12871

static SDValue lowerShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1,

12872

SDValue V2, ArrayRef<int> Mask,

12873

const APInt &Zeroable,

12874

const X86Subtarget &Subtarget,

12875

SelectionDAG &DAG) {

12876

MVT MaskVT = VT;

12877

MVT EltVT = VT.getVectorElementType();

12878

SDValue Zero, AllOnes;

12879

// Use f64 if i64 isn't legal.

12880

if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {

12881

EltVT = MVT::f64;

12882

MaskVT = MVT::getVectorVT(EltVT, Mask.size());

12883

}

12884

12885

MVT LogicVT = VT;

12886

if (EltVT == MVT::f32 || EltVT == MVT::f64) {

12887

Zero = DAG.getConstantFP(0.0, DL, EltVT);

12888

APFloat AllOnesValue =

12889

APFloat::getAllOnesValue(SelectionDAG::EVTToAPFloatSemantics(EltVT));

12890

AllOnes = DAG.getConstantFP(AllOnesValue, DL, EltVT);

12891

LogicVT =

12892

MVT::getVectorVT(EltVT == MVT::f64 ? MVT::i64 : MVT::i32, Mask.size());

12893

} else {

12894

Zero = DAG.getConstant(0, DL, EltVT);

12895

AllOnes = DAG.getAllOnesConstant(DL, EltVT);

12896

}

12897

12898

SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);

12899

SDValue V;

12900

for (int i = 0, Size = Mask.size(); i < Size; ++i) {

12901

if (Zeroable[i])

12902

continue;

12903

if (Mask[i] % Size != i)

12904

return SDValue(); // Not a blend.

12905

if (!V)

12906

V = Mask[i] < Size ? V1 : V2;

12907

else if (V != (Mask[i] < Size ? V1 : V2))

12908

return SDValue(); // Can only let one input through the mask.

12909

12910

VMaskOps[i] = AllOnes;

12911

}

12912

if (!V)

12913

return SDValue(); // No non-zeroable elements!

12914

12915

SDValue VMask = DAG.getBuildVector(MaskVT, DL, VMaskOps);

12916

VMask = DAG.getBitcast(LogicVT, VMask);

12917

V = DAG.getBitcast(LogicVT, V);

12918

SDValue And = DAG.getNode(ISD::AND, DL, LogicVT, V, VMask);

12919

return DAG.getBitcast(VT, And);

12920

}

12921

12922

/// Try to emit a blend instruction for a shuffle using bit math.

12923

///

12924

/// This is used as a fallback approach when first class blend instructions are

12925

/// unavailable. Currently it is only suitable for integer vectors, but could

12926

/// be generalized for floating point vectors if desirable.

12927

static SDValue lowerShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1,

12928

SDValue V2, ArrayRef<int> Mask,

12929

SelectionDAG &DAG) {

12930

assert(VT.isInteger() && "Only supports integer vector types!")(static_cast <bool> (VT.isInteger() && "Only supports integer vector types!"
) ? void (0) : __assert_fail ("VT.isInteger() && \"Only supports integer vector types!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12930, __extension__
__PRETTY_FUNCTION__));

12931

MVT EltVT = VT.getVectorElementType();

12932

SDValue Zero = DAG.getConstant(0, DL, EltVT);

12933

SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);

12934

SmallVector<SDValue, 16> MaskOps;

12935

for (int i = 0, Size = Mask.size(); i < Size; ++i) {

12936

if (Mask[i] >= 0 && Mask[i] != i && Mask[i] != i + Size)

12937

return SDValue(); // Shuffled input!

12938

MaskOps.push_back(Mask[i] < Size ? AllOnes : Zero);

12939

}

12940

12941

SDValue V1Mask = DAG.getBuildVector(VT, DL, MaskOps);

12942

V1 = DAG.getNode(ISD::AND, DL, VT, V1, V1Mask);

12943

V2 = DAG.getNode(X86ISD::ANDNP, DL, VT, V1Mask, V2);

12944

return DAG.getNode(ISD::OR, DL, VT, V1, V2);

12945

}

12946

12947

static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,

12948

SDValue PreservedSrc,

12949

const X86Subtarget &Subtarget,

12950

SelectionDAG &DAG);

12951

12952

static bool matchShuffleAsBlend(SDValue V1, SDValue V2,

12953

MutableArrayRef<int> Mask,

12954

const APInt &Zeroable, bool &ForceV1Zero,

12955

bool &ForceV2Zero, uint64_t &BlendMask) {

12956

bool V1IsZeroOrUndef =

12957

V1.isUndef() || ISD::isBuildVectorAllZeros(V1.getNode());

12958

bool V2IsZeroOrUndef =

12959

V2.isUndef() || ISD::isBuildVectorAllZeros(V2.getNode());

12960

12961

BlendMask = 0;

12962

ForceV1Zero = false, ForceV2Zero = false;

12963

assert(Mask.size() <= 64 && "Shuffle mask too big for blend mask")(static_cast <bool> (Mask.size() <= 64 && "Shuffle mask too big for blend mask"
) ? void (0) : __assert_fail ("Mask.size() <= 64 && \"Shuffle mask too big for blend mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 12963, __extension__
__PRETTY_FUNCTION__));

12964

12965

// Attempt to generate the binary blend mask. If an input is zero then

12966

// we can use any lane.

12967

for (int i = 0, Size = Mask.size(); i < Size; ++i) {

12968

int M = Mask[i];

12969

if (M == SM_SentinelUndef)

12970

continue;

12971

if (M == i ||

12972

(0 <= M && M < Size && IsElementEquivalent(Size, V1, V1, M, i))) {

12973

Mask[i] = i;

12974

continue;

12975

}

12976

if (M == (i + Size) ||

12977

(Size <= M && IsElementEquivalent(Size, V2, V2, M - Size, i))) {

12978

BlendMask |= 1ull << i;

12979

Mask[i] = i + Size;

12980

continue;

12981

}

12982

if (Zeroable[i]) {

12983

if (V1IsZeroOrUndef) {

12984

ForceV1Zero = true;

12985

Mask[i] = i;

12986

continue;

12987

}

12988

if (V2IsZeroOrUndef) {

12989

ForceV2Zero = true;

12990

BlendMask |= 1ull << i;

12991

Mask[i] = i + Size;

12992

continue;

12993

}

12994

}

12995

return false;

12996

}

12997

return true;

12998

}

12999

13000

static uint64_t scaleVectorShuffleBlendMask(uint64_t BlendMask, int Size,

13001

int Scale) {

13002

uint64_t ScaledMask = 0;

13003

for (int i = 0; i != Size; ++i)

13004

if (BlendMask & (1ull << i))

13005

ScaledMask |= ((1ull << Scale) - 1) << (i * Scale);

13006

return ScaledMask;

13007

}

13008

13009

/// Try to emit a blend instruction for a shuffle.

13010

///

13011

/// This doesn't do any checks for the availability of instructions for blending

13012

/// these values. It relies on the availability of the X86ISD::BLENDI pattern to

13013

/// be matched in the backend with the type given. What it does check for is

13014

/// that the shuffle mask is a blend, or convertible into a blend with zero.

13015

static SDValue lowerShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,

13016

SDValue V2, ArrayRef<int> Original,

13017

const APInt &Zeroable,

13018

const X86Subtarget &Subtarget,

13019

SelectionDAG &DAG) {

13020

uint64_t BlendMask = 0;

13021

bool ForceV1Zero = false, ForceV2Zero = false;

13022

SmallVector<int, 64> Mask(Original);

13023

if (!matchShuffleAsBlend(V1, V2, Mask, Zeroable, ForceV1Zero, ForceV2Zero,

13024

BlendMask))

13025

return SDValue();

13026

13027

// Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.

13028

if (ForceV1Zero)

13029

V1 = getZeroVector(VT, Subtarget, DAG, DL);

13030

if (ForceV2Zero)

13031

V2 = getZeroVector(VT, Subtarget, DAG, DL);

13032

13033

unsigned NumElts = VT.getVectorNumElements();

13034

13035

switch (VT.SimpleTy) {

13036

case MVT::v4i64:

13037

case MVT::v8i32:

13038

assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!")(static_cast <bool> (Subtarget.hasAVX2() && "256-bit integer blends require AVX2!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"256-bit integer blends require AVX2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13038, __extension__
__PRETTY_FUNCTION__));

13039

[[fallthrough]];

13040

case MVT::v4f64:

13041

case MVT::v8f32:

13042

assert(Subtarget.hasAVX() && "256-bit float blends require AVX!")(static_cast <bool> (Subtarget.hasAVX() && "256-bit float blends require AVX!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX() && \"256-bit float blends require AVX!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13042, __extension__
__PRETTY_FUNCTION__));

13043

[[fallthrough]];

13044

case MVT::v2f64:

13045

case MVT::v2i64:

13046

case MVT::v4f32:

13047

case MVT::v4i32:

13048

case MVT::v8i16:

13049

assert(Subtarget.hasSSE41() && "128-bit blends require SSE41!")(static_cast <bool> (Subtarget.hasSSE41() && "128-bit blends require SSE41!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE41() && \"128-bit blends require SSE41!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13049, __extension__
__PRETTY_FUNCTION__));

13050

return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,

13051

DAG.getTargetConstant(BlendMask, DL, MVT::i8));

13052

case MVT::v16i16: {

13053

assert(Subtarget.hasAVX2() && "v16i16 blends require AVX2!")(static_cast <bool> (Subtarget.hasAVX2() && "v16i16 blends require AVX2!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"v16i16 blends require AVX2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13053, __extension__
__PRETTY_FUNCTION__));

13054

SmallVector<int, 8> RepeatedMask;

13055

if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {

13056

// We can lower these with PBLENDW which is mirrored across 128-bit lanes.

13057

assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!")(static_cast <bool> (RepeatedMask.size() == 8 &&
"Repeated mask size doesn't match!") ? void (0) : __assert_fail
("RepeatedMask.size() == 8 && \"Repeated mask size doesn't match!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13057, __extension__
__PRETTY_FUNCTION__));

13058

BlendMask = 0;

13059

for (int i = 0; i < 8; ++i)

13060

if (RepeatedMask[i] >= 8)

13061

BlendMask |= 1ull << i;

13062

return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,

13063

DAG.getTargetConstant(BlendMask, DL, MVT::i8));

13064

}

13065

// Use PBLENDW for lower/upper lanes and then blend lanes.

13066

// TODO - we should allow 2 PBLENDW here and leave shuffle combine to

13067

// merge to VSELECT where useful.

13068

uint64_t LoMask = BlendMask & 0xFF;

13069

uint64_t HiMask = (BlendMask >> 8) & 0xFF;

13070

if (LoMask == 0 || LoMask == 255 || HiMask == 0 || HiMask == 255) {

13071

SDValue Lo = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,

13072

DAG.getTargetConstant(LoMask, DL, MVT::i8));

13073

SDValue Hi = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,

13074

DAG.getTargetConstant(HiMask, DL, MVT::i8));

13075

return DAG.getVectorShuffle(

13076

MVT::v16i16, DL, Lo, Hi,

13077

{0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31});

13078

}

13079

[[fallthrough]];

13080

}

13081

case MVT::v32i8:

13082

assert(Subtarget.hasAVX2() && "256-bit byte-blends require AVX2!")(static_cast <bool> (Subtarget.hasAVX2() && "256-bit byte-blends require AVX2!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"256-bit byte-blends require AVX2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13082, __extension__
__PRETTY_FUNCTION__));

13083

[[fallthrough]];

13084

case MVT::v16i8: {

13085

assert(Subtarget.hasSSE41() && "128-bit byte-blends require SSE41!")(static_cast <bool> (Subtarget.hasSSE41() && "128-bit byte-blends require SSE41!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE41() && \"128-bit byte-blends require SSE41!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13085, __extension__
__PRETTY_FUNCTION__));

13086

13087

// Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB.

13088

if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,

13089

Subtarget, DAG))

13090

return Masked;

13091

13092

if (Subtarget.hasBWI() && Subtarget.hasVLX()) {

13093

MVT IntegerType = MVT::getIntegerVT(std::max<unsigned>(NumElts, 8));

13094

SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);

13095

return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);

13096

}

13097

13098

// If we have VPTERNLOG, we can use that as a bit blend.

13099

if (Subtarget.hasVLX())

13100

if (SDValue BitBlend =

13101

lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))

13102

return BitBlend;

13103

13104

// Scale the blend by the number of bytes per element.

13105

int Scale = VT.getScalarSizeInBits() / 8;

13106

13107

// This form of blend is always done on bytes. Compute the byte vector

13108

// type.

13109

MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);

13110

13111

// x86 allows load folding with blendvb from the 2nd source operand. But

13112

// we are still using LLVM select here (see comment below), so that's V1.

13113

// If V2 can be load-folded and V1 cannot be load-folded, then commute to

13114

// allow that load-folding possibility.

13115

if (!ISD::isNormalLoad(V1.getNode()) && ISD::isNormalLoad(V2.getNode())) {

13116

ShuffleVectorSDNode::commuteMask(Mask);

13117

std::swap(V1, V2);

13118

}

13119

13120

// Compute the VSELECT mask. Note that VSELECT is really confusing in the

13121

// mix of LLVM's code generator and the x86 backend. We tell the code

13122

// generator that boolean values in the elements of an x86 vector register

13123

// are -1 for true and 0 for false. We then use the LLVM semantics of 'true'

13124

// mapping a select to operand #1, and 'false' mapping to operand #2. The

13125

// reality in x86 is that vector masks (pre-AVX-512) use only the high bit

13126

// of the element (the remaining are ignored) and 0 in that high bit would

13127

// mean operand #1 while 1 in the high bit would mean operand #2. So while

13128

// the LLVM model for boolean values in vector elements gets the relevant

13129

// bit set, it is set backwards and over constrained relative to x86's

13130

// actual model.

13131

SmallVector<SDValue, 32> VSELECTMask;

13132

for (int i = 0, Size = Mask.size(); i < Size; ++i)

13133

for (int j = 0; j < Scale; ++j)

13134

VSELECTMask.push_back(

13135

Mask[i] < 0 ? DAG.getUNDEF(MVT::i8)

13136

: DAG.getConstant(Mask[i] < Size ? -1 : 0, DL,

13137

MVT::i8));

13138

13139

V1 = DAG.getBitcast(BlendVT, V1);

13140

V2 = DAG.getBitcast(BlendVT, V2);

13141

return DAG.getBitcast(

13142

VT,

13143

DAG.getSelect(DL, BlendVT, DAG.getBuildVector(BlendVT, DL, VSELECTMask),

13144

V1, V2));

13145

}

13146

case MVT::v16f32:

13147

case MVT::v8f64:

13148

case MVT::v8i64:

13149

case MVT::v16i32:

13150

case MVT::v32i16:

13151

case MVT::v64i8: {

13152

// Attempt to lower to a bitmask if we can. Only if not optimizing for size.

13153

bool OptForSize = DAG.shouldOptForSize();

13154

if (!OptForSize) {

13155

if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,

13156

Subtarget, DAG))

13157

return Masked;

13158

}

13159

13160

// Otherwise load an immediate into a GPR, cast to k-register, and use a

13161

// masked move.

13162

MVT IntegerType = MVT::getIntegerVT(std::max<unsigned>(NumElts, 8));

13163

SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);

13164

return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);

13165

}

13166

default:

13167

llvm_unreachable("Not a supported integer vector type!")::llvm::llvm_unreachable_internal("Not a supported integer vector type!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13167);

13168

}

13169

}

13170

13171

/// Try to lower as a blend of elements from two inputs followed by

13172

/// a single-input permutation.

13173

///

13174

/// This matches the pattern where we can blend elements from two inputs and

13175

/// then reduce the shuffle to a single-input permutation.

13176

static SDValue lowerShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT,

13177

SDValue V1, SDValue V2,

13178

ArrayRef<int> Mask,

13179

SelectionDAG &DAG,

13180

bool ImmBlends = false) {

13181

// We build up the blend mask while checking whether a blend is a viable way

13182

// to reduce the shuffle.

13183

SmallVector<int, 32> BlendMask(Mask.size(), -1);

13184

SmallVector<int, 32> PermuteMask(Mask.size(), -1);

13185

13186

for (int i = 0, Size = Mask.size(); i < Size; ++i) {

13187

if (Mask[i] < 0)

13188

continue;

13189

13190

assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds.")(static_cast <bool> (Mask[i] < Size * 2 && "Shuffle input is out of bounds."
) ? void (0) : __assert_fail ("Mask[i] < Size * 2 && \"Shuffle input is out of bounds.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13190, __extension__
__PRETTY_FUNCTION__));

13191

13192

if (BlendMask[Mask[i] % Size] < 0)

13193

BlendMask[Mask[i] % Size] = Mask[i];

13194

else if (BlendMask[Mask[i] % Size] != Mask[i])

13195

return SDValue(); // Can't blend in the needed input!

13196

13197

PermuteMask[i] = Mask[i] % Size;

13198

}

13199

13200

// If only immediate blends, then bail if the blend mask can't be widened to

13201

// i16.

13202

unsigned EltSize = VT.getScalarSizeInBits();

13203

if (ImmBlends && EltSize == 8 && !canWidenShuffleElements(BlendMask))

13204

return SDValue();

13205

13206

SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);

13207

return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask);

13208

}

13209

13210

/// Try to lower as an unpack of elements from two inputs followed by

13211

/// a single-input permutation.

13212

///

13213

/// This matches the pattern where we can unpack elements from two inputs and

13214

/// then reduce the shuffle to a single-input (wider) permutation.

13215

static SDValue lowerShuffleAsUNPCKAndPermute(const SDLoc &DL, MVT VT,

13216

SDValue V1, SDValue V2,

13217

ArrayRef<int> Mask,

13218

SelectionDAG &DAG) {

13219

int NumElts = Mask.size();

13220

int NumLanes = VT.getSizeInBits() / 128;

13221

int NumLaneElts = NumElts / NumLanes;

13222

int NumHalfLaneElts = NumLaneElts / 2;

13223

13224

bool MatchLo = true, MatchHi = true;

13225

SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};

13226

13227

// Determine UNPCKL/UNPCKH type and operand order.

13228

for (int Lane = 0; Lane != NumElts; Lane += NumLaneElts) {

13229

for (int Elt = 0; Elt != NumLaneElts; ++Elt) {

13230

int M = Mask[Lane + Elt];

13231

if (M < 0)

13232

continue;

13233

13234

SDValue &Op = Ops[Elt & 1];

13235

if (M < NumElts && (Op.isUndef() || Op == V1))

13236

Op = V1;

13237

else if (NumElts <= M && (Op.isUndef() || Op == V2))

13238

Op = V2;

13239

else

13240

return SDValue();

13241

13242

int Lo = Lane, Mid = Lane + NumHalfLaneElts, Hi = Lane + NumLaneElts;

13243

MatchLo &= isUndefOrInRange(M, Lo, Mid) ||

13244

isUndefOrInRange(M, NumElts + Lo, NumElts + Mid);

13245

MatchHi &= isUndefOrInRange(M, Mid, Hi) ||

13246

isUndefOrInRange(M, NumElts + Mid, NumElts + Hi);

13247

if (!MatchLo && !MatchHi)

13248

return SDValue();

13249

}

13250

}

13251

assert((MatchLo ^ MatchHi) && "Failed to match UNPCKLO/UNPCKHI")(static_cast <bool> ((MatchLo ^ MatchHi) && "Failed to match UNPCKLO/UNPCKHI"
) ? void (0) : __assert_fail ("(MatchLo ^ MatchHi) && \"Failed to match UNPCKLO/UNPCKHI\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13251, __extension__
__PRETTY_FUNCTION__));

13252

13253

// Now check that each pair of elts come from the same unpack pair

13254

// and set the permute mask based on each pair.

13255

// TODO - Investigate cases where we permute individual elements.

13256

SmallVector<int, 32> PermuteMask(NumElts, -1);

13257

for (int Lane = 0; Lane != NumElts; Lane += NumLaneElts) {

13258

for (int Elt = 0; Elt != NumLaneElts; Elt += 2) {

13259

int M0 = Mask[Lane + Elt + 0];

13260

int M1 = Mask[Lane + Elt + 1];

13261

if (0 <= M0 && 0 <= M1 &&

13262

(M0 % NumHalfLaneElts) != (M1 % NumHalfLaneElts))

13263

return SDValue();

13264

if (0 <= M0)

13265

PermuteMask[Lane + Elt + 0] = Lane + (2 * (M0 % NumHalfLaneElts));

13266

if (0 <= M1)

13267

PermuteMask[Lane + Elt + 1] = Lane + (2 * (M1 % NumHalfLaneElts)) + 1;

13268

}

13269

}

13270

13271

unsigned UnpckOp = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;

13272

SDValue Unpck = DAG.getNode(UnpckOp, DL, VT, Ops);

13273

return DAG.getVectorShuffle(VT, DL, Unpck, DAG.getUNDEF(VT), PermuteMask);

13274

}

13275

13276

/// Try to lower a shuffle as a permute of the inputs followed by an

13277

/// UNPCK instruction.

13278

///

13279

/// This specifically targets cases where we end up with alternating between

13280

/// the two inputs, and so can permute them into something that feeds a single

13281

/// UNPCK instruction. Note that this routine only targets integer vectors

13282

/// because for floating point vectors we have a generalized SHUFPS lowering

13283

/// strategy that handles everything that doesn't *exactly* match an unpack,

13284

/// making this clever lowering unnecessary.

13285

static SDValue lowerShuffleAsPermuteAndUnpack(const SDLoc &DL, MVT VT,

13286

SDValue V1, SDValue V2,

13287

ArrayRef<int> Mask,

13288

const X86Subtarget &Subtarget,

13289

SelectionDAG &DAG) {

13290

int Size = Mask.size();

13291

assert(Mask.size() >= 2 && "Single element masks are invalid.")(static_cast <bool> (Mask.size() >= 2 && "Single element masks are invalid."
) ? void (0) : __assert_fail ("Mask.size() >= 2 && \"Single element masks are invalid.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13291, __extension__
__PRETTY_FUNCTION__));

13292

13293

// This routine only supports 128-bit integer dual input vectors.

13294

if (VT.isFloatingPoint() || !VT.is128BitVector() || V2.isUndef())

13295

return SDValue();

13296

13297

int NumLoInputs =

13298

count_if(Mask, [Size](int M) { return M >= 0 && M % Size < Size / 2; });

13299

int NumHiInputs =

13300

count_if(Mask, [Size](int M) { return M % Size >= Size / 2; });

13301

13302

bool UnpackLo = NumLoInputs >= NumHiInputs;

13303

13304

auto TryUnpack = [&](int ScalarSize, int Scale) {

13305

SmallVector<int, 16> V1Mask((unsigned)Size, -1);

13306

SmallVector<int, 16> V2Mask((unsigned)Size, -1);

13307

13308

for (int i = 0; i < Size; ++i) {

13309

if (Mask[i] < 0)

13310

continue;

13311

13312

// Each element of the unpack contains Scale elements from this mask.

13313

int UnpackIdx = i / Scale;

13314

13315

// We only handle the case where V1 feeds the first slots of the unpack.

13316

// We rely on canonicalization to ensure this is the case.

13317

if ((UnpackIdx % 2 == 0) != (Mask[i] < Size))

13318

return SDValue();

13319

13320

// Setup the mask for this input. The indexing is tricky as we have to

13321

// handle the unpack stride.

13322

SmallVectorImpl<int> &VMask = (UnpackIdx % 2 == 0) ? V1Mask : V2Mask;

13323

VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 : Size / 2)] =

13324

Mask[i] % Size;

13325

}

13326

13327

// If we will have to shuffle both inputs to use the unpack, check whether

13328

// we can just unpack first and shuffle the result. If so, skip this unpack.

13329

if ((NumLoInputs == 0 || NumHiInputs == 0) && !isNoopShuffleMask(V1Mask) &&

13330

!isNoopShuffleMask(V2Mask))

13331

return SDValue();

13332

13333

// Shuffle the inputs into place.

13334

V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);

13335

V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);

13336

13337

// Cast the inputs to the type we will use to unpack them.

13338

MVT UnpackVT =

13339

MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), Size / Scale);

13340

V1 = DAG.getBitcast(UnpackVT, V1);

13341

V2 = DAG.getBitcast(UnpackVT, V2);

13342

13343

// Unpack the inputs and cast the result back to the desired type.

13344

return DAG.getBitcast(

13345

VT, DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,

13346

UnpackVT, V1, V2));

13347

};

13348

13349

// We try each unpack from the largest to the smallest to try and find one

13350

// that fits this mask.

13351

int OrigScalarSize = VT.getScalarSizeInBits();

13352

for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2)

13353

if (SDValue Unpack = TryUnpack(ScalarSize, ScalarSize / OrigScalarSize))

13354

return Unpack;

13355

13356

// If we're shuffling with a zero vector then we're better off not doing

13357

// VECTOR_SHUFFLE(UNPCK()) as we lose track of those zero elements.

13358

if (ISD::isBuildVectorAllZeros(V1.getNode()) ||

13359

ISD::isBuildVectorAllZeros(V2.getNode()))

13360

return SDValue();

13361

13362

// If none of the unpack-rooted lowerings worked (or were profitable) try an

13363

// initial unpack.

13364

if (NumLoInputs == 0 || NumHiInputs == 0) {

13365

assert((NumLoInputs > 0 || NumHiInputs > 0) &&(static_cast <bool> ((NumLoInputs > 0 || NumHiInputs
> 0) && "We have to have *some* inputs!") ? void (
0) : __assert_fail ("(NumLoInputs > 0 || NumHiInputs > 0) && \"We have to have *some* inputs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13366, __extension__
__PRETTY_FUNCTION__))

13366

"We have to have *some* inputs!")(static_cast <bool> ((NumLoInputs > 0 || NumHiInputs
> 0) && "We have to have *some* inputs!") ? void (
0) : __assert_fail ("(NumLoInputs > 0 || NumHiInputs > 0) && \"We have to have *some* inputs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13366, __extension__
__PRETTY_FUNCTION__));

13367

int HalfOffset = NumLoInputs == 0 ? Size / 2 : 0;

13368

13369

// FIXME: We could consider the total complexity of the permute of each

13370

// possible unpacking. Or at the least we should consider how many

13371

// half-crossings are created.

13372

// FIXME: We could consider commuting the unpacks.

13373

13374

SmallVector<int, 32> PermMask((unsigned)Size, -1);

13375

for (int i = 0; i < Size; ++i) {

13376

if (Mask[i] < 0)

13377

continue;

13378

13379

assert(Mask[i] % Size >= HalfOffset && "Found input from wrong half!")(static_cast <bool> (Mask[i] % Size >= HalfOffset &&
"Found input from wrong half!") ? void (0) : __assert_fail (
"Mask[i] % Size >= HalfOffset && \"Found input from wrong half!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13379, __extension__
__PRETTY_FUNCTION__));

13380

13381

PermMask[i] =

13382

2 * ((Mask[i] % Size) - HalfOffset) + (Mask[i] < Size ? 0 : 1);

13383

}

13384

return DAG.getVectorShuffle(

13385

VT, DL,

13386

DAG.getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL, DL, VT,

13387

V1, V2),

13388

DAG.getUNDEF(VT), PermMask);

13389

}

13390

13391

return SDValue();

13392

}

13393

13394

/// Helper to form a PALIGNR-based rotate+permute, merging 2 inputs and then

13395

/// permuting the elements of the result in place.

13396

static SDValue lowerShuffleAsByteRotateAndPermute(

13397

const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,

13398

const X86Subtarget &Subtarget, SelectionDAG &DAG) {

13399

if ((VT.is128BitVector() && !Subtarget.hasSSSE3()) ||

13400

(VT.is256BitVector() && !Subtarget.hasAVX2()) ||

13401

(VT.is512BitVector() && !Subtarget.hasBWI()))

13402

return SDValue();

13403

13404

// We don't currently support lane crossing permutes.

13405

if (is128BitLaneCrossingShuffleMask(VT, Mask))

13406

return SDValue();

13407

13408

int Scale = VT.getScalarSizeInBits() / 8;

13409

int NumLanes = VT.getSizeInBits() / 128;

13410

int NumElts = VT.getVectorNumElements();

13411

int NumEltsPerLane = NumElts / NumLanes;

13412

13413

// Determine range of mask elts.

13414

bool Blend1 = true;

13415

bool Blend2 = true;

13416

std::pair<int, int> Range1 = std::make_pair(INT_MAX2147483647, INT_MIN(-2147483647 -1));

13417

std::pair<int, int> Range2 = std::make_pair(INT_MAX2147483647, INT_MIN(-2147483647 -1));

13418

for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {

13419

for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {

13420

int M = Mask[Lane + Elt];

13421

if (M < 0)

13422

continue;

13423

if (M < NumElts) {

13424

Blend1 &= (M == (Lane + Elt));

13425

assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask")(static_cast <bool> (Lane <= M && M < (Lane
+ NumEltsPerLane) && "Out of range mask") ? void (0)
: __assert_fail ("Lane <= M && M < (Lane + NumEltsPerLane) && \"Out of range mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13425, __extension__
__PRETTY_FUNCTION__));

13426

M = M % NumEltsPerLane;

13427

Range1.first = std::min(Range1.first, M);

13428

Range1.second = std::max(Range1.second, M);

13429

} else {

13430

M -= NumElts;

13431

Blend2 &= (M == (Lane + Elt));

13432

assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask")(static_cast <bool> (Lane <= M && M < (Lane
+ NumEltsPerLane) && "Out of range mask") ? void (0)
: __assert_fail ("Lane <= M && M < (Lane + NumEltsPerLane) && \"Out of range mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13432, __extension__
__PRETTY_FUNCTION__));

13433

M = M % NumEltsPerLane;

13434

Range2.first = std::min(Range2.first, M);

13435

Range2.second = std::max(Range2.second, M);

13436

}

13437

}

13438

}

13439

13440

// Bail if we don't need both elements.

13441

// TODO - it might be worth doing this for unary shuffles if the permute

13442

// can be widened.

13443

if (!(0 <= Range1.first && Range1.second < NumEltsPerLane) ||

13444

!(0 <= Range2.first && Range2.second < NumEltsPerLane))

13445

return SDValue();

13446

13447

if (VT.getSizeInBits() > 128 && (Blend1 || Blend2))

13448

return SDValue();

13449

13450

// Rotate the 2 ops so we can access both ranges, then permute the result.

13451

auto RotateAndPermute = [&](SDValue Lo, SDValue Hi, int RotAmt, int Ofs) {

13452

MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);

13453

SDValue Rotate = DAG.getBitcast(

13454

VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, DAG.getBitcast(ByteVT, Hi),

13455

DAG.getBitcast(ByteVT, Lo),

13456

DAG.getTargetConstant(Scale * RotAmt, DL, MVT::i8)));

13457

SmallVector<int, 64> PermMask(NumElts, SM_SentinelUndef);

13458

for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) {

13459

for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) {

13460

int M = Mask[Lane + Elt];

13461

if (M < 0)

13462

continue;

13463

if (M < NumElts)

13464

PermMask[Lane + Elt] = Lane + ((M + Ofs - RotAmt) % NumEltsPerLane);

13465

else

13466

PermMask[Lane + Elt] = Lane + ((M - Ofs - RotAmt) % NumEltsPerLane);

13467

}

13468

}

13469

return DAG.getVectorShuffle(VT, DL, Rotate, DAG.getUNDEF(VT), PermMask);

13470

};

13471

13472

// Check if the ranges are small enough to rotate from either direction.

13473

if (Range2.second < Range1.first)

13474

return RotateAndPermute(V1, V2, Range1.first, 0);

13475

if (Range1.second < Range2.first)

13476

return RotateAndPermute(V2, V1, Range2.first, NumElts);

13477

return SDValue();

13478

}

13479

13480

static bool isBroadcastShuffleMask(ArrayRef<int> Mask) {

13481

return isUndefOrEqual(Mask, 0);

13482

}

13483

13484

static bool isNoopOrBroadcastShuffleMask(ArrayRef<int> Mask) {

13485

return isNoopShuffleMask(Mask) || isBroadcastShuffleMask(Mask);

13486

}

13487

13488

/// Generic routine to decompose a shuffle and blend into independent

13489

/// blends and permutes.

13490

///

13491

/// This matches the extremely common pattern for handling combined

13492

/// shuffle+blend operations on newer X86 ISAs where we have very fast blend

13493

/// operations. It will try to pick the best arrangement of shuffles and

13494

/// blends. For vXi8/vXi16 shuffles we may use unpack instead of blend.

13495

static SDValue lowerShuffleAsDecomposedShuffleMerge(

13496

const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,

13497

const X86Subtarget &Subtarget, SelectionDAG &DAG) {

13498

int NumElts = Mask.size();

13499

int NumLanes = VT.getSizeInBits() / 128;

13500

int NumEltsPerLane = NumElts / NumLanes;

13501

13502

// Shuffle the input elements into the desired positions in V1 and V2 and

13503

// unpack/blend them together.

13504

bool IsAlternating = true;

13505

SmallVector<int, 32> V1Mask(NumElts, -1);

13506

SmallVector<int, 32> V2Mask(NumElts, -1);

13507

SmallVector<int, 32> FinalMask(NumElts, -1);

13508

for (int i = 0; i < NumElts; ++i) {

13509

int M = Mask[i];

13510

if (M >= 0 && M < NumElts) {

13511

V1Mask[i] = M;

13512

FinalMask[i] = i;

13513

IsAlternating &= (i & 1) == 0;

13514

} else if (M >= NumElts) {

13515

V2Mask[i] = M - NumElts;

13516

FinalMask[i] = i + NumElts;

13517

IsAlternating &= (i & 1) == 1;

13518

}

13519

}

13520

13521

// If we effectively only demand the 0'th element of \p Input, and not only

13522

// as 0'th element, then broadcast said input,

13523

// and change \p InputMask to be a no-op (identity) mask.

13524

auto canonicalizeBroadcastableInput = [DL, VT, &Subtarget,

13525

&DAG](SDValue &Input,

13526

MutableArrayRef<int> InputMask) {

13527

unsigned EltSizeInBits = Input.getScalarValueSizeInBits();

13528

if (!Subtarget.hasAVX2() && (!Subtarget.hasAVX() || EltSizeInBits < 32 ||

13529

!X86::mayFoldLoad(Input, Subtarget)))

13530

return;

13531

if (isNoopShuffleMask(InputMask))

13532

return;

13533

assert(isBroadcastShuffleMask(InputMask) &&(static_cast <bool> (isBroadcastShuffleMask(InputMask) &&
"Expected to demand only the 0'th element.") ? void (0) : __assert_fail
("isBroadcastShuffleMask(InputMask) && \"Expected to demand only the 0'th element.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13534, __extension__
__PRETTY_FUNCTION__))

13534

"Expected to demand only the 0'th element.")(static_cast <bool> (isBroadcastShuffleMask(InputMask) &&
"Expected to demand only the 0'th element.") ? void (0) : __assert_fail
("isBroadcastShuffleMask(InputMask) && \"Expected to demand only the 0'th element.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13534, __extension__
__PRETTY_FUNCTION__));

13535

Input = DAG.getNode(X86ISD::VBROADCAST, DL, VT, Input);

13536

for (auto I : enumerate(InputMask)) {

13537

int &InputMaskElt = I.value();

13538

if (InputMaskElt >= 0)

13539

InputMaskElt = I.index();

13540

}

13541

};

13542

13543

// Currently, we may need to produce one shuffle per input, and blend results.

13544

// It is possible that the shuffle for one of the inputs is already a no-op.

13545

// See if we can simplify non-no-op shuffles into broadcasts,

13546

// which we consider to be strictly better than an arbitrary shuffle.

13547

if (isNoopOrBroadcastShuffleMask(V1Mask) &&

13548

isNoopOrBroadcastShuffleMask(V2Mask)) {

13549

canonicalizeBroadcastableInput(V1, V1Mask);

13550

canonicalizeBroadcastableInput(V2, V2Mask);

13551

}

13552

13553

// Try to lower with the simpler initial blend/unpack/rotate strategies unless

13554

// one of the input shuffles would be a no-op. We prefer to shuffle inputs as

13555

// the shuffle may be able to fold with a load or other benefit. However, when

13556

// we'll have to do 2x as many shuffles in order to achieve this, a 2-input

13557

// pre-shuffle first is a better strategy.

13558

if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask)) {

13559

// Only prefer immediate blends to unpack/rotate.

13560

if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask,

13561

DAG, true))

13562

return BlendPerm;

13563

if (SDValue UnpackPerm = lowerShuffleAsUNPCKAndPermute(DL, VT, V1, V2, Mask,

13564

DAG))

13565

return UnpackPerm;

13566

if (SDValue RotatePerm = lowerShuffleAsByteRotateAndPermute(

13567

DL, VT, V1, V2, Mask, Subtarget, DAG))

13568

return RotatePerm;

13569

// Unpack/rotate failed - try again with variable blends.

13570

if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask,

13571

DAG))

13572

return BlendPerm;

13573

if (VT.getScalarSizeInBits() >= 32)

13574

if (SDValue PermUnpack = lowerShuffleAsPermuteAndUnpack(

13575

DL, VT, V1, V2, Mask, Subtarget, DAG))

13576

return PermUnpack;

13577

}

13578

13579

// If the final mask is an alternating blend of vXi8/vXi16, convert to an

13580

// UNPCKL(SHUFFLE, SHUFFLE) pattern.

13581

// TODO: It doesn't have to be alternating - but each lane mustn't have more

13582

// than half the elements coming from each source.

13583

if (IsAlternating && VT.getScalarSizeInBits() < 32) {

13584

V1Mask.assign(NumElts, -1);

13585

V2Mask.assign(NumElts, -1);

13586

FinalMask.assign(NumElts, -1);

13587

for (int i = 0; i != NumElts; i += NumEltsPerLane)

13588

for (int j = 0; j != NumEltsPerLane; ++j) {

13589

int M = Mask[i + j];

13590

if (M >= 0 && M < NumElts) {

13591

V1Mask[i + (j / 2)] = M;

13592

FinalMask[i + j] = i + (j / 2);

13593

} else if (M >= NumElts) {

13594

V2Mask[i + (j / 2)] = M - NumElts;

13595

FinalMask[i + j] = i + (j / 2) + NumElts;

13596

}

13597

}

13598

}

13599

13600

V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);

13601

V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);

13602

return DAG.getVectorShuffle(VT, DL, V1, V2, FinalMask);

13603

}

13604

13605

/// Try to lower a vector shuffle as a bit rotation.

13606

///

13607

/// Look for a repeated rotation pattern in each sub group.

13608

/// Returns a ISD::ROTL element rotation amount or -1 if failed.

13609

static int matchShuffleAsBitRotate(ArrayRef<int> Mask, int NumSubElts) {

13610

int NumElts = Mask.size();

13611

assert((NumElts % NumSubElts) == 0 && "Illegal shuffle mask")(static_cast <bool> ((NumElts % NumSubElts) == 0 &&
"Illegal shuffle mask") ? void (0) : __assert_fail ("(NumElts % NumSubElts) == 0 && \"Illegal shuffle mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13611, __extension__
__PRETTY_FUNCTION__));

13612

13613

int RotateAmt = -1;

13614

for (int i = 0; i != NumElts; i += NumSubElts) {

13615

for (int j = 0; j != NumSubElts; ++j) {

13616

int M = Mask[i + j];

13617

if (M < 0)

13618

continue;

13619

if (!isInRange(M, i, i + NumSubElts))

13620

return -1;

13621

int Offset = (NumSubElts - (M - (i + j))) % NumSubElts;

13622

if (0 <= RotateAmt && Offset != RotateAmt)

13623

return -1;

13624

RotateAmt = Offset;

13625

}

13626

}

13627

return RotateAmt;

13628

}

13629

13630

static int matchShuffleAsBitRotate(MVT &RotateVT, int EltSizeInBits,

13631

const X86Subtarget &Subtarget,

13632

ArrayRef<int> Mask) {

13633

assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!")(static_cast <bool> (!isNoopShuffleMask(Mask) &&
"We shouldn't lower no-op shuffles!") ? void (0) : __assert_fail
("!isNoopShuffleMask(Mask) && \"We shouldn't lower no-op shuffles!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13633, __extension__
__PRETTY_FUNCTION__));

13634

assert(EltSizeInBits < 64 && "Can't rotate 64-bit integers")(static_cast <bool> (EltSizeInBits < 64 && "Can't rotate 64-bit integers"
) ? void (0) : __assert_fail ("EltSizeInBits < 64 && \"Can't rotate 64-bit integers\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13634, __extension__
__PRETTY_FUNCTION__));

13635

13636

// AVX512 only has vXi32/vXi64 rotates, so limit the rotation sub group size.

13637

int MinSubElts = Subtarget.hasAVX512() ? std::max(32 / EltSizeInBits, 2) : 2;

13638

int MaxSubElts = 64 / EltSizeInBits;

13639

for (int NumSubElts = MinSubElts; NumSubElts <= MaxSubElts; NumSubElts *= 2) {

13640

int RotateAmt = matchShuffleAsBitRotate(Mask, NumSubElts);

13641

if (RotateAmt < 0)

13642

continue;

13643

13644

int NumElts = Mask.size();

13645

MVT RotateSVT = MVT::getIntegerVT(EltSizeInBits * NumSubElts);

13646

RotateVT = MVT::getVectorVT(RotateSVT, NumElts / NumSubElts);

13647

return RotateAmt * EltSizeInBits;

13648

}

13649

13650

return -1;

13651

}

13652

13653

/// Lower shuffle using X86ISD::VROTLI rotations.

13654

static SDValue lowerShuffleAsBitRotate(const SDLoc &DL, MVT VT, SDValue V1,

13655

ArrayRef<int> Mask,

13656

const X86Subtarget &Subtarget,

13657

SelectionDAG &DAG) {

13658

// Only XOP + AVX512 targets have bit rotation instructions.

13659

// If we at least have SSSE3 (PSHUFB) then we shouldn't attempt to use this.

13660

bool IsLegal =

13661

(VT.is128BitVector() && Subtarget.hasXOP()) || Subtarget.hasAVX512();

13662

if (!IsLegal && Subtarget.hasSSE3())

13663

return SDValue();

13664

13665

MVT RotateVT;

13666

int RotateAmt = matchShuffleAsBitRotate(RotateVT, VT.getScalarSizeInBits(),

13667

Subtarget, Mask);

13668

if (RotateAmt < 0)

13669

return SDValue();

13670

13671

// For pre-SSSE3 targets, if we are shuffling vXi8 elts then ISD::ROTL,

13672

// expanded to OR(SRL,SHL), will be more efficient, but if they can

13673

// widen to vXi16 or more then existing lowering should will be better.

13674

if (!IsLegal) {

13675

if ((RotateAmt % 16) == 0)

13676

return SDValue();

13677

// TODO: Use getTargetVShiftByConstNode.

13678

unsigned ShlAmt = RotateAmt;

13679

unsigned SrlAmt = RotateVT.getScalarSizeInBits() - RotateAmt;

13680

V1 = DAG.getBitcast(RotateVT, V1);

13681

SDValue SHL = DAG.getNode(X86ISD::VSHLI, DL, RotateVT, V1,

13682

DAG.getTargetConstant(ShlAmt, DL, MVT::i8));

13683

SDValue SRL = DAG.getNode(X86ISD::VSRLI, DL, RotateVT, V1,

13684

DAG.getTargetConstant(SrlAmt, DL, MVT::i8));

13685

SDValue Rot = DAG.getNode(ISD::OR, DL, RotateVT, SHL, SRL);

13686

return DAG.getBitcast(VT, Rot);

13687

}

13688

13689

SDValue Rot =

13690

DAG.getNode(X86ISD::VROTLI, DL, RotateVT, DAG.getBitcast(RotateVT, V1),

13691

DAG.getTargetConstant(RotateAmt, DL, MVT::i8));

13692

return DAG.getBitcast(VT, Rot);

13693

}

13694

13695

/// Try to match a vector shuffle as an element rotation.

13696

///

13697

/// This is used for support PALIGNR for SSSE3 or VALIGND/Q for AVX512.

13698

static int matchShuffleAsElementRotate(SDValue &V1, SDValue &V2,

13699

ArrayRef<int> Mask) {

13700

int NumElts = Mask.size();

13701

13702

// We need to detect various ways of spelling a rotation:

13703

// [11, 12, 13, 14, 15, 0, 1, 2]

13704

// [-1, 12, 13, 14, -1, -1, 1, -1]

13705

// [-1, -1, -1, -1, -1, -1, 1, 2]

13706

// [ 3, 4, 5, 6, 7, 8, 9, 10]

13707

// [-1, 4, 5, 6, -1, -1, 9, -1]

13708

// [-1, 4, 5, 6, -1, -1, -1, -1]

13709

int Rotation = 0;

13710

SDValue Lo, Hi;

13711

for (int i = 0; i < NumElts; ++i) {

13712

int M = Mask[i];

13713

assert((M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) &&(static_cast <bool> ((M == SM_SentinelUndef || (0 <=
M && M < (2*NumElts))) && "Unexpected mask index."
) ? void (0) : __assert_fail ("(M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) && \"Unexpected mask index.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13714, __extension__
__PRETTY_FUNCTION__))

13714

"Unexpected mask index.")(static_cast <bool> ((M == SM_SentinelUndef || (0 <=
M && M < (2*NumElts))) && "Unexpected mask index."
) ? void (0) : __assert_fail ("(M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) && \"Unexpected mask index.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13714, __extension__
__PRETTY_FUNCTION__));

13715

if (M < 0)

13716

continue;

13717

13718

// Determine where a rotated vector would have started.

13719

int StartIdx = i - (M % NumElts);

13720

if (StartIdx == 0)

13721

// The identity rotation isn't interesting, stop.

13722

return -1;

13723

13724

// If we found the tail of a vector the rotation must be the missing

13725

// front. If we found the head of a vector, it must be how much of the

13726

// head.

13727

int CandidateRotation = StartIdx < 0 ? -StartIdx : NumElts - StartIdx;

13728

13729

if (Rotation == 0)

13730

Rotation = CandidateRotation;

13731

else if (Rotation != CandidateRotation)

13732

// The rotations don't match, so we can't match this mask.

13733

return -1;

13734

13735

// Compute which value this mask is pointing at.

13736

SDValue MaskV = M < NumElts ? V1 : V2;

13737

13738

// Compute which of the two target values this index should be assigned

13739

// to. This reflects whether the high elements are remaining or the low

13740

// elements are remaining.

13741

SDValue &TargetV = StartIdx < 0 ? Hi : Lo;

13742

13743

// Either set up this value if we've not encountered it before, or check

13744

// that it remains consistent.

13745

if (!TargetV)

13746

TargetV = MaskV;

13747

else if (TargetV != MaskV)

13748

// This may be a rotation, but it pulls from the inputs in some

13749

// unsupported interleaving.

13750

return -1;

13751

}

13752

13753

// Check that we successfully analyzed the mask, and normalize the results.

13754

assert(Rotation != 0 && "Failed to locate a viable rotation!")(static_cast <bool> (Rotation != 0 && "Failed to locate a viable rotation!"
) ? void (0) : __assert_fail ("Rotation != 0 && \"Failed to locate a viable rotation!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13754, __extension__
__PRETTY_FUNCTION__));

13755

assert((Lo || Hi) && "Failed to find a rotated input vector!")(static_cast <bool> ((Lo || Hi) && "Failed to find a rotated input vector!"
) ? void (0) : __assert_fail ("(Lo || Hi) && \"Failed to find a rotated input vector!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13755, __extension__
__PRETTY_FUNCTION__));

13756

if (!Lo)

13757

Lo = Hi;

13758

else if (!Hi)

13759

Hi = Lo;

13760

13761

V1 = Lo;

13762

V2 = Hi;

13763

13764

return Rotation;

13765

}

13766

13767

/// Try to lower a vector shuffle as a byte rotation.

13768

///

13769

/// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary

13770

/// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use

13771

/// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will

13772

/// try to generically lower a vector shuffle through such an pattern. It

13773

/// does not check for the profitability of lowering either as PALIGNR or

13774

/// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.

13775

/// This matches shuffle vectors that look like:

13776

///

13777

/// v8i16 [11, 12, 13, 14, 15, 0, 1, 2]

13778

///

13779

/// Essentially it concatenates V1 and V2, shifts right by some number of

13780

/// elements, and takes the low elements as the result. Note that while this is

13781

/// specified as a *right shift* because x86 is little-endian, it is a *left

13782

/// rotate* of the vector lanes.

13783

static int matchShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2,

13784

ArrayRef<int> Mask) {

13785

// Don't accept any shuffles with zero elements.

13786

if (isAnyZero(Mask))

13787

return -1;

13788

13789

// PALIGNR works on 128-bit lanes.

13790

SmallVector<int, 16> RepeatedMask;

13791

if (!is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask))

13792

return -1;

13793

13794

int Rotation = matchShuffleAsElementRotate(V1, V2, RepeatedMask);

13795

if (Rotation <= 0)

13796

return -1;

13797

13798

// PALIGNR rotates bytes, so we need to scale the

13799

// rotation based on how many bytes are in the vector lane.

13800

int NumElts = RepeatedMask.size();

13801

int Scale = 16 / NumElts;

13802

return Rotation * Scale;

13803

}

13804

13805

static SDValue lowerShuffleAsByteRotate(const SDLoc &DL, MVT VT, SDValue V1,

13806

SDValue V2, ArrayRef<int> Mask,

13807

const X86Subtarget &Subtarget,

13808

SelectionDAG &DAG) {

13809

assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!")(static_cast <bool> (!isNoopShuffleMask(Mask) &&
"We shouldn't lower no-op shuffles!") ? void (0) : __assert_fail
("!isNoopShuffleMask(Mask) && \"We shouldn't lower no-op shuffles!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13809, __extension__
__PRETTY_FUNCTION__));

13810

13811

SDValue Lo = V1, Hi = V2;

13812

int ByteRotation = matchShuffleAsByteRotate(VT, Lo, Hi, Mask);

13813

if (ByteRotation <= 0)

13814

return SDValue();

13815

13816

// Cast the inputs to i8 vector of correct length to match PALIGNR or

13817

// PSLLDQ/PSRLDQ.

13818

MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);

13819

Lo = DAG.getBitcast(ByteVT, Lo);

13820

Hi = DAG.getBitcast(ByteVT, Hi);

13821

13822

// SSSE3 targets can use the palignr instruction.

13823

if (Subtarget.hasSSSE3()) {

13824

assert((!VT.is512BitVector() || Subtarget.hasBWI()) &&(static_cast <bool> ((!VT.is512BitVector() || Subtarget
.hasBWI()) && "512-bit PALIGNR requires BWI instructions"
) ? void (0) : __assert_fail ("(!VT.is512BitVector() || Subtarget.hasBWI()) && \"512-bit PALIGNR requires BWI instructions\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13825, __extension__
__PRETTY_FUNCTION__))

13825

"512-bit PALIGNR requires BWI instructions")(static_cast <bool> ((!VT.is512BitVector() || Subtarget
.hasBWI()) && "512-bit PALIGNR requires BWI instructions"
) ? void (0) : __assert_fail ("(!VT.is512BitVector() || Subtarget.hasBWI()) && \"512-bit PALIGNR requires BWI instructions\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13825, __extension__
__PRETTY_FUNCTION__));

13826

return DAG.getBitcast(

13827

VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, Lo, Hi,

13828

DAG.getTargetConstant(ByteRotation, DL, MVT::i8)));

13829

}

13830

13831

assert(VT.is128BitVector() &&(static_cast <bool> (VT.is128BitVector() && "Rotate-based lowering only supports 128-bit lowering!"
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Rotate-based lowering only supports 128-bit lowering!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13832, __extension__
__PRETTY_FUNCTION__))

13832

"Rotate-based lowering only supports 128-bit lowering!")(static_cast <bool> (VT.is128BitVector() && "Rotate-based lowering only supports 128-bit lowering!"
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Rotate-based lowering only supports 128-bit lowering!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13832, __extension__
__PRETTY_FUNCTION__));

13833

assert(Mask.size() <= 16 &&(static_cast <bool> (Mask.size() <= 16 && "Can shuffle at most 16 bytes in a 128-bit vector!"
) ? void (0) : __assert_fail ("Mask.size() <= 16 && \"Can shuffle at most 16 bytes in a 128-bit vector!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13834, __extension__
__PRETTY_FUNCTION__))

13834

"Can shuffle at most 16 bytes in a 128-bit vector!")(static_cast <bool> (Mask.size() <= 16 && "Can shuffle at most 16 bytes in a 128-bit vector!"
) ? void (0) : __assert_fail ("Mask.size() <= 16 && \"Can shuffle at most 16 bytes in a 128-bit vector!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13834, __extension__
__PRETTY_FUNCTION__));

13835

assert(ByteVT == MVT::v16i8 &&(static_cast <bool> (ByteVT == MVT::v16i8 && "SSE2 rotate lowering only needed for v16i8!"
) ? void (0) : __assert_fail ("ByteVT == MVT::v16i8 && \"SSE2 rotate lowering only needed for v16i8!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13836, __extension__
__PRETTY_FUNCTION__))

13836

"SSE2 rotate lowering only needed for v16i8!")(static_cast <bool> (ByteVT == MVT::v16i8 && "SSE2 rotate lowering only needed for v16i8!"
) ? void (0) : __assert_fail ("ByteVT == MVT::v16i8 && \"SSE2 rotate lowering only needed for v16i8!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13836, __extension__
__PRETTY_FUNCTION__));

13837

13838

// Default SSE2 implementation

13839

int LoByteShift = 16 - ByteRotation;

13840

int HiByteShift = ByteRotation;

13841

13842

SDValue LoShift =

13843

DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Lo,

13844

DAG.getTargetConstant(LoByteShift, DL, MVT::i8));

13845

SDValue HiShift =

13846

DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Hi,

13847

DAG.getTargetConstant(HiByteShift, DL, MVT::i8));

13848

return DAG.getBitcast(VT,

13849

DAG.getNode(ISD::OR, DL, MVT::v16i8, LoShift, HiShift));

13850

}

13851

13852

/// Try to lower a vector shuffle as a dword/qword rotation.

13853

///

13854

/// AVX512 has a VALIGND/VALIGNQ instructions that will do an arbitrary

13855

/// rotation of the concatenation of two vectors; This routine will

13856

/// try to generically lower a vector shuffle through such an pattern.

13857

///

13858

/// Essentially it concatenates V1 and V2, shifts right by some number of

13859

/// elements, and takes the low elements as the result. Note that while this is

13860

/// specified as a *right shift* because x86 is little-endian, it is a *left

13861

/// rotate* of the vector lanes.

13862

static SDValue lowerShuffleAsVALIGN(const SDLoc &DL, MVT VT, SDValue V1,

13863

SDValue V2, ArrayRef<int> Mask,

13864

const X86Subtarget &Subtarget,

13865

SelectionDAG &DAG) {

13866

assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&(static_cast <bool> ((VT.getScalarType() == MVT::i32 ||
VT.getScalarType() == MVT::i64) && "Only 32-bit and 64-bit elements are supported!"
) ? void (0) : __assert_fail ("(VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) && \"Only 32-bit and 64-bit elements are supported!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13867, __extension__
__PRETTY_FUNCTION__))

13867

"Only 32-bit and 64-bit elements are supported!")(static_cast <bool> ((VT.getScalarType() == MVT::i32 ||
VT.getScalarType() == MVT::i64) && "Only 32-bit and 64-bit elements are supported!"
) ? void (0) : __assert_fail ("(VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) && \"Only 32-bit and 64-bit elements are supported!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13867, __extension__
__PRETTY_FUNCTION__));

13868

13869

// 128/256-bit vectors are only supported with VLX.

13870

assert((Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector()))(static_cast <bool> ((Subtarget.hasVLX() || (!VT.is128BitVector
() && !VT.is256BitVector())) && "VLX required for 128/256-bit vectors"
) ? void (0) : __assert_fail ("(Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector())) && \"VLX required for 128/256-bit vectors\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13871, __extension__
__PRETTY_FUNCTION__))

13871

&& "VLX required for 128/256-bit vectors")(static_cast <bool> ((Subtarget.hasVLX() || (!VT.is128BitVector
() && !VT.is256BitVector())) && "VLX required for 128/256-bit vectors"
) ? void (0) : __assert_fail ("(Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector())) && \"VLX required for 128/256-bit vectors\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13871, __extension__
__PRETTY_FUNCTION__));

13872

13873

SDValue Lo = V1, Hi = V2;

13874

int Rotation = matchShuffleAsElementRotate(Lo, Hi, Mask);

13875

if (Rotation <= 0)

13876

return SDValue();

13877

13878

return DAG.getNode(X86ISD::VALIGN, DL, VT, Lo, Hi,

13879

DAG.getTargetConstant(Rotation, DL, MVT::i8));

13880

}

13881

13882

/// Try to lower a vector shuffle as a byte shift sequence.

13883

static SDValue lowerShuffleAsByteShiftMask(const SDLoc &DL, MVT VT, SDValue V1,

13884

SDValue V2, ArrayRef<int> Mask,

13885

const APInt &Zeroable,

13886

const X86Subtarget &Subtarget,

13887

SelectionDAG &DAG) {

13888

assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!")(static_cast <bool> (!isNoopShuffleMask(Mask) &&
"We shouldn't lower no-op shuffles!") ? void (0) : __assert_fail
("!isNoopShuffleMask(Mask) && \"We shouldn't lower no-op shuffles!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13888, __extension__
__PRETTY_FUNCTION__));

13889

assert(VT.is128BitVector() && "Only 128-bit vectors supported")(static_cast <bool> (VT.is128BitVector() && "Only 128-bit vectors supported"
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Only 128-bit vectors supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 13889, __extension__
__PRETTY_FUNCTION__));

13890

13891

// We need a shuffle that has zeros at one/both ends and a sequential

13892

// shuffle from one source within.

13893

unsigned ZeroLo = Zeroable.countTrailingOnes();

13894

unsigned ZeroHi = Zeroable.countLeadingOnes();

13895

if (!ZeroLo && !ZeroHi)

13896

return SDValue();

13897

13898

unsigned NumElts = Mask.size();

13899

unsigned Len = NumElts - (ZeroLo + ZeroHi);

13900

if (!isSequentialOrUndefInRange(Mask, ZeroLo, Len, Mask[ZeroLo]))

13901

return SDValue();

13902

13903

unsigned Scale = VT.getScalarSizeInBits() / 8;

13904

ArrayRef<int> StubMask = Mask.slice(ZeroLo, Len);

13905

if (!isUndefOrInRange(StubMask, 0, NumElts) &&

13906

!isUndefOrInRange(StubMask, NumElts, 2 * NumElts))

13907

return SDValue();

13908

13909

SDValue Res = Mask[ZeroLo] < (int)NumElts ? V1 : V2;

13910

Res = DAG.getBitcast(MVT::v16i8, Res);

13911

13912

// Use VSHLDQ/VSRLDQ ops to zero the ends of a vector and leave an

13913

// inner sequential set of elements, possibly offset:

13914

// 01234567 --> zzzzzz01 --> 1zzzzzzz

13915

// 01234567 --> 4567zzzz --> zzzzz456

13916

// 01234567 --> z0123456 --> 3456zzzz --> zz3456zz

13917

if (ZeroLo == 0) {

13918

unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);

13919

Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,

13920

DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));

13921

Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,

13922

DAG.getTargetConstant(Scale * ZeroHi, DL, MVT::i8));

13923

} else if (ZeroHi == 0) {

13924

unsigned Shift = Mask[ZeroLo] % NumElts;

13925

Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,

13926

DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));

13927

Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,

13928

DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8));

13929

} else if (!Subtarget.hasSSSE3()) {

13930

// If we don't have PSHUFB then its worth avoiding an AND constant mask

13931

// by performing 3 byte shifts. Shuffle combining can kick in above that.

13932

// TODO: There may be some cases where VSH{LR}DQ+PAND is still better.

13933

unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts);

13934

Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,

13935

DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));

13936

Shift += Mask[ZeroLo] % NumElts;

13937

Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res,

13938

DAG.getTargetConstant(Scale * Shift, DL, MVT::i8));

13939

Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res,

13940

DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8));

13941

} else

13942

return SDValue();

13943

13944

return DAG.getBitcast(VT, Res);

13945

}

13946

13947

/// Try to lower a vector shuffle as a bit shift (shifts in zeros).

13948

///

13949

/// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and

13950

/// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function

13951

/// matches elements from one of the input vectors shuffled to the left or

13952

/// right with zeroable elements 'shifted in'. It handles both the strictly

13953

/// bit-wise element shifts and the byte shift across an entire 128-bit double

13954

/// quad word lane.

13955

///

13956

/// PSHL : (little-endian) left bit shift.

13957

/// [ zz, 0, zz, 2 ]

13958

/// [ -1, 4, zz, -1 ]

13959

/// PSRL : (little-endian) right bit shift.

13960

/// [ 1, zz, 3, zz]

13961

/// [ -1, -1, 7, zz]

13962

/// PSLLDQ : (little-endian) left byte shift

13963

/// [ zz, 0, 1, 2, 3, 4, 5, 6]

13964

/// [ zz, zz, -1, -1, 2, 3, 4, -1]

13965

/// [ zz, zz, zz, zz, zz, zz, -1, 1]

13966

/// PSRLDQ : (little-endian) right byte shift

13967

/// [ 5, 6, 7, zz, zz, zz, zz, zz]

13968

/// [ -1, 5, 6, 7, zz, zz, zz, zz]

13969

/// [ 1, 2, -1, -1, -1, -1, zz, zz]

13970

static int matchShuffleAsShift(MVT &ShiftVT, unsigned &Opcode,

13971

unsigned ScalarSizeInBits, ArrayRef<int> Mask,

13972

int MaskOffset, const APInt &Zeroable,

13973

const X86Subtarget &Subtarget) {

13974

int Size = Mask.size();

13975

unsigned SizeInBits = Size * ScalarSizeInBits;

13976

13977

auto CheckZeros = [&](int Shift, int Scale, bool Left) {

13978

for (int i = 0; i < Size; i += Scale)

13979

for (int j = 0; j < Shift; ++j)

13980

if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))])

13981

return false;

13982

13983

return true;

13984

};

13985

13986

auto MatchShift = [&](int Shift, int Scale, bool Left) {

13987

for (int i = 0; i != Size; i += Scale) {

13988

unsigned Pos = Left ? i + Shift : i;

13989

unsigned Low = Left ? i : i + Shift;

13990

unsigned Len = Scale - Shift;

13991

if (!isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset))

13992

return -1;

13993

}

13994

13995

int ShiftEltBits = ScalarSizeInBits * Scale;

13996

bool ByteShift = ShiftEltBits > 64;

13997

Opcode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI)

13998

: (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI);

13999

int ShiftAmt = Shift * ScalarSizeInBits / (ByteShift ? 8 : 1);

14000

14001

// Normalize the scale for byte shifts to still produce an i64 element

14002

// type.

14003

Scale = ByteShift ? Scale / 2 : Scale;

14004

14005

// We need to round trip through the appropriate type for the shift.

14006

MVT ShiftSVT = MVT::getIntegerVT(ScalarSizeInBits * Scale);

14007

ShiftVT = ByteShift ? MVT::getVectorVT(MVT::i8, SizeInBits / 8)

14008

: MVT::getVectorVT(ShiftSVT, Size / Scale);

14009

return (int)ShiftAmt;

14010

};

14011

14012

// SSE/AVX supports logical shifts up to 64-bit integers - so we can just

14013

// keep doubling the size of the integer elements up to that. We can

14014

// then shift the elements of the integer vector by whole multiples of

14015

// their width within the elements of the larger integer vector. Test each

14016

// multiple to see if we can find a match with the moved element indices

14017

// and that the shifted in elements are all zeroable.

14018

unsigned MaxWidth = ((SizeInBits == 512) && !Subtarget.hasBWI() ? 64 : 128);

14019

for (int Scale = 2; Scale * ScalarSizeInBits <= MaxWidth; Scale *= 2)

14020

for (int Shift = 1; Shift != Scale; ++Shift)

14021

for (bool Left : {true, false})

14022

if (CheckZeros(Shift, Scale, Left)) {

14023

int ShiftAmt = MatchShift(Shift, Scale, Left);

14024

if (0 < ShiftAmt)

14025

return ShiftAmt;

14026

}

14027

14028

// no match

14029

return -1;

14030

}

14031

14032

static SDValue lowerShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1,

14033

SDValue V2, ArrayRef<int> Mask,

14034

const APInt &Zeroable,

14035

const X86Subtarget &Subtarget,

14036

SelectionDAG &DAG) {

14037

int Size = Mask.size();

14038

assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size")(static_cast <bool> (Size == (int)VT.getVectorNumElements
() && "Unexpected mask size") ? void (0) : __assert_fail
("Size == (int)VT.getVectorNumElements() && \"Unexpected mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14038, __extension__
__PRETTY_FUNCTION__));

14039

14040

MVT ShiftVT;

14041

SDValue V = V1;

14042

unsigned Opcode;

14043

14044

// Try to match shuffle against V1 shift.

14045

int ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),

14046

Mask, 0, Zeroable, Subtarget);

14047

14048

// If V1 failed, try to match shuffle against V2 shift.

14049

if (ShiftAmt < 0) {

14050

ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),

14051

Mask, Size, Zeroable, Subtarget);

14052

V = V2;

14053

}

14054

14055

if (ShiftAmt < 0)

14056

return SDValue();

14057

14058

assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&(static_cast <bool> (DAG.getTargetLoweringInfo().isTypeLegal
(ShiftVT) && "Illegal integer vector type") ? void (0
) : __assert_fail ("DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) && \"Illegal integer vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14059, __extension__
__PRETTY_FUNCTION__))

14059

"Illegal integer vector type")(static_cast <bool> (DAG.getTargetLoweringInfo().isTypeLegal
(ShiftVT) && "Illegal integer vector type") ? void (0
) : __assert_fail ("DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) && \"Illegal integer vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14059, __extension__
__PRETTY_FUNCTION__));

14060

V = DAG.getBitcast(ShiftVT, V);

14061

V = DAG.getNode(Opcode, DL, ShiftVT, V,

14062

DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));

14063

return DAG.getBitcast(VT, V);

14064

}

14065

14066

// EXTRQ: Extract Len elements from lower half of source, starting at Idx.

14067

// Remainder of lower half result is zero and upper half is all undef.

14068

static bool matchShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2,

14069

ArrayRef<int> Mask, uint64_t &BitLen,

14070

uint64_t &BitIdx, const APInt &Zeroable) {

14071

int Size = Mask.size();

14072

int HalfSize = Size / 2;

14073

assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size")(static_cast <bool> (Size == (int)VT.getVectorNumElements
() && "Unexpected mask size") ? void (0) : __assert_fail
("Size == (int)VT.getVectorNumElements() && \"Unexpected mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14073, __extension__
__PRETTY_FUNCTION__));

14074

assert(!Zeroable.isAllOnes() && "Fully zeroable shuffle mask")(static_cast <bool> (!Zeroable.isAllOnes() && "Fully zeroable shuffle mask"
) ? void (0) : __assert_fail ("!Zeroable.isAllOnes() && \"Fully zeroable shuffle mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14074, __extension__
__PRETTY_FUNCTION__));

14075

14076

// Upper half must be undefined.

14077

if (!isUndefUpperHalf(Mask))

14078

return false;

14079

14080

// Determine the extraction length from the part of the

14081

// lower half that isn't zeroable.

14082

int Len = HalfSize;

14083

for (; Len > 0; --Len)

14084

if (!Zeroable[Len - 1])

14085

break;

14086

assert(Len > 0 && "Zeroable shuffle mask")(static_cast <bool> (Len > 0 && "Zeroable shuffle mask"
) ? void (0) : __assert_fail ("Len > 0 && \"Zeroable shuffle mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14086, __extension__
__PRETTY_FUNCTION__));

14087

14088

// Attempt to match first Len sequential elements from the lower half.

14089

SDValue Src;

14090

int Idx = -1;

14091

for (int i = 0; i != Len; ++i) {

14092

int M = Mask[i];

14093

if (M == SM_SentinelUndef)

14094

continue;

14095

SDValue &V = (M < Size ? V1 : V2);

14096

M = M % Size;

14097

14098

// The extracted elements must start at a valid index and all mask

14099

// elements must be in the lower half.

14100

if (i > M || M >= HalfSize)

14101

return false;

14102

14103

if (Idx < 0 || (Src == V && Idx == (M - i))) {

14104

Src = V;

14105

Idx = M - i;

14106

continue;

14107

}

14108

return false;

14109

}

14110

14111

if (!Src || Idx < 0)

14112

return false;

14113

14114

assert((Idx + Len) <= HalfSize && "Illegal extraction mask")(static_cast <bool> ((Idx + Len) <= HalfSize &&
"Illegal extraction mask") ? void (0) : __assert_fail ("(Idx + Len) <= HalfSize && \"Illegal extraction mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14114, __extension__
__PRETTY_FUNCTION__));

14115

BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;

14116

BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;

14117

V1 = Src;

14118

return true;

14119

}

14120

14121

// INSERTQ: Extract lowest Len elements from lower half of second source and

14122

// insert over first source, starting at Idx.

14123

// { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... }

14124

static bool matchShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2,

14125

ArrayRef<int> Mask, uint64_t &BitLen,

14126

uint64_t &BitIdx) {

14127

int Size = Mask.size();

14128

int HalfSize = Size / 2;

14129

assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size")(static_cast <bool> (Size == (int)VT.getVectorNumElements
() && "Unexpected mask size") ? void (0) : __assert_fail
("Size == (int)VT.getVectorNumElements() && \"Unexpected mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14129, __extension__
__PRETTY_FUNCTION__));

14130

14131

// Upper half must be undefined.

14132

if (!isUndefUpperHalf(Mask))

14133

return false;

14134

14135

for (int Idx = 0; Idx != HalfSize; ++Idx) {

14136

SDValue Base;

14137

14138

// Attempt to match first source from mask before insertion point.

14139

if (isUndefInRange(Mask, 0, Idx)) {

14140

/* EMPTY */

14141

} else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) {

14142

Base = V1;

14143

} else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) {

14144

Base = V2;

14145

} else {

14146

continue;

14147

}

14148

14149

// Extend the extraction length looking to match both the insertion of

14150

// the second source and the remaining elements of the first.

14151

for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) {

14152

SDValue Insert;

14153

int Len = Hi - Idx;

14154

14155

// Match insertion.

14156

if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) {

14157

Insert = V1;

14158

} else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) {

14159

Insert = V2;

14160

} else {

14161

continue;

14162

}

14163

14164

// Match the remaining elements of the lower half.

14165

if (isUndefInRange(Mask, Hi, HalfSize - Hi)) {

14166

/* EMPTY */

14167

} else if ((!Base || (Base == V1)) &&

14168

isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) {

14169

Base = V1;

14170

} else if ((!Base || (Base == V2)) &&

14171

isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi,

14172

Size + Hi)) {

14173

Base = V2;

14174

} else {

14175

continue;

14176

}

14177

14178

BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;

14179

BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;

14180

V1 = Base;

14181

V2 = Insert;

14182

return true;

14183

}

14184

}

14185

14186

return false;

14187

}

14188

14189

/// Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.

14190

static SDValue lowerShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1,

14191

SDValue V2, ArrayRef<int> Mask,

14192

const APInt &Zeroable, SelectionDAG &DAG) {

14193

uint64_t BitLen, BitIdx;

14194

if (matchShuffleAsEXTRQ(VT, V1, V2, Mask, BitLen, BitIdx, Zeroable))

14195

return DAG.getNode(X86ISD::EXTRQI, DL, VT, V1,

14196

DAG.getTargetConstant(BitLen, DL, MVT::i8),

14197

DAG.getTargetConstant(BitIdx, DL, MVT::i8));

14198

14199

if (matchShuffleAsINSERTQ(VT, V1, V2, Mask, BitLen, BitIdx))

14200

return DAG.getNode(X86ISD::INSERTQI, DL, VT, V1 ? V1 : DAG.getUNDEF(VT),

14201

V2 ? V2 : DAG.getUNDEF(VT),

14202

DAG.getTargetConstant(BitLen, DL, MVT::i8),

14203

DAG.getTargetConstant(BitIdx, DL, MVT::i8));

14204

14205

return SDValue();

14206

}

14207

14208

/// Lower a vector shuffle as a zero or any extension.

14209

///

14210

/// Given a specific number of elements, element bit width, and extension

14211

/// stride, produce either a zero or any extension based on the available

14212

/// features of the subtarget. The extended elements are consecutive and

14213

/// begin and can start from an offsetted element index in the input; to

14214

/// avoid excess shuffling the offset must either being in the bottom lane

14215

/// or at the start of a higher lane. All extended elements must be from

14216

/// the same lane.

14217

static SDValue lowerShuffleAsSpecificZeroOrAnyExtend(

14218

const SDLoc &DL, MVT VT, int Scale, int Offset, bool AnyExt, SDValue InputV,

14219

ArrayRef<int> Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) {

14220

assert(Scale > 1 && "Need a scale to extend.")(static_cast <bool> (Scale > 1 && "Need a scale to extend."
) ? void (0) : __assert_fail ("Scale > 1 && \"Need a scale to extend.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14220, __extension__
__PRETTY_FUNCTION__));

14221

int EltBits = VT.getScalarSizeInBits();

14222

int NumElements = VT.getVectorNumElements();

14223

int NumEltsPerLane = 128 / EltBits;

14224

int OffsetLane = Offset / NumEltsPerLane;

14225

assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&(static_cast <bool> ((EltBits == 8 || EltBits == 16 || EltBits
== 32) && "Only 8, 16, and 32 bit elements can be extended."
) ? void (0) : __assert_fail ("(EltBits == 8 || EltBits == 16 || EltBits == 32) && \"Only 8, 16, and 32 bit elements can be extended.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14226, __extension__
__PRETTY_FUNCTION__))

14226

"Only 8, 16, and 32 bit elements can be extended.")(static_cast <bool> ((EltBits == 8 || EltBits == 16 || EltBits
== 32) && "Only 8, 16, and 32 bit elements can be extended."
) ? void (0) : __assert_fail ("(EltBits == 8 || EltBits == 16 || EltBits == 32) && \"Only 8, 16, and 32 bit elements can be extended.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14226, __extension__
__PRETTY_FUNCTION__));

14227

assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.")(static_cast <bool> (Scale * EltBits <= 64 &&
"Cannot zero extend past 64 bits.") ? void (0) : __assert_fail
("Scale * EltBits <= 64 && \"Cannot zero extend past 64 bits.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14227, __extension__
__PRETTY_FUNCTION__));

14228

assert(0 <= Offset && "Extension offset must be positive.")(static_cast <bool> (0 <= Offset && "Extension offset must be positive."
) ? void (0) : __assert_fail ("0 <= Offset && \"Extension offset must be positive.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14228, __extension__
__PRETTY_FUNCTION__));

14229

assert((Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) &&(static_cast <bool> ((Offset < NumEltsPerLane || Offset
% NumEltsPerLane == 0) && "Extension offset must be in the first lane or start an upper lane."
) ? void (0) : __assert_fail ("(Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) && \"Extension offset must be in the first lane or start an upper lane.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14230, __extension__
__PRETTY_FUNCTION__))

14230

"Extension offset must be in the first lane or start an upper lane.")(static_cast <bool> ((Offset < NumEltsPerLane || Offset
% NumEltsPerLane == 0) && "Extension offset must be in the first lane or start an upper lane."
) ? void (0) : __assert_fail ("(Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) && \"Extension offset must be in the first lane or start an upper lane.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14230, __extension__
__PRETTY_FUNCTION__));

14231

14232

// Check that an index is in same lane as the base offset.

14233

auto SafeOffset = [&](int Idx) {

14234

return OffsetLane == (Idx / NumEltsPerLane);

14235

};

14236

14237

// Shift along an input so that the offset base moves to the first element.

14238

auto ShuffleOffset = [&](SDValue V) {

14239

if (!Offset)

14240

return V;

14241

14242

SmallVector<int, 8> ShMask((unsigned)NumElements, -1);

14243

for (int i = 0; i * Scale < NumElements; ++i) {

14244

int SrcIdx = i + Offset;

14245

ShMask[i] = SafeOffset(SrcIdx) ? SrcIdx : -1;

14246

}

14247

return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), ShMask);

14248

};

14249

14250

// Found a valid a/zext mask! Try various lowering strategies based on the

14251

// input type and available ISA extensions.

14252

if (Subtarget.hasSSE41()) {

14253

// Not worth offsetting 128-bit vectors if scale == 2, a pattern using

14254

// PUNPCK will catch this in a later shuffle match.

14255

if (Offset && Scale == 2 && VT.is128BitVector())

14256

return SDValue();

14257

MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),

14258

NumElements / Scale);

14259

InputV = ShuffleOffset(InputV);

14260

InputV = getEXTEND_VECTOR_INREG(AnyExt ? ISD::ANY_EXTEND : ISD::ZERO_EXTEND,

14261

DL, ExtVT, InputV, DAG);

14262

return DAG.getBitcast(VT, InputV);

14263

}

14264

14265

assert(VT.is128BitVector() && "Only 128-bit vectors can be extended.")(static_cast <bool> (VT.is128BitVector() && "Only 128-bit vectors can be extended."
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Only 128-bit vectors can be extended.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14265, __extension__
__PRETTY_FUNCTION__));

14266

14267

// For any extends we can cheat for larger element sizes and use shuffle

14268

// instructions that can fold with a load and/or copy.

14269

if (AnyExt && EltBits == 32) {

14270

int PSHUFDMask[4] = {Offset, -1, SafeOffset(Offset + 1) ? Offset + 1 : -1,

14271

-1};

14272

return DAG.getBitcast(

14273

VT, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,

14274

DAG.getBitcast(MVT::v4i32, InputV),

14275

getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));

14276

}

14277

if (AnyExt && EltBits == 16 && Scale > 2) {

14278

int PSHUFDMask[4] = {Offset / 2, -1,

14279

SafeOffset(Offset + 1) ? (Offset + 1) / 2 : -1, -1};

14280

InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,

14281

DAG.getBitcast(MVT::v4i32, InputV),

14282

getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));

14283

int PSHUFWMask[4] = {1, -1, -1, -1};

14284

unsigned OddEvenOp = (Offset & 1) ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;

14285

return DAG.getBitcast(

14286

VT, DAG.getNode(OddEvenOp, DL, MVT::v8i16,

14287

DAG.getBitcast(MVT::v8i16, InputV),

14288

getV4X86ShuffleImm8ForMask(PSHUFWMask, DL, DAG)));

14289

}

14290

14291

// The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes

14292

// to 64-bits.

14293

if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget.hasSSE4A()) {

14294

assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!")(static_cast <bool> (NumElements == (int)Mask.size() &&
"Unexpected shuffle mask size!") ? void (0) : __assert_fail (
"NumElements == (int)Mask.size() && \"Unexpected shuffle mask size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14294, __extension__
__PRETTY_FUNCTION__));

14295

assert(VT.is128BitVector() && "Unexpected vector width!")(static_cast <bool> (VT.is128BitVector() && "Unexpected vector width!"
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Unexpected vector width!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14295, __extension__
__PRETTY_FUNCTION__));

14296

14297

int LoIdx = Offset * EltBits;

14298

SDValue Lo = DAG.getBitcast(

14299

MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,

14300

DAG.getTargetConstant(EltBits, DL, MVT::i8),

14301

DAG.getTargetConstant(LoIdx, DL, MVT::i8)));

14302

14303

if (isUndefUpperHalf(Mask) || !SafeOffset(Offset + 1))

14304

return DAG.getBitcast(VT, Lo);

14305

14306

int HiIdx = (Offset + 1) * EltBits;

14307

SDValue Hi = DAG.getBitcast(

14308

MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,

14309

DAG.getTargetConstant(EltBits, DL, MVT::i8),

14310

DAG.getTargetConstant(HiIdx, DL, MVT::i8)));

14311

return DAG.getBitcast(VT,

14312

DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi));

14313

}

14314

14315

// If this would require more than 2 unpack instructions to expand, use

14316

// pshufb when available. We can only use more than 2 unpack instructions

14317

// when zero extending i8 elements which also makes it easier to use pshufb.

14318

if (Scale > 4 && EltBits == 8 && Subtarget.hasSSSE3()) {

14319

assert(NumElements == 16 && "Unexpected byte vector width!")(static_cast <bool> (NumElements == 16 && "Unexpected byte vector width!"
) ? void (0) : __assert_fail ("NumElements == 16 && \"Unexpected byte vector width!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14319, __extension__
__PRETTY_FUNCTION__));

14320

SDValue PSHUFBMask[16];

14321

for (int i = 0; i < 16; ++i) {

14322

int Idx = Offset + (i / Scale);

14323

if ((i % Scale == 0 && SafeOffset(Idx))) {

14324

PSHUFBMask[i] = DAG.getConstant(Idx, DL, MVT::i8);

14325

continue;

14326

}

14327

PSHUFBMask[i] =

14328

AnyExt ? DAG.getUNDEF(MVT::i8) : DAG.getConstant(0x80, DL, MVT::i8);

14329

}

14330

InputV = DAG.getBitcast(MVT::v16i8, InputV);

14331

return DAG.getBitcast(

14332

VT, DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,

14333

DAG.getBuildVector(MVT::v16i8, DL, PSHUFBMask)));

14334

}

14335

14336

// If we are extending from an offset, ensure we start on a boundary that

14337

// we can unpack from.

14338

int AlignToUnpack = Offset % (NumElements / Scale);

14339

if (AlignToUnpack) {

14340

SmallVector<int, 8> ShMask((unsigned)NumElements, -1);

14341

for (int i = AlignToUnpack; i < NumElements; ++i)

14342

ShMask[i - AlignToUnpack] = i;

14343

InputV = DAG.getVectorShuffle(VT, DL, InputV, DAG.getUNDEF(VT), ShMask);

14344

Offset -= AlignToUnpack;

14345

}

14346

14347

// Otherwise emit a sequence of unpacks.

14348

do {

14349

unsigned UnpackLoHi = X86ISD::UNPCKL;

14350

if (Offset >= (NumElements / 2)) {

14351

UnpackLoHi = X86ISD::UNPCKH;

14352

Offset -= (NumElements / 2);

14353

}

14354

14355

MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);

14356

SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)

14357

: getZeroVector(InputVT, Subtarget, DAG, DL);

14358

InputV = DAG.getBitcast(InputVT, InputV);

14359

InputV = DAG.getNode(UnpackLoHi, DL, InputVT, InputV, Ext);

14360

Scale /= 2;

14361

EltBits *= 2;

14362

NumElements /= 2;

14363

} while (Scale > 1);

14364

return DAG.getBitcast(VT, InputV);

14365

}

14366

14367

/// Try to lower a vector shuffle as a zero extension on any microarch.

14368

///

14369

/// This routine will try to do everything in its power to cleverly lower

14370

/// a shuffle which happens to match the pattern of a zero extend. It doesn't

14371

/// check for the profitability of this lowering, it tries to aggressively

14372

/// match this pattern. It will use all of the micro-architectural details it

14373

/// can to emit an efficient lowering. It handles both blends with all-zero

14374

/// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to

14375

/// masking out later).

14376

///

14377

/// The reason we have dedicated lowering for zext-style shuffles is that they

14378

/// are both incredibly common and often quite performance sensitive.

14379

static SDValue lowerShuffleAsZeroOrAnyExtend(

14380

const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,

14381

const APInt &Zeroable, const X86Subtarget &Subtarget,

14382

SelectionDAG &DAG) {

14383

int Bits = VT.getSizeInBits();

14384

int NumLanes = Bits / 128;

14385

int NumElements = VT.getVectorNumElements();

14386

int NumEltsPerLane = NumElements / NumLanes;

14387

assert(VT.getScalarSizeInBits() <= 32 &&(static_cast <bool> (VT.getScalarSizeInBits() <= 32 &&
"Exceeds 32-bit integer zero extension limit") ? void (0) : __assert_fail
("VT.getScalarSizeInBits() <= 32 && \"Exceeds 32-bit integer zero extension limit\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14388, __extension__
__PRETTY_FUNCTION__))

14388

"Exceeds 32-bit integer zero extension limit")(static_cast <bool> (VT.getScalarSizeInBits() <= 32 &&
"Exceeds 32-bit integer zero extension limit") ? void (0) : __assert_fail
("VT.getScalarSizeInBits() <= 32 && \"Exceeds 32-bit integer zero extension limit\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14388, __extension__
__PRETTY_FUNCTION__));

14389

assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size")(static_cast <bool> ((int)Mask.size() == NumElements &&
"Unexpected shuffle mask size") ? void (0) : __assert_fail (
"(int)Mask.size() == NumElements && \"Unexpected shuffle mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14389, __extension__
__PRETTY_FUNCTION__));

14390

14391

// Define a helper function to check a particular ext-scale and lower to it if

14392

// valid.

14393

auto Lower = [&](int Scale) -> SDValue {

14394

SDValue InputV;

14395

bool AnyExt = true;

14396

int Offset = 0;

14397

int Matches = 0;

14398

for (int i = 0; i < NumElements; ++i) {

14399

int M = Mask[i];

14400

if (M < 0)

14401

continue; // Valid anywhere but doesn't tell us anything.

14402

if (i % Scale != 0) {

14403

// Each of the extended elements need to be zeroable.

14404

if (!Zeroable[i])

14405

return SDValue();

14406

14407

// We no longer are in the anyext case.

14408

AnyExt = false;

14409

continue;

14410

}

14411

14412

// Each of the base elements needs to be consecutive indices into the

14413

// same input vector.

14414

SDValue V = M < NumElements ? V1 : V2;

14415

M = M % NumElements;

14416

if (!InputV) {

14417

InputV = V;

14418

Offset = M - (i / Scale);

14419

} else if (InputV != V)

14420

return SDValue(); // Flip-flopping inputs.

14421

14422

// Offset must start in the lowest 128-bit lane or at the start of an

14423

// upper lane.

14424

// FIXME: Is it ever worth allowing a negative base offset?

14425

if (!((0 <= Offset && Offset < NumEltsPerLane) ||

14426

(Offset % NumEltsPerLane) == 0))

14427

return SDValue();

14428

14429

// If we are offsetting, all referenced entries must come from the same

14430

// lane.

14431

if (Offset && (Offset / NumEltsPerLane) != (M / NumEltsPerLane))

14432

return SDValue();

14433

14434

if ((M % NumElements) != (Offset + (i / Scale)))

14435

return SDValue(); // Non-consecutive strided elements.

14436

Matches++;

14437

}

14438

14439

// If we fail to find an input, we have a zero-shuffle which should always

14440

// have already been handled.

14441

// FIXME: Maybe handle this here in case during blending we end up with one?

14442

if (!InputV)

14443

return SDValue();

14444

14445

// If we are offsetting, don't extend if we only match a single input, we

14446

// can always do better by using a basic PSHUF or PUNPCK.

14447

if (Offset != 0 && Matches < 2)

14448

return SDValue();

14449

14450

return lowerShuffleAsSpecificZeroOrAnyExtend(DL, VT, Scale, Offset, AnyExt,

14451

InputV, Mask, Subtarget, DAG);

14452

};

14453

14454

// The widest scale possible for extending is to a 64-bit integer.

14455

assert(Bits % 64 == 0 &&(static_cast <bool> (Bits % 64 == 0 && "The number of bits in a vector must be divisible by 64 on x86!"
) ? void (0) : __assert_fail ("Bits % 64 == 0 && \"The number of bits in a vector must be divisible by 64 on x86!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14456, __extension__
__PRETTY_FUNCTION__))

14456

"The number of bits in a vector must be divisible by 64 on x86!")(static_cast <bool> (Bits % 64 == 0 && "The number of bits in a vector must be divisible by 64 on x86!"
) ? void (0) : __assert_fail ("Bits % 64 == 0 && \"The number of bits in a vector must be divisible by 64 on x86!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14456, __extension__
__PRETTY_FUNCTION__));

14457

int NumExtElements = Bits / 64;

14458

14459

// Each iteration, try extending the elements half as much, but into twice as

14460

// many elements.

14461

for (; NumExtElements < NumElements; NumExtElements *= 2) {

14462

assert(NumElements % NumExtElements == 0 &&(static_cast <bool> (NumElements % NumExtElements == 0 &&
"The input vector size must be divisible by the extended size."
) ? void (0) : __assert_fail ("NumElements % NumExtElements == 0 && \"The input vector size must be divisible by the extended size.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14463, __extension__
__PRETTY_FUNCTION__))

14463

"The input vector size must be divisible by the extended size.")(static_cast <bool> (NumElements % NumExtElements == 0 &&
"The input vector size must be divisible by the extended size."
) ? void (0) : __assert_fail ("NumElements % NumExtElements == 0 && \"The input vector size must be divisible by the extended size.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14463, __extension__
__PRETTY_FUNCTION__));

14464

if (SDValue V = Lower(NumElements / NumExtElements))

14465

return V;

14466

}

14467

14468

// General extends failed, but 128-bit vectors may be able to use MOVQ.

14469

if (Bits != 128)

14470

return SDValue();

14471

14472

// Returns one of the source operands if the shuffle can be reduced to a

14473

// MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits.

14474

auto CanZExtLowHalf = [&]() {

14475

for (int i = NumElements / 2; i != NumElements; ++i)

14476

if (!Zeroable[i])

14477

return SDValue();

14478

if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0))

14479

return V1;

14480

if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements))

14481

return V2;

14482

return SDValue();

14483

};

14484

14485

if (SDValue V = CanZExtLowHalf()) {

14486

V = DAG.getBitcast(MVT::v2i64, V);

14487

V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V);

14488

return DAG.getBitcast(VT, V);

14489

}

14490

14491

// No viable ext lowering found.

14492

return SDValue();

14493

}

14494

14495

/// Try to get a scalar value for a specific element of a vector.

14496

///

14497

/// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar.

14498

static SDValue getScalarValueForVectorElement(SDValue V, int Idx,

14499

SelectionDAG &DAG) {

14500

MVT VT = V.getSimpleValueType();

14501

MVT EltVT = VT.getVectorElementType();

14502

V = peekThroughBitcasts(V);

14503

14504

// If the bitcasts shift the element size, we can't extract an equivalent

14505

// element from it.

14506

MVT NewVT = V.getSimpleValueType();

14507

if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits())

14508

return SDValue();

14509

14510

if (V.getOpcode() == ISD::BUILD_VECTOR ||

14511

(Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR)) {

14512

// Ensure the scalar operand is the same size as the destination.

14513

// FIXME: Add support for scalar truncation where possible.

14514

SDValue S = V.getOperand(Idx);

14515

if (EltVT.getSizeInBits() == S.getSimpleValueType().getSizeInBits())

14516

return DAG.getBitcast(EltVT, S);

14517

}

14518

14519

return SDValue();

14520

}

14521

14522

/// Helper to test for a load that can be folded with x86 shuffles.

14523

///

14524

/// This is particularly important because the set of instructions varies

14525

/// significantly based on whether the operand is a load or not.

14526

static bool isShuffleFoldableLoad(SDValue V) {

14527

return V->hasOneUse() &&

14528

ISD::isNON_EXTLoad(peekThroughOneUseBitcasts(V).getNode());

14529

}

14530

14531

template<typename T>

14532

static bool isSoftFP16(T VT, const X86Subtarget &Subtarget) {

14533

return VT.getScalarType() == MVT::f16 && !Subtarget.hasFP16();

14534

}

14535

14536

template<typename T>

14537

bool X86TargetLowering::isSoftFP16(T VT) const {

14538

return ::isSoftFP16(VT, Subtarget);

14539

}

14540

14541

/// Try to lower insertion of a single element into a zero vector.

14542

///

14543

/// This is a common pattern that we have especially efficient patterns to lower

14544

/// across all subtarget feature sets.

14545

static SDValue lowerShuffleAsElementInsertion(

14546

const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,

14547

const APInt &Zeroable, const X86Subtarget &Subtarget,

14548

SelectionDAG &DAG) {

14549

MVT ExtVT = VT;

14550

MVT EltVT = VT.getVectorElementType();

14551

14552

if (isSoftFP16(EltVT, Subtarget))

14553

return SDValue();

14554

14555

int V2Index =

14556

find_if(Mask, [&Mask](int M) { return M >= (int)Mask.size(); }) -

14557

Mask.begin();

14558

bool IsV1Zeroable = true;

14559

for (int i = 0, Size = Mask.size(); i < Size; ++i)

14560

if (i != V2Index && !Zeroable[i]) {

14561

IsV1Zeroable = false;

14562

break;

14563

}

14564

14565

// Check for a single input from a SCALAR_TO_VECTOR node.

14566

// FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and

14567

// all the smarts here sunk into that routine. However, the current

14568

// lowering of BUILD_VECTOR makes that nearly impossible until the old

14569

// vector shuffle lowering is dead.

14570

SDValue V2S = getScalarValueForVectorElement(V2, Mask[V2Index] - Mask.size(),

14571

DAG);

14572

if (V2S && DAG.getTargetLoweringInfo().isTypeLegal(V2S.getValueType())) {

14573

// We need to zext the scalar if it is smaller than an i32.

14574

V2S = DAG.getBitcast(EltVT, V2S);

14575

if (EltVT == MVT::i8 || (EltVT == MVT::i16 && !Subtarget.hasFP16())) {

14576

// Using zext to expand a narrow element won't work for non-zero

14577

// insertions.

14578

if (!IsV1Zeroable)

14579

return SDValue();

14580

14581

// Zero-extend directly to i32.

14582

ExtVT = MVT::getVectorVT(MVT::i32, ExtVT.getSizeInBits() / 32);

14583

V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);

14584

}

14585

V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);

14586

} else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 ||

14587

EltVT == MVT::i16) {

14588

// Either not inserting from the low element of the input or the input

14589

// element size is too small to use VZEXT_MOVL to clear the high bits.

14590

return SDValue();

14591

}

14592

14593

if (!IsV1Zeroable) {

14594

// If V1 can't be treated as a zero vector we have fewer options to lower

14595

// this. We can't support integer vectors or non-zero targets cheaply, and

14596

// the V1 elements can't be permuted in any way.

14597

assert(VT == ExtVT && "Cannot change extended type when non-zeroable!")(static_cast <bool> (VT == ExtVT && "Cannot change extended type when non-zeroable!"
) ? void (0) : __assert_fail ("VT == ExtVT && \"Cannot change extended type when non-zeroable!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14597, __extension__
__PRETTY_FUNCTION__));

14598

if (!VT.isFloatingPoint() || V2Index != 0)

14599

return SDValue();

14600

SmallVector<int, 8> V1Mask(Mask);

14601

V1Mask[V2Index] = -1;

14602

if (!isNoopShuffleMask(V1Mask))

14603

return SDValue();

14604

if (!VT.is128BitVector())

14605

return SDValue();

14606

14607

// Otherwise, use MOVSD, MOVSS or MOVSH.

14608

unsigned MovOpc = 0;

14609

if (EltVT == MVT::f16)

14610

MovOpc = X86ISD::MOVSH;

14611

else if (EltVT == MVT::f32)

14612

MovOpc = X86ISD::MOVSS;

14613

else if (EltVT == MVT::f64)

14614

MovOpc = X86ISD::MOVSD;

14615

else

14616

llvm_unreachable("Unsupported floating point element type to handle!")::llvm::llvm_unreachable_internal("Unsupported floating point element type to handle!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14616);

14617

return DAG.getNode(MovOpc, DL, ExtVT, V1, V2);

14618

}

14619

14620

// This lowering only works for the low element with floating point vectors.

14621

if (VT.isFloatingPoint() && V2Index != 0)

14622

return SDValue();

14623

14624

V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2);

14625

if (ExtVT != VT)

14626

V2 = DAG.getBitcast(VT, V2);

14627

14628

if (V2Index != 0) {

14629

// If we have 4 or fewer lanes we can cheaply shuffle the element into

14630

// the desired position. Otherwise it is more efficient to do a vector

14631

// shift left. We know that we can do a vector shift left because all

14632

// the inputs are zero.

14633

if (VT.isFloatingPoint() || VT.getVectorNumElements() <= 4) {

14634

SmallVector<int, 4> V2Shuffle(Mask.size(), 1);

14635

V2Shuffle[V2Index] = 0;

14636

V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);

14637

} else {

14638

V2 = DAG.getBitcast(MVT::v16i8, V2);

14639

V2 = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, V2,

14640

DAG.getTargetConstant(

14641

V2Index * EltVT.getSizeInBits() / 8, DL, MVT::i8));

14642

V2 = DAG.getBitcast(VT, V2);

14643

}

14644

}

14645

return V2;

14646

}

14647

14648

/// Try to lower broadcast of a single - truncated - integer element,

14649

/// coming from a scalar_to_vector/build_vector node \p V0 with larger elements.

14650

///

14651

/// This assumes we have AVX2.

14652

static SDValue lowerShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT, SDValue V0,

14653

int BroadcastIdx,

14654

const X86Subtarget &Subtarget,

14655

SelectionDAG &DAG) {

14656

assert(Subtarget.hasAVX2() &&(static_cast <bool> (Subtarget.hasAVX2() && "We can only lower integer broadcasts with AVX2!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"We can only lower integer broadcasts with AVX2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14657, __extension__
__PRETTY_FUNCTION__))

14657

"We can only lower integer broadcasts with AVX2!")(static_cast <bool> (Subtarget.hasAVX2() && "We can only lower integer broadcasts with AVX2!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"We can only lower integer broadcasts with AVX2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14657, __extension__
__PRETTY_FUNCTION__));

14658

14659

MVT EltVT = VT.getVectorElementType();

14660

MVT V0VT = V0.getSimpleValueType();

14661

14662

assert(VT.isInteger() && "Unexpected non-integer trunc broadcast!")(static_cast <bool> (VT.isInteger() && "Unexpected non-integer trunc broadcast!"
) ? void (0) : __assert_fail ("VT.isInteger() && \"Unexpected non-integer trunc broadcast!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14662, __extension__
__PRETTY_FUNCTION__));

14663

assert(V0VT.isVector() && "Unexpected non-vector vector-sized value!")(static_cast <bool> (V0VT.isVector() && "Unexpected non-vector vector-sized value!"
) ? void (0) : __assert_fail ("V0VT.isVector() && \"Unexpected non-vector vector-sized value!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14663, __extension__
__PRETTY_FUNCTION__));

14664

14665

MVT V0EltVT = V0VT.getVectorElementType();

14666

if (!V0EltVT.isInteger())

14667

return SDValue();

14668

14669

const unsigned EltSize = EltVT.getSizeInBits();

14670

const unsigned V0EltSize = V0EltVT.getSizeInBits();

14671

14672

// This is only a truncation if the original element type is larger.

14673

if (V0EltSize <= EltSize)

14674

return SDValue();

14675

14676

assert(((V0EltSize % EltSize) == 0) &&(static_cast <bool> (((V0EltSize % EltSize) == 0) &&
"Scalar type sizes must all be powers of 2 on x86!") ? void (
0) : __assert_fail ("((V0EltSize % EltSize) == 0) && \"Scalar type sizes must all be powers of 2 on x86!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14677, __extension__
__PRETTY_FUNCTION__))

14677

"Scalar type sizes must all be powers of 2 on x86!")(static_cast <bool> (((V0EltSize % EltSize) == 0) &&
"Scalar type sizes must all be powers of 2 on x86!") ? void (
0) : __assert_fail ("((V0EltSize % EltSize) == 0) && \"Scalar type sizes must all be powers of 2 on x86!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14677, __extension__
__PRETTY_FUNCTION__));

14678

14679

const unsigned V0Opc = V0.getOpcode();

14680

const unsigned Scale = V0EltSize / EltSize;

14681

const unsigned V0BroadcastIdx = BroadcastIdx / Scale;

14682

14683

if ((V0Opc != ISD::SCALAR_TO_VECTOR || V0BroadcastIdx != 0) &&

14684

V0Opc != ISD::BUILD_VECTOR)

14685

return SDValue();

14686

14687

SDValue Scalar = V0.getOperand(V0BroadcastIdx);

14688

14689

// If we're extracting non-least-significant bits, shift so we can truncate.

14690

// Hopefully, we can fold away the trunc/srl/load into the broadcast.

14691

// Even if we can't (and !isShuffleFoldableLoad(Scalar)), prefer

14692

// vpbroadcast+vmovd+shr to vpshufb(m)+vmovd.

14693

if (const int OffsetIdx = BroadcastIdx % Scale)

14694

Scalar = DAG.getNode(ISD::SRL, DL, Scalar.getValueType(), Scalar,

14695

DAG.getConstant(OffsetIdx * EltSize, DL, MVT::i8));

14696

14697

return DAG.getNode(X86ISD::VBROADCAST, DL, VT,

14698

DAG.getNode(ISD::TRUNCATE, DL, EltVT, Scalar));

14699

}

14700

14701

/// Test whether this can be lowered with a single SHUFPS instruction.

14702

///

14703

/// This is used to disable more specialized lowerings when the shufps lowering

14704

/// will happen to be efficient.

14705

static bool isSingleSHUFPSMask(ArrayRef<int> Mask) {

14706

// This routine only handles 128-bit shufps.

14707

assert(Mask.size() == 4 && "Unsupported mask size!")(static_cast <bool> (Mask.size() == 4 && "Unsupported mask size!"
) ? void (0) : __assert_fail ("Mask.size() == 4 && \"Unsupported mask size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14707, __extension__
__PRETTY_FUNCTION__));

14708

assert(Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!")(static_cast <bool> (Mask[0] >= -1 && Mask[0
] < 8 && "Out of bound mask element!") ? void (0) :
__assert_fail ("Mask[0] >= -1 && Mask[0] < 8 && \"Out of bound mask element!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14708, __extension__
__PRETTY_FUNCTION__));

14709

assert(Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!")(static_cast <bool> (Mask[1] >= -1 && Mask[1
] < 8 && "Out of bound mask element!") ? void (0) :
__assert_fail ("Mask[1] >= -1 && Mask[1] < 8 && \"Out of bound mask element!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14709, __extension__
__PRETTY_FUNCTION__));

14710

assert(Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!")(static_cast <bool> (Mask[2] >= -1 && Mask[2
] < 8 && "Out of bound mask element!") ? void (0) :
__assert_fail ("Mask[2] >= -1 && Mask[2] < 8 && \"Out of bound mask element!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14710, __extension__
__PRETTY_FUNCTION__));

14711

assert(Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!")(static_cast <bool> (Mask[3] >= -1 && Mask[3
] < 8 && "Out of bound mask element!") ? void (0) :
__assert_fail ("Mask[3] >= -1 && Mask[3] < 8 && \"Out of bound mask element!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14711, __extension__
__PRETTY_FUNCTION__));

14712

14713

// To lower with a single SHUFPS we need to have the low half and high half

14714

// each requiring a single input.

14715

if (Mask[0] >= 0 && Mask[1] >= 0 && (Mask[0] < 4) != (Mask[1] < 4))

14716

return false;

14717

if (Mask[2] >= 0 && Mask[3] >= 0 && (Mask[2] < 4) != (Mask[3] < 4))

14718

return false;

14719

14720

return true;

14721

}

14722

14723

/// Test whether the specified input (0 or 1) is in-place blended by the

14724

/// given mask.

14725

///

14726

/// This returns true if the elements from a particular input are already in the

14727

/// slot required by the given mask and require no permutation.

14728

static bool isShuffleMaskInputInPlace(int Input, ArrayRef<int> Mask) {

14729

assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.")(static_cast <bool> ((Input == 0 || Input == 1) &&
"Only two inputs to shuffles.") ? void (0) : __assert_fail (
"(Input == 0 || Input == 1) && \"Only two inputs to shuffles.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14729, __extension__
__PRETTY_FUNCTION__));

14730

int Size = Mask.size();

14731

for (int i = 0; i < Size; ++i)

14732

if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i)

14733

return false;

14734

14735

return true;

14736

}

14737

14738

/// If we are extracting two 128-bit halves of a vector and shuffling the

14739

/// result, match that to a 256-bit AVX2 vperm* instruction to avoid a

14740

/// multi-shuffle lowering.

14741

static SDValue lowerShuffleOfExtractsAsVperm(const SDLoc &DL, SDValue N0,

14742

SDValue N1, ArrayRef<int> Mask,

14743

SelectionDAG &DAG) {

14744

MVT VT = N0.getSimpleValueType();

14745

assert((VT.is128BitVector() &&(static_cast <bool> ((VT.is128BitVector() && (VT
.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64
)) && "VPERM* family of shuffles requires 32-bit or 64-bit elements"
) ? void (0) : __assert_fail ("(VT.is128BitVector() && (VT.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64)) && \"VPERM* family of shuffles requires 32-bit or 64-bit elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14747, __extension__
__PRETTY_FUNCTION__))

14746

(VT.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64)) &&(static_cast <bool> ((VT.is128BitVector() && (VT
.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64
)) && "VPERM* family of shuffles requires 32-bit or 64-bit elements"
) ? void (0) : __assert_fail ("(VT.is128BitVector() && (VT.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64)) && \"VPERM* family of shuffles requires 32-bit or 64-bit elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14747, __extension__
__PRETTY_FUNCTION__))

14747

"VPERM* family of shuffles requires 32-bit or 64-bit elements")(static_cast <bool> ((VT.is128BitVector() && (VT
.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64
)) && "VPERM* family of shuffles requires 32-bit or 64-bit elements"
) ? void (0) : __assert_fail ("(VT.is128BitVector() && (VT.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64)) && \"VPERM* family of shuffles requires 32-bit or 64-bit elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14747, __extension__
__PRETTY_FUNCTION__));

14748

14749

// Check that both sources are extracts of the same source vector.

14750

if (!N0.hasOneUse() || !N1.hasOneUse() ||

14751

N0.getOpcode() != ISD::EXTRACT_SUBVECTOR ||

14752

N1.getOpcode() != ISD::EXTRACT_SUBVECTOR ||

14753

N0.getOperand(0) != N1.getOperand(0))

14754

return SDValue();

14755

14756

SDValue WideVec = N0.getOperand(0);

14757

MVT WideVT = WideVec.getSimpleValueType();

14758

if (!WideVT.is256BitVector())

14759

return SDValue();

14760

14761

// Match extracts of each half of the wide source vector. Commute the shuffle

14762

// if the extract of the low half is N1.

14763

unsigned NumElts = VT.getVectorNumElements();

14764

SmallVector<int, 4> NewMask(Mask);

14765

const APInt &ExtIndex0 = N0.getConstantOperandAPInt(1);

14766

const APInt &ExtIndex1 = N1.getConstantOperandAPInt(1);

14767

if (ExtIndex1 == 0 && ExtIndex0 == NumElts)

14768

ShuffleVectorSDNode::commuteMask(NewMask);

14769

else if (ExtIndex0 != 0 || ExtIndex1 != NumElts)

14770

return SDValue();

14771

14772

// Final bailout: if the mask is simple, we are better off using an extract

14773

// and a simple narrow shuffle. Prefer extract+unpack(h/l)ps to vpermps

14774

// because that avoids a constant load from memory.

14775

if (NumElts == 4 &&

14776

(isSingleSHUFPSMask(NewMask) || is128BitUnpackShuffleMask(NewMask, DAG)))

14777

return SDValue();

14778

14779

// Extend the shuffle mask with undef elements.

14780

NewMask.append(NumElts, -1);

14781

14782

// shuf (extract X, 0), (extract X, 4), M --> extract (shuf X, undef, M'), 0

14783

SDValue Shuf = DAG.getVectorShuffle(WideVT, DL, WideVec, DAG.getUNDEF(WideVT),

14784

NewMask);

14785

// This is free: ymm -> xmm.

14786

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Shuf,

14787

DAG.getIntPtrConstant(0, DL));

14788

}

14789

14790

/// Try to lower broadcast of a single element.

14791

///

14792

/// For convenience, this code also bundles all of the subtarget feature set

14793

/// filtering. While a little annoying to re-dispatch on type here, there isn't

14794

/// a convenient way to factor it out.

14795

static SDValue lowerShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1,

14796

SDValue V2, ArrayRef<int> Mask,

14797

const X86Subtarget &Subtarget,

14798

SelectionDAG &DAG) {

14799

if (!((Subtarget.hasSSE3() && VT == MVT::v2f64) ||

14800

(Subtarget.hasAVX() && VT.isFloatingPoint()) ||

14801

(Subtarget.hasAVX2() && VT.isInteger())))

14802

return SDValue();

14803

14804

// With MOVDDUP (v2f64) we can broadcast from a register or a load, otherwise

14805

// we can only broadcast from a register with AVX2.

14806

unsigned NumEltBits = VT.getScalarSizeInBits();

14807

unsigned Opcode = (VT == MVT::v2f64 && !Subtarget.hasAVX2())

14808

? X86ISD::MOVDDUP

14809

: X86ISD::VBROADCAST;

14810

bool BroadcastFromReg = (Opcode == X86ISD::MOVDDUP) || Subtarget.hasAVX2();

14811

14812

// Check that the mask is a broadcast.

14813

int BroadcastIdx = getSplatIndex(Mask);

14814

if (BroadcastIdx < 0)

14815

return SDValue();

14816

assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "(static_cast <bool> (BroadcastIdx < (int)Mask.size()
&& "We only expect to be called with " "a sorted mask where the broadcast "
"comes from V1.") ? void (0) : __assert_fail ("BroadcastIdx < (int)Mask.size() && \"We only expect to be called with \" \"a sorted mask where the broadcast \" \"comes from V1.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14818, __extension__
__PRETTY_FUNCTION__))

14817

"a sorted mask where the broadcast "(static_cast <bool> (BroadcastIdx < (int)Mask.size()
&& "We only expect to be called with " "a sorted mask where the broadcast "
"comes from V1.") ? void (0) : __assert_fail ("BroadcastIdx < (int)Mask.size() && \"We only expect to be called with \" \"a sorted mask where the broadcast \" \"comes from V1.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14818, __extension__
__PRETTY_FUNCTION__))

14818

"comes from V1.")(static_cast <bool> (BroadcastIdx < (int)Mask.size()
&& "We only expect to be called with " "a sorted mask where the broadcast "
"comes from V1.") ? void (0) : __assert_fail ("BroadcastIdx < (int)Mask.size() && \"We only expect to be called with \" \"a sorted mask where the broadcast \" \"comes from V1.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14818, __extension__
__PRETTY_FUNCTION__));

14819

14820

// Go up the chain of (vector) values to find a scalar load that we can

14821

// combine with the broadcast.

14822

// TODO: Combine this logic with findEltLoadSrc() used by

14823

// EltsFromConsecutiveLoads().

14824

int BitOffset = BroadcastIdx * NumEltBits;

14825

SDValue V = V1;

14826

for (;;) {

14827

switch (V.getOpcode()) {

14828

case ISD::BITCAST: {

14829

V = V.getOperand(0);

14830

continue;

14831

}

14832

case ISD::CONCAT_VECTORS: {

14833

int OpBitWidth = V.getOperand(0).getValueSizeInBits();

14834

int OpIdx = BitOffset / OpBitWidth;

14835

V = V.getOperand(OpIdx);

14836

BitOffset %= OpBitWidth;

14837

continue;

14838

}

14839

case ISD::EXTRACT_SUBVECTOR: {

14840

// The extraction index adds to the existing offset.

14841

unsigned EltBitWidth = V.getScalarValueSizeInBits();

14842

unsigned Idx = V.getConstantOperandVal(1);

14843

unsigned BeginOffset = Idx * EltBitWidth;

14844

BitOffset += BeginOffset;

14845

V = V.getOperand(0);

14846

continue;

14847

}

14848

case ISD::INSERT_SUBVECTOR: {

14849

SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);

14850

int EltBitWidth = VOuter.getScalarValueSizeInBits();

14851

int Idx = (int)V.getConstantOperandVal(2);

14852

int NumSubElts = (int)VInner.getSimpleValueType().getVectorNumElements();

14853

int BeginOffset = Idx * EltBitWidth;

14854

int EndOffset = BeginOffset + NumSubElts * EltBitWidth;

14855

if (BeginOffset <= BitOffset && BitOffset < EndOffset) {

14856

BitOffset -= BeginOffset;

14857

V = VInner;

14858

} else {

14859

V = VOuter;

14860

}

14861

continue;

14862

}

14863

}

14864

break;

14865

}

14866

assert((BitOffset % NumEltBits) == 0 && "Illegal bit-offset")(static_cast <bool> ((BitOffset % NumEltBits) == 0 &&
"Illegal bit-offset") ? void (0) : __assert_fail ("(BitOffset % NumEltBits) == 0 && \"Illegal bit-offset\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14866, __extension__
__PRETTY_FUNCTION__));

14867

BroadcastIdx = BitOffset / NumEltBits;

14868

14869

// Do we need to bitcast the source to retrieve the original broadcast index?

14870

bool BitCastSrc = V.getScalarValueSizeInBits() != NumEltBits;

14871

14872

// Check if this is a broadcast of a scalar. We special case lowering

14873

// for scalars so that we can more effectively fold with loads.

14874

// If the original value has a larger element type than the shuffle, the

14875

// broadcast element is in essence truncated. Make that explicit to ease

14876

// folding.

14877

if (BitCastSrc && VT.isInteger())

14878

if (SDValue TruncBroadcast = lowerShuffleAsTruncBroadcast(

14879

DL, VT, V, BroadcastIdx, Subtarget, DAG))

14880

return TruncBroadcast;

14881

14882

// Also check the simpler case, where we can directly reuse the scalar.

14883

if (!BitCastSrc &&

14884

((V.getOpcode() == ISD::BUILD_VECTOR && V.hasOneUse()) ||

14885

(V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0))) {

14886

V = V.getOperand(BroadcastIdx);

14887

14888

// If we can't broadcast from a register, check that the input is a load.

14889

if (!BroadcastFromReg && !isShuffleFoldableLoad(V))

14890

return SDValue();

14891

} else if (ISD::isNormalLoad(V.getNode()) &&

14892

cast<LoadSDNode>(V)->isSimple()) {

14893

// We do not check for one-use of the vector load because a broadcast load

14894

// is expected to be a win for code size, register pressure, and possibly

14895

// uops even if the original vector load is not eliminated.

14896

14897

// Reduce the vector load and shuffle to a broadcasted scalar load.

14898

LoadSDNode *Ld = cast<LoadSDNode>(V);

14899

SDValue BaseAddr = Ld->getOperand(1);

14900

MVT SVT = VT.getScalarType();

14901

unsigned Offset = BroadcastIdx * SVT.getStoreSize();

14902

assert((int)(Offset * 8) == BitOffset && "Unexpected bit-offset")(static_cast <bool> ((int)(Offset * 8) == BitOffset &&
"Unexpected bit-offset") ? void (0) : __assert_fail ("(int)(Offset * 8) == BitOffset && \"Unexpected bit-offset\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14902, __extension__
__PRETTY_FUNCTION__));

14903

SDValue NewAddr =

14904

DAG.getMemBasePlusOffset(BaseAddr, TypeSize::Fixed(Offset), DL);

14905

14906

// Directly form VBROADCAST_LOAD if we're using VBROADCAST opcode rather

14907

// than MOVDDUP.

14908

// FIXME: Should we add VBROADCAST_LOAD isel patterns for pre-AVX?

14909

if (Opcode == X86ISD::VBROADCAST) {

14910

SDVTList Tys = DAG.getVTList(VT, MVT::Other);

14911

SDValue Ops[] = {Ld->getChain(), NewAddr};

14912

V = DAG.getMemIntrinsicNode(

14913

X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, SVT,

14914

DAG.getMachineFunction().getMachineMemOperand(

14915

Ld->getMemOperand(), Offset, SVT.getStoreSize()));

14916

DAG.makeEquivalentMemoryOrdering(Ld, V);

14917

return DAG.getBitcast(VT, V);

14918

}

14919

assert(SVT == MVT::f64 && "Unexpected VT!")(static_cast <bool> (SVT == MVT::f64 && "Unexpected VT!"
) ? void (0) : __assert_fail ("SVT == MVT::f64 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14919, __extension__
__PRETTY_FUNCTION__));

14920

V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr,

14921

DAG.getMachineFunction().getMachineMemOperand(

14922

Ld->getMemOperand(), Offset, SVT.getStoreSize()));

14923

DAG.makeEquivalentMemoryOrdering(Ld, V);

14924

} else if (!BroadcastFromReg) {

14925

// We can't broadcast from a vector register.

14926

return SDValue();

14927

} else if (BitOffset != 0) {

14928

// We can only broadcast from the zero-element of a vector register,

14929

// but it can be advantageous to broadcast from the zero-element of a

14930

// subvector.

14931

if (!VT.is256BitVector() && !VT.is512BitVector())

14932

return SDValue();

14933

14934

// VPERMQ/VPERMPD can perform the cross-lane shuffle directly.

14935

if (VT == MVT::v4f64 || VT == MVT::v4i64)

14936

return SDValue();

14937

14938

// Only broadcast the zero-element of a 128-bit subvector.

14939

if ((BitOffset % 128) != 0)

14940

return SDValue();

14941

14942

assert((BitOffset % V.getScalarValueSizeInBits()) == 0 &&(static_cast <bool> ((BitOffset % V.getScalarValueSizeInBits
()) == 0 && "Unexpected bit-offset") ? void (0) : __assert_fail
("(BitOffset % V.getScalarValueSizeInBits()) == 0 && \"Unexpected bit-offset\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14943, __extension__
__PRETTY_FUNCTION__))

14943

"Unexpected bit-offset")(static_cast <bool> ((BitOffset % V.getScalarValueSizeInBits
()) == 0 && "Unexpected bit-offset") ? void (0) : __assert_fail
("(BitOffset % V.getScalarValueSizeInBits()) == 0 && \"Unexpected bit-offset\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14943, __extension__
__PRETTY_FUNCTION__));

14944

assert((V.getValueSizeInBits() == 256 || V.getValueSizeInBits() == 512) &&(static_cast <bool> ((V.getValueSizeInBits() == 256 || V
.getValueSizeInBits() == 512) && "Unexpected vector size"
) ? void (0) : __assert_fail ("(V.getValueSizeInBits() == 256 || V.getValueSizeInBits() == 512) && \"Unexpected vector size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14945, __extension__
__PRETTY_FUNCTION__))

14945

"Unexpected vector size")(static_cast <bool> ((V.getValueSizeInBits() == 256 || V
.getValueSizeInBits() == 512) && "Unexpected vector size"
) ? void (0) : __assert_fail ("(V.getValueSizeInBits() == 256 || V.getValueSizeInBits() == 512) && \"Unexpected vector size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14945, __extension__
__PRETTY_FUNCTION__));

14946

unsigned ExtractIdx = BitOffset / V.getScalarValueSizeInBits();

14947

V = extract128BitVector(V, ExtractIdx, DAG, DL);

14948

}

14949

14950

// On AVX we can use VBROADCAST directly for scalar sources.

14951

if (Opcode == X86ISD::MOVDDUP && !V.getValueType().isVector()) {

14952

V = DAG.getBitcast(MVT::f64, V);

14953

if (Subtarget.hasAVX()) {

14954

V = DAG.getNode(X86ISD::VBROADCAST, DL, MVT::v2f64, V);

14955

return DAG.getBitcast(VT, V);

14956

}

14957

V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V);

14958

}

14959

14960

// If this is a scalar, do the broadcast on this type and bitcast.

14961

if (!V.getValueType().isVector()) {

14962

assert(V.getScalarValueSizeInBits() == NumEltBits &&(static_cast <bool> (V.getScalarValueSizeInBits() == NumEltBits
&& "Unexpected scalar size") ? void (0) : __assert_fail
("V.getScalarValueSizeInBits() == NumEltBits && \"Unexpected scalar size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14963, __extension__
__PRETTY_FUNCTION__))

14963

"Unexpected scalar size")(static_cast <bool> (V.getScalarValueSizeInBits() == NumEltBits
&& "Unexpected scalar size") ? void (0) : __assert_fail
("V.getScalarValueSizeInBits() == NumEltBits && \"Unexpected scalar size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14963, __extension__
__PRETTY_FUNCTION__));

14964

MVT BroadcastVT = MVT::getVectorVT(V.getSimpleValueType(),

14965

VT.getVectorNumElements());

14966

return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V));

14967

}

14968

14969

// We only support broadcasting from 128-bit vectors to minimize the

14970

// number of patterns we need to deal with in isel. So extract down to

14971

// 128-bits, removing as many bitcasts as possible.

14972

if (V.getValueSizeInBits() > 128)

14973

V = extract128BitVector(peekThroughBitcasts(V), 0, DAG, DL);

14974

14975

// Otherwise cast V to a vector with the same element type as VT, but

14976

// possibly narrower than VT. Then perform the broadcast.

14977

unsigned NumSrcElts = V.getValueSizeInBits() / NumEltBits;

14978

MVT CastVT = MVT::getVectorVT(VT.getVectorElementType(), NumSrcElts);

14979

return DAG.getNode(Opcode, DL, VT, DAG.getBitcast(CastVT, V));

14980

}

14981

14982

// Check for whether we can use INSERTPS to perform the shuffle. We only use

14983

// INSERTPS when the V1 elements are already in the correct locations

14984

// because otherwise we can just always use two SHUFPS instructions which

14985

// are much smaller to encode than a SHUFPS and an INSERTPS. We can also

14986

// perform INSERTPS if a single V1 element is out of place and all V2

14987

// elements are zeroable.

14988

static bool matchShuffleAsInsertPS(SDValue &V1, SDValue &V2,

14989

unsigned &InsertPSMask,

14990

const APInt &Zeroable,

14991

ArrayRef<int> Mask, SelectionDAG &DAG) {

14992

assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType().is128BitVector
() && "Bad operand type!") ? void (0) : __assert_fail
("V1.getSimpleValueType().is128BitVector() && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14992, __extension__
__PRETTY_FUNCTION__));

14993

assert(V2.getSimpleValueType().is128BitVector() && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType().is128BitVector
() && "Bad operand type!") ? void (0) : __assert_fail
("V2.getSimpleValueType().is128BitVector() && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14993, __extension__
__PRETTY_FUNCTION__));

14994

assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!")(static_cast <bool> (Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 4 && \"Unexpected mask size for v4 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 14994, __extension__
__PRETTY_FUNCTION__));

14995

14996

// Attempt to match INSERTPS with one element from VA or VB being

14997

// inserted into VA (or undef). If successful, V1, V2 and InsertPSMask

14998

// are updated.

14999

auto matchAsInsertPS = [&](SDValue VA, SDValue VB,

15000

ArrayRef<int> CandidateMask) {

15001

unsigned ZMask = 0;

15002

int VADstIndex = -1;

15003

int VBDstIndex = -1;

15004

bool VAUsedInPlace = false;

15005

15006

for (int i = 0; i < 4; ++i) {

15007

// Synthesize a zero mask from the zeroable elements (includes undefs).

15008

if (Zeroable[i]) {

15009

ZMask |= 1 << i;

15010

continue;

15011

}

15012

15013

// Flag if we use any VA inputs in place.

15014

if (i == CandidateMask[i]) {

15015

VAUsedInPlace = true;

15016

continue;

15017

}

15018

15019

// We can only insert a single non-zeroable element.

15020

if (VADstIndex >= 0 || VBDstIndex >= 0)

15021

return false;

15022

15023

if (CandidateMask[i] < 4) {

15024

// VA input out of place for insertion.

15025

VADstIndex = i;

15026

} else {

15027

// VB input for insertion.

15028

VBDstIndex = i;

15029

}

15030

}

15031

15032

// Don't bother if we have no (non-zeroable) element for insertion.

15033

if (VADstIndex < 0 && VBDstIndex < 0)

15034

return false;

15035

15036

// Determine element insertion src/dst indices. The src index is from the

15037

// start of the inserted vector, not the start of the concatenated vector.

15038

unsigned VBSrcIndex = 0;

15039

if (VADstIndex >= 0) {

15040

// If we have a VA input out of place, we use VA as the V2 element

15041

// insertion and don't use the original V2 at all.

15042

VBSrcIndex = CandidateMask[VADstIndex];

15043

VBDstIndex = VADstIndex;

15044

VB = VA;

15045

} else {

15046

VBSrcIndex = CandidateMask[VBDstIndex] - 4;

15047

}

15048

15049

// If no V1 inputs are used in place, then the result is created only from

15050

// the zero mask and the V2 insertion - so remove V1 dependency.

15051

if (!VAUsedInPlace)

15052

VA = DAG.getUNDEF(MVT::v4f32);

15053

15054

// Update V1, V2 and InsertPSMask accordingly.

15055

V1 = VA;

15056

V2 = VB;

15057

15058

// Insert the V2 element into the desired position.

15059

InsertPSMask = VBSrcIndex << 6 | VBDstIndex << 4 | ZMask;

15060

assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!")(static_cast <bool> ((InsertPSMask & ~0xFFu) == 0 &&
"Invalid mask!") ? void (0) : __assert_fail ("(InsertPSMask & ~0xFFu) == 0 && \"Invalid mask!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15060, __extension__
__PRETTY_FUNCTION__));

15061

return true;

15062

};

15063

15064

if (matchAsInsertPS(V1, V2, Mask))

15065

return true;

15066

15067

// Commute and try again.

15068

SmallVector<int, 4> CommutedMask(Mask);

15069

ShuffleVectorSDNode::commuteMask(CommutedMask);

15070

if (matchAsInsertPS(V2, V1, CommutedMask))

15071

return true;

15072

15073

return false;

15074

}

15075

15076

static SDValue lowerShuffleAsInsertPS(const SDLoc &DL, SDValue V1, SDValue V2,

15077

ArrayRef<int> Mask, const APInt &Zeroable,

15078

SelectionDAG &DAG) {

15079

assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v4f32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v4f32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15079, __extension__
__PRETTY_FUNCTION__));

15080

assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v4f32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v4f32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15080, __extension__
__PRETTY_FUNCTION__));

15081

15082

// Attempt to match the insertps pattern.

15083

unsigned InsertPSMask = 0;

15084

if (!matchShuffleAsInsertPS(V1, V2, InsertPSMask, Zeroable, Mask, DAG))

15085

return SDValue();

15086

15087

// Insert the V2 element into the desired position.

15088

return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,

15089

DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));

15090

}

15091

15092

/// Handle lowering of 2-lane 64-bit floating point shuffles.

15093

///

15094

/// This is the basis function for the 2-lane 64-bit shuffles as we have full

15095

/// support for floating point shuffles but not integer shuffles. These

15096

/// instructions will incur a domain crossing penalty on some chips though so

15097

/// it is better to avoid lowering through this for integer vectors where

15098

/// possible.

15099

static SDValue lowerV2F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

15100

const APInt &Zeroable, SDValue V1, SDValue V2,

15101

const X86Subtarget &Subtarget,

15102

SelectionDAG &DAG) {

15103

assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v2f64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v2f64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15103, __extension__
__PRETTY_FUNCTION__));

15104

assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v2f64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v2f64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15104, __extension__
__PRETTY_FUNCTION__));

15105

assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!")(static_cast <bool> (Mask.size() == 2 && "Unexpected mask size for v2 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 2 && \"Unexpected mask size for v2 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15105, __extension__
__PRETTY_FUNCTION__));

15106

15107

if (V2.isUndef()) {

15108

// Check for being able to broadcast a single element.

15109

if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2f64, V1, V2,

15110

Mask, Subtarget, DAG))

15111

return Broadcast;

15112

15113

// Straight shuffle of a single input vector. Simulate this by using the

15114

// single input as both of the "inputs" to this instruction..

15115

unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1);

15116

15117

if (Subtarget.hasAVX()) {

15118

// If we have AVX, we can use VPERMILPS which will allow folding a load

15119

// into the shuffle.

15120

return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1,

15121

DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));

15122

}

15123

15124

return DAG.getNode(

15125

X86ISD::SHUFP, DL, MVT::v2f64,

15126

Mask[0] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,

15127

Mask[1] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,

15128

DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));

15129

}

15130

assert(Mask[0] >= 0 && "No undef lanes in multi-input v2 shuffles!")(static_cast <bool> (Mask[0] >= 0 && "No undef lanes in multi-input v2 shuffles!"
) ? void (0) : __assert_fail ("Mask[0] >= 0 && \"No undef lanes in multi-input v2 shuffles!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15130, __extension__
__PRETTY_FUNCTION__));

15131

assert(Mask[1] >= 0 && "No undef lanes in multi-input v2 shuffles!")(static_cast <bool> (Mask[1] >= 0 && "No undef lanes in multi-input v2 shuffles!"
) ? void (0) : __assert_fail ("Mask[1] >= 0 && \"No undef lanes in multi-input v2 shuffles!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15131, __extension__
__PRETTY_FUNCTION__));

15132

assert(Mask[0] < 2 && "We sort V1 to be the first input.")(static_cast <bool> (Mask[0] < 2 && "We sort V1 to be the first input."
) ? void (0) : __assert_fail ("Mask[0] < 2 && \"We sort V1 to be the first input.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15132, __extension__
__PRETTY_FUNCTION__));

15133

assert(Mask[1] >= 2 && "We sort V2 to be the second input.")(static_cast <bool> (Mask[1] >= 2 && "We sort V2 to be the second input."
) ? void (0) : __assert_fail ("Mask[1] >= 2 && \"We sort V2 to be the second input.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15133, __extension__
__PRETTY_FUNCTION__));

15134

15135

if (Subtarget.hasAVX2())

15136

if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))

15137

return Extract;

15138

15139

// When loading a scalar and then shuffling it into a vector we can often do

15140

// the insertion cheaply.

15141

if (SDValue Insertion = lowerShuffleAsElementInsertion(

15142

DL, MVT::v2f64, V1, V2, Mask, Zeroable, Subtarget, DAG))

15143

return Insertion;

15144

// Try inverting the insertion since for v2 masks it is easy to do and we

15145

// can't reliably sort the mask one way or the other.

15146

int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),

15147

Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};

15148

if (SDValue Insertion = lowerShuffleAsElementInsertion(

15149

DL, MVT::v2f64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))

15150

return Insertion;

15151

15152

// Try to use one of the special instruction patterns to handle two common

15153

// blend patterns if a zero-blend above didn't work.

15154

if (isShuffleEquivalent(Mask, {0, 3}, V1, V2) ||

15155

isShuffleEquivalent(Mask, {1, 3}, V1, V2))

15156

if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))

15157

// We can either use a special instruction to load over the low double or

15158

// to move just the low double.

15159

return DAG.getNode(

15160

X86ISD::MOVSD, DL, MVT::v2f64, V2,

15161

DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S));

15162

15163

if (Subtarget.hasSSE41())

15164

if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,

15165

Zeroable, Subtarget, DAG))

15166

return Blend;

15167

15168

// Use dedicated unpack instructions for masks that match their pattern.

15169

if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2f64, Mask, V1, V2, DAG))

15170

return V;

15171

15172

unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);

15173

return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V2,

15174

DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8));

15175

}

15176

15177

/// Handle lowering of 2-lane 64-bit integer shuffles.

15178

///

15179

/// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by

15180

/// the integer unit to minimize domain crossing penalties. However, for blends

15181

/// it falls back to the floating point shuffle operation with appropriate bit

15182

/// casting.

15183

static SDValue lowerV2I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

15184

const APInt &Zeroable, SDValue V1, SDValue V2,

15185

const X86Subtarget &Subtarget,

15186

SelectionDAG &DAG) {

15187

assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v2i64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v2i64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15187, __extension__
__PRETTY_FUNCTION__));

15188

assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v2i64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v2i64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15188, __extension__
__PRETTY_FUNCTION__));

15189

assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!")(static_cast <bool> (Mask.size() == 2 && "Unexpected mask size for v2 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 2 && \"Unexpected mask size for v2 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15189, __extension__
__PRETTY_FUNCTION__));

15190

15191

if (V2.isUndef()) {

15192

// Check for being able to broadcast a single element.

15193

if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2i64, V1, V2,

15194

Mask, Subtarget, DAG))

15195

return Broadcast;

15196

15197

// Straight shuffle of a single input vector. For everything from SSE2

15198

// onward this has a single fast instruction with no scary immediates.

15199

// We have to map the mask as it is actually a v4i32 shuffle instruction.

15200

V1 = DAG.getBitcast(MVT::v4i32, V1);

15201

int WidenedMask[4] = {Mask[0] < 0 ? -1 : (Mask[0] * 2),

15202

Mask[0] < 0 ? -1 : ((Mask[0] * 2) + 1),

15203

Mask[1] < 0 ? -1 : (Mask[1] * 2),

15204

Mask[1] < 0 ? -1 : ((Mask[1] * 2) + 1)};

15205

return DAG.getBitcast(

15206

MVT::v2i64,

15207

DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,

15208

getV4X86ShuffleImm8ForMask(WidenedMask, DL, DAG)));

15209

}

15210

assert(Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!")(static_cast <bool> (Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!"
) ? void (0) : __assert_fail ("Mask[0] != -1 && \"No undef lanes in multi-input v2 shuffles!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15210, __extension__
__PRETTY_FUNCTION__));

15211

assert(Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!")(static_cast <bool> (Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!"
) ? void (0) : __assert_fail ("Mask[1] != -1 && \"No undef lanes in multi-input v2 shuffles!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15211, __extension__
__PRETTY_FUNCTION__));

15212

assert(Mask[0] < 2 && "We sort V1 to be the first input.")(static_cast <bool> (Mask[0] < 2 && "We sort V1 to be the first input."
) ? void (0) : __assert_fail ("Mask[0] < 2 && \"We sort V1 to be the first input.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15212, __extension__
__PRETTY_FUNCTION__));

15213

assert(Mask[1] >= 2 && "We sort V2 to be the second input.")(static_cast <bool> (Mask[1] >= 2 && "We sort V2 to be the second input."
) ? void (0) : __assert_fail ("Mask[1] >= 2 && \"We sort V2 to be the second input.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15213, __extension__
__PRETTY_FUNCTION__));

15214

15215

if (Subtarget.hasAVX2())

15216

if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))

15217

return Extract;

15218

15219

// Try to use shift instructions.

15220

if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask,

15221

Zeroable, Subtarget, DAG))

15222

return Shift;

15223

15224

// When loading a scalar and then shuffling it into a vector we can often do

15225

// the insertion cheaply.

15226

if (SDValue Insertion = lowerShuffleAsElementInsertion(

15227

DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget, DAG))

15228

return Insertion;

15229

// Try inverting the insertion since for v2 masks it is easy to do and we

15230

// can't reliably sort the mask one way or the other.

15231

int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2};

15232

if (SDValue Insertion = lowerShuffleAsElementInsertion(

15233

DL, MVT::v2i64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))

15234

return Insertion;

15235

15236

// We have different paths for blend lowering, but they all must use the

15237

// *exact* same predicate.

15238

bool IsBlendSupported = Subtarget.hasSSE41();

15239

if (IsBlendSupported)

15240

if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,

15241

Zeroable, Subtarget, DAG))

15242

return Blend;

15243

15244

// Use dedicated unpack instructions for masks that match their pattern.

15245

if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2i64, Mask, V1, V2, DAG))

15246

return V;

15247

15248

// Try to use byte rotation instructions.

15249

// Its more profitable for pre-SSSE3 to use shuffles/unpacks.

15250

if (Subtarget.hasSSSE3()) {

15251

if (Subtarget.hasVLX())

15252

if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v2i64, V1, V2, Mask,

15253

Subtarget, DAG))

15254

return Rotate;

15255

15256

if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v2i64, V1, V2, Mask,

15257

Subtarget, DAG))

15258

return Rotate;

15259

}

15260

15261

// If we have direct support for blends, we should lower by decomposing into

15262

// a permute. That will be faster than the domain cross.

15263

if (IsBlendSupported)

15264

return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v2i64, V1, V2, Mask,

15265

Subtarget, DAG);

15266

15267

// We implement this with SHUFPD which is pretty lame because it will likely

15268

// incur 2 cycles of stall for integer vectors on Nehalem and older chips.

15269

// However, all the alternatives are still more cycles and newer chips don't

15270

// have this problem. It would be really nice if x86 had better shuffles here.

15271

V1 = DAG.getBitcast(MVT::v2f64, V1);

15272

V2 = DAG.getBitcast(MVT::v2f64, V2);

15273

return DAG.getBitcast(MVT::v2i64,

15274

DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));

15275

}

15276

15277

/// Lower a vector shuffle using the SHUFPS instruction.

15278

///

15279

/// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.

15280

/// It makes no assumptions about whether this is the *best* lowering, it simply

15281

/// uses it.

15282

static SDValue lowerShuffleWithSHUFPS(const SDLoc &DL, MVT VT,

15283

ArrayRef<int> Mask, SDValue V1,

15284

SDValue V2, SelectionDAG &DAG) {

15285

SDValue LowV = V1, HighV = V2;

15286

SmallVector<int, 4> NewMask(Mask);

15287

int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });

15288

15289

if (NumV2Elements == 1) {

15290

int V2Index = find_if(Mask, [](int M) { return M >= 4; }) - Mask.begin();

15291

15292

// Compute the index adjacent to V2Index and in the same half by toggling

15293

// the low bit.

15294

int V2AdjIndex = V2Index ^ 1;

15295

15296

if (Mask[V2AdjIndex] < 0) {

15297

// Handles all the cases where we have a single V2 element and an undef.

15298

// This will only ever happen in the high lanes because we commute the

15299

// vector otherwise.

15300

if (V2Index < 2)

15301

std::swap(LowV, HighV);

15302

NewMask[V2Index] -= 4;

15303

} else {

15304

// Handle the case where the V2 element ends up adjacent to a V1 element.

15305

// To make this work, blend them together as the first step.

15306

int V1Index = V2AdjIndex;

15307

int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};

15308

V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,

15309

getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));

15310

15311

// Now proceed to reconstruct the final blend as we have the necessary

15312

// high or low half formed.

15313

if (V2Index < 2) {

15314

LowV = V2;

15315

HighV = V1;

15316

} else {

15317

HighV = V2;

15318

}

15319

NewMask[V1Index] = 2; // We put the V1 element in V2[2].

15320

NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].

15321

}

15322

} else if (NumV2Elements == 2) {

15323

if (Mask[0] < 4 && Mask[1] < 4) {

15324

// Handle the easy case where we have V1 in the low lanes and V2 in the

15325

// high lanes.

15326

NewMask[2] -= 4;

15327

NewMask[3] -= 4;

15328

} else if (Mask[2] < 4 && Mask[3] < 4) {

15329

// We also handle the reversed case because this utility may get called

15330

// when we detect a SHUFPS pattern but can't easily commute the shuffle to

15331

// arrange things in the right direction.

15332

NewMask[0] -= 4;

15333

NewMask[1] -= 4;

15334

HighV = V1;

15335

LowV = V2;

15336

} else {

15337

// We have a mixture of V1 and V2 in both low and high lanes. Rather than

15338

// trying to place elements directly, just blend them and set up the final

15339

// shuffle to place them.

15340

15341

// The first two blend mask elements are for V1, the second two are for

15342

// V2.

15343

int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],

15344

Mask[2] < 4 ? Mask[2] : Mask[3],

15345

(Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,

15346

(Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};

15347

V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,

15348

getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));

15349

15350

// Now we do a normal shuffle of V1 by giving V1 as both operands to

15351

// a blend.

15352

LowV = HighV = V1;

15353

NewMask[0] = Mask[0] < 4 ? 0 : 2;

15354

NewMask[1] = Mask[0] < 4 ? 2 : 0;

15355

NewMask[2] = Mask[2] < 4 ? 1 : 3;

15356

NewMask[3] = Mask[2] < 4 ? 3 : 1;

15357

}

15358

} else if (NumV2Elements == 3) {

15359

// Ideally canonicalizeShuffleMaskWithCommute should have caught this, but

15360

// we can get here due to other paths (e.g repeated mask matching) that we

15361

// don't want to do another round of lowerVECTOR_SHUFFLE.

15362

ShuffleVectorSDNode::commuteMask(NewMask);

15363

return lowerShuffleWithSHUFPS(DL, VT, NewMask, V2, V1, DAG);

15364

}

15365

return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV,

15366

getV4X86ShuffleImm8ForMask(NewMask, DL, DAG));

15367

}

15368

15369

/// Lower 4-lane 32-bit floating point shuffles.

15370

///

15371

/// Uses instructions exclusively from the floating point unit to minimize

15372

/// domain crossing penalties, as these are sufficient to implement all v4f32

15373

/// shuffles.

15374

static SDValue lowerV4F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

15375

const APInt &Zeroable, SDValue V1, SDValue V2,

15376

const X86Subtarget &Subtarget,

15377

SelectionDAG &DAG) {

15378

assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v4f32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v4f32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15378, __extension__
__PRETTY_FUNCTION__));

15379

assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v4f32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v4f32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15379, __extension__
__PRETTY_FUNCTION__));

15380

assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!")(static_cast <bool> (Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 4 && \"Unexpected mask size for v4 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15380, __extension__
__PRETTY_FUNCTION__));

15381

15382

int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });

15383

15384

if (NumV2Elements == 0) {

15385

// Check for being able to broadcast a single element.

15386

if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f32, V1, V2,

15387

Mask, Subtarget, DAG))

15388

return Broadcast;

15389

15390

// Use even/odd duplicate instructions for masks that match their pattern.

15391

if (Subtarget.hasSSE3()) {

15392

if (isShuffleEquivalent(Mask, {0, 0, 2, 2}, V1, V2))

15393

return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);

15394

if (isShuffleEquivalent(Mask, {1, 1, 3, 3}, V1, V2))

15395

return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);

15396

}

15397

15398

if (Subtarget.hasAVX()) {

15399

// If we have AVX, we can use VPERMILPS which will allow folding a load

15400

// into the shuffle.

15401

return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1,

15402

getV4X86ShuffleImm8ForMask(Mask, DL, DAG));

15403

}

15404

15405

// Use MOVLHPS/MOVHLPS to simulate unary shuffles. These are only valid

15406

// in SSE1 because otherwise they are widened to v2f64 and never get here.

15407

if (!Subtarget.hasSSE2()) {

15408

if (isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1, V2))

15409

return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V1);

15410

if (isShuffleEquivalent(Mask, {2, 3, 2, 3}, V1, V2))

15411

return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V1, V1);

15412

}

15413

15414

// Otherwise, use a straight shuffle of a single input vector. We pass the

15415

// input vector to both operands to simulate this with a SHUFPS.

15416

return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,

15417

getV4X86ShuffleImm8ForMask(Mask, DL, DAG));

15418

}

15419

15420

if (Subtarget.hasAVX2())

15421

if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))

15422

return Extract;

15423

15424

// There are special ways we can lower some single-element blends. However, we

15425

// have custom ways we can lower more complex single-element blends below that

15426

// we defer to if both this and BLENDPS fail to match, so restrict this to

15427

// when the V2 input is targeting element 0 of the mask -- that is the fast

15428

// case here.

15429

if (NumV2Elements == 1 && Mask[0] >= 4)

15430

if (SDValue V = lowerShuffleAsElementInsertion(

15431

DL, MVT::v4f32, V1, V2, Mask, Zeroable, Subtarget, DAG))

15432

return V;

15433

15434

if (Subtarget.hasSSE41()) {

15435

if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,

15436

Zeroable, Subtarget, DAG))

15437

return Blend;

15438

15439

// Use INSERTPS if we can complete the shuffle efficiently.

15440

if (SDValue V = lowerShuffleAsInsertPS(DL, V1, V2, Mask, Zeroable, DAG))

15441

return V;

15442

15443

if (!isSingleSHUFPSMask(Mask))

15444

if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, MVT::v4f32, V1,

15445

V2, Mask, DAG))

15446

return BlendPerm;

15447

}

15448

15449

// Use low/high mov instructions. These are only valid in SSE1 because

15450

// otherwise they are widened to v2f64 and never get here.

15451

if (!Subtarget.hasSSE2()) {

15452

if (isShuffleEquivalent(Mask, {0, 1, 4, 5}, V1, V2))

15453

return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V2);

15454

if (isShuffleEquivalent(Mask, {2, 3, 6, 7}, V1, V2))

15455

return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V2, V1);

15456

}

15457

15458

// Use dedicated unpack instructions for masks that match their pattern.

15459

if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f32, Mask, V1, V2, DAG))

15460

return V;

15461

15462

// Otherwise fall back to a SHUFPS lowering strategy.

15463

return lowerShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);

15464

}

15465

15466

/// Lower 4-lane i32 vector shuffles.

15467

///

15468

/// We try to handle these with integer-domain shuffles where we can, but for

15469

/// blends we use the floating point domain blend instructions.

15470

static SDValue lowerV4I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

15471

const APInt &Zeroable, SDValue V1, SDValue V2,

15472

const X86Subtarget &Subtarget,

15473

SelectionDAG &DAG) {

15474

assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v4i32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v4i32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15474, __extension__
__PRETTY_FUNCTION__));

15475

assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v4i32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v4i32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15475, __extension__
__PRETTY_FUNCTION__));

15476

assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!")(static_cast <bool> (Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 4 && \"Unexpected mask size for v4 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15476, __extension__
__PRETTY_FUNCTION__));

15477

15478

// Whenever we can lower this as a zext, that instruction is strictly faster

15479

// than any alternative. It also allows us to fold memory operands into the

15480

// shuffle in many cases.

15481

if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v4i32, V1, V2, Mask,

15482

Zeroable, Subtarget, DAG))

15483

return ZExt;

15484

15485

int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });

15486

15487

if (NumV2Elements == 0) {

15488

// Try to use broadcast unless the mask only has one non-undef element.

15489

if (count_if(Mask, [](int M) { return M >= 0 && M < 4; }) > 1) {

15490

if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i32, V1, V2,

15491

Mask, Subtarget, DAG))

15492

return Broadcast;

15493

}

15494

15495

// Straight shuffle of a single input vector. For everything from SSE2

15496

// onward this has a single fast instruction with no scary immediates.

15497

// We coerce the shuffle pattern to be compatible with UNPCK instructions

15498

// but we aren't actually going to use the UNPCK instruction because doing

15499

// so prevents folding a load into this instruction or making a copy.

15500

const int UnpackLoMask[] = {0, 0, 1, 1};

15501

const int UnpackHiMask[] = {2, 2, 3, 3};

15502

if (isShuffleEquivalent(Mask, {0, 0, 1, 1}, V1, V2))

15503

Mask = UnpackLoMask;

15504

else if (isShuffleEquivalent(Mask, {2, 2, 3, 3}, V1, V2))

15505

Mask = UnpackHiMask;

15506

15507

return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,

15508

getV4X86ShuffleImm8ForMask(Mask, DL, DAG));

15509

}

15510

15511

if (Subtarget.hasAVX2())

15512

if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))

15513

return Extract;

15514

15515

// Try to use shift instructions.

15516

if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask,

15517

Zeroable, Subtarget, DAG))

15518

return Shift;

15519

15520

// There are special ways we can lower some single-element blends.

15521

if (NumV2Elements == 1)

15522

if (SDValue V = lowerShuffleAsElementInsertion(

15523

DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))

15524

return V;

15525

15526

// We have different paths for blend lowering, but they all must use the

15527

// *exact* same predicate.

15528

bool IsBlendSupported = Subtarget.hasSSE41();

15529

if (IsBlendSupported)

15530

if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,

15531

Zeroable, Subtarget, DAG))

15532

return Blend;

15533

15534

if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask,

15535

Zeroable, Subtarget, DAG))

15536

return Masked;

15537

15538

// Use dedicated unpack instructions for masks that match their pattern.

15539

if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i32, Mask, V1, V2, DAG))

15540

return V;

15541

15542

// Try to use byte rotation instructions.

15543

// Its more profitable for pre-SSSE3 to use shuffles/unpacks.

15544

if (Subtarget.hasSSSE3()) {

15545

if (Subtarget.hasVLX())

15546

if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i32, V1, V2, Mask,

15547

Subtarget, DAG))

15548

return Rotate;

15549

15550

if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i32, V1, V2, Mask,

15551

Subtarget, DAG))

15552

return Rotate;

15553

}

15554

15555

// Assume that a single SHUFPS is faster than an alternative sequence of

15556

// multiple instructions (even if the CPU has a domain penalty).

15557

// If some CPU is harmed by the domain switch, we can fix it in a later pass.

15558

if (!isSingleSHUFPSMask(Mask)) {

15559

// If we have direct support for blends, we should lower by decomposing into

15560

// a permute. That will be faster than the domain cross.

15561

if (IsBlendSupported)

15562

return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i32, V1, V2, Mask,

15563

Subtarget, DAG);

15564

15565

// Try to lower by permuting the inputs into an unpack instruction.

15566

if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v4i32, V1, V2,

15567

Mask, Subtarget, DAG))

15568

return Unpack;

15569

}

15570

15571

// We implement this with SHUFPS because it can blend from two vectors.

15572

// Because we're going to eventually use SHUFPS, we use SHUFPS even to build

15573

// up the inputs, bypassing domain shift penalties that we would incur if we

15574

// directly used PSHUFD on Nehalem and older. For newer chips, this isn't

15575

// relevant.

15576

SDValue CastV1 = DAG.getBitcast(MVT::v4f32, V1);

15577

SDValue CastV2 = DAG.getBitcast(MVT::v4f32, V2);

15578

SDValue ShufPS = DAG.getVectorShuffle(MVT::v4f32, DL, CastV1, CastV2, Mask);

15579

return DAG.getBitcast(MVT::v4i32, ShufPS);

15580

}

15581

15582

/// Lowering of single-input v8i16 shuffles is the cornerstone of SSE2

15583

/// shuffle lowering, and the most complex part.

15584

///

15585

/// The lowering strategy is to try to form pairs of input lanes which are

15586

/// targeted at the same half of the final vector, and then use a dword shuffle

15587

/// to place them onto the right half, and finally unpack the paired lanes into

15588

/// their final position.

15589

///

15590

/// The exact breakdown of how to form these dword pairs and align them on the

15591

/// correct sides is really tricky. See the comments within the function for

15592

/// more of the details.

15593

///

15594

/// This code also handles repeated 128-bit lanes of v8i16 shuffles, but each

15595

/// lane must shuffle the *exact* same way. In fact, you must pass a v8 Mask to

15596

/// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16

15597

/// vector, form the analogous 128-bit 8-element Mask.

15598

static SDValue lowerV8I16GeneralSingleInputShuffle(

15599

const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef<int> Mask,

15600

const X86Subtarget &Subtarget, SelectionDAG &DAG) {

15601

assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!")(static_cast <bool> (VT.getVectorElementType() == MVT::
i16 && "Bad input type!") ? void (0) : __assert_fail (
"VT.getVectorElementType() == MVT::i16 && \"Bad input type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15601, __extension__
__PRETTY_FUNCTION__));

15602

MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);

15603

15604

assert(Mask.size() == 8 && "Shuffle mask length doesn't match!")(static_cast <bool> (Mask.size() == 8 && "Shuffle mask length doesn't match!"
) ? void (0) : __assert_fail ("Mask.size() == 8 && \"Shuffle mask length doesn't match!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15604, __extension__
__PRETTY_FUNCTION__));

15605

MutableArrayRef<int> LoMask = Mask.slice(0, 4);

15606

MutableArrayRef<int> HiMask = Mask.slice(4, 4);

15607

15608

// Attempt to directly match PSHUFLW or PSHUFHW.

15609

if (isUndefOrInRange(LoMask, 0, 4) &&

15610

isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {

15611

return DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,

15612

getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));

15613

}

15614

if (isUndefOrInRange(HiMask, 4, 8) &&

15615

isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {

15616

for (int i = 0; i != 4; ++i)

15617

HiMask[i] = (HiMask[i] < 0 ? HiMask[i] : (HiMask[i] - 4));

15618

return DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,

15619

getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));

15620

}

15621

15622

SmallVector<int, 4> LoInputs;

15623

copy_if(LoMask, std::back_inserter(LoInputs), [](int M) { return M >= 0; });

15624

array_pod_sort(LoInputs.begin(), LoInputs.end());

15625

LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end());

15626

SmallVector<int, 4> HiInputs;

15627

copy_if(HiMask, std::back_inserter(HiInputs), [](int M) { return M >= 0; });

15628

array_pod_sort(HiInputs.begin(), HiInputs.end());

15629

HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end());

15630

int NumLToL = llvm::lower_bound(LoInputs, 4) - LoInputs.begin();

15631

int NumHToL = LoInputs.size() - NumLToL;

15632

int NumLToH = llvm::lower_bound(HiInputs, 4) - HiInputs.begin();

15633

int NumHToH = HiInputs.size() - NumLToH;

15634

MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL);

15635

MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH);

15636

MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);

15637

MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);

15638

15639

// If we are shuffling values from one half - check how many different DWORD

15640

// pairs we need to create. If only 1 or 2 then we can perform this as a

15641

// PSHUFLW/PSHUFHW + PSHUFD instead of the PSHUFD+PSHUFLW+PSHUFHW chain below.

15642

auto ShuffleDWordPairs = [&](ArrayRef<int> PSHUFHalfMask,

15643

ArrayRef<int> PSHUFDMask, unsigned ShufWOp) {

15644

V = DAG.getNode(ShufWOp, DL, VT, V,

15645

getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));

15646

V = DAG.getBitcast(PSHUFDVT, V);

15647

V = DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, V,

15648

getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));

15649

return DAG.getBitcast(VT, V);

15650

};

15651

15652

if ((NumHToL + NumHToH) == 0 || (NumLToL + NumLToH) == 0) {

15653

int PSHUFDMask[4] = { -1, -1, -1, -1 };

15654

SmallVector<std::pair<int, int>, 4> DWordPairs;

15655

int DOffset = ((NumHToL + NumHToH) == 0 ? 0 : 2);

15656

15657

// Collect the different DWORD pairs.

15658

for (int DWord = 0; DWord != 4; ++DWord) {

15659

int M0 = Mask[2 * DWord + 0];

15660

int M1 = Mask[2 * DWord + 1];

15661

M0 = (M0 >= 0 ? M0 % 4 : M0);

15662

M1 = (M1 >= 0 ? M1 % 4 : M1);

15663

if (M0 < 0 && M1 < 0)

15664

continue;

15665

15666

bool Match = false;

15667

for (int j = 0, e = DWordPairs.size(); j < e; ++j) {

15668

auto &DWordPair = DWordPairs[j];

15669

if ((M0 < 0 || isUndefOrEqual(DWordPair.first, M0)) &&

15670

(M1 < 0 || isUndefOrEqual(DWordPair.second, M1))) {

15671

DWordPair.first = (M0 >= 0 ? M0 : DWordPair.first);

15672

DWordPair.second = (M1 >= 0 ? M1 : DWordPair.second);

15673

PSHUFDMask[DWord] = DOffset + j;

15674

Match = true;

15675

break;

15676

}

15677

}

15678

if (!Match) {

15679

PSHUFDMask[DWord] = DOffset + DWordPairs.size();

15680

DWordPairs.push_back(std::make_pair(M0, M1));

15681

}

15682

}

15683

15684

if (DWordPairs.size() <= 2) {

15685

DWordPairs.resize(2, std::make_pair(-1, -1));

15686

int PSHUFHalfMask[4] = {DWordPairs[0].first, DWordPairs[0].second,

15687

DWordPairs[1].first, DWordPairs[1].second};

15688

if ((NumHToL + NumHToH) == 0)

15689

return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFLW);

15690

if ((NumLToL + NumLToH) == 0)

15691

return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFHW);

15692

}

15693

}

15694

15695

// Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all

15696

// such inputs we can swap two of the dwords across the half mark and end up

15697

// with <=2 inputs to each half in each half. Once there, we can fall through

15698

// to the generic code below. For example:

15699

//

15700

// Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]

15701

// Mask: [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5]

15702

//

15703

// However in some very rare cases we have a 1-into-3 or 3-into-1 on one half

15704

// and an existing 2-into-2 on the other half. In this case we may have to

15705

// pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or

15706

// 1-into-3 which could cause us to cycle endlessly fixing each side in turn.

15707

// Fortunately, we don't have to handle anything but a 2-into-2 pattern

15708

// because any other situation (including a 3-into-1 or 1-into-3 in the other

15709

// half than the one we target for fixing) will be fixed when we re-enter this

15710

// path. We will also combine away any sequence of PSHUFD instructions that

15711

// result into a single instruction. Here is an example of the tricky case:

15712

//

15713

// Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]

15714

// Mask: [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3]

15715

//

15716

// This now has a 1-into-3 in the high half! Instead, we do two shuffles:

15717

//

15718

// Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h]

15719

// Mask: [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6]

15720

//

15721

// Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h]

15722

// Mask: [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6]

15723

//

15724

// The result is fine to be handled by the generic logic.

15725

auto balanceSides = [&](ArrayRef<int> AToAInputs, ArrayRef<int> BToAInputs,

15726

ArrayRef<int> BToBInputs, ArrayRef<int> AToBInputs,

15727

int AOffset, int BOffset) {

15728

assert((AToAInputs.size() == 3 || AToAInputs.size() == 1) &&(static_cast <bool> ((AToAInputs.size() == 3 || AToAInputs
.size() == 1) && "Must call this with A having 3 or 1 inputs from the A half."
) ? void (0) : __assert_fail ("(AToAInputs.size() == 3 || AToAInputs.size() == 1) && \"Must call this with A having 3 or 1 inputs from the A half.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15729, __extension__
__PRETTY_FUNCTION__))

15729

"Must call this with A having 3 or 1 inputs from the A half.")(static_cast <bool> ((AToAInputs.size() == 3 || AToAInputs
.size() == 1) && "Must call this with A having 3 or 1 inputs from the A half."
) ? void (0) : __assert_fail ("(AToAInputs.size() == 3 || AToAInputs.size() == 1) && \"Must call this with A having 3 or 1 inputs from the A half.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15729, __extension__
__PRETTY_FUNCTION__));

15730

assert((BToAInputs.size() == 1 || BToAInputs.size() == 3) &&(static_cast <bool> ((BToAInputs.size() == 1 || BToAInputs
.size() == 3) && "Must call this with B having 1 or 3 inputs from the B half."
) ? void (0) : __assert_fail ("(BToAInputs.size() == 1 || BToAInputs.size() == 3) && \"Must call this with B having 1 or 3 inputs from the B half.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15731, __extension__
__PRETTY_FUNCTION__))

15731

"Must call this with B having 1 or 3 inputs from the B half.")(static_cast <bool> ((BToAInputs.size() == 1 || BToAInputs
.size() == 3) && "Must call this with B having 1 or 3 inputs from the B half."
) ? void (0) : __assert_fail ("(BToAInputs.size() == 1 || BToAInputs.size() == 3) && \"Must call this with B having 1 or 3 inputs from the B half.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15731, __extension__
__PRETTY_FUNCTION__));

15732

assert(AToAInputs.size() + BToAInputs.size() == 4 &&(static_cast <bool> (AToAInputs.size() + BToAInputs.size
() == 4 && "Must call this with either 3:1 or 1:3 inputs (summing to 4)."
) ? void (0) : __assert_fail ("AToAInputs.size() + BToAInputs.size() == 4 && \"Must call this with either 3:1 or 1:3 inputs (summing to 4).\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15733, __extension__
__PRETTY_FUNCTION__))

15733

"Must call this with either 3:1 or 1:3 inputs (summing to 4).")(static_cast <bool> (AToAInputs.size() + BToAInputs.size
() == 4 && "Must call this with either 3:1 or 1:3 inputs (summing to 4)."
) ? void (0) : __assert_fail ("AToAInputs.size() + BToAInputs.size() == 4 && \"Must call this with either 3:1 or 1:3 inputs (summing to 4).\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15733, __extension__
__PRETTY_FUNCTION__));

15734

15735

bool ThreeAInputs = AToAInputs.size() == 3;

15736

15737

// Compute the index of dword with only one word among the three inputs in

15738

// a half by taking the sum of the half with three inputs and subtracting

15739

// the sum of the actual three inputs. The difference is the remaining

15740

// slot.

15741

int ADWord = 0, BDWord = 0;

15742

int &TripleDWord = ThreeAInputs ? ADWord : BDWord;

15743

int &OneInputDWord = ThreeAInputs ? BDWord : ADWord;

15744

int TripleInputOffset = ThreeAInputs ? AOffset : BOffset;

15745

ArrayRef<int> TripleInputs = ThreeAInputs ? AToAInputs : BToAInputs;

15746

int OneInput = ThreeAInputs ? BToAInputs[0] : AToAInputs[0];

15747

int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);

15748

int TripleNonInputIdx =

15749

TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);

15750

TripleDWord = TripleNonInputIdx / 2;

15751

15752

// We use xor with one to compute the adjacent DWord to whichever one the

15753

// OneInput is in.

15754

OneInputDWord = (OneInput / 2) ^ 1;

15755

15756

// Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA

15757

// and BToA inputs. If there is also such a problem with the BToB and AToB

15758

// inputs, we don't try to fix it necessarily -- we'll recurse and see it in

15759

// the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it

15760

// is essential that we don't *create* a 3<-1 as then we might oscillate.

15761

if (BToBInputs.size() == 2 && AToBInputs.size() == 2) {

15762

// Compute how many inputs will be flipped by swapping these DWords. We

15763

// need

15764

// to balance this to ensure we don't form a 3-1 shuffle in the other

15765

// half.

15766

int NumFlippedAToBInputs = llvm::count(AToBInputs, 2 * ADWord) +

15767

llvm::count(AToBInputs, 2 * ADWord + 1);

15768

int NumFlippedBToBInputs = llvm::count(BToBInputs, 2 * BDWord) +

15769

llvm::count(BToBInputs, 2 * BDWord + 1);

15770

if ((NumFlippedAToBInputs == 1 &&

15771

(NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) ||

15772

(NumFlippedBToBInputs == 1 &&

15773

(NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) {

15774

// We choose whether to fix the A half or B half based on whether that

15775

// half has zero flipped inputs. At zero, we may not be able to fix it

15776

// with that half. We also bias towards fixing the B half because that

15777

// will more commonly be the high half, and we have to bias one way.

15778

auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord,

15779

ArrayRef<int> Inputs) {

15780

int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot.

15781

bool IsFixIdxInput = is_contained(Inputs, PinnedIdx ^ 1);

15782

// Determine whether the free index is in the flipped dword or the

15783

// unflipped dword based on where the pinned index is. We use this bit

15784

// in an xor to conditionally select the adjacent dword.

15785

int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord));

15786

bool IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);

15787

if (IsFixIdxInput == IsFixFreeIdxInput)

15788

FixFreeIdx += 1;

15789

IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);

15790

assert(IsFixIdxInput != IsFixFreeIdxInput &&(static_cast <bool> (IsFixIdxInput != IsFixFreeIdxInput
&& "We need to be changing the number of flipped inputs!"
) ? void (0) : __assert_fail ("IsFixIdxInput != IsFixFreeIdxInput && \"We need to be changing the number of flipped inputs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15791, __extension__
__PRETTY_FUNCTION__))

15791

"We need to be changing the number of flipped inputs!")(static_cast <bool> (IsFixIdxInput != IsFixFreeIdxInput
&& "We need to be changing the number of flipped inputs!"
) ? void (0) : __assert_fail ("IsFixIdxInput != IsFixFreeIdxInput && \"We need to be changing the number of flipped inputs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15791, __extension__
__PRETTY_FUNCTION__));

15792

int PSHUFHalfMask[] = {0, 1, 2, 3};

15793

std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);

15794

V = DAG.getNode(

15795

FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL,

15796

MVT::getVectorVT(MVT::i16, V.getValueSizeInBits() / 16), V,

15797

getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));

15798

15799

for (int &M : Mask)

15800

if (M >= 0 && M == FixIdx)

15801

M = FixFreeIdx;

15802

else if (M >= 0 && M == FixFreeIdx)

15803

M = FixIdx;

15804

};

15805

if (NumFlippedBToBInputs != 0) {

15806

int BPinnedIdx =

15807

BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;

15808

FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs);

15809

} else {

15810

assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!")(static_cast <bool> (NumFlippedAToBInputs != 0 &&
"Impossible given predicates!") ? void (0) : __assert_fail (
"NumFlippedAToBInputs != 0 && \"Impossible given predicates!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15810, __extension__
__PRETTY_FUNCTION__));

15811

int APinnedIdx = ThreeAInputs ? TripleNonInputIdx : OneInput;

15812

FixFlippedInputs(APinnedIdx, ADWord, AToBInputs);

15813

}

15814

}

15815

}

15816

15817

int PSHUFDMask[] = {0, 1, 2, 3};

15818

PSHUFDMask[ADWord] = BDWord;

15819

PSHUFDMask[BDWord] = ADWord;

15820

V = DAG.getBitcast(

15821

VT,

15822

DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),

15823

getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));

15824

15825

// Adjust the mask to match the new locations of A and B.

15826

for (int &M : Mask)

15827

if (M >= 0 && M/2 == ADWord)

15828

M = 2 * BDWord + M % 2;

15829

else if (M >= 0 && M/2 == BDWord)

15830

M = 2 * ADWord + M % 2;

15831

15832

// Recurse back into this routine to re-compute state now that this isn't

15833

// a 3 and 1 problem.

15834

return lowerV8I16GeneralSingleInputShuffle(DL, VT, V, Mask, Subtarget, DAG);

15835

};

15836

if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3))

15837

return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);

15838

if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3))

15839

return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);

15840

15841

// At this point there are at most two inputs to the low and high halves from

15842

// each half. That means the inputs can always be grouped into dwords and

15843

// those dwords can then be moved to the correct half with a dword shuffle.

15844

// We use at most one low and one high word shuffle to collect these paired

15845

// inputs into dwords, and finally a dword shuffle to place them.

15846

int PSHUFLMask[4] = {-1, -1, -1, -1};

15847

int PSHUFHMask[4] = {-1, -1, -1, -1};

15848

int PSHUFDMask[4] = {-1, -1, -1, -1};

15849

15850

// First fix the masks for all the inputs that are staying in their

15851

// original halves. This will then dictate the targets of the cross-half

15852

// shuffles.

15853

auto fixInPlaceInputs =

15854

[&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs,

15855

MutableArrayRef<int> SourceHalfMask,

15856

MutableArrayRef<int> HalfMask, int HalfOffset) {

15857

if (InPlaceInputs.empty())

15858

return;

15859

if (InPlaceInputs.size() == 1) {

15860

SourceHalfMask[InPlaceInputs[0] - HalfOffset] =

15861

InPlaceInputs[0] - HalfOffset;

15862

PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;

15863

return;

15864

}

15865

if (IncomingInputs.empty()) {

15866

// Just fix all of the in place inputs.

15867

for (int Input : InPlaceInputs) {

15868

SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;

15869

PSHUFDMask[Input / 2] = Input / 2;

15870

}

15871

return;

15872

}

15873

15874

assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!")(static_cast <bool> (InPlaceInputs.size() == 2 &&
"Cannot handle 3 or 4 inputs!") ? void (0) : __assert_fail (
"InPlaceInputs.size() == 2 && \"Cannot handle 3 or 4 inputs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15874, __extension__
__PRETTY_FUNCTION__));

15875

SourceHalfMask[InPlaceInputs[0] - HalfOffset] =

15876

InPlaceInputs[0] - HalfOffset;

15877

// Put the second input next to the first so that they are packed into

15878

// a dword. We find the adjacent index by toggling the low bit.

15879

int AdjIndex = InPlaceInputs[0] ^ 1;

15880

SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;

15881

std::replace(HalfMask.begin(), HalfMask.end(), InPlaceInputs[1], AdjIndex);

15882

PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;

15883

};

15884

fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);

15885

fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);

15886

15887

// Now gather the cross-half inputs and place them into a free dword of

15888

// their target half.

15889

// FIXME: This operation could almost certainly be simplified dramatically to

15890

// look more like the 3-1 fixing operation.

15891

auto moveInputsToRightHalf = [&PSHUFDMask](

15892

MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,

15893

MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,

15894

MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset,

15895

int DestOffset) {

15896

auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {

15897

return SourceHalfMask[Word] >= 0 && SourceHalfMask[Word] != Word;

15898

};

15899

auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask,

15900

int Word) {

15901

int LowWord = Word & ~1;

15902

int HighWord = Word | 1;

15903

return isWordClobbered(SourceHalfMask, LowWord) ||

15904

isWordClobbered(SourceHalfMask, HighWord);

15905

};

15906

15907

if (IncomingInputs.empty())

15908

return;

15909

15910

if (ExistingInputs.empty()) {

15911

// Map any dwords with inputs from them into the right half.

15912

for (int Input : IncomingInputs) {

15913

// If the source half mask maps over the inputs, turn those into

15914

// swaps and use the swapped lane.

15915

if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) {

15916

if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] < 0) {

15917

SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =

15918

Input - SourceOffset;

15919

// We have to swap the uses in our half mask in one sweep.

15920

for (int &M : HalfMask)

15921

if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset)

15922

M = Input;

15923

else if (M == Input)

15924

M = SourceHalfMask[Input - SourceOffset] + SourceOffset;

15925

} else {

15926

assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] ==(static_cast <bool> (SourceHalfMask[SourceHalfMask[Input
- SourceOffset]] == Input - SourceOffset && "Previous placement doesn't match!"
) ? void (0) : __assert_fail ("SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == Input - SourceOffset && \"Previous placement doesn't match!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15928, __extension__
__PRETTY_FUNCTION__))

15927

Input - SourceOffset &&(static_cast <bool> (SourceHalfMask[SourceHalfMask[Input
- SourceOffset]] == Input - SourceOffset && "Previous placement doesn't match!"
) ? void (0) : __assert_fail ("SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == Input - SourceOffset && \"Previous placement doesn't match!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15928, __extension__
__PRETTY_FUNCTION__))

15928

"Previous placement doesn't match!")(static_cast <bool> (SourceHalfMask[SourceHalfMask[Input
- SourceOffset]] == Input - SourceOffset && "Previous placement doesn't match!"
) ? void (0) : __assert_fail ("SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == Input - SourceOffset && \"Previous placement doesn't match!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15928, __extension__
__PRETTY_FUNCTION__));

15929

}

15930

// Note that this correctly re-maps both when we do a swap and when

15931

// we observe the other side of the swap above. We rely on that to

15932

// avoid swapping the members of the input list directly.

15933

Input = SourceHalfMask[Input - SourceOffset] + SourceOffset;

15934

}

15935

15936

// Map the input's dword into the correct half.

15937

if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] < 0)

15938

PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;

15939

else

15940

assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==(static_cast <bool> (PSHUFDMask[(Input - SourceOffset +
DestOffset) / 2] == Input / 2 && "Previous placement doesn't match!"
) ? void (0) : __assert_fail ("PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == Input / 2 && \"Previous placement doesn't match!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15942, __extension__
__PRETTY_FUNCTION__))

15941

Input / 2 &&(static_cast <bool> (PSHUFDMask[(Input - SourceOffset +
DestOffset) / 2] == Input / 2 && "Previous placement doesn't match!"
) ? void (0) : __assert_fail ("PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == Input / 2 && \"Previous placement doesn't match!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15942, __extension__
__PRETTY_FUNCTION__))

15942

"Previous placement doesn't match!")(static_cast <bool> (PSHUFDMask[(Input - SourceOffset +
DestOffset) / 2] == Input / 2 && "Previous placement doesn't match!"
) ? void (0) : __assert_fail ("PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == Input / 2 && \"Previous placement doesn't match!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15942, __extension__
__PRETTY_FUNCTION__));

15943

}

15944

15945

// And just directly shift any other-half mask elements to be same-half

15946

// as we will have mirrored the dword containing the element into the

15947

// same position within that half.

15948

for (int &M : HalfMask)

15949

if (M >= SourceOffset && M < SourceOffset + 4) {

15950

M = M - SourceOffset + DestOffset;

15951

assert(M >= 0 && "This should never wrap below zero!")(static_cast <bool> (M >= 0 && "This should never wrap below zero!"
) ? void (0) : __assert_fail ("M >= 0 && \"This should never wrap below zero!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 15951, __extension__
__PRETTY_FUNCTION__));

15952

}

15953

return;

15954

}

15955

15956

// Ensure we have the input in a viable dword of its current half. This

15957

// is particularly tricky because the original position may be clobbered

15958

// by inputs being moved and *staying* in that half.

15959

if (IncomingInputs.size() == 1) {

15960

if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {

15961

int InputFixed = find(SourceHalfMask, -1) - std::begin(SourceHalfMask) +

15962

SourceOffset;

15963

SourceHalfMask[InputFixed - SourceOffset] =

15964

IncomingInputs[0] - SourceOffset;

15965

std::replace(HalfMask.begin(), HalfMask.end(), IncomingInputs[0],

15966

InputFixed);

15967

IncomingInputs[0] = InputFixed;

15968

}

15969

} else if (IncomingInputs.size() == 2) {

15970

if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 ||

15971

isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {

15972

// We have two non-adjacent or clobbered inputs we need to extract from

15973

// the source half. To do this, we need to map them into some adjacent

15974

// dword slot in the source mask.

15975

int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,

15976

IncomingInputs[1] - SourceOffset};

15977

15978

// If there is a free slot in the source half mask adjacent to one of

15979

// the inputs, place the other input in it. We use (Index XOR 1) to

15980

// compute an adjacent index.

15981

if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&

15982

SourceHalfMask[InputsFixed[0] ^ 1] < 0) {

15983

SourceHalfMask[InputsFixed[0]] = InputsFixed[0];

15984

SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];

15985

InputsFixed[1] = InputsFixed[0] ^ 1;

15986

} else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&

15987

SourceHalfMask[InputsFixed[1] ^ 1] < 0) {

15988

SourceHalfMask[InputsFixed[1]] = InputsFixed[1];

15989

SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];

15990

InputsFixed[0] = InputsFixed[1] ^ 1;

15991

} else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] < 0 &&

15992

SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] < 0) {

15993

// The two inputs are in the same DWord but it is clobbered and the

15994

// adjacent DWord isn't used at all. Move both inputs to the free

15995

// slot.

15996

SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];

15997

SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];

15998

InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);

15999

InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;

16000

} else {

16001

// The only way we hit this point is if there is no clobbering

16002

// (because there are no off-half inputs to this half) and there is no

16003

// free slot adjacent to one of the inputs. In this case, we have to

16004

// swap an input with a non-input.

16005

for (int i = 0; i < 4; ++i)

16006

assert((SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) &&(static_cast <bool> ((SourceHalfMask[i] < 0 || SourceHalfMask
[i] == i) && "We can't handle any clobbers here!") ? void
(0) : __assert_fail ("(SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) && \"We can't handle any clobbers here!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16007, __extension__
__PRETTY_FUNCTION__))

16007

"We can't handle any clobbers here!")(static_cast <bool> ((SourceHalfMask[i] < 0 || SourceHalfMask
[i] == i) && "We can't handle any clobbers here!") ? void
(0) : __assert_fail ("(SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) && \"We can't handle any clobbers here!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16007, __extension__
__PRETTY_FUNCTION__));

16008

assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&(static_cast <bool> (InputsFixed[1] != (InputsFixed[0] ^
1) && "Cannot have adjacent inputs here!") ? void (0
) : __assert_fail ("InputsFixed[1] != (InputsFixed[0] ^ 1) && \"Cannot have adjacent inputs here!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16009, __extension__
__PRETTY_FUNCTION__))

16009

"Cannot have adjacent inputs here!")(static_cast <bool> (InputsFixed[1] != (InputsFixed[0] ^
1) && "Cannot have adjacent inputs here!") ? void (0
) : __assert_fail ("InputsFixed[1] != (InputsFixed[0] ^ 1) && \"Cannot have adjacent inputs here!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16009, __extension__
__PRETTY_FUNCTION__));

16010

16011

SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];

16012

SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;

16013

16014

// We also have to update the final source mask in this case because

16015

// it may need to undo the above swap.

16016

for (int &M : FinalSourceHalfMask)

16017

if (M == (InputsFixed[0] ^ 1) + SourceOffset)

16018

M = InputsFixed[1] + SourceOffset;

16019

else if (M == InputsFixed[1] + SourceOffset)

16020

M = (InputsFixed[0] ^ 1) + SourceOffset;

16021

16022

InputsFixed[1] = InputsFixed[0] ^ 1;

16023

}

16024

16025

// Point everything at the fixed inputs.

16026

for (int &M : HalfMask)

16027

if (M == IncomingInputs[0])

16028

M = InputsFixed[0] + SourceOffset;

16029

else if (M == IncomingInputs[1])

16030

M = InputsFixed[1] + SourceOffset;

16031

16032

IncomingInputs[0] = InputsFixed[0] + SourceOffset;

16033

IncomingInputs[1] = InputsFixed[1] + SourceOffset;

16034

}

16035

} else {

16036

llvm_unreachable("Unhandled input size!")::llvm::llvm_unreachable_internal("Unhandled input size!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16036);

16037

}

16038

16039

// Now hoist the DWord down to the right half.

16040

int FreeDWord = (PSHUFDMask[DestOffset / 2] < 0 ? 0 : 1) + DestOffset / 2;

16041

assert(PSHUFDMask[FreeDWord] < 0 && "DWord not free")(static_cast <bool> (PSHUFDMask[FreeDWord] < 0 &&
"DWord not free") ? void (0) : __assert_fail ("PSHUFDMask[FreeDWord] < 0 && \"DWord not free\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16041, __extension__
__PRETTY_FUNCTION__));

16042

PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;

16043

for (int &M : HalfMask)

16044

for (int Input : IncomingInputs)

16045

if (M == Input)

16046

M = FreeDWord * 2 + Input % 2;

16047

};

16048

moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,

16049

/*SourceOffset*/ 4, /*DestOffset*/ 0);

16050

moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,

16051

/*SourceOffset*/ 0, /*DestOffset*/ 4);

16052

16053

// Now enact all the shuffles we've computed to move the inputs into their

16054

// target half.

16055

if (!isNoopShuffleMask(PSHUFLMask))

16056

V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,

16057

getV4X86ShuffleImm8ForMask(PSHUFLMask, DL, DAG));

16058

if (!isNoopShuffleMask(PSHUFHMask))

16059

V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,

16060

getV4X86ShuffleImm8ForMask(PSHUFHMask, DL, DAG));

16061

if (!isNoopShuffleMask(PSHUFDMask))

16062

V = DAG.getBitcast(

16063

VT,

16064

DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),

16065

getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));

16066

16067

// At this point, each half should contain all its inputs, and we can then

16068

// just shuffle them into their final position.

16069

assert(count_if(LoMask, [](int M) { return M >= 4; }) == 0 &&(static_cast <bool> (count_if(LoMask, [](int M) { return
M >= 4; }) == 0 && "Failed to lift all the high half inputs to the low mask!"
) ? void (0) : __assert_fail ("count_if(LoMask, [](int M) { return M >= 4; }) == 0 && \"Failed to lift all the high half inputs to the low mask!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16070, __extension__
__PRETTY_FUNCTION__))

16070

"Failed to lift all the high half inputs to the low mask!")(static_cast <bool> (count_if(LoMask, [](int M) { return
M >= 4; }) == 0 && "Failed to lift all the high half inputs to the low mask!"
) ? void (0) : __assert_fail ("count_if(LoMask, [](int M) { return M >= 4; }) == 0 && \"Failed to lift all the high half inputs to the low mask!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16070, __extension__
__PRETTY_FUNCTION__));

16071

assert(count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 &&(static_cast <bool> (count_if(HiMask, [](int M) { return
M >= 0 && M < 4; }) == 0 && "Failed to lift all the low half inputs to the high mask!"
) ? void (0) : __assert_fail ("count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 && \"Failed to lift all the low half inputs to the high mask!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16072, __extension__
__PRETTY_FUNCTION__))

16072

"Failed to lift all the low half inputs to the high mask!")(static_cast <bool> (count_if(HiMask, [](int M) { return
M >= 0 && M < 4; }) == 0 && "Failed to lift all the low half inputs to the high mask!"
) ? void (0) : __assert_fail ("count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 && \"Failed to lift all the low half inputs to the high mask!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16072, __extension__
__PRETTY_FUNCTION__));

16073

16074

// Do a half shuffle for the low mask.

16075

if (!isNoopShuffleMask(LoMask))

16076

V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,

16077

getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));

16078

16079

// Do a half shuffle with the high mask after shifting its values down.

16080

for (int &M : HiMask)

16081

if (M >= 0)

16082

M -= 4;

16083

if (!isNoopShuffleMask(HiMask))

16084

V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,

16085

getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));

16086

16087

return V;

16088

}

16089

16090

/// Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the

16091

/// blend if only one input is used.

16092

static SDValue lowerShuffleAsBlendOfPSHUFBs(

16093

const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,

16094

const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse, bool &V2InUse) {

16095

assert(!is128BitLaneCrossingShuffleMask(VT, Mask) &&(static_cast <bool> (!is128BitLaneCrossingShuffleMask(VT
, Mask) && "Lane crossing shuffle masks not supported"
) ? void (0) : __assert_fail ("!is128BitLaneCrossingShuffleMask(VT, Mask) && \"Lane crossing shuffle masks not supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16096, __extension__
__PRETTY_FUNCTION__))

16096

"Lane crossing shuffle masks not supported")(static_cast <bool> (!is128BitLaneCrossingShuffleMask(VT
, Mask) && "Lane crossing shuffle masks not supported"
) ? void (0) : __assert_fail ("!is128BitLaneCrossingShuffleMask(VT, Mask) && \"Lane crossing shuffle masks not supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16096, __extension__
__PRETTY_FUNCTION__));

16097

16098

int NumBytes = VT.getSizeInBits() / 8;

16099

int Size = Mask.size();

16100

int Scale = NumBytes / Size;

16101

16102

SmallVector<SDValue, 64> V1Mask(NumBytes, DAG.getUNDEF(MVT::i8));

16103

SmallVector<SDValue, 64> V2Mask(NumBytes, DAG.getUNDEF(MVT::i8));

16104

V1InUse = false;

16105

V2InUse = false;

16106

16107

for (int i = 0; i < NumBytes; ++i) {

16108

int M = Mask[i / Scale];

16109

if (M < 0)

16110

continue;

16111

16112

const int ZeroMask = 0x80;

16113

int V1Idx = M < Size ? M * Scale + i % Scale : ZeroMask;

16114

int V2Idx = M < Size ? ZeroMask : (M - Size) * Scale + i % Scale;

16115

if (Zeroable[i / Scale])

16116

V1Idx = V2Idx = ZeroMask;

16117

16118

V1Mask[i] = DAG.getConstant(V1Idx, DL, MVT::i8);

16119

V2Mask[i] = DAG.getConstant(V2Idx, DL, MVT::i8);

16120

V1InUse |= (ZeroMask != V1Idx);

16121

V2InUse |= (ZeroMask != V2Idx);

16122

}

16123

16124

MVT ShufVT = MVT::getVectorVT(MVT::i8, NumBytes);

16125

if (V1InUse)

16126

V1 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V1),

16127

DAG.getBuildVector(ShufVT, DL, V1Mask));

16128

if (V2InUse)

16129

V2 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V2),

16130

DAG.getBuildVector(ShufVT, DL, V2Mask));

16131

16132

// If we need shuffled inputs from both, blend the two.

16133

SDValue V;

16134

if (V1InUse && V2InUse)

16135

V = DAG.getNode(ISD::OR, DL, ShufVT, V1, V2);

16136

else

16137

V = V1InUse ? V1 : V2;

16138

16139

// Cast the result back to the correct type.

16140

return DAG.getBitcast(VT, V);

16141

}

16142

16143

/// Generic lowering of 8-lane i16 shuffles.

16144

///

16145

/// This handles both single-input shuffles and combined shuffle/blends with

16146

/// two inputs. The single input shuffles are immediately delegated to

16147

/// a dedicated lowering routine.

16148

///

16149

/// The blends are lowered in one of three fundamental ways. If there are few

16150

/// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle

16151

/// of the input is significantly cheaper when lowered as an interleaving of

16152

/// the two inputs, try to interleave them. Otherwise, blend the low and high

16153

/// halves of the inputs separately (making them have relatively few inputs)

16154

/// and then concatenate them.

16155

static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

16156

const APInt &Zeroable, SDValue V1, SDValue V2,

16157

const X86Subtarget &Subtarget,

16158

SelectionDAG &DAG) {

16159

assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v8i16
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v8i16 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16159, __extension__
__PRETTY_FUNCTION__));

16160

assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v8i16
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v8i16 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16160, __extension__
__PRETTY_FUNCTION__));

16161

assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")(static_cast <bool> (Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 8 && \"Unexpected mask size for v8 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16161, __extension__
__PRETTY_FUNCTION__));

16162

16163

// Whenever we can lower this as a zext, that instruction is strictly faster

16164

// than any alternative.

16165

if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i16, V1, V2, Mask,

16166

Zeroable, Subtarget, DAG))

16167

return ZExt;

16168

16169

// Try to use lower using a truncation.

16170

if (SDValue V = lowerShuffleWithVPMOV(DL, MVT::v8i16, V1, V2, Mask, Zeroable,

16171

Subtarget, DAG))

16172

return V;

16173

16174

int NumV2Inputs = count_if(Mask, [](int M) { return M >= 8; });

16175

16176

if (NumV2Inputs == 0) {

16177

// Try to use shift instructions.

16178

if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask,

16179

Zeroable, Subtarget, DAG))

16180

return Shift;

16181

16182

// Check for being able to broadcast a single element.

16183

if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i16, V1, V2,

16184

Mask, Subtarget, DAG))

16185

return Broadcast;

16186

16187

// Try to use bit rotation instructions.

16188

if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v8i16, V1, Mask,

16189

Subtarget, DAG))

16190

return Rotate;

16191

16192

// Use dedicated unpack instructions for masks that match their pattern.

16193

if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))

16194

return V;

16195

16196

// Use dedicated pack instructions for masks that match their pattern.

16197

if (SDValue V = lowerShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2, DAG,

16198

Subtarget))

16199

return V;

16200

16201

// Try to use byte rotation instructions.

16202

if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V1, Mask,

16203

Subtarget, DAG))

16204

return Rotate;

16205

16206

// Make a copy of the mask so it can be modified.

16207

SmallVector<int, 8> MutableMask(Mask);

16208

return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v8i16, V1, MutableMask,

16209

Subtarget, DAG);

16210

}

16211

16212

assert(llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) &&(static_cast <bool> (llvm::any_of(Mask, [](int M) { return
M >= 0 && M < 8; }) && "All single-input shuffles should be canonicalized to be V1-input "
"shuffles.") ? void (0) : __assert_fail ("llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) && \"All single-input shuffles should be canonicalized to be V1-input \" \"shuffles.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16214, __extension__
__PRETTY_FUNCTION__))

16213

"All single-input shuffles should be canonicalized to be V1-input "(static_cast <bool> (llvm::any_of(Mask, [](int M) { return
M >= 0 && M < 8; }) && "All single-input shuffles should be canonicalized to be V1-input "
"shuffles.") ? void (0) : __assert_fail ("llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) && \"All single-input shuffles should be canonicalized to be V1-input \" \"shuffles.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16214, __extension__
__PRETTY_FUNCTION__))

16214

"shuffles.")(static_cast <bool> (llvm::any_of(Mask, [](int M) { return
M >= 0 && M < 8; }) && "All single-input shuffles should be canonicalized to be V1-input "
"shuffles.") ? void (0) : __assert_fail ("llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) && \"All single-input shuffles should be canonicalized to be V1-input \" \"shuffles.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16214, __extension__
__PRETTY_FUNCTION__));

16215

16216

// Try to use shift instructions.

16217

if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask,

16218

Zeroable, Subtarget, DAG))

16219

return Shift;

16220

16221

// See if we can use SSE4A Extraction / Insertion.

16222

if (Subtarget.hasSSE4A())

16223

if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask,

16224

Zeroable, DAG))

16225

return V;

16226

16227

// There are special ways we can lower some single-element blends.

16228

if (NumV2Inputs == 1)

16229

if (SDValue V = lowerShuffleAsElementInsertion(

16230

DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))

16231

return V;

16232

16233

// We have different paths for blend lowering, but they all must use the

16234

// *exact* same predicate.

16235

bool IsBlendSupported = Subtarget.hasSSE41();

16236

if (IsBlendSupported)

16237

if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,

16238

Zeroable, Subtarget, DAG))

16239

return Blend;

16240

16241

if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask,

16242

Zeroable, Subtarget, DAG))

16243

return Masked;

16244

16245

// Use dedicated unpack instructions for masks that match their pattern.

16246

if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))

16247

return V;

16248

16249

// Use dedicated pack instructions for masks that match their pattern.

16250

if (SDValue V = lowerShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2, DAG,

16251

Subtarget))

16252

return V;

16253

16254

// Try to use lower using a truncation.

16255

if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v8i16, V1, V2, Mask, Zeroable,

16256

Subtarget, DAG))

16257

return V;

16258

16259

// Try to use byte rotation instructions.

16260

if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V2, Mask,

16261

Subtarget, DAG))

16262

return Rotate;

16263

16264

if (SDValue BitBlend =

16265

lowerShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG))

16266

return BitBlend;

16267

16268

// Try to use byte shift instructions to mask.

16269

if (SDValue V = lowerShuffleAsByteShiftMask(DL, MVT::v8i16, V1, V2, Mask,

16270

Zeroable, Subtarget, DAG))

16271

return V;

16272

16273

// Attempt to lower using compaction, SSE41 is necessary for PACKUSDW.

16274

// We could use SIGN_EXTEND_INREG+PACKSSDW for older targets but this seems to

16275

// be slower than a PSHUFLW+PSHUFHW+PSHUFD chain.

16276

int NumEvenDrops = canLowerByDroppingElements(Mask, true, false);

16277

if ((NumEvenDrops == 1 || NumEvenDrops == 2) && Subtarget.hasSSE41() &&

16278

!Subtarget.hasVLX()) {

16279

// Check if this is part of a 256-bit vector truncation.

16280

if (NumEvenDrops == 2 && Subtarget.hasAVX2() &&

16281

peekThroughBitcasts(V1).getOpcode() == ISD::EXTRACT_SUBVECTOR &&

16282

peekThroughBitcasts(V2).getOpcode() == ISD::EXTRACT_SUBVECTOR) {

16283

SDValue V1V2 = concatSubVectors(V1, V2, DAG, DL);

16284

V1V2 = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1V2,

16285

getZeroVector(MVT::v16i16, Subtarget, DAG, DL),

16286

DAG.getTargetConstant(0xEE, DL, MVT::i8));

16287

V1V2 = DAG.getBitcast(MVT::v8i32, V1V2);

16288

V1 = extract128BitVector(V1V2, 0, DAG, DL);

16289

V2 = extract128BitVector(V1V2, 4, DAG, DL);

16290

} else {

16291

SmallVector<SDValue, 4> DWordClearOps(4,

16292

DAG.getConstant(0, DL, MVT::i32));

16293

for (unsigned i = 0; i != 4; i += 1 << (NumEvenDrops - 1))

16294

DWordClearOps[i] = DAG.getConstant(0xFFFF, DL, MVT::i32);

16295

SDValue DWordClearMask =

16296

DAG.getBuildVector(MVT::v4i32, DL, DWordClearOps);

16297

V1 = DAG.getNode(ISD::AND, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, V1),

16298

DWordClearMask);

16299

V2 = DAG.getNode(ISD::AND, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, V2),

16300

DWordClearMask);

16301

}

16302

// Now pack things back together.

16303

SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v8i16, V1, V2);

16304

if (NumEvenDrops == 2) {

16305

Result = DAG.getBitcast(MVT::v4i32, Result);

16306

Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v8i16, Result, Result);

16307

}

16308

return Result;

16309

}

16310

16311

// When compacting odd (upper) elements, use PACKSS pre-SSE41.

16312

int NumOddDrops = canLowerByDroppingElements(Mask, false, false);

16313

if (NumOddDrops == 1) {

16314

bool HasSSE41 = Subtarget.hasSSE41();

16315

V1 = DAG.getNode(HasSSE41 ? X86ISD::VSRLI : X86ISD::VSRAI, DL, MVT::v4i32,

16316

DAG.getBitcast(MVT::v4i32, V1),

16317

DAG.getTargetConstant(16, DL, MVT::i8));

16318

V2 = DAG.getNode(HasSSE41 ? X86ISD::VSRLI : X86ISD::VSRAI, DL, MVT::v4i32,

16319

DAG.getBitcast(MVT::v4i32, V2),

16320

DAG.getTargetConstant(16, DL, MVT::i8));

16321

return DAG.getNode(HasSSE41 ? X86ISD::PACKUS : X86ISD::PACKSS, DL,

16322

MVT::v8i16, V1, V2);

16323

}

16324

16325

// Try to lower by permuting the inputs into an unpack instruction.

16326

if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1, V2,

16327

Mask, Subtarget, DAG))

16328

return Unpack;

16329

16330

// If we can't directly blend but can use PSHUFB, that will be better as it

16331

// can both shuffle and set up the inefficient blend.

16332

if (!IsBlendSupported && Subtarget.hasSSSE3()) {

16333

bool V1InUse, V2InUse;

16334

return lowerShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask,

16335

Zeroable, DAG, V1InUse, V2InUse);

16336

}

16337

16338

// We can always bit-blend if we have to so the fallback strategy is to

16339

// decompose into single-input permutes and blends/unpacks.

16340

return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8i16, V1, V2,

16341

Mask, Subtarget, DAG);

16342

}

16343

16344

/// Lower 8-lane 16-bit floating point shuffles.

16345

static SDValue lowerV8F16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

16346

const APInt &Zeroable, SDValue V1, SDValue V2,

16347

const X86Subtarget &Subtarget,

16348

SelectionDAG &DAG) {

16349

assert(V1.getSimpleValueType() == MVT::v8f16 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v8f16
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v8f16 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16349, __extension__
__PRETTY_FUNCTION__));

16350

assert(V2.getSimpleValueType() == MVT::v8f16 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v8f16
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v8f16 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16350, __extension__
__PRETTY_FUNCTION__));

16351

assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")(static_cast <bool> (Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 8 && \"Unexpected mask size for v8 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16351, __extension__
__PRETTY_FUNCTION__));

16352

int NumV2Elements = count_if(Mask, [](int M) { return M >= 8; });

16353

16354

if (Subtarget.hasFP16()) {

16355

if (NumV2Elements == 0) {

16356

// Check for being able to broadcast a single element.

16357

if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8f16, V1, V2,

16358

Mask, Subtarget, DAG))

16359

return Broadcast;

16360

}

16361

if (NumV2Elements == 1 && Mask[0] >= 8)

16362

if (SDValue V = lowerShuffleAsElementInsertion(

16363

DL, MVT::v8f16, V1, V2, Mask, Zeroable, Subtarget, DAG))

16364

return V;

16365

}

16366

16367

V1 = DAG.getBitcast(MVT::v8i16, V1);

16368

V2 = DAG.getBitcast(MVT::v8i16, V2);

16369

return DAG.getBitcast(MVT::v8f16,

16370

DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, Mask));

16371

}

16372

16373

// Lowers unary/binary shuffle as VPERMV/VPERMV3, for non-VLX targets,

16374

// sub-512-bit shuffles are padded to 512-bits for the shuffle and then

16375

// the active subvector is extracted.

16376

static SDValue lowerShuffleWithPERMV(const SDLoc &DL, MVT VT,

16377

ArrayRef<int> Mask, SDValue V1, SDValue V2,

16378

const X86Subtarget &Subtarget,

16379

SelectionDAG &DAG) {

16380

MVT MaskVT = VT.changeTypeToInteger();

16381

SDValue MaskNode;

16382

MVT ShuffleVT = VT;

16383

if (!VT.is512BitVector() && !Subtarget.hasVLX()) {

16384

V1 = widenSubVector(V1, false, Subtarget, DAG, DL, 512);

16385

V2 = widenSubVector(V2, false, Subtarget, DAG, DL, 512);

16386

ShuffleVT = V1.getSimpleValueType();

16387

16388

// Adjust mask to correct indices for the second input.

16389

int NumElts = VT.getVectorNumElements();

16390

unsigned Scale = 512 / VT.getSizeInBits();

16391

SmallVector<int, 32> AdjustedMask(Mask);

16392

for (int &M : AdjustedMask)

16393

if (NumElts <= M)

16394

M += (Scale - 1) * NumElts;

16395

MaskNode = getConstVector(AdjustedMask, MaskVT, DAG, DL, true);

16396

MaskNode = widenSubVector(MaskNode, false, Subtarget, DAG, DL, 512);

16397

} else {

16398

MaskNode = getConstVector(Mask, MaskVT, DAG, DL, true);

16399

}

16400

16401

SDValue Result;

16402

if (V2.isUndef())

16403

Result = DAG.getNode(X86ISD::VPERMV, DL, ShuffleVT, MaskNode, V1);

16404

else

16405

Result = DAG.getNode(X86ISD::VPERMV3, DL, ShuffleVT, V1, MaskNode, V2);

16406

16407

if (VT != ShuffleVT)

16408

Result = extractSubVector(Result, 0, DAG, DL, VT.getSizeInBits());

16409

16410

return Result;

16411

}

16412

16413

/// Generic lowering of v16i8 shuffles.

16414

///

16415

/// This is a hybrid strategy to lower v16i8 vectors. It first attempts to

16416

/// detect any complexity reducing interleaving. If that doesn't help, it uses

16417

/// UNPCK to spread the i8 elements across two i16-element vectors, and uses

16418

/// the existing lowering for v8i16 blends on each half, finally PACK-ing them

16419

/// back together.

16420

static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

16421

const APInt &Zeroable, SDValue V1, SDValue V2,

16422

const X86Subtarget &Subtarget,

16423

SelectionDAG &DAG) {

16424

assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v16i8
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v16i8 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16424, __extension__
__PRETTY_FUNCTION__));

16425

assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v16i8
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v16i8 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16425, __extension__
__PRETTY_FUNCTION__));

16426

assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!")(static_cast <bool> (Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 16 && \"Unexpected mask size for v16 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16426, __extension__
__PRETTY_FUNCTION__));

16427

16428

// Try to use shift instructions.

16429

if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask,

16430

Zeroable, Subtarget, DAG))

16431

return Shift;

16432

16433

// Try to use byte rotation instructions.

16434

if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i8, V1, V2, Mask,

16435

Subtarget, DAG))

16436

return Rotate;

16437

16438

// Use dedicated pack instructions for masks that match their pattern.

16439

if (SDValue V = lowerShuffleWithPACK(DL, MVT::v16i8, Mask, V1, V2, DAG,

16440

Subtarget))

16441

return V;

16442

16443

// Try to use a zext lowering.

16444

if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v16i8, V1, V2, Mask,

16445

Zeroable, Subtarget, DAG))

16446

return ZExt;

16447

16448

// Try to use lower using a truncation.

16449

if (SDValue V = lowerShuffleWithVPMOV(DL, MVT::v16i8, V1, V2, Mask, Zeroable,

16450

Subtarget, DAG))

16451

return V;

16452

16453

if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v16i8, V1, V2, Mask, Zeroable,

16454

Subtarget, DAG))

16455

return V;

16456

16457

// See if we can use SSE4A Extraction / Insertion.

16458

if (Subtarget.hasSSE4A())

16459

if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask,

16460

Zeroable, DAG))

16461

return V;

16462

16463

int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });

16464

16465

// For single-input shuffles, there are some nicer lowering tricks we can use.

16466

if (NumV2Elements == 0) {

16467

// Check for being able to broadcast a single element.

16468

if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i8, V1, V2,

16469

Mask, Subtarget, DAG))

16470

return Broadcast;

16471

16472

// Try to use bit rotation instructions.

16473

if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v16i8, V1, Mask,

16474

Subtarget, DAG))

16475

return Rotate;

16476

16477

if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))

16478

return V;

16479

16480

// Check whether we can widen this to an i16 shuffle by duplicating bytes.

16481

// Notably, this handles splat and partial-splat shuffles more efficiently.

16482

// However, it only makes sense if the pre-duplication shuffle simplifies

16483

// things significantly. Currently, this means we need to be able to

16484

// express the pre-duplication shuffle as an i16 shuffle.

16485

//

16486

// FIXME: We should check for other patterns which can be widened into an

16487

// i16 shuffle as well.

16488

auto canWidenViaDuplication = [](ArrayRef<int> Mask) {

16489

for (int i = 0; i < 16; i += 2)

16490

if (Mask[i] >= 0 && Mask[i + 1] >= 0 && Mask[i] != Mask[i + 1])

16491

return false;

16492

16493

return true;

16494

};

16495

auto tryToWidenViaDuplication = [&]() -> SDValue {

16496

if (!canWidenViaDuplication(Mask))

16497

return SDValue();

16498

SmallVector<int, 4> LoInputs;

16499

copy_if(Mask, std::back_inserter(LoInputs),

16500

[](int M) { return M >= 0 && M < 8; });

16501

array_pod_sort(LoInputs.begin(), LoInputs.end());

16502

LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()),

16503

LoInputs.end());

16504

SmallVector<int, 4> HiInputs;

16505

copy_if(Mask, std::back_inserter(HiInputs), [](int M) { return M >= 8; });

16506

array_pod_sort(HiInputs.begin(), HiInputs.end());

16507

HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()),

16508

HiInputs.end());

16509

16510

bool TargetLo = LoInputs.size() >= HiInputs.size();

16511

ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;

16512

ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;

16513

16514

int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};

16515

SmallDenseMap<int, int, 8> LaneMap;

16516

for (int I : InPlaceInputs) {

16517

PreDupI16Shuffle[I/2] = I/2;

16518

LaneMap[I] = I;

16519

}

16520

int j = TargetLo ? 0 : 4, je = j + 4;

16521

for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) {

16522

// Check if j is already a shuffle of this input. This happens when

16523

// there are two adjacent bytes after we move the low one.

16524

if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {

16525

// If we haven't yet mapped the input, search for a slot into which

16526

// we can map it.

16527

while (j < je && PreDupI16Shuffle[j] >= 0)

16528

++j;

16529

16530

if (j == je)

16531

// We can't place the inputs into a single half with a simple i16 shuffle, so bail.

16532

return SDValue();

16533

16534

// Map this input with the i16 shuffle.

16535

PreDupI16Shuffle[j] = MovingInputs[i] / 2;

16536

}

16537

16538

// Update the lane map based on the mapping we ended up with.

16539

LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;

16540

}

16541

V1 = DAG.getBitcast(

16542

MVT::v16i8,

16543

DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),

16544

DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));

16545

16546

// Unpack the bytes to form the i16s that will be shuffled into place.

16547

bool EvenInUse = false, OddInUse = false;

16548

for (int i = 0; i < 16; i += 2) {

16549

EvenInUse |= (Mask[i + 0] >= 0);

16550

OddInUse |= (Mask[i + 1] >= 0);

16551

if (EvenInUse && OddInUse)

16552

break;

16553

}

16554

V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,

16555

MVT::v16i8, EvenInUse ? V1 : DAG.getUNDEF(MVT::v16i8),

16556

OddInUse ? V1 : DAG.getUNDEF(MVT::v16i8));

16557

16558

int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};

16559

for (int i = 0; i < 16; ++i)

16560

if (Mask[i] >= 0) {

16561

int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);

16562

assert(MappedMask < 8 && "Invalid v8 shuffle mask!")(static_cast <bool> (MappedMask < 8 && "Invalid v8 shuffle mask!"
) ? void (0) : __assert_fail ("MappedMask < 8 && \"Invalid v8 shuffle mask!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16562, __extension__
__PRETTY_FUNCTION__));

16563

if (PostDupI16Shuffle[i / 2] < 0)

16564

PostDupI16Shuffle[i / 2] = MappedMask;

16565

else

16566

assert(PostDupI16Shuffle[i / 2] == MappedMask &&(static_cast <bool> (PostDupI16Shuffle[i / 2] == MappedMask
&& "Conflicting entries in the original shuffle!") ?
void (0) : __assert_fail ("PostDupI16Shuffle[i / 2] == MappedMask && \"Conflicting entries in the original shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16567, __extension__
__PRETTY_FUNCTION__))

16567

"Conflicting entries in the original shuffle!")(static_cast <bool> (PostDupI16Shuffle[i / 2] == MappedMask
&& "Conflicting entries in the original shuffle!") ?
void (0) : __assert_fail ("PostDupI16Shuffle[i / 2] == MappedMask && \"Conflicting entries in the original shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16567, __extension__
__PRETTY_FUNCTION__));

16568

}

16569

return DAG.getBitcast(

16570

MVT::v16i8,

16571

DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),

16572

DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));

16573

};

16574

if (SDValue V = tryToWidenViaDuplication())

16575

return V;

16576

}

16577

16578

if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask,

16579

Zeroable, Subtarget, DAG))

16580

return Masked;

16581

16582

// Use dedicated unpack instructions for masks that match their pattern.

16583

if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))

16584

return V;

16585

16586

// Try to use byte shift instructions to mask.

16587

if (SDValue V = lowerShuffleAsByteShiftMask(DL, MVT::v16i8, V1, V2, Mask,

16588

Zeroable, Subtarget, DAG))

16589

return V;

16590

16591

// Check for compaction patterns.

16592

bool IsSingleInput = V2.isUndef();

16593

int NumEvenDrops = canLowerByDroppingElements(Mask, true, IsSingleInput);

16594

16595

// Check for SSSE3 which lets us lower all v16i8 shuffles much more directly

16596

// with PSHUFB. It is important to do this before we attempt to generate any

16597

// blends but after all of the single-input lowerings. If the single input

16598

// lowerings can find an instruction sequence that is faster than a PSHUFB, we

16599

// want to preserve that and we can DAG combine any longer sequences into

16600

// a PSHUFB in the end. But once we start blending from multiple inputs,

16601

// the complexity of DAG combining bad patterns back into PSHUFB is too high,

16602

// and there are *very* few patterns that would actually be faster than the

16603

// PSHUFB approach because of its ability to zero lanes.

16604

//

16605

// If the mask is a binary compaction, we can more efficiently perform this

16606

// as a PACKUS(AND(),AND()) - which is quicker than UNPACK(PSHUFB(),PSHUFB()).

16607

//

16608

// FIXME: The only exceptions to the above are blends which are exact

16609

// interleavings with direct instructions supporting them. We currently don't

16610

// handle those well here.

16611

if (Subtarget.hasSSSE3() && (IsSingleInput || NumEvenDrops != 1)) {

16612

bool V1InUse = false;

16613

bool V2InUse = false;

16614

16615

SDValue PSHUFB = lowerShuffleAsBlendOfPSHUFBs(

16616

DL, MVT::v16i8, V1, V2, Mask, Zeroable, DAG, V1InUse, V2InUse);

16617

16618

// If both V1 and V2 are in use and we can use a direct blend or an unpack,

16619

// do so. This avoids using them to handle blends-with-zero which is

16620

// important as a single pshufb is significantly faster for that.

16621

if (V1InUse && V2InUse) {

16622

if (Subtarget.hasSSE41())

16623

if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i8, V1, V2, Mask,

16624

Zeroable, Subtarget, DAG))

16625

return Blend;

16626

16627

// We can use an unpack to do the blending rather than an or in some

16628

// cases. Even though the or may be (very minorly) more efficient, we

16629

// preference this lowering because there are common cases where part of

16630

// the complexity of the shuffles goes away when we do the final blend as

16631

// an unpack.

16632

// FIXME: It might be worth trying to detect if the unpack-feeding

16633

// shuffles will both be pshufb, in which case we shouldn't bother with

16634

// this.

16635

if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(

16636

DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))

16637

return Unpack;

16638

16639

// AVX512VBMI can lower to VPERMB (non-VLX will pad to v64i8).

16640

if (Subtarget.hasVBMI())

16641

return lowerShuffleWithPERMV(DL, MVT::v16i8, Mask, V1, V2, Subtarget,

16642

DAG);

16643

16644

// If we have XOP we can use one VPPERM instead of multiple PSHUFBs.

16645

if (Subtarget.hasXOP()) {

16646

SDValue MaskNode = getConstVector(Mask, MVT::v16i8, DAG, DL, true);

16647

return DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, V1, V2, MaskNode);

16648

}

16649

16650

// Use PALIGNR+Permute if possible - permute might become PSHUFB but the

16651

// PALIGNR will be cheaper than the second PSHUFB+OR.

16652

if (SDValue V = lowerShuffleAsByteRotateAndPermute(

16653

DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))

16654

return V;

16655

}

16656

16657

return PSHUFB;

16658

}

16659

16660

// There are special ways we can lower some single-element blends.

16661

if (NumV2Elements == 1)

16662

if (SDValue V = lowerShuffleAsElementInsertion(

16663

DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))

16664

return V;

16665

16666

if (SDValue Blend = lowerShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG))

16667

return Blend;

16668

16669

// Check whether a compaction lowering can be done. This handles shuffles

16670

// which take every Nth element for some even N. See the helper function for

16671

// details.

16672

//

16673

// We special case these as they can be particularly efficiently handled with

16674

// the PACKUSB instruction on x86 and they show up in common patterns of

16675

// rearranging bytes to truncate wide elements.

16676

if (NumEvenDrops) {

16677

// NumEvenDrops is the power of two stride of the elements. Another way of

16678

// thinking about it is that we need to drop the even elements this many

16679

// times to get the original input.

16680

16681

// First we need to zero all the dropped bytes.

16682

assert(NumEvenDrops <= 3 &&(static_cast <bool> (NumEvenDrops <= 3 && "No support for dropping even elements more than 3 times."
) ? void (0) : __assert_fail ("NumEvenDrops <= 3 && \"No support for dropping even elements more than 3 times.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16683, __extension__
__PRETTY_FUNCTION__))

16683

"No support for dropping even elements more than 3 times.")(static_cast <bool> (NumEvenDrops <= 3 && "No support for dropping even elements more than 3 times."
) ? void (0) : __assert_fail ("NumEvenDrops <= 3 && \"No support for dropping even elements more than 3 times.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16683, __extension__
__PRETTY_FUNCTION__));

16684

SmallVector<SDValue, 8> WordClearOps(8, DAG.getConstant(0, DL, MVT::i16));

16685

for (unsigned i = 0; i != 8; i += 1 << (NumEvenDrops - 1))

16686

WordClearOps[i] = DAG.getConstant(0xFF, DL, MVT::i16);

16687

SDValue WordClearMask = DAG.getBuildVector(MVT::v8i16, DL, WordClearOps);

16688

V1 = DAG.getNode(ISD::AND, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, V1),

16689

WordClearMask);

16690

if (!IsSingleInput)

16691

V2 = DAG.getNode(ISD::AND, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, V2),

16692

WordClearMask);

16693

16694

// Now pack things back together.

16695

SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1,

16696

IsSingleInput ? V1 : V2);

16697

for (int i = 1; i < NumEvenDrops; ++i) {

16698

Result = DAG.getBitcast(MVT::v8i16, Result);

16699

Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result);

16700

}

16701

return Result;

16702

}

16703

16704

int NumOddDrops = canLowerByDroppingElements(Mask, false, IsSingleInput);

16705

if (NumOddDrops == 1) {

16706

V1 = DAG.getNode(X86ISD::VSRLI, DL, MVT::v8i16,

16707

DAG.getBitcast(MVT::v8i16, V1),

16708

DAG.getTargetConstant(8, DL, MVT::i8));

16709

if (!IsSingleInput)

16710

V2 = DAG.getNode(X86ISD::VSRLI, DL, MVT::v8i16,

16711

DAG.getBitcast(MVT::v8i16, V2),

16712

DAG.getTargetConstant(8, DL, MVT::i8));

16713

return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1,

16714

IsSingleInput ? V1 : V2);

16715

}

16716

16717

// Handle multi-input cases by blending/unpacking single-input shuffles.

16718

if (NumV2Elements > 0)

16719

return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v16i8, V1, V2, Mask,

16720

Subtarget, DAG);

16721

16722

// The fallback path for single-input shuffles widens this into two v8i16

16723

// vectors with unpacks, shuffles those, and then pulls them back together

16724

// with a pack.

16725

SDValue V = V1;

16726

16727

std::array<int, 8> LoBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};

16728

std::array<int, 8> HiBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};

16729

for (int i = 0; i < 16; ++i)

16730

if (Mask[i] >= 0)

16731

(i < 8 ? LoBlendMask[i] : HiBlendMask[i % 8]) = Mask[i];

16732

16733

SDValue VLoHalf, VHiHalf;

16734

// Check if any of the odd lanes in the v16i8 are used. If not, we can mask

16735

// them out and avoid using UNPCK{L,H} to extract the elements of V as

16736

// i16s.

16737

if (none_of(LoBlendMask, [](int M) { return M >= 0 && M % 2 == 1; }) &&

16738

none_of(HiBlendMask, [](int M) { return M >= 0 && M % 2 == 1; })) {

16739

// Use a mask to drop the high bytes.

16740

VLoHalf = DAG.getBitcast(MVT::v8i16, V);

16741

VLoHalf = DAG.getNode(ISD::AND, DL, MVT::v8i16, VLoHalf,

16742

DAG.getConstant(0x00FF, DL, MVT::v8i16));

16743

16744

// This will be a single vector shuffle instead of a blend so nuke VHiHalf.

16745

VHiHalf = DAG.getUNDEF(MVT::v8i16);

16746

16747

// Squash the masks to point directly into VLoHalf.

16748

for (int &M : LoBlendMask)

16749

if (M >= 0)

16750

M /= 2;

16751

for (int &M : HiBlendMask)

16752

if (M >= 0)

16753

M /= 2;

16754

} else {

16755

// Otherwise just unpack the low half of V into VLoHalf and the high half into

16756

// VHiHalf so that we can blend them as i16s.

16757

SDValue Zero = getZeroVector(MVT::v16i8, Subtarget, DAG, DL);

16758

16759

VLoHalf = DAG.getBitcast(

16760

MVT::v8i16, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));

16761

VHiHalf = DAG.getBitcast(

16762

MVT::v8i16, DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));

16763

}

16764

16765

SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask);

16766

SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, HiBlendMask);

16767

16768

return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);

16769

}

16770

16771

/// Dispatching routine to lower various 128-bit x86 vector shuffles.

16772

///

16773

/// This routine breaks down the specific type of 128-bit shuffle and

16774

/// dispatches to the lowering routines accordingly.

16775

static SDValue lower128BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,

16776

MVT VT, SDValue V1, SDValue V2,

16777

const APInt &Zeroable,

16778

const X86Subtarget &Subtarget,

16779

SelectionDAG &DAG) {

16780

switch (VT.SimpleTy) {

16781

case MVT::v2i64:

16782

return lowerV2I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

16783

case MVT::v2f64:

16784

return lowerV2F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

16785

case MVT::v4i32:

16786

return lowerV4I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

16787

case MVT::v4f32:

16788

return lowerV4F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

16789

case MVT::v8i16:

16790

return lowerV8I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

16791

case MVT::v8f16:

16792

return lowerV8F16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

16793

case MVT::v16i8:

16794

return lowerV16I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

16795

16796

default:

16797

llvm_unreachable("Unimplemented!")::llvm::llvm_unreachable_internal("Unimplemented!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 16797);

16798

}

16799

}

16800

16801

/// Generic routine to split vector shuffle into half-sized shuffles.

16802

///

16803

/// This routine just extracts two subvectors, shuffles them independently, and

16804

/// then concatenates them back together. This should work effectively with all

16805

/// AVX vector shuffle types.

16806

static SDValue splitAndLowerShuffle(const SDLoc &DL, MVT VT, SDValue V1,

16807

SDValue V2, ArrayRef<int> Mask,

16808

SelectionDAG &DAG) {

16809

assert(VT.getSizeInBits() >= 256 &&(static_cast <bool> (VT.getSizeInBits() >= 256 &&
"Only for 256-bit or wider vector shuffles!") ? void (0) : __assert_fail
("VT.getSizeInBits() >= 256 && \"Only for 256-bit or wider vector shuffles!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16810, __extension__
__PRETTY_FUNCTION__))

16810

"Only for 256-bit or wider vector shuffles!")(static_cast <bool> (VT.getSizeInBits() >= 256 &&
"Only for 256-bit or wider vector shuffles!") ? void (0) : __assert_fail
("VT.getSizeInBits() >= 256 && \"Only for 256-bit or wider vector shuffles!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16810, __extension__
__PRETTY_FUNCTION__));

16811

assert(V1.getSimpleValueType() == VT && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == VT &&
"Bad operand type!") ? void (0) : __assert_fail ("V1.getSimpleValueType() == VT && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16811, __extension__
__PRETTY_FUNCTION__));

16812

assert(V2.getSimpleValueType() == VT && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == VT &&
"Bad operand type!") ? void (0) : __assert_fail ("V2.getSimpleValueType() == VT && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16812, __extension__
__PRETTY_FUNCTION__));

16813

16814

ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2);

16815

ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2);

16816

16817

int NumElements = VT.getVectorNumElements();

16818

int SplitNumElements = NumElements / 2;

16819

MVT ScalarVT = VT.getVectorElementType();

16820

MVT SplitVT = MVT::getVectorVT(ScalarVT, SplitNumElements);

16821

16822

// Use splitVector/extractSubVector so that split build-vectors just build two

16823

// narrower build vectors. This helps shuffling with splats and zeros.

16824

auto SplitVector = [&](SDValue V) {

16825

SDValue LoV, HiV;

16826

std::tie(LoV, HiV) = splitVector(peekThroughBitcasts(V), DAG, DL);

16827

return std::make_pair(DAG.getBitcast(SplitVT, LoV),

16828

DAG.getBitcast(SplitVT, HiV));

16829

};

16830

16831

SDValue LoV1, HiV1, LoV2, HiV2;

16832

std::tie(LoV1, HiV1) = SplitVector(V1);

16833

std::tie(LoV2, HiV2) = SplitVector(V2);

16834

16835

// Now create two 4-way blends of these half-width vectors.

16836

auto HalfBlend = [&](ArrayRef<int> HalfMask) {

16837

bool UseLoV1 = false, UseHiV1 = false, UseLoV2 = false, UseHiV2 = false;

16838

SmallVector<int, 32> V1BlendMask((unsigned)SplitNumElements, -1);

16839

SmallVector<int, 32> V2BlendMask((unsigned)SplitNumElements, -1);

16840

SmallVector<int, 32> BlendMask((unsigned)SplitNumElements, -1);

16841

for (int i = 0; i < SplitNumElements; ++i) {

16842

int M = HalfMask[i];

16843

if (M >= NumElements) {

16844

if (M >= NumElements + SplitNumElements)

16845

UseHiV2 = true;

16846

else

16847

UseLoV2 = true;

16848

V2BlendMask[i] = M - NumElements;

16849

BlendMask[i] = SplitNumElements + i;

16850

} else if (M >= 0) {

16851

if (M >= SplitNumElements)

16852

UseHiV1 = true;

16853

else

16854

UseLoV1 = true;

16855

V1BlendMask[i] = M;

16856

BlendMask[i] = i;

16857

}

16858

}

16859

16860

// Because the lowering happens after all combining takes place, we need to

16861

// manually combine these blend masks as much as possible so that we create

16862

// a minimal number of high-level vector shuffle nodes.

16863

16864

// First try just blending the halves of V1 or V2.

16865

if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)

16866

return DAG.getUNDEF(SplitVT);

16867

if (!UseLoV2 && !UseHiV2)

16868

return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);

16869

if (!UseLoV1 && !UseHiV1)

16870

return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);

16871

16872

SDValue V1Blend, V2Blend;

16873

if (UseLoV1 && UseHiV1) {

16874

V1Blend =

16875

DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);

16876

} else {

16877

// We only use half of V1 so map the usage down into the final blend mask.

16878

V1Blend = UseLoV1 ? LoV1 : HiV1;

16879

for (int i = 0; i < SplitNumElements; ++i)

16880

if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements)

16881

BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);

16882

}

16883

if (UseLoV2 && UseHiV2) {

16884

V2Blend =

16885

DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);

16886

} else {

16887

// We only use half of V2 so map the usage down into the final blend mask.

16888

V2Blend = UseLoV2 ? LoV2 : HiV2;

16889

for (int i = 0; i < SplitNumElements; ++i)

16890

if (BlendMask[i] >= SplitNumElements)

16891

BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0);

16892

}

16893

return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask);

16894

};

16895

SDValue Lo = HalfBlend(LoMask);

16896

SDValue Hi = HalfBlend(HiMask);

16897

return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);

16898

}

16899

16900

/// Either split a vector in halves or decompose the shuffles and the

16901

/// blend/unpack.

16902

///

16903

/// This is provided as a good fallback for many lowerings of non-single-input

16904

/// shuffles with more than one 128-bit lane. In those cases, we want to select

16905

/// between splitting the shuffle into 128-bit components and stitching those

16906

/// back together vs. extracting the single-input shuffles and blending those

16907

/// results.

16908

static SDValue lowerShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT, SDValue V1,

16909

SDValue V2, ArrayRef<int> Mask,

16910

const X86Subtarget &Subtarget,

16911

SelectionDAG &DAG) {

16912

assert(!V2.isUndef() && "This routine must not be used to lower single-input "(static_cast <bool> (!V2.isUndef() && "This routine must not be used to lower single-input "
"shuffles as it could then recurse on itself.") ? void (0) :
__assert_fail ("!V2.isUndef() && \"This routine must not be used to lower single-input \" \"shuffles as it could then recurse on itself.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16913, __extension__
__PRETTY_FUNCTION__))

16913

"shuffles as it could then recurse on itself.")(static_cast <bool> (!V2.isUndef() && "This routine must not be used to lower single-input "
"shuffles as it could then recurse on itself.") ? void (0) :
__assert_fail ("!V2.isUndef() && \"This routine must not be used to lower single-input \" \"shuffles as it could then recurse on itself.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16913, __extension__
__PRETTY_FUNCTION__));

16914

int Size = Mask.size();

16915

16916

// If this can be modeled as a broadcast of two elements followed by a blend,

16917

// prefer that lowering. This is especially important because broadcasts can

16918

// often fold with memory operands.

16919

auto DoBothBroadcast = [&] {

16920

int V1BroadcastIdx = -1, V2BroadcastIdx = -1;

16921

for (int M : Mask)

16922

if (M >= Size) {

16923

if (V2BroadcastIdx < 0)

16924

V2BroadcastIdx = M - Size;

16925

else if (M - Size != V2BroadcastIdx)

16926

return false;

16927

} else if (M >= 0) {

16928

if (V1BroadcastIdx < 0)

16929

V1BroadcastIdx = M;

16930

else if (M != V1BroadcastIdx)

16931

return false;

16932

}

16933

return true;

16934

};

16935

if (DoBothBroadcast())

16936

return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Subtarget,

16937

DAG);

16938

16939

// If the inputs all stem from a single 128-bit lane of each input, then we

16940

// split them rather than blending because the split will decompose to

16941

// unusually few instructions.

16942

int LaneCount = VT.getSizeInBits() / 128;

16943

int LaneSize = Size / LaneCount;

16944

SmallBitVector LaneInputs[2];

16945

LaneInputs[0].resize(LaneCount, false);

16946

LaneInputs[1].resize(LaneCount, false);

16947

for (int i = 0; i < Size; ++i)

16948

if (Mask[i] >= 0)

16949

LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true;

16950

if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)

16951

return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);

16952

16953

// Otherwise, just fall back to decomposed shuffles and a blend/unpack. This

16954

// requires that the decomposed single-input shuffles don't end up here.

16955

return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Subtarget,

16956

DAG);

16957

}

16958

16959

// Lower as SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).

16960

// TODO: Extend to support v8f32 (+ 512-bit shuffles).

16961

static SDValue lowerShuffleAsLanePermuteAndSHUFP(const SDLoc &DL, MVT VT,

16962

SDValue V1, SDValue V2,

16963

ArrayRef<int> Mask,

16964

SelectionDAG &DAG) {

16965

assert(VT == MVT::v4f64 && "Only for v4f64 shuffles")(static_cast <bool> (VT == MVT::v4f64 && "Only for v4f64 shuffles"
) ? void (0) : __assert_fail ("VT == MVT::v4f64 && \"Only for v4f64 shuffles\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 16965, __extension__
__PRETTY_FUNCTION__));

16966

16967

int LHSMask[4] = {-1, -1, -1, -1};

16968

int RHSMask[4] = {-1, -1, -1, -1};

16969

unsigned SHUFPMask = 0;

16970

16971

// As SHUFPD uses a single LHS/RHS element per lane, we can always

16972

// perform the shuffle once the lanes have been shuffled in place.

16973

for (int i = 0; i != 4; ++i) {

16974

int M = Mask[i];

16975

if (M < 0)

16976

continue;

16977

int LaneBase = i & ~1;

16978

auto &LaneMask = (i & 1) ? RHSMask : LHSMask;

16979

LaneMask[LaneBase + (M & 1)] = M;

16980

SHUFPMask |= (M & 1) << i;

16981

}

16982

16983

SDValue LHS = DAG.getVectorShuffle(VT, DL, V1, V2, LHSMask);

16984

SDValue RHS = DAG.getVectorShuffle(VT, DL, V1, V2, RHSMask);

16985

return DAG.getNode(X86ISD::SHUFP, DL, VT, LHS, RHS,

16986

DAG.getTargetConstant(SHUFPMask, DL, MVT::i8));

16987

}

16988

16989

/// Lower a vector shuffle crossing multiple 128-bit lanes as

16990

/// a lane permutation followed by a per-lane permutation.

16991

///

16992

/// This is mainly for cases where we can have non-repeating permutes

16993

/// in each lane.

16994

///

16995

/// TODO: This is very similar to lowerShuffleAsLanePermuteAndRepeatedMask,

16996

/// we should investigate merging them.

16997

static SDValue lowerShuffleAsLanePermuteAndPermute(

16998

const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,

16999

SelectionDAG &DAG, const X86Subtarget &Subtarget) {

17000

int NumElts = VT.getVectorNumElements();

17001

int NumLanes = VT.getSizeInBits() / 128;

17002

int NumEltsPerLane = NumElts / NumLanes;

17003

bool CanUseSublanes = Subtarget.hasAVX2() && V2.isUndef();

17004

17005

/// Attempts to find a sublane permute with the given size

17006

/// that gets all elements into their target lanes.

17007

///

17008

/// If successful, fills CrossLaneMask and InLaneMask and returns true.

17009

/// If unsuccessful, returns false and may overwrite InLaneMask.

17010

auto getSublanePermute = [&](int NumSublanes) -> SDValue {

17011

int NumSublanesPerLane = NumSublanes / NumLanes;

17012

int NumEltsPerSublane = NumElts / NumSublanes;

17013

17014

SmallVector<int, 16> CrossLaneMask;

17015

SmallVector<int, 16> InLaneMask(NumElts, SM_SentinelUndef);

17016

// CrossLaneMask but one entry == one sublane.

17017

SmallVector<int, 16> CrossLaneMaskLarge(NumSublanes, SM_SentinelUndef);

17018

17019

for (int i = 0; i != NumElts; ++i) {

17020

int M = Mask[i];

17021

if (M < 0)

17022

continue;

17023

17024

int SrcSublane = M / NumEltsPerSublane;

17025

int DstLane = i / NumEltsPerLane;

17026

17027

// We only need to get the elements into the right lane, not sublane.

17028

// So search all sublanes that make up the destination lane.

17029

bool Found = false;

17030

int DstSubStart = DstLane * NumSublanesPerLane;

17031

int DstSubEnd = DstSubStart + NumSublanesPerLane;

17032

for (int DstSublane = DstSubStart; DstSublane < DstSubEnd; ++DstSublane) {

17033

if (!isUndefOrEqual(CrossLaneMaskLarge[DstSublane], SrcSublane))

17034

continue;

17035

17036

Found = true;

17037

CrossLaneMaskLarge[DstSublane] = SrcSublane;

17038

int DstSublaneOffset = DstSublane * NumEltsPerSublane;

17039

InLaneMask[i] = DstSublaneOffset + M % NumEltsPerSublane;

17040

break;

17041

}

17042

if (!Found)

17043

return SDValue();

17044

}

17045

17046

// Fill CrossLaneMask using CrossLaneMaskLarge.

17047

narrowShuffleMaskElts(NumEltsPerSublane, CrossLaneMaskLarge, CrossLaneMask);

17048

17049

if (!CanUseSublanes) {

17050

// If we're only shuffling a single lowest lane and the rest are identity

17051

// then don't bother.

17052

// TODO - isShuffleMaskInputInPlace could be extended to something like

17053

// this.

17054

int NumIdentityLanes = 0;

17055

bool OnlyShuffleLowestLane = true;

17056

for (int i = 0; i != NumLanes; ++i) {

17057

int LaneOffset = i * NumEltsPerLane;

17058

if (isSequentialOrUndefInRange(InLaneMask, LaneOffset, NumEltsPerLane,

17059

i * NumEltsPerLane))

17060

NumIdentityLanes++;

17061

else if (CrossLaneMask[LaneOffset] != 0)

17062

OnlyShuffleLowestLane = false;

17063

}

17064

if (OnlyShuffleLowestLane && NumIdentityLanes == (NumLanes - 1))

17065

return SDValue();

17066

}

17067

17068

// Avoid returning the same shuffle operation. For example,

17069

// t7: v16i16 = vector_shuffle<8,9,10,11,4,5,6,7,0,1,2,3,12,13,14,15> t5,

17070

// undef:v16i16

17071

if (CrossLaneMask == Mask || InLaneMask == Mask)

17072

return SDValue();

17073

17074

SDValue CrossLane = DAG.getVectorShuffle(VT, DL, V1, V2, CrossLaneMask);

17075

return DAG.getVectorShuffle(VT, DL, CrossLane, DAG.getUNDEF(VT),

17076

InLaneMask);

17077

};

17078

17079

// First attempt a solution with full lanes.

17080

if (SDValue V = getSublanePermute(/*NumSublanes=*/NumLanes))

17081

return V;

17082

17083

// The rest of the solutions use sublanes.

17084

if (!CanUseSublanes)

17085

return SDValue();

17086

17087

// Then attempt a solution with 64-bit sublanes (vpermq).

17088

if (SDValue V = getSublanePermute(/*NumSublanes=*/NumLanes * 2))

17089

return V;

17090

17091

// If that doesn't work and we have fast variable cross-lane shuffle,

17092

// attempt 32-bit sublanes (vpermd).

17093

if (!Subtarget.hasFastVariableCrossLaneShuffle())

17094

return SDValue();

17095

17096

return getSublanePermute(/*NumSublanes=*/NumLanes * 4);

17097

}

17098

17099

/// Lower a vector shuffle crossing multiple 128-bit lanes by shuffling one

17100

/// source with a lane permutation.

17101

///

17102

/// This lowering strategy results in four instructions in the worst case for a

17103

/// single-input cross lane shuffle which is lower than any other fully general

17104

/// cross-lane shuffle strategy I'm aware of. Special cases for each particular

17105

/// shuffle pattern should be handled prior to trying this lowering.

17106

static SDValue lowerShuffleAsLanePermuteAndShuffle(

17107

const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,

17108

SelectionDAG &DAG, const X86Subtarget &Subtarget) {

17109

// FIXME: This should probably be generalized for 512-bit vectors as well.

17110

assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!")(static_cast <bool> (VT.is256BitVector() && "Only for 256-bit vector shuffles!"
) ? void (0) : __assert_fail ("VT.is256BitVector() && \"Only for 256-bit vector shuffles!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17110, __extension__
__PRETTY_FUNCTION__));

17111

int Size = Mask.size();

17112

int LaneSize = Size / 2;

17113

17114

// Fold to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).

17115

// Only do this if the elements aren't all from the lower lane,

17116

// otherwise we're (probably) better off doing a split.

17117

if (VT == MVT::v4f64 &&

17118

!all_of(Mask, [LaneSize](int M) { return M < LaneSize; }))

17119

return lowerShuffleAsLanePermuteAndSHUFP(DL, VT, V1, V2, Mask, DAG);

17120

17121

// If there are only inputs from one 128-bit lane, splitting will in fact be

17122

// less expensive. The flags track whether the given lane contains an element

17123

// that crosses to another lane.

17124

bool AllLanes;

17125

if (!Subtarget.hasAVX2()) {

17126

bool LaneCrossing[2] = {false, false};

17127

for (int i = 0; i < Size; ++i)

17128

if (Mask[i] >= 0 && ((Mask[i] % Size) / LaneSize) != (i / LaneSize))

17129

LaneCrossing[(Mask[i] % Size) / LaneSize] = true;

17130

AllLanes = LaneCrossing[0] && LaneCrossing[1];

17131

} else {

17132

bool LaneUsed[2] = {false, false};

17133

for (int i = 0; i < Size; ++i)

17134

if (Mask[i] >= 0)

17135

LaneUsed[(Mask[i] % Size) / LaneSize] = true;

17136

AllLanes = LaneUsed[0] && LaneUsed[1];

17137

}

17138

17139

// TODO - we could support shuffling V2 in the Flipped input.

17140

assert(V2.isUndef() &&(static_cast <bool> (V2.isUndef() && "This last part of this routine only works on single input shuffles"
) ? void (0) : __assert_fail ("V2.isUndef() && \"This last part of this routine only works on single input shuffles\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17141, __extension__
__PRETTY_FUNCTION__))

17141

"This last part of this routine only works on single input shuffles")(static_cast <bool> (V2.isUndef() && "This last part of this routine only works on single input shuffles"
) ? void (0) : __assert_fail ("V2.isUndef() && \"This last part of this routine only works on single input shuffles\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17141, __extension__
__PRETTY_FUNCTION__));

17142

17143

SmallVector<int, 32> InLaneMask(Mask);

17144

for (int i = 0; i < Size; ++i) {

17145

int &M = InLaneMask[i];

17146

if (M < 0)

17147

continue;

17148

if (((M % Size) / LaneSize) != (i / LaneSize))

17149

M = (M % LaneSize) + ((i / LaneSize) * LaneSize) + Size;

17150

}

17151

assert(!is128BitLaneCrossingShuffleMask(VT, InLaneMask) &&(static_cast <bool> (!is128BitLaneCrossingShuffleMask(VT
, InLaneMask) && "In-lane shuffle mask expected") ? void
(0) : __assert_fail ("!is128BitLaneCrossingShuffleMask(VT, InLaneMask) && \"In-lane shuffle mask expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17152, __extension__
__PRETTY_FUNCTION__))

17152

"In-lane shuffle mask expected")(static_cast <bool> (!is128BitLaneCrossingShuffleMask(VT
, InLaneMask) && "In-lane shuffle mask expected") ? void
(0) : __assert_fail ("!is128BitLaneCrossingShuffleMask(VT, InLaneMask) && \"In-lane shuffle mask expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17152, __extension__
__PRETTY_FUNCTION__));

17153

17154

// If we're not using both lanes in each lane and the inlane mask is not

17155

// repeating, then we're better off splitting.

17156

if (!AllLanes && !is128BitLaneRepeatedShuffleMask(VT, InLaneMask))

17157

return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);

17158

17159

// Flip the lanes, and shuffle the results which should now be in-lane.

17160

MVT PVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;

17161

SDValue Flipped = DAG.getBitcast(PVT, V1);

17162

Flipped =

17163

DAG.getVectorShuffle(PVT, DL, Flipped, DAG.getUNDEF(PVT), {2, 3, 0, 1});

17164

Flipped = DAG.getBitcast(VT, Flipped);

17165

return DAG.getVectorShuffle(VT, DL, V1, Flipped, InLaneMask);

17166

}

17167

17168

/// Handle lowering 2-lane 128-bit shuffles.

17169

static SDValue lowerV2X128Shuffle(const SDLoc &DL, MVT VT, SDValue V1,

17170

SDValue V2, ArrayRef<int> Mask,

17171

const APInt &Zeroable,

17172

const X86Subtarget &Subtarget,

17173

SelectionDAG &DAG) {

17174

if (V2.isUndef()) {

17175

// Attempt to match VBROADCAST*128 subvector broadcast load.

17176

bool SplatLo = isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1);

17177

bool SplatHi = isShuffleEquivalent(Mask, {2, 3, 2, 3}, V1);

17178

if ((SplatLo || SplatHi) && !Subtarget.hasAVX512() && V1.hasOneUse() &&

17179

X86::mayFoldLoad(peekThroughOneUseBitcasts(V1), Subtarget)) {

17180

MVT MemVT = VT.getHalfNumVectorElementsVT();

17181

unsigned Ofs = SplatLo ? 0 : MemVT.getStoreSize();

17182

auto *Ld = cast<LoadSDNode>(peekThroughOneUseBitcasts(V1));

17183

if (SDValue BcstLd = getBROADCAST_LOAD(X86ISD::SUBV_BROADCAST_LOAD, DL,

17184

VT, MemVT, Ld, Ofs, DAG))

17185

return BcstLd;

17186

}

17187

17188

// With AVX2, use VPERMQ/VPERMPD for unary shuffles to allow memory folding.

17189

if (Subtarget.hasAVX2())

17190

return SDValue();

17191

}

17192

17193

bool V2IsZero = !V2.isUndef() && ISD::isBuildVectorAllZeros(V2.getNode());

17194

17195

SmallVector<int, 4> WidenedMask;

17196

if (!canWidenShuffleElements(Mask, Zeroable, V2IsZero, WidenedMask))

17197

return SDValue();

17198

17199

bool IsLowZero = (Zeroable & 0x3) == 0x3;

17200

bool IsHighZero = (Zeroable & 0xc) == 0xc;

17201

17202

// Try to use an insert into a zero vector.

17203

if (WidenedMask[0] == 0 && IsHighZero) {

17204

MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);

17205

SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,

17206

DAG.getIntPtrConstant(0, DL));

17207

return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,

17208

getZeroVector(VT, Subtarget, DAG, DL), LoV,

17209

DAG.getIntPtrConstant(0, DL));

17210

}

17211

17212

// TODO: If minimizing size and one of the inputs is a zero vector and the

17213

// the zero vector has only one use, we could use a VPERM2X128 to save the

17214

// instruction bytes needed to explicitly generate the zero vector.

17215

17216

// Blends are faster and handle all the non-lane-crossing cases.

17217

if (SDValue Blend = lowerShuffleAsBlend(DL, VT, V1, V2, Mask, Zeroable,

17218

Subtarget, DAG))

17219

return Blend;

17220

17221

// If either input operand is a zero vector, use VPERM2X128 because its mask

17222

// allows us to replace the zero input with an implicit zero.

17223

if (!IsLowZero && !IsHighZero) {

17224

// Check for patterns which can be matched with a single insert of a 128-bit

17225

// subvector.

17226

bool OnlyUsesV1 = isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1, V2);

17227

if (OnlyUsesV1 || isShuffleEquivalent(Mask, {0, 1, 4, 5}, V1, V2)) {

17228

17229

// With AVX1, use vperm2f128 (below) to allow load folding. Otherwise,

17230

// this will likely become vinsertf128 which can't fold a 256-bit memop.

17231

if (!isa<LoadSDNode>(peekThroughBitcasts(V1))) {

17232

MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);

17233

SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,

17234

OnlyUsesV1 ? V1 : V2,

17235

DAG.getIntPtrConstant(0, DL));

17236

return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,

17237

DAG.getIntPtrConstant(2, DL));

17238

}

17239

}

17240

17241

// Try to use SHUF128 if possible.

17242

if (Subtarget.hasVLX()) {

17243

if (WidenedMask[0] < 2 && WidenedMask[1] >= 2) {

17244

unsigned PermMask = ((WidenedMask[0] % 2) << 0) |

17245

((WidenedMask[1] % 2) << 1);

17246

return DAG.getNode(X86ISD::SHUF128, DL, VT, V1, V2,

17247

DAG.getTargetConstant(PermMask, DL, MVT::i8));

17248

}

17249

}

17250

}

17251

17252

// Otherwise form a 128-bit permutation. After accounting for undefs,

17253

// convert the 64-bit shuffle mask selection values into 128-bit

17254

// selection bits by dividing the indexes by 2 and shifting into positions

17255

// defined by a vperm2*128 instruction's immediate control byte.

17256

17257

// The immediate permute control byte looks like this:

17258

// [1:0] - select 128 bits from sources for low half of destination

17259

// [2] - ignore

17260

// [3] - zero low half of destination

17261

// [5:4] - select 128 bits from sources for high half of destination

17262

// [6] - ignore

17263

// [7] - zero high half of destination

17264

17265

assert((WidenedMask[0] >= 0 || IsLowZero) &&(static_cast <bool> ((WidenedMask[0] >= 0 || IsLowZero
) && (WidenedMask[1] >= 0 || IsHighZero) &&
"Undef half?") ? void (0) : __assert_fail ("(WidenedMask[0] >= 0 || IsLowZero) && (WidenedMask[1] >= 0 || IsHighZero) && \"Undef half?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17266, __extension__
__PRETTY_FUNCTION__))

17266

(WidenedMask[1] >= 0 || IsHighZero) && "Undef half?")(static_cast <bool> ((WidenedMask[0] >= 0 || IsLowZero
) && (WidenedMask[1] >= 0 || IsHighZero) &&
"Undef half?") ? void (0) : __assert_fail ("(WidenedMask[0] >= 0 || IsLowZero) && (WidenedMask[1] >= 0 || IsHighZero) && \"Undef half?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17266, __extension__
__PRETTY_FUNCTION__));

17267

17268

unsigned PermMask = 0;

17269

PermMask |= IsLowZero ? 0x08 : (WidenedMask[0] << 0);

17270

PermMask |= IsHighZero ? 0x80 : (WidenedMask[1] << 4);

17271

17272

// Check the immediate mask and replace unused sources with undef.

17273

if ((PermMask & 0x0a) != 0x00 && (PermMask & 0xa0) != 0x00)

17274

V1 = DAG.getUNDEF(VT);

17275

if ((PermMask & 0x0a) != 0x02 && (PermMask & 0xa0) != 0x20)

17276

V2 = DAG.getUNDEF(VT);

17277

17278

return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,

17279

DAG.getTargetConstant(PermMask, DL, MVT::i8));

17280

}

17281

17282

/// Lower a vector shuffle by first fixing the 128-bit lanes and then

17283

/// shuffling each lane.

17284

///

17285

/// This attempts to create a repeated lane shuffle where each lane uses one

17286

/// or two of the lanes of the inputs. The lanes of the input vectors are

17287

/// shuffled in one or two independent shuffles to get the lanes into the

17288

/// position needed by the final shuffle.

17289

static SDValue lowerShuffleAsLanePermuteAndRepeatedMask(

17290

const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,

17291

const X86Subtarget &Subtarget, SelectionDAG &DAG) {

17292

assert(!V2.isUndef() && "This is only useful with multiple inputs.")(static_cast <bool> (!V2.isUndef() && "This is only useful with multiple inputs."
) ? void (0) : __assert_fail ("!V2.isUndef() && \"This is only useful with multiple inputs.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17292, __extension__
__PRETTY_FUNCTION__));

17293

17294

if (is128BitLaneRepeatedShuffleMask(VT, Mask))

17295

return SDValue();

17296

17297

int NumElts = Mask.size();

17298

int NumLanes = VT.getSizeInBits() / 128;

17299

int NumLaneElts = 128 / VT.getScalarSizeInBits();

17300

SmallVector<int, 16> RepeatMask(NumLaneElts, -1);

17301

SmallVector<std::array<int, 2>, 2> LaneSrcs(NumLanes, {{-1, -1}});

17302

17303

// First pass will try to fill in the RepeatMask from lanes that need two

17304

// sources.

17305

for (int Lane = 0; Lane != NumLanes; ++Lane) {

17306

int Srcs[2] = {-1, -1};

17307

SmallVector<int, 16> InLaneMask(NumLaneElts, -1);

17308

for (int i = 0; i != NumLaneElts; ++i) {

17309

int M = Mask[(Lane * NumLaneElts) + i];

17310

if (M < 0)

17311

continue;

17312

// Determine which of the possible input lanes (NumLanes from each source)

17313

// this element comes from. Assign that as one of the sources for this

17314

// lane. We can assign up to 2 sources for this lane. If we run out

17315

// sources we can't do anything.

17316

int LaneSrc = M / NumLaneElts;

17317

int Src;

17318

if (Srcs[0] < 0 || Srcs[0] == LaneSrc)

17319

Src = 0;

17320

else if (Srcs[1] < 0 || Srcs[1] == LaneSrc)

17321

Src = 1;

17322

else

17323

return SDValue();

17324

17325

Srcs[Src] = LaneSrc;

17326

InLaneMask[i] = (M % NumLaneElts) + Src * NumElts;

17327

}

17328

17329

// If this lane has two sources, see if it fits with the repeat mask so far.

17330

if (Srcs[1] < 0)

17331

continue;

17332

17333

LaneSrcs[Lane][0] = Srcs[0];

17334

LaneSrcs[Lane][1] = Srcs[1];

17335

17336

auto MatchMasks = [](ArrayRef<int> M1, ArrayRef<int> M2) {

17337

assert(M1.size() == M2.size() && "Unexpected mask size")(static_cast <bool> (M1.size() == M2.size() && "Unexpected mask size"
) ? void (0) : __assert_fail ("M1.size() == M2.size() && \"Unexpected mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17337, __extension__
__PRETTY_FUNCTION__));

17338

for (int i = 0, e = M1.size(); i != e; ++i)

17339

if (M1[i] >= 0 && M2[i] >= 0 && M1[i] != M2[i])

17340

return false;

17341

return true;

17342

};

17343

17344

auto MergeMasks = [](ArrayRef<int> Mask, MutableArrayRef<int> MergedMask) {

17345

assert(Mask.size() == MergedMask.size() && "Unexpected mask size")(static_cast <bool> (Mask.size() == MergedMask.size() &&
"Unexpected mask size") ? void (0) : __assert_fail ("Mask.size() == MergedMask.size() && \"Unexpected mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17345, __extension__
__PRETTY_FUNCTION__));

17346

for (int i = 0, e = MergedMask.size(); i != e; ++i) {

17347

int M = Mask[i];

17348

if (M < 0)

17349

continue;

17350

assert((MergedMask[i] < 0 || MergedMask[i] == M) &&(static_cast <bool> ((MergedMask[i] < 0 || MergedMask
[i] == M) && "Unexpected mask element") ? void (0) : __assert_fail
("(MergedMask[i] < 0 || MergedMask[i] == M) && \"Unexpected mask element\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17351, __extension__
__PRETTY_FUNCTION__))

17351

"Unexpected mask element")(static_cast <bool> ((MergedMask[i] < 0 || MergedMask
[i] == M) && "Unexpected mask element") ? void (0) : __assert_fail
("(MergedMask[i] < 0 || MergedMask[i] == M) && \"Unexpected mask element\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17351, __extension__
__PRETTY_FUNCTION__));

17352

MergedMask[i] = M;

17353

}

17354

};

17355

17356

if (MatchMasks(InLaneMask, RepeatMask)) {

17357

// Merge this lane mask into the final repeat mask.

17358

MergeMasks(InLaneMask, RepeatMask);

17359

continue;

17360

}

17361

17362

// Didn't find a match. Swap the operands and try again.

17363

std::swap(LaneSrcs[Lane][0], LaneSrcs[Lane][1]);

17364

ShuffleVectorSDNode::commuteMask(InLaneMask);

17365

17366

if (MatchMasks(InLaneMask, RepeatMask)) {

17367

// Merge this lane mask into the final repeat mask.

17368

MergeMasks(InLaneMask, RepeatMask);

17369

continue;

17370

}

17371

17372

// Couldn't find a match with the operands in either order.

17373

return SDValue();

17374

}

17375

17376

// Now handle any lanes with only one source.

17377

for (int Lane = 0; Lane != NumLanes; ++Lane) {

17378

// If this lane has already been processed, skip it.

17379

if (LaneSrcs[Lane][0] >= 0)

17380

continue;

17381

17382

for (int i = 0; i != NumLaneElts; ++i) {

17383

int M = Mask[(Lane * NumLaneElts) + i];

17384

if (M < 0)

17385

continue;

17386

17387

// If RepeatMask isn't defined yet we can define it ourself.

17388

if (RepeatMask[i] < 0)

17389

RepeatMask[i] = M % NumLaneElts;

17390

17391

if (RepeatMask[i] < NumElts) {

17392

if (RepeatMask[i] != M % NumLaneElts)

17393

return SDValue();

17394

LaneSrcs[Lane][0] = M / NumLaneElts;

17395

} else {

17396

if (RepeatMask[i] != ((M % NumLaneElts) + NumElts))

17397

return SDValue();

17398

LaneSrcs[Lane][1] = M / NumLaneElts;

17399

}

17400

}

17401

17402

if (LaneSrcs[Lane][0] < 0 && LaneSrcs[Lane][1] < 0)

17403

return SDValue();

17404

}

17405

17406

SmallVector<int, 16> NewMask(NumElts, -1);

17407

for (int Lane = 0; Lane != NumLanes; ++Lane) {

17408

int Src = LaneSrcs[Lane][0];

17409

for (int i = 0; i != NumLaneElts; ++i) {

17410

int M = -1;

17411

if (Src >= 0)

17412

M = Src * NumLaneElts + i;

17413

NewMask[Lane * NumLaneElts + i] = M;

17414

}

17415

}

17416

SDValue NewV1 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);

17417

// Ensure we didn't get back the shuffle we started with.

17418

// FIXME: This is a hack to make up for some splat handling code in

17419

// getVectorShuffle.

17420

if (isa<ShuffleVectorSDNode>(NewV1) &&

17421

cast<ShuffleVectorSDNode>(NewV1)->getMask() == Mask)

17422

return SDValue();

17423

17424

for (int Lane = 0; Lane != NumLanes; ++Lane) {

17425

int Src = LaneSrcs[Lane][1];

17426

for (int i = 0; i != NumLaneElts; ++i) {

17427

int M = -1;

17428

if (Src >= 0)

17429

M = Src * NumLaneElts + i;

17430

NewMask[Lane * NumLaneElts + i] = M;

17431

}

17432

}

17433

SDValue NewV2 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);

17434

// Ensure we didn't get back the shuffle we started with.

17435

// FIXME: This is a hack to make up for some splat handling code in

17436

// getVectorShuffle.

17437

if (isa<ShuffleVectorSDNode>(NewV2) &&

17438

cast<ShuffleVectorSDNode>(NewV2)->getMask() == Mask)

17439

return SDValue();

17440

17441

for (int i = 0; i != NumElts; ++i) {

17442

NewMask[i] = RepeatMask[i % NumLaneElts];

17443

if (NewMask[i] < 0)

17444

continue;

17445

17446

NewMask[i] += (i / NumLaneElts) * NumLaneElts;

17447

}

17448

return DAG.getVectorShuffle(VT, DL, NewV1, NewV2, NewMask);

17449

}

17450

17451

/// If the input shuffle mask results in a vector that is undefined in all upper

17452

/// or lower half elements and that mask accesses only 2 halves of the

17453

/// shuffle's operands, return true. A mask of half the width with mask indexes

17454

/// adjusted to access the extracted halves of the original shuffle operands is

17455

/// returned in HalfMask. HalfIdx1 and HalfIdx2 return whether the upper or

17456

/// lower half of each input operand is accessed.

17457

static bool

17458

getHalfShuffleMask(ArrayRef<int> Mask, MutableArrayRef<int> HalfMask,

17459

int &HalfIdx1, int &HalfIdx2) {

17460

assert((Mask.size() == HalfMask.size() * 2) &&(static_cast <bool> ((Mask.size() == HalfMask.size() * 2
) && "Expected input mask to be twice as long as output"
) ? void (0) : __assert_fail ("(Mask.size() == HalfMask.size() * 2) && \"Expected input mask to be twice as long as output\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17461, __extension__
__PRETTY_FUNCTION__))

17461

"Expected input mask to be twice as long as output")(static_cast <bool> ((Mask.size() == HalfMask.size() * 2
) && "Expected input mask to be twice as long as output"
) ? void (0) : __assert_fail ("(Mask.size() == HalfMask.size() * 2) && \"Expected input mask to be twice as long as output\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17461, __extension__
__PRETTY_FUNCTION__));

17462

17463

// Exactly one half of the result must be undef to allow narrowing.

17464

bool UndefLower = isUndefLowerHalf(Mask);

17465

bool UndefUpper = isUndefUpperHalf(Mask);

17466

if (UndefLower == UndefUpper)

17467

return false;

17468

17469

unsigned HalfNumElts = HalfMask.size();

17470

unsigned MaskIndexOffset = UndefLower ? HalfNumElts : 0;

17471

HalfIdx1 = -1;

17472

HalfIdx2 = -1;

17473

for (unsigned i = 0; i != HalfNumElts; ++i) {

17474

int M = Mask[i + MaskIndexOffset];

17475

if (M < 0) {

17476

HalfMask[i] = M;

17477

continue;

17478

}

17479

17480

// Determine which of the 4 half vectors this element is from.

17481

// i.e. 0 = Lower V1, 1 = Upper V1, 2 = Lower V2, 3 = Upper V2.

17482

int HalfIdx = M / HalfNumElts;

17483

17484

// Determine the element index into its half vector source.

17485

int HalfElt = M % HalfNumElts;

17486

17487

// We can shuffle with up to 2 half vectors, set the new 'half'

17488

// shuffle mask accordingly.

17489

if (HalfIdx1 < 0 || HalfIdx1 == HalfIdx) {

17490

HalfMask[i] = HalfElt;

17491

HalfIdx1 = HalfIdx;

17492

continue;

17493

}

17494

if (HalfIdx2 < 0 || HalfIdx2 == HalfIdx) {

17495

HalfMask[i] = HalfElt + HalfNumElts;

17496

HalfIdx2 = HalfIdx;

17497

continue;

17498

}

17499

17500

// Too many half vectors referenced.

17501

return false;

17502

}

17503

17504

return true;

17505

}

17506

17507

/// Given the output values from getHalfShuffleMask(), create a half width

17508

/// shuffle of extracted vectors followed by an insert back to full width.

17509

static SDValue getShuffleHalfVectors(const SDLoc &DL, SDValue V1, SDValue V2,

17510

ArrayRef<int> HalfMask, int HalfIdx1,

17511

int HalfIdx2, bool UndefLower,

17512

SelectionDAG &DAG, bool UseConcat = false) {

17513

assert(V1.getValueType() == V2.getValueType() && "Different sized vectors?")(static_cast <bool> (V1.getValueType() == V2.getValueType
() && "Different sized vectors?") ? void (0) : __assert_fail
("V1.getValueType() == V2.getValueType() && \"Different sized vectors?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17513, __extension__
__PRETTY_FUNCTION__));

17514

assert(V1.getValueType().isSimple() && "Expecting only simple types")(static_cast <bool> (V1.getValueType().isSimple() &&
"Expecting only simple types") ? void (0) : __assert_fail ("V1.getValueType().isSimple() && \"Expecting only simple types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17514, __extension__
__PRETTY_FUNCTION__));

17515

17516

MVT VT = V1.getSimpleValueType();

17517

MVT HalfVT = VT.getHalfNumVectorElementsVT();

17518

unsigned HalfNumElts = HalfVT.getVectorNumElements();

17519

17520

auto getHalfVector = [&](int HalfIdx) {

17521

if (HalfIdx < 0)

17522

return DAG.getUNDEF(HalfVT);

17523

SDValue V = (HalfIdx < 2 ? V1 : V2);

17524

HalfIdx = (HalfIdx % 2) * HalfNumElts;

17525

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V,

17526

DAG.getIntPtrConstant(HalfIdx, DL));

17527

};

17528

17529

// ins undef, (shuf (ext V1, HalfIdx1), (ext V2, HalfIdx2), HalfMask), Offset

17530

SDValue Half1 = getHalfVector(HalfIdx1);

17531

SDValue Half2 = getHalfVector(HalfIdx2);

17532

SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask);

17533

if (UseConcat) {

17534

SDValue Op0 = V;

17535

SDValue Op1 = DAG.getUNDEF(HalfVT);

17536

if (UndefLower)

17537

std::swap(Op0, Op1);

17538

return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Op0, Op1);

17539

}

17540

17541

unsigned Offset = UndefLower ? HalfNumElts : 0;

17542

return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V,

17543

DAG.getIntPtrConstant(Offset, DL));

17544

}

17545

17546

/// Lower shuffles where an entire half of a 256 or 512-bit vector is UNDEF.

17547

/// This allows for fast cases such as subvector extraction/insertion

17548

/// or shuffling smaller vector types which can lower more efficiently.

17549

static SDValue lowerShuffleWithUndefHalf(const SDLoc &DL, MVT VT, SDValue V1,

17550

SDValue V2, ArrayRef<int> Mask,

17551

const X86Subtarget &Subtarget,

17552

SelectionDAG &DAG) {

17553

assert((VT.is256BitVector() || VT.is512BitVector()) &&(static_cast <bool> ((VT.is256BitVector() || VT.is512BitVector
()) && "Expected 256-bit or 512-bit vector") ? void (
0) : __assert_fail ("(VT.is256BitVector() || VT.is512BitVector()) && \"Expected 256-bit or 512-bit vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17554, __extension__
__PRETTY_FUNCTION__))

17554

"Expected 256-bit or 512-bit vector")(static_cast <bool> ((VT.is256BitVector() || VT.is512BitVector
()) && "Expected 256-bit or 512-bit vector") ? void (
0) : __assert_fail ("(VT.is256BitVector() || VT.is512BitVector()) && \"Expected 256-bit or 512-bit vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17554, __extension__
__PRETTY_FUNCTION__));

17555

17556

bool UndefLower = isUndefLowerHalf(Mask);

17557

if (!UndefLower && !isUndefUpperHalf(Mask))

17558

return SDValue();

17559

17560

assert((!UndefLower || !isUndefUpperHalf(Mask)) &&(static_cast <bool> ((!UndefLower || !isUndefUpperHalf(
Mask)) && "Completely undef shuffle mask should have been simplified already"
) ? void (0) : __assert_fail ("(!UndefLower || !isUndefUpperHalf(Mask)) && \"Completely undef shuffle mask should have been simplified already\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17561, __extension__
__PRETTY_FUNCTION__))

17561

"Completely undef shuffle mask should have been simplified already")(static_cast <bool> ((!UndefLower || !isUndefUpperHalf(
Mask)) && "Completely undef shuffle mask should have been simplified already"
) ? void (0) : __assert_fail ("(!UndefLower || !isUndefUpperHalf(Mask)) && \"Completely undef shuffle mask should have been simplified already\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17561, __extension__
__PRETTY_FUNCTION__));

17562

17563

// Upper half is undef and lower half is whole upper subvector.

17564

// e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>

17565

MVT HalfVT = VT.getHalfNumVectorElementsVT();

17566

unsigned HalfNumElts = HalfVT.getVectorNumElements();

17567

if (!UndefLower &&

17568

isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) {

17569

SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,

17570

DAG.getIntPtrConstant(HalfNumElts, DL));

17571

return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,

17572

DAG.getIntPtrConstant(0, DL));

17573

}

17574

17575

// Lower half is undef and upper half is whole lower subvector.

17576

// e.g. vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>

17577

if (UndefLower &&

17578

isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) {

17579

SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,

17580

DAG.getIntPtrConstant(0, DL));

17581

return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,

17582

DAG.getIntPtrConstant(HalfNumElts, DL));

17583

}

17584

17585

int HalfIdx1, HalfIdx2;

17586

SmallVector<int, 8> HalfMask(HalfNumElts);

17587

if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2))

17588

return SDValue();

17589

17590

assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length")(static_cast <bool> (HalfMask.size() == HalfNumElts &&
"Unexpected shuffle mask length") ? void (0) : __assert_fail
("HalfMask.size() == HalfNumElts && \"Unexpected shuffle mask length\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17590, __extension__
__PRETTY_FUNCTION__));

17591

17592

// Only shuffle the halves of the inputs when useful.

17593

unsigned NumLowerHalves =

17594

(HalfIdx1 == 0 || HalfIdx1 == 2) + (HalfIdx2 == 0 || HalfIdx2 == 2);

17595

unsigned NumUpperHalves =

17596

(HalfIdx1 == 1 || HalfIdx1 == 3) + (HalfIdx2 == 1 || HalfIdx2 == 3);

17597

assert(NumLowerHalves + NumUpperHalves <= 2 && "Only 1 or 2 halves allowed")(static_cast <bool> (NumLowerHalves + NumUpperHalves <=
2 && "Only 1 or 2 halves allowed") ? void (0) : __assert_fail
("NumLowerHalves + NumUpperHalves <= 2 && \"Only 1 or 2 halves allowed\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17597, __extension__
__PRETTY_FUNCTION__));

17598

17599

// Determine the larger pattern of undef/halves, then decide if it's worth

17600

// splitting the shuffle based on subtarget capabilities and types.

17601

unsigned EltWidth = VT.getVectorElementType().getSizeInBits();

17602

if (!UndefLower) {

17603

// XXXXuuuu: no insert is needed.

17604

// Always extract lowers when setting lower - these are all free subreg ops.

17605

if (NumUpperHalves == 0)

17606

return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,

17607

UndefLower, DAG);

17608

17609

if (NumUpperHalves == 1) {

17610

// AVX2 has efficient 32/64-bit element cross-lane shuffles.

17611

if (Subtarget.hasAVX2()) {

17612

// extract128 + vunpckhps/vshufps, is better than vblend + vpermps.

17613

if (EltWidth == 32 && NumLowerHalves && HalfVT.is128BitVector() &&

17614

!is128BitUnpackShuffleMask(HalfMask, DAG) &&

17615

(!isSingleSHUFPSMask(HalfMask) ||

17616

Subtarget.hasFastVariableCrossLaneShuffle()))

17617

return SDValue();

17618

// If this is a unary shuffle (assume that the 2nd operand is

17619

// canonicalized to undef), then we can use vpermpd. Otherwise, we

17620

// are better off extracting the upper half of 1 operand and using a

17621

// narrow shuffle.

17622

if (EltWidth == 64 && V2.isUndef())

17623

return SDValue();

17624

}

17625

// AVX512 has efficient cross-lane shuffles for all legal 512-bit types.

17626

if (Subtarget.hasAVX512() && VT.is512BitVector())

17627

return SDValue();

17628

// Extract + narrow shuffle is better than the wide alternative.

17629

return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,

17630

UndefLower, DAG);

17631

}

17632

17633

// Don't extract both uppers, instead shuffle and then extract.

17634

assert(NumUpperHalves == 2 && "Half vector count went wrong")(static_cast <bool> (NumUpperHalves == 2 && "Half vector count went wrong"
) ? void (0) : __assert_fail ("NumUpperHalves == 2 && \"Half vector count went wrong\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17634, __extension__
__PRETTY_FUNCTION__));

17635

return SDValue();

17636

}

17637

17638

// UndefLower - uuuuXXXX: an insert to high half is required if we split this.

17639

if (NumUpperHalves == 0) {

17640

// AVX2 has efficient 64-bit element cross-lane shuffles.

17641

// TODO: Refine to account for unary shuffle, splat, and other masks?

17642

if (Subtarget.hasAVX2() && EltWidth == 64)

17643

return SDValue();

17644

// AVX512 has efficient cross-lane shuffles for all legal 512-bit types.

17645

if (Subtarget.hasAVX512() && VT.is512BitVector())

17646

return SDValue();

17647

// Narrow shuffle + insert is better than the wide alternative.

17648

return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2,

17649

UndefLower, DAG);

17650

}

17651

17652

// NumUpperHalves != 0: don't bother with extract, shuffle, and then insert.

17653

return SDValue();

17654

}

17655

17656

/// Handle case where shuffle sources are coming from the same 128-bit lane and

17657

/// every lane can be represented as the same repeating mask - allowing us to

17658

/// shuffle the sources with the repeating shuffle and then permute the result

17659

/// to the destination lanes.

17660

static SDValue lowerShuffleAsRepeatedMaskAndLanePermute(

17661

const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,

17662

const X86Subtarget &Subtarget, SelectionDAG &DAG) {

17663

int NumElts = VT.getVectorNumElements();

17664

int NumLanes = VT.getSizeInBits() / 128;

17665

int NumLaneElts = NumElts / NumLanes;

17666

17667

// On AVX2 we may be able to just shuffle the lowest elements and then

17668

// broadcast the result.

17669

if (Subtarget.hasAVX2()) {

17670

for (unsigned BroadcastSize : {16, 32, 64}) {

17671

if (BroadcastSize <= VT.getScalarSizeInBits())

17672

continue;

17673

int NumBroadcastElts = BroadcastSize / VT.getScalarSizeInBits();

17674

17675

// Attempt to match a repeating pattern every NumBroadcastElts,

17676

// accounting for UNDEFs but only references the lowest 128-bit

17677

// lane of the inputs.

17678

auto FindRepeatingBroadcastMask = [&](SmallVectorImpl<int> &RepeatMask) {

17679

for (int i = 0; i != NumElts; i += NumBroadcastElts)

17680

for (int j = 0; j != NumBroadcastElts; ++j) {

17681

int M = Mask[i + j];

17682

if (M < 0)

17683

continue;

17684

int &R = RepeatMask[j];

17685

if (0 != ((M % NumElts) / NumLaneElts))

17686

return false;

17687

if (0 <= R && R != M)

17688

return false;

17689

R = M;

17690

}

17691

return true;

17692

};

17693

17694

SmallVector<int, 8> RepeatMask((unsigned)NumElts, -1);

17695

if (!FindRepeatingBroadcastMask(RepeatMask))

17696

continue;

17697

17698

// Shuffle the (lowest) repeated elements in place for broadcast.

17699

SDValue RepeatShuf = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatMask);

17700

17701

// Shuffle the actual broadcast.

17702

SmallVector<int, 8> BroadcastMask((unsigned)NumElts, -1);

17703

for (int i = 0; i != NumElts; i += NumBroadcastElts)

17704

for (int j = 0; j != NumBroadcastElts; ++j)

17705

BroadcastMask[i + j] = j;

17706

return DAG.getVectorShuffle(VT, DL, RepeatShuf, DAG.getUNDEF(VT),

17707

BroadcastMask);

17708

}

17709

}

17710

17711

// Bail if the shuffle mask doesn't cross 128-bit lanes.

17712

if (!is128BitLaneCrossingShuffleMask(VT, Mask))

17713

return SDValue();

17714

17715

// Bail if we already have a repeated lane shuffle mask.

17716

if (is128BitLaneRepeatedShuffleMask(VT, Mask))

17717

return SDValue();

17718

17719

// Helper to look for repeated mask in each split sublane, and that those

17720

// sublanes can then be permuted into place.

17721

auto ShuffleSubLanes = [&](int SubLaneScale) {

17722

int NumSubLanes = NumLanes * SubLaneScale;

17723

int NumSubLaneElts = NumLaneElts / SubLaneScale;

17724

17725

// Check that all the sources are coming from the same lane and see if we

17726

// can form a repeating shuffle mask (local to each sub-lane). At the same

17727

// time, determine the source sub-lane for each destination sub-lane.

17728

int TopSrcSubLane = -1;

17729

SmallVector<int, 8> Dst2SrcSubLanes((unsigned)NumSubLanes, -1);

17730

SmallVector<SmallVector<int, 8>> RepeatedSubLaneMasks(

17731

SubLaneScale,

17732

SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef));

17733

17734

for (int DstSubLane = 0; DstSubLane != NumSubLanes; ++DstSubLane) {

17735

// Extract the sub-lane mask, check that it all comes from the same lane

17736

// and normalize the mask entries to come from the first lane.

17737

int SrcLane = -1;

17738

SmallVector<int, 8> SubLaneMask((unsigned)NumSubLaneElts, -1);

17739

for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {

17740

int M = Mask[(DstSubLane * NumSubLaneElts) + Elt];

17741

if (M < 0)

17742

continue;

17743

int Lane = (M % NumElts) / NumLaneElts;

17744

if ((0 <= SrcLane) && (SrcLane != Lane))

17745

return SDValue();

17746

SrcLane = Lane;

17747

int LocalM = (M % NumLaneElts) + (M < NumElts ? 0 : NumElts);

17748

SubLaneMask[Elt] = LocalM;

17749

}

17750

17751

// Whole sub-lane is UNDEF.

17752

if (SrcLane < 0)

17753

continue;

17754

17755

// Attempt to match against the candidate repeated sub-lane masks.

17756

for (int SubLane = 0; SubLane != SubLaneScale; ++SubLane) {

17757

auto MatchMasks = [NumSubLaneElts](ArrayRef<int> M1, ArrayRef<int> M2) {

17758

for (int i = 0; i != NumSubLaneElts; ++i) {

17759

if (M1[i] < 0 || M2[i] < 0)

17760

continue;

17761

if (M1[i] != M2[i])

17762

return false;

17763

}

17764

return true;

17765

};

17766

17767

auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane];

17768

if (!MatchMasks(SubLaneMask, RepeatedSubLaneMask))

17769

continue;

17770

17771

// Merge the sub-lane mask into the matching repeated sub-lane mask.

17772

for (int i = 0; i != NumSubLaneElts; ++i) {

17773

int M = SubLaneMask[i];

17774

if (M < 0)

17775

continue;

17776

assert((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) &&(static_cast <bool> ((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask
[i] == M) && "Unexpected mask element") ? void (0) : __assert_fail
("(RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) && \"Unexpected mask element\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17777, __extension__
__PRETTY_FUNCTION__))

17777

"Unexpected mask element")(static_cast <bool> ((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask
[i] == M) && "Unexpected mask element") ? void (0) : __assert_fail
("(RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) && \"Unexpected mask element\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17777, __extension__
__PRETTY_FUNCTION__));

17778

RepeatedSubLaneMask[i] = M;

17779

}

17780

17781

// Track the top most source sub-lane - by setting the remaining to

17782

// UNDEF we can greatly simplify shuffle matching.

17783

int SrcSubLane = (SrcLane * SubLaneScale) + SubLane;

17784

TopSrcSubLane = std::max(TopSrcSubLane, SrcSubLane);

17785

Dst2SrcSubLanes[DstSubLane] = SrcSubLane;

17786

break;

17787

}

17788

17789

// Bail if we failed to find a matching repeated sub-lane mask.

17790

if (Dst2SrcSubLanes[DstSubLane] < 0)

17791

return SDValue();

17792

}

17793

assert(0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes &&(static_cast <bool> (0 <= TopSrcSubLane && TopSrcSubLane
< NumSubLanes && "Unexpected source lane") ? void
(0) : __assert_fail ("0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes && \"Unexpected source lane\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17794, __extension__
__PRETTY_FUNCTION__))

17794

"Unexpected source lane")(static_cast <bool> (0 <= TopSrcSubLane && TopSrcSubLane
< NumSubLanes && "Unexpected source lane") ? void
(0) : __assert_fail ("0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes && \"Unexpected source lane\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17794, __extension__
__PRETTY_FUNCTION__));

17795

17796

// Create a repeating shuffle mask for the entire vector.

17797

SmallVector<int, 8> RepeatedMask((unsigned)NumElts, -1);

17798

for (int SubLane = 0; SubLane <= TopSrcSubLane; ++SubLane) {

17799

int Lane = SubLane / SubLaneScale;

17800

auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane % SubLaneScale];

17801

for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {

17802

int M = RepeatedSubLaneMask[Elt];

17803

if (M < 0)

17804

continue;

17805

int Idx = (SubLane * NumSubLaneElts) + Elt;

17806

RepeatedMask[Idx] = M + (Lane * NumLaneElts);

17807

}

17808

}

17809

17810

// Shuffle each source sub-lane to its destination.

17811

SmallVector<int, 8> SubLaneMask((unsigned)NumElts, -1);

17812

for (int i = 0; i != NumElts; i += NumSubLaneElts) {

17813

int SrcSubLane = Dst2SrcSubLanes[i / NumSubLaneElts];

17814

if (SrcSubLane < 0)

17815

continue;

17816

for (int j = 0; j != NumSubLaneElts; ++j)

17817

SubLaneMask[i + j] = j + (SrcSubLane * NumSubLaneElts);

17818

}

17819

17820

// Avoid returning the same shuffle operation.

17821

// v8i32 = vector_shuffle<0,1,4,5,2,3,6,7> t5, undef:v8i32

17822

if (RepeatedMask == Mask || SubLaneMask == Mask)

17823

return SDValue();

17824

17825

SDValue RepeatedShuffle =

17826

DAG.getVectorShuffle(VT, DL, V1, V2, RepeatedMask);

17827

17828

return DAG.getVectorShuffle(VT, DL, RepeatedShuffle, DAG.getUNDEF(VT),

17829

SubLaneMask);

17830

};

17831

17832

// On AVX2 targets we can permute 256-bit vectors as 64-bit sub-lanes

17833

// (with PERMQ/PERMPD). On AVX2/AVX512BW targets, permuting 32-bit sub-lanes,

17834

// even with a variable shuffle, can be worth it for v32i8/v64i8 vectors.

17835

// Otherwise we can only permute whole 128-bit lanes.

17836

int MinSubLaneScale = 1, MaxSubLaneScale = 1;

17837

if (Subtarget.hasAVX2() && VT.is256BitVector()) {

17838

bool OnlyLowestElts = isUndefOrInRange(Mask, 0, NumLaneElts);

17839

MinSubLaneScale = 2;

17840

MaxSubLaneScale =

17841

(!OnlyLowestElts && V2.isUndef() && VT == MVT::v32i8) ? 4 : 2;

17842

}

17843

if (Subtarget.hasBWI() && VT == MVT::v64i8)

17844

MinSubLaneScale = MaxSubLaneScale = 4;

17845

17846

for (int Scale = MinSubLaneScale; Scale <= MaxSubLaneScale; Scale *= 2)

17847

if (SDValue Shuffle = ShuffleSubLanes(Scale))

17848

return Shuffle;

17849

17850

return SDValue();

17851

}

17852

17853

static bool matchShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2,

17854

bool &ForceV1Zero, bool &ForceV2Zero,

17855

unsigned &ShuffleImm, ArrayRef<int> Mask,

17856

const APInt &Zeroable) {

17857

int NumElts = VT.getVectorNumElements();

17858

assert(VT.getScalarSizeInBits() == 64 &&(static_cast <bool> (VT.getScalarSizeInBits() == 64 &&
(NumElts == 2 || NumElts == 4 || NumElts == 8) && "Unexpected data type for VSHUFPD"
) ? void (0) : __assert_fail ("VT.getScalarSizeInBits() == 64 && (NumElts == 2 || NumElts == 4 || NumElts == 8) && \"Unexpected data type for VSHUFPD\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17860, __extension__
__PRETTY_FUNCTION__))

17859

(NumElts == 2 || NumElts == 4 || NumElts == 8) &&(static_cast <bool> (VT.getScalarSizeInBits() == 64 &&
(NumElts == 2 || NumElts == 4 || NumElts == 8) && "Unexpected data type for VSHUFPD"
) ? void (0) : __assert_fail ("VT.getScalarSizeInBits() == 64 && (NumElts == 2 || NumElts == 4 || NumElts == 8) && \"Unexpected data type for VSHUFPD\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17860, __extension__
__PRETTY_FUNCTION__))

17860

"Unexpected data type for VSHUFPD")(static_cast <bool> (VT.getScalarSizeInBits() == 64 &&
(NumElts == 2 || NumElts == 4 || NumElts == 8) && "Unexpected data type for VSHUFPD"
) ? void (0) : __assert_fail ("VT.getScalarSizeInBits() == 64 && (NumElts == 2 || NumElts == 4 || NumElts == 8) && \"Unexpected data type for VSHUFPD\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17860, __extension__
__PRETTY_FUNCTION__));

17861

assert(isUndefOrZeroOrInRange(Mask, 0, 2 * NumElts) &&(static_cast <bool> (isUndefOrZeroOrInRange(Mask, 0, 2 *
NumElts) && "Illegal shuffle mask") ? void (0) : __assert_fail
("isUndefOrZeroOrInRange(Mask, 0, 2 * NumElts) && \"Illegal shuffle mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17862, __extension__
__PRETTY_FUNCTION__))

17862

"Illegal shuffle mask")(static_cast <bool> (isUndefOrZeroOrInRange(Mask, 0, 2 *
NumElts) && "Illegal shuffle mask") ? void (0) : __assert_fail
("isUndefOrZeroOrInRange(Mask, 0, 2 * NumElts) && \"Illegal shuffle mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17862, __extension__
__PRETTY_FUNCTION__));

17863

17864

bool ZeroLane[2] = { true, true };

17865

for (int i = 0; i < NumElts; ++i)

17866

ZeroLane[i & 1] &= Zeroable[i];

17867

17868

// Mask for V8F64: 0/1, 8/9, 2/3, 10/11, 4/5, ..

17869

// Mask for V4F64; 0/1, 4/5, 2/3, 6/7..

17870

ShuffleImm = 0;

17871

bool ShufpdMask = true;

17872

bool CommutableMask = true;

17873

for (int i = 0; i < NumElts; ++i) {

17874

if (Mask[i] == SM_SentinelUndef || ZeroLane[i & 1])

17875

continue;

17876

if (Mask[i] < 0)

17877

return false;

17878

int Val = (i & 6) + NumElts * (i & 1);

17879

int CommutVal = (i & 0xe) + NumElts * ((i & 1) ^ 1);

17880

if (Mask[i] < Val || Mask[i] > Val + 1)

17881

ShufpdMask = false;

17882

if (Mask[i] < CommutVal || Mask[i] > CommutVal + 1)

17883

CommutableMask = false;

17884

ShuffleImm |= (Mask[i] % 2) << i;

17885

}

17886

17887

if (!ShufpdMask && !CommutableMask)

17888

return false;

17889

17890

if (!ShufpdMask && CommutableMask)

17891

std::swap(V1, V2);

17892

17893

ForceV1Zero = ZeroLane[0];

17894

ForceV2Zero = ZeroLane[1];

17895

return true;

17896

}

17897

17898

static SDValue lowerShuffleWithSHUFPD(const SDLoc &DL, MVT VT, SDValue V1,

17899

SDValue V2, ArrayRef<int> Mask,

17900

const APInt &Zeroable,

17901

const X86Subtarget &Subtarget,

17902

SelectionDAG &DAG) {

17903

assert((VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64) &&(static_cast <bool> ((VT == MVT::v2f64 || VT == MVT::v4f64
|| VT == MVT::v8f64) && "Unexpected data type for VSHUFPD"
) ? void (0) : __assert_fail ("(VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64) && \"Unexpected data type for VSHUFPD\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17904, __extension__
__PRETTY_FUNCTION__))

17904

"Unexpected data type for VSHUFPD")(static_cast <bool> ((VT == MVT::v2f64 || VT == MVT::v4f64
|| VT == MVT::v8f64) && "Unexpected data type for VSHUFPD"
) ? void (0) : __assert_fail ("(VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64) && \"Unexpected data type for VSHUFPD\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17904, __extension__
__PRETTY_FUNCTION__));

17905

17906

unsigned Immediate = 0;

17907

bool ForceV1Zero = false, ForceV2Zero = false;

17908

if (!matchShuffleWithSHUFPD(VT, V1, V2, ForceV1Zero, ForceV2Zero, Immediate,

17909

Mask, Zeroable))

17910

return SDValue();

17911

17912

// Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.

17913

if (ForceV1Zero)

17914

V1 = getZeroVector(VT, Subtarget, DAG, DL);

17915

if (ForceV2Zero)

17916

V2 = getZeroVector(VT, Subtarget, DAG, DL);

17917

17918

return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,

17919

DAG.getTargetConstant(Immediate, DL, MVT::i8));

17920

}

17921

17922

// Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed

17923

// by zeroable elements in the remaining 24 elements. Turn this into two

17924

// vmovqb instructions shuffled together.

17925

static SDValue lowerShuffleAsVTRUNCAndUnpack(const SDLoc &DL, MVT VT,

17926

SDValue V1, SDValue V2,

17927

ArrayRef<int> Mask,

17928

const APInt &Zeroable,

17929

SelectionDAG &DAG) {

17930

assert(VT == MVT::v32i8 && "Unexpected type!")(static_cast <bool> (VT == MVT::v32i8 && "Unexpected type!"
) ? void (0) : __assert_fail ("VT == MVT::v32i8 && \"Unexpected type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17930, __extension__
__PRETTY_FUNCTION__));

17931

17932

// The first 8 indices should be every 8th element.

17933

if (!isSequentialOrUndefInRange(Mask, 0, 8, 0, 8))

17934

return SDValue();

17935

17936

// Remaining elements need to be zeroable.

17937

if (Zeroable.countLeadingOnes() < (Mask.size() - 8))

17938

return SDValue();

17939

17940

V1 = DAG.getBitcast(MVT::v4i64, V1);

17941

V2 = DAG.getBitcast(MVT::v4i64, V2);

17942

17943

V1 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V1);

17944

V2 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V2);

17945

17946

// The VTRUNCs will put 0s in the upper 12 bytes. Use them to put zeroes in

17947

// the upper bits of the result using an unpckldq.

17948

SDValue Unpack = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2,

17949

{ 0, 1, 2, 3, 16, 17, 18, 19,

17950

4, 5, 6, 7, 20, 21, 22, 23 });

17951

// Insert the unpckldq into a zero vector to widen to v32i8.

17952

return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v32i8,

17953

DAG.getConstant(0, DL, MVT::v32i8), Unpack,

17954

DAG.getIntPtrConstant(0, DL));

17955

}

17956

17957

// a = shuffle v1, v2, mask1 ; interleaving lower lanes of v1 and v2

17958

// b = shuffle v1, v2, mask2 ; interleaving higher lanes of v1 and v2

17959

// =>

17960

// ul = unpckl v1, v2

17961

// uh = unpckh v1, v2

17962

// a = vperm ul, uh

17963

// b = vperm ul, uh

17964

//

17965

// Pattern-match interleave(256b v1, 256b v2) -> 512b v3 and lower it into unpck

17966

// and permute. We cannot directly match v3 because it is split into two

17967

// 256-bit vectors in earlier isel stages. Therefore, this function matches a

17968

// pair of 256-bit shuffles and makes sure the masks are consecutive.

17969

//

17970

// Once unpck and permute nodes are created, the permute corresponding to this

17971

// shuffle is returned, while the other permute replaces the other half of the

17972

// shuffle in the selection dag.

17973

static SDValue lowerShufflePairAsUNPCKAndPermute(const SDLoc &DL, MVT VT,

17974

SDValue V1, SDValue V2,

17975

ArrayRef<int> Mask,

17976

SelectionDAG &DAG) {

17977

if (VT != MVT::v8f32 && VT != MVT::v8i32 && VT != MVT::v16i16 &&

17978

VT != MVT::v32i8)

17979

return SDValue();

17980

// <B0, B1, B0+1, B1+1, ..., >

17981

auto IsInterleavingPattern = [&](ArrayRef<int> Mask, unsigned Begin0,

17982

unsigned Begin1) {

17983

size_t Size = Mask.size();

17984

assert(Size % 2 == 0 && "Expected even mask size")(static_cast <bool> (Size % 2 == 0 && "Expected even mask size"
) ? void (0) : __assert_fail ("Size % 2 == 0 && \"Expected even mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 17984, __extension__
__PRETTY_FUNCTION__));

17985

for (unsigned I = 0; I < Size; I += 2) {

17986

if (Mask[I] != (int)(Begin0 + I / 2) ||

17987

Mask[I + 1] != (int)(Begin1 + I / 2))

17988

return false;

17989

}

17990

return true;

17991

};

17992

// Check which half is this shuffle node

17993

int NumElts = VT.getVectorNumElements();

17994

size_t FirstQtr = NumElts / 2;

17995

size_t ThirdQtr = NumElts + NumElts / 2;

17996

bool IsFirstHalf = IsInterleavingPattern(Mask, 0, NumElts);

17997

bool IsSecondHalf = IsInterleavingPattern(Mask, FirstQtr, ThirdQtr);

17998

if (!IsFirstHalf && !IsSecondHalf)

17999

return SDValue();

18000

18001

// Find the intersection between shuffle users of V1 and V2.

18002

SmallVector<SDNode *, 2> Shuffles;

18003

for (SDNode *User : V1->uses())

18004

if (User->getOpcode() == ISD::VECTOR_SHUFFLE && User->getOperand(0) == V1 &&

18005

User->getOperand(1) == V2)

18006

Shuffles.push_back(User);

18007

// Limit user size to two for now.

18008

if (Shuffles.size() != 2)

18009

return SDValue();

18010

// Find out which half of the 512-bit shuffles is each smaller shuffle

18011

auto *SVN1 = cast<ShuffleVectorSDNode>(Shuffles[0]);

18012

auto *SVN2 = cast<ShuffleVectorSDNode>(Shuffles[1]);

18013

SDNode *FirstHalf;

18014

SDNode *SecondHalf;

18015

if (IsInterleavingPattern(SVN1->getMask(), 0, NumElts) &&

18016

IsInterleavingPattern(SVN2->getMask(), FirstQtr, ThirdQtr)) {

18017

FirstHalf = Shuffles[0];

18018

SecondHalf = Shuffles[1];

18019

} else if (IsInterleavingPattern(SVN1->getMask(), FirstQtr, ThirdQtr) &&

18020

IsInterleavingPattern(SVN2->getMask(), 0, NumElts)) {

18021

FirstHalf = Shuffles[1];

18022

SecondHalf = Shuffles[0];

18023

} else {

18024

return SDValue();

18025

}

18026

// Lower into unpck and perm. Return the perm of this shuffle and replace

18027

// the other.

18028

SDValue Unpckl = DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);

18029

SDValue Unpckh = DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);

18030

SDValue Perm1 = DAG.getNode(X86ISD::VPERM2X128, DL, VT, Unpckl, Unpckh,

18031

DAG.getTargetConstant(0x20, DL, MVT::i8));

18032

SDValue Perm2 = DAG.getNode(X86ISD::VPERM2X128, DL, VT, Unpckl, Unpckh,

18033

DAG.getTargetConstant(0x31, DL, MVT::i8));

18034

if (IsFirstHalf) {

18035

DAG.ReplaceAllUsesWith(SecondHalf, &Perm2);

18036

return Perm1;

18037

}

18038

DAG.ReplaceAllUsesWith(FirstHalf, &Perm1);

18039

return Perm2;

18040

}

18041

18042

/// Handle lowering of 4-lane 64-bit floating point shuffles.

18043

///

18044

/// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2

18045

/// isn't available.

18046

static SDValue lowerV4F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

18047

const APInt &Zeroable, SDValue V1, SDValue V2,

18048

const X86Subtarget &Subtarget,

18049

SelectionDAG &DAG) {

18050

assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v4f64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v4f64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18050, __extension__
__PRETTY_FUNCTION__));

18051

assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v4f64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v4f64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18051, __extension__
__PRETTY_FUNCTION__));

18052

assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!")(static_cast <bool> (Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 4 && \"Unexpected mask size for v4 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18052, __extension__
__PRETTY_FUNCTION__));

18053

18054

if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4f64, V1, V2, Mask, Zeroable,

18055

Subtarget, DAG))

18056

return V;

18057

18058

if (V2.isUndef()) {

18059

// Check for being able to broadcast a single element.

18060

if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f64, V1, V2,

18061

Mask, Subtarget, DAG))

18062

return Broadcast;

18063

18064

// Use low duplicate instructions for masks that match their pattern.

18065

if (isShuffleEquivalent(Mask, {0, 0, 2, 2}, V1, V2))

18066

return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);

18067

18068

if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {

18069

// Non-half-crossing single input shuffles can be lowered with an

18070

// interleaved permutation.

18071

unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |

18072

((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3);

18073

return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1,

18074

DAG.getTargetConstant(VPERMILPMask, DL, MVT::i8));

18075

}

18076

18077

// With AVX2 we have direct support for this permutation.

18078

if (Subtarget.hasAVX2())

18079

return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1,

18080

getV4X86ShuffleImm8ForMask(Mask, DL, DAG));

18081

18082

// Try to create an in-lane repeating shuffle mask and then shuffle the

18083

// results into the target lanes.

18084

if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(

18085

DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))

18086

return V;

18087

18088

// Try to permute the lanes and then use a per-lane permute.

18089

if (SDValue V = lowerShuffleAsLanePermuteAndPermute(DL, MVT::v4f64, V1, V2,

18090

Mask, DAG, Subtarget))

18091

return V;

18092

18093

// Otherwise, fall back.

18094

return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v4f64, V1, V2, Mask,

18095

DAG, Subtarget);

18096

}

18097

18098

// Use dedicated unpack instructions for masks that match their pattern.

18099

if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f64, Mask, V1, V2, DAG))

18100

return V;

18101

18102

if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,

18103

Zeroable, Subtarget, DAG))

18104

return Blend;

18105

18106

// Check if the blend happens to exactly fit that of SHUFPD.

18107

if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v4f64, V1, V2, Mask,

18108

Zeroable, Subtarget, DAG))

18109

return Op;

18110

18111

bool V1IsInPlace = isShuffleMaskInputInPlace(0, Mask);

18112

bool V2IsInPlace = isShuffleMaskInputInPlace(1, Mask);

18113

18114

// If we have lane crossing shuffles AND they don't all come from the lower

18115

// lane elements, lower to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).

18116

// TODO: Handle BUILD_VECTOR sources which getVectorShuffle currently

18117

// canonicalize to a blend of splat which isn't necessary for this combine.

18118

if (is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask) &&

18119

!all_of(Mask, [](int M) { return M < 2 || (4 <= M && M < 6); }) &&

18120

(V1.getOpcode() != ISD::BUILD_VECTOR) &&

18121

(V2.getOpcode() != ISD::BUILD_VECTOR))

18122

return lowerShuffleAsLanePermuteAndSHUFP(DL, MVT::v4f64, V1, V2, Mask, DAG);

18123

18124

// If we have one input in place, then we can permute the other input and

18125

// blend the result.

18126

if (V1IsInPlace || V2IsInPlace)

18127

return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4f64, V1, V2, Mask,

18128

Subtarget, DAG);

18129

18130

// Try to create an in-lane repeating shuffle mask and then shuffle the

18131

// results into the target lanes.

18132

if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(

18133

DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))

18134

return V;

18135

18136

// Try to simplify this by merging 128-bit lanes to enable a lane-based

18137

// shuffle. However, if we have AVX2 and either inputs are already in place,

18138

// we will be able to shuffle even across lanes the other input in a single

18139

// instruction so skip this pattern.

18140

if (!(Subtarget.hasAVX2() && (V1IsInPlace || V2IsInPlace)))

18141

if (SDValue V = lowerShuffleAsLanePermuteAndRepeatedMask(

18142

DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))

18143

return V;

18144

18145

// If we have VLX support, we can use VEXPAND.

18146

if (Subtarget.hasVLX())

18147

if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v4f64, Zeroable, Mask, V1, V2,

18148

DAG, Subtarget))

18149

return V;

18150

18151

// If we have AVX2 then we always want to lower with a blend because an v4 we

18152

// can fully permute the elements.

18153

if (Subtarget.hasAVX2())

18154

return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4f64, V1, V2, Mask,

18155

Subtarget, DAG);

18156

18157

// Otherwise fall back on generic lowering.

18158

return lowerShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask,

18159

Subtarget, DAG);

18160

}

18161

18162

/// Handle lowering of 4-lane 64-bit integer shuffles.

18163

///

18164

/// This routine is only called when we have AVX2 and thus a reasonable

18165

/// instruction set for v4i64 shuffling..

18166

static SDValue lowerV4I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

18167

const APInt &Zeroable, SDValue V1, SDValue V2,

18168

const X86Subtarget &Subtarget,

18169

SelectionDAG &DAG) {

18170

assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v4i64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v4i64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18170, __extension__
__PRETTY_FUNCTION__));

18171

assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v4i64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v4i64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18171, __extension__
__PRETTY_FUNCTION__));

18172

assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!")(static_cast <bool> (Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 4 && \"Unexpected mask size for v4 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18172, __extension__
__PRETTY_FUNCTION__));

18173

assert(Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!")(static_cast <bool> (Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"We can only lower v4i64 with AVX2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18173, __extension__
__PRETTY_FUNCTION__));

18174

18175

if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4i64, V1, V2, Mask, Zeroable,

18176

Subtarget, DAG))

18177

return V;

18178

18179

if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,

18180

Zeroable, Subtarget, DAG))

18181

return Blend;

18182

18183

// Check for being able to broadcast a single element.

18184

if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i64, V1, V2, Mask,

18185

Subtarget, DAG))

18186

return Broadcast;

18187

18188

if (V2.isUndef()) {

18189

// When the shuffle is mirrored between the 128-bit lanes of the unit, we

18190

// can use lower latency instructions that will operate on both lanes.

18191

SmallVector<int, 2> RepeatedMask;

18192

if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {

18193

SmallVector<int, 4> PSHUFDMask;

18194

narrowShuffleMaskElts(2, RepeatedMask, PSHUFDMask);

18195

return DAG.getBitcast(

18196

MVT::v4i64,

18197

DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,

18198

DAG.getBitcast(MVT::v8i32, V1),

18199

getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));

18200

}

18201

18202

// AVX2 provides a direct instruction for permuting a single input across

18203

// lanes.

18204

return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,

18205

getV4X86ShuffleImm8ForMask(Mask, DL, DAG));

18206

}

18207

18208

// Try to use shift instructions.

18209

if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask,

18210

Zeroable, Subtarget, DAG))

18211

return Shift;

18212

18213

// If we have VLX support, we can use VALIGN or VEXPAND.

18214

if (Subtarget.hasVLX()) {

18215

if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i64, V1, V2, Mask,

18216

Subtarget, DAG))

18217

return Rotate;

18218

18219

if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v4i64, Zeroable, Mask, V1, V2,

18220

DAG, Subtarget))

18221

return V;

18222

}

18223

18224

// Try to use PALIGNR.

18225

if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i64, V1, V2, Mask,

18226

Subtarget, DAG))

18227

return Rotate;

18228

18229

// Use dedicated unpack instructions for masks that match their pattern.

18230

if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i64, Mask, V1, V2, DAG))

18231

return V;

18232

18233

bool V1IsInPlace = isShuffleMaskInputInPlace(0, Mask);

18234

bool V2IsInPlace = isShuffleMaskInputInPlace(1, Mask);

18235

18236

// If we have one input in place, then we can permute the other input and

18237

// blend the result.

18238

if (V1IsInPlace || V2IsInPlace)

18239

return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i64, V1, V2, Mask,

18240

Subtarget, DAG);

18241

18242

// Try to create an in-lane repeating shuffle mask and then shuffle the

18243

// results into the target lanes.

18244

if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(

18245

DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))

18246

return V;

18247

18248

// Try to lower to PERMQ(BLENDD(V1,V2)).

18249

if (SDValue V =

18250

lowerShuffleAsBlendAndPermute(DL, MVT::v4i64, V1, V2, Mask, DAG))

18251

return V;

18252

18253

// Try to simplify this by merging 128-bit lanes to enable a lane-based

18254

// shuffle. However, if we have AVX2 and either inputs are already in place,

18255

// we will be able to shuffle even across lanes the other input in a single

18256

// instruction so skip this pattern.

18257

if (!V1IsInPlace && !V2IsInPlace)

18258

if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(

18259

DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))

18260

return Result;

18261

18262

// Otherwise fall back on generic blend lowering.

18263

return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i64, V1, V2, Mask,

18264

Subtarget, DAG);

18265

}

18266

18267

/// Handle lowering of 8-lane 32-bit floating point shuffles.

18268

///

18269

/// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2

18270

/// isn't available.

18271

static SDValue lowerV8F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

18272

const APInt &Zeroable, SDValue V1, SDValue V2,

18273

const X86Subtarget &Subtarget,

18274

SelectionDAG &DAG) {

18275

assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v8f32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v8f32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18275, __extension__
__PRETTY_FUNCTION__));

18276

assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v8f32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v8f32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18276, __extension__
__PRETTY_FUNCTION__));

18277

assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")(static_cast <bool> (Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 8 && \"Unexpected mask size for v8 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18277, __extension__
__PRETTY_FUNCTION__));

18278

18279

if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,

18280

Zeroable, Subtarget, DAG))

18281

return Blend;

18282

18283

// Check for being able to broadcast a single element.

18284

if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8f32, V1, V2, Mask,

18285

Subtarget, DAG))

18286

return Broadcast;

18287

18288

// If the shuffle mask is repeated in each 128-bit lane, we have many more

18289

// options to efficiently lower the shuffle.

18290

SmallVector<int, 4> RepeatedMask;

18291

if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) {

18292

assert(RepeatedMask.size() == 4 &&(static_cast <bool> (RepeatedMask.size() == 4 &&
"Repeated masks must be half the mask width!") ? void (0) : __assert_fail
("RepeatedMask.size() == 4 && \"Repeated masks must be half the mask width!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18293, __extension__
__PRETTY_FUNCTION__))

18293

"Repeated masks must be half the mask width!")(static_cast <bool> (RepeatedMask.size() == 4 &&
"Repeated masks must be half the mask width!") ? void (0) : __assert_fail
("RepeatedMask.size() == 4 && \"Repeated masks must be half the mask width!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18293, __extension__
__PRETTY_FUNCTION__));

18294

18295

// Use even/odd duplicate instructions for masks that match their pattern.

18296

if (isShuffleEquivalent(RepeatedMask, {0, 0, 2, 2}, V1, V2))

18297

return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);

18298

if (isShuffleEquivalent(RepeatedMask, {1, 1, 3, 3}, V1, V2))

18299

return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);

18300

18301

if (V2.isUndef())

18302

return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,

18303

getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));

18304

18305

// Use dedicated unpack instructions for masks that match their pattern.

18306

if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8f32, Mask, V1, V2, DAG))

18307

return V;

18308

18309

// Otherwise, fall back to a SHUFPS sequence. Here it is important that we

18310

// have already handled any direct blends.

18311

return lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG);

18312

}

18313

18314

// Try to create an in-lane repeating shuffle mask and then shuffle the

18315

// results into the target lanes.

18316

if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(

18317

DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))

18318

return V;

18319

18320

// If we have a single input shuffle with different shuffle patterns in the

18321

// two 128-bit lanes use the variable mask to VPERMILPS.

18322

if (V2.isUndef()) {

18323

if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask)) {

18324

SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);

18325

return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, V1, VPermMask);

18326

}

18327

if (Subtarget.hasAVX2()) {

18328

SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);

18329

return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32, VPermMask, V1);

18330

}

18331

// Otherwise, fall back.

18332

return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v8f32, V1, V2, Mask,

18333

DAG, Subtarget);

18334

}

18335

18336

// Try to simplify this by merging 128-bit lanes to enable a lane-based

18337

// shuffle.

18338

if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(

18339

DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))

18340

return Result;

18341

18342

// If we have VLX support, we can use VEXPAND.

18343

if (Subtarget.hasVLX())

18344

if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8f32, Zeroable, Mask, V1, V2,

18345

DAG, Subtarget))

18346

return V;

18347

18348

// Try to match an interleave of two v8f32s and lower them as unpck and

18349

// permutes using ymms. This needs to go before we try to split the vectors.

18350

//

18351

// TODO: Expand this to AVX1. Currently v8i32 is casted to v8f32 and hits

18352

// this path inadvertently.

18353

if (Subtarget.hasAVX2() && !Subtarget.hasAVX512())

18354

if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v8f32, V1, V2,

18355

Mask, DAG))

18356

return V;

18357

18358

// For non-AVX512 if the Mask is of 16bit elements in lane then try to split

18359

// since after split we get a more efficient code using vpunpcklwd and

18360

// vpunpckhwd instrs than vblend.

18361

if (!Subtarget.hasAVX512() && isUnpackWdShuffleMask(Mask, MVT::v8f32, DAG))

18362

return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, Subtarget,

18363

DAG);

18364

18365

// If we have AVX2 then we always want to lower with a blend because at v8 we

18366

// can fully permute the elements.

18367

if (Subtarget.hasAVX2())

18368

return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8f32, V1, V2, Mask,

18369

Subtarget, DAG);

18370

18371

// Otherwise fall back on generic lowering.

18372

return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask,

18373

Subtarget, DAG);

18374

}

18375

18376

/// Handle lowering of 8-lane 32-bit integer shuffles.

18377

///

18378

/// This routine is only called when we have AVX2 and thus a reasonable

18379

/// instruction set for v8i32 shuffling..

18380

static SDValue lowerV8I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

18381

const APInt &Zeroable, SDValue V1, SDValue V2,

18382

const X86Subtarget &Subtarget,

18383

SelectionDAG &DAG) {

18384

assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v8i32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v8i32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18384, __extension__
__PRETTY_FUNCTION__));

18385

assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v8i32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v8i32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18385, __extension__
__PRETTY_FUNCTION__));

18386

assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")(static_cast <bool> (Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 8 && \"Unexpected mask size for v8 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18386, __extension__
__PRETTY_FUNCTION__));

18387

assert(Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!")(static_cast <bool> (Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"We can only lower v8i32 with AVX2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18387, __extension__
__PRETTY_FUNCTION__));

18388

18389

// Whenever we can lower this as a zext, that instruction is strictly faster

18390

// than any alternative. It also allows us to fold memory operands into the

18391

// shuffle in many cases.

18392

if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2, Mask,

18393

Zeroable, Subtarget, DAG))

18394

return ZExt;

18395

18396

// Try to match an interleave of two v8i32s and lower them as unpck and

18397

// permutes using ymms. This needs to go before we try to split the vectors.

18398

if (!Subtarget.hasAVX512())

18399

if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v8i32, V1, V2,

18400

Mask, DAG))

18401

return V;

18402

18403

// For non-AVX512 if the Mask is of 16bit elements in lane then try to split

18404

// since after split we get a more efficient code than vblend by using

18405

// vpunpcklwd and vpunpckhwd instrs.

18406

if (isUnpackWdShuffleMask(Mask, MVT::v8i32, DAG) && !V2.isUndef() &&

18407

!Subtarget.hasAVX512())

18408

return lowerShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2, Mask, Subtarget,

18409

DAG);

18410

18411

if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,

18412

Zeroable, Subtarget, DAG))

18413

return Blend;

18414

18415

// Check for being able to broadcast a single element.

18416

if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i32, V1, V2, Mask,

18417

Subtarget, DAG))

18418

return Broadcast;

18419

18420

// If the shuffle mask is repeated in each 128-bit lane we can use more

18421

// efficient instructions that mirror the shuffles across the two 128-bit

18422

// lanes.

18423

SmallVector<int, 4> RepeatedMask;

18424

bool Is128BitLaneRepeatedShuffle =

18425

is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask);

18426

if (Is128BitLaneRepeatedShuffle) {

18427

assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!")(static_cast <bool> (RepeatedMask.size() == 4 &&
"Unexpected repeated mask size!") ? void (0) : __assert_fail
("RepeatedMask.size() == 4 && \"Unexpected repeated mask size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18427, __extension__
__PRETTY_FUNCTION__));

18428

if (V2.isUndef())

18429

return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1,

18430

getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));

18431

18432

// Use dedicated unpack instructions for masks that match their pattern.

18433

if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i32, Mask, V1, V2, DAG))

18434

return V;

18435

}

18436

18437

// Try to use shift instructions.

18438

if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask,

18439

Zeroable, Subtarget, DAG))

18440

return Shift;

18441

18442

// If we have VLX support, we can use VALIGN or EXPAND.

18443

if (Subtarget.hasVLX()) {

18444

if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v8i32, V1, V2, Mask,

18445

Subtarget, DAG))

18446

return Rotate;

18447

18448

if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8i32, Zeroable, Mask, V1, V2,

18449

DAG, Subtarget))

18450

return V;

18451

}

18452

18453

// Try to use byte rotation instructions.

18454

if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i32, V1, V2, Mask,

18455

Subtarget, DAG))

18456

return Rotate;

18457

18458

// Try to create an in-lane repeating shuffle mask and then shuffle the

18459

// results into the target lanes.

18460

if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(

18461

DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))

18462

return V;

18463

18464

if (V2.isUndef()) {

18465

// Try to produce a fixed cross-128-bit lane permute followed by unpack

18466

// because that should be faster than the variable permute alternatives.

18467

if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v8i32, Mask, V1, V2, DAG))

18468

return V;

18469

18470

// If the shuffle patterns aren't repeated but it's a single input, directly

18471

// generate a cross-lane VPERMD instruction.

18472

SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);

18473

return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8i32, VPermMask, V1);

18474

}

18475

18476

// Assume that a single SHUFPS is faster than an alternative sequence of

18477

// multiple instructions (even if the CPU has a domain penalty).

18478

// If some CPU is harmed by the domain switch, we can fix it in a later pass.

18479

if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {

18480

SDValue CastV1 = DAG.getBitcast(MVT::v8f32, V1);

18481

SDValue CastV2 = DAG.getBitcast(MVT::v8f32, V2);

18482

SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask,

18483

CastV1, CastV2, DAG);

18484

return DAG.getBitcast(MVT::v8i32, ShufPS);

18485

}

18486

18487

// Try to simplify this by merging 128-bit lanes to enable a lane-based

18488

// shuffle.

18489

if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(

18490

DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))

18491

return Result;

18492

18493

// Otherwise fall back on generic blend lowering.

18494

return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8i32, V1, V2, Mask,

18495

Subtarget, DAG);

18496

}

18497

18498

/// Handle lowering of 16-lane 16-bit integer shuffles.

18499

///

18500

/// This routine is only called when we have AVX2 and thus a reasonable

18501

/// instruction set for v16i16 shuffling..

18502

static SDValue lowerV16I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

18503

const APInt &Zeroable, SDValue V1, SDValue V2,

18504

const X86Subtarget &Subtarget,

18505

SelectionDAG &DAG) {

18506

assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v16i16
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v16i16 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18506, __extension__
__PRETTY_FUNCTION__));

18507

assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v16i16
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v16i16 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18507, __extension__
__PRETTY_FUNCTION__));

18508

assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!")(static_cast <bool> (Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 16 && \"Unexpected mask size for v16 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18508, __extension__
__PRETTY_FUNCTION__));

18509

assert(Subtarget.hasAVX2() && "We can only lower v16i16 with AVX2!")(static_cast <bool> (Subtarget.hasAVX2() && "We can only lower v16i16 with AVX2!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"We can only lower v16i16 with AVX2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18509, __extension__
__PRETTY_FUNCTION__));

18510

18511

// Whenever we can lower this as a zext, that instruction is strictly faster

18512

// than any alternative. It also allows us to fold memory operands into the

18513

// shuffle in many cases.

18514

if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(

18515

DL, MVT::v16i16, V1, V2, Mask, Zeroable, Subtarget, DAG))

18516

return ZExt;

18517

18518

// Check for being able to broadcast a single element.

18519

if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i16, V1, V2, Mask,

18520

Subtarget, DAG))

18521

return Broadcast;

18522

18523

if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,

18524

Zeroable, Subtarget, DAG))

18525

return Blend;

18526

18527

// Use dedicated unpack instructions for masks that match their pattern.

18528

if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i16, Mask, V1, V2, DAG))

18529

return V;

18530

18531

// Use dedicated pack instructions for masks that match their pattern.

18532

if (SDValue V = lowerShuffleWithPACK(DL, MVT::v16i16, Mask, V1, V2, DAG,

18533

Subtarget))

18534

return V;

18535

18536

// Try to use lower using a truncation.

18537

if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v16i16, V1, V2, Mask, Zeroable,

18538

Subtarget, DAG))

18539

return V;

18540

18541

// Try to use shift instructions.

18542

if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask,

18543

Zeroable, Subtarget, DAG))

18544

return Shift;

18545

18546

// Try to use byte rotation instructions.

18547

if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i16, V1, V2, Mask,

18548

Subtarget, DAG))

18549

return Rotate;

18550

18551

// Try to create an in-lane repeating shuffle mask and then shuffle the

18552

// results into the target lanes.

18553

if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(

18554

DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))

18555

return V;

18556

18557

if (V2.isUndef()) {

18558

// Try to use bit rotation instructions.

18559

if (SDValue Rotate =

18560

lowerShuffleAsBitRotate(DL, MVT::v16i16, V1, Mask, Subtarget, DAG))

18561

return Rotate;

18562

18563

// Try to produce a fixed cross-128-bit lane permute followed by unpack

18564

// because that should be faster than the variable permute alternatives.

18565

if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v16i16, Mask, V1, V2, DAG))

18566

return V;

18567

18568

// There are no generalized cross-lane shuffle operations available on i16

18569

// element types.

18570

if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask)) {

18571

if (SDValue V = lowerShuffleAsLanePermuteAndPermute(

18572

DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))

18573

return V;

18574

18575

return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v16i16, V1, V2, Mask,

18576

DAG, Subtarget);

18577

}

18578

18579

SmallVector<int, 8> RepeatedMask;

18580

if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {

18581

// As this is a single-input shuffle, the repeated mask should be

18582

// a strictly valid v8i16 mask that we can pass through to the v8i16

18583

// lowering to handle even the v16 case.

18584

return lowerV8I16GeneralSingleInputShuffle(

18585

DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG);

18586

}

18587

}

18588

18589

if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v16i16, Mask, V1, V2,

18590

Zeroable, Subtarget, DAG))

18591

return PSHUFB;

18592

18593

// AVX512BW can lower to VPERMW (non-VLX will pad to v32i16).

18594

if (Subtarget.hasBWI())

18595

return lowerShuffleWithPERMV(DL, MVT::v16i16, Mask, V1, V2, Subtarget, DAG);

18596

18597

// Try to simplify this by merging 128-bit lanes to enable a lane-based

18598

// shuffle.

18599

if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(

18600

DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))

18601

return Result;

18602

18603

// Try to permute the lanes and then use a per-lane permute.

18604

if (SDValue V = lowerShuffleAsLanePermuteAndPermute(

18605

DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))

18606

return V;

18607

18608

// Try to match an interleave of two v16i16s and lower them as unpck and

18609

// permutes using ymms.

18610

if (!Subtarget.hasAVX512())

18611

if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v16i16, V1, V2,

18612

Mask, DAG))

18613

return V;

18614

18615

// Otherwise fall back on generic lowering.

18616

return lowerShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask,

18617

Subtarget, DAG);

18618

}

18619

18620

/// Handle lowering of 32-lane 8-bit integer shuffles.

18621

///

18622

/// This routine is only called when we have AVX2 and thus a reasonable

18623

/// instruction set for v32i8 shuffling..

18624

static SDValue lowerV32I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

18625

const APInt &Zeroable, SDValue V1, SDValue V2,

18626

const X86Subtarget &Subtarget,

18627

SelectionDAG &DAG) {

18628

assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v32i8
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v32i8 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18628, __extension__
__PRETTY_FUNCTION__));

18629

assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v32i8
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v32i8 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18629, __extension__
__PRETTY_FUNCTION__));

18630

assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!")(static_cast <bool> (Mask.size() == 32 && "Unexpected mask size for v32 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 32 && \"Unexpected mask size for v32 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18630, __extension__
__PRETTY_FUNCTION__));

18631

assert(Subtarget.hasAVX2() && "We can only lower v32i8 with AVX2!")(static_cast <bool> (Subtarget.hasAVX2() && "We can only lower v32i8 with AVX2!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"We can only lower v32i8 with AVX2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18631, __extension__
__PRETTY_FUNCTION__));

18632

18633

// Whenever we can lower this as a zext, that instruction is strictly faster

18634

// than any alternative. It also allows us to fold memory operands into the

18635

// shuffle in many cases.

18636

if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v32i8, V1, V2, Mask,

18637

Zeroable, Subtarget, DAG))

18638

return ZExt;

18639

18640

// Check for being able to broadcast a single element.

18641

if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v32i8, V1, V2, Mask,

18642

Subtarget, DAG))

18643

return Broadcast;

18644

18645

if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,

18646

Zeroable, Subtarget, DAG))

18647

return Blend;

18648

18649

// Use dedicated unpack instructions for masks that match their pattern.

18650

if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i8, Mask, V1, V2, DAG))

18651

return V;

18652

18653

// Use dedicated pack instructions for masks that match their pattern.

18654

if (SDValue V = lowerShuffleWithPACK(DL, MVT::v32i8, Mask, V1, V2, DAG,

18655

Subtarget))

18656

return V;

18657

18658

// Try to use lower using a truncation.

18659

if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v32i8, V1, V2, Mask, Zeroable,

18660

Subtarget, DAG))

18661

return V;

18662

18663

// Try to use shift instructions.

18664

if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask,

18665

Zeroable, Subtarget, DAG))

18666

return Shift;

18667

18668

// Try to use byte rotation instructions.

18669

if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i8, V1, V2, Mask,

18670

Subtarget, DAG))

18671

return Rotate;

18672

18673

// Try to use bit rotation instructions.

18674

if (V2.isUndef())

18675

if (SDValue Rotate =

18676

lowerShuffleAsBitRotate(DL, MVT::v32i8, V1, Mask, Subtarget, DAG))

18677

return Rotate;

18678

18679

// Try to create an in-lane repeating shuffle mask and then shuffle the

18680

// results into the target lanes.

18681

if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(

18682

DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))

18683

return V;

18684

18685

// There are no generalized cross-lane shuffle operations available on i8

18686

// element types.

18687

if (V2.isUndef() && is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask)) {

18688

// Try to produce a fixed cross-128-bit lane permute followed by unpack

18689

// because that should be faster than the variable permute alternatives.

18690

if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v32i8, Mask, V1, V2, DAG))

18691

return V;

18692

18693

if (SDValue V = lowerShuffleAsLanePermuteAndPermute(

18694

DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))

18695

return V;

18696

18697

return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v32i8, V1, V2, Mask,

18698

DAG, Subtarget);

18699

}

18700

18701

if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i8, Mask, V1, V2,

18702

Zeroable, Subtarget, DAG))

18703

return PSHUFB;

18704

18705

// AVX512VBMI can lower to VPERMB (non-VLX will pad to v64i8).

18706

if (Subtarget.hasVBMI())

18707

return lowerShuffleWithPERMV(DL, MVT::v32i8, Mask, V1, V2, Subtarget, DAG);

18708

18709

// Try to simplify this by merging 128-bit lanes to enable a lane-based

18710

// shuffle.

18711

if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(

18712

DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))

18713

return Result;

18714

18715

// Try to permute the lanes and then use a per-lane permute.

18716

if (SDValue V = lowerShuffleAsLanePermuteAndPermute(

18717

DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))

18718

return V;

18719

18720

// Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed

18721

// by zeroable elements in the remaining 24 elements. Turn this into two

18722

// vmovqb instructions shuffled together.

18723

if (Subtarget.hasVLX())

18724

if (SDValue V = lowerShuffleAsVTRUNCAndUnpack(DL, MVT::v32i8, V1, V2,

18725

Mask, Zeroable, DAG))

18726

return V;

18727

18728

// Try to match an interleave of two v32i8s and lower them as unpck and

18729

// permutes using ymms.

18730

if (!Subtarget.hasAVX512())

18731

if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v32i8, V1, V2,

18732

Mask, DAG))

18733

return V;

18734

18735

// Otherwise fall back on generic lowering.

18736

return lowerShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask,

18737

Subtarget, DAG);

18738

}

18739

18740

/// High-level routine to lower various 256-bit x86 vector shuffles.

18741

///

18742

/// This routine either breaks down the specific type of a 256-bit x86 vector

18743

/// shuffle or splits it into two 128-bit shuffles and fuses the results back

18744

/// together based on the available instructions.

18745

static SDValue lower256BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,

18746

SDValue V1, SDValue V2, const APInt &Zeroable,

18747

const X86Subtarget &Subtarget,

18748

SelectionDAG &DAG) {

18749

// If we have a single input to the zero element, insert that into V1 if we

18750

// can do so cheaply.

18751

int NumElts = VT.getVectorNumElements();

18752

int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });

18753

18754

if (NumV2Elements == 1 && Mask[0] >= NumElts)

18755

if (SDValue Insertion = lowerShuffleAsElementInsertion(

18756

DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))

18757

return Insertion;

18758

18759

// Handle special cases where the lower or upper half is UNDEF.

18760

if (SDValue V =

18761

lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))

18762

return V;

18763

18764

// There is a really nice hard cut-over between AVX1 and AVX2 that means we

18765

// can check for those subtargets here and avoid much of the subtarget

18766

// querying in the per-vector-type lowering routines. With AVX1 we have

18767

// essentially *zero* ability to manipulate a 256-bit vector with integer

18768

// types. Since we'll use floating point types there eventually, just

18769

// immediately cast everything to a float and operate entirely in that domain.

18770

if (VT.isInteger() && !Subtarget.hasAVX2()) {

18771

int ElementBits = VT.getScalarSizeInBits();

18772

if (ElementBits < 32) {

18773

// No floating point type available, if we can't use the bit operations

18774

// for masking/blending then decompose into 128-bit vectors.

18775

if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,

18776

Subtarget, DAG))

18777

return V;

18778

if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))

18779

return V;

18780

return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);

18781

}

18782

18783

MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),

18784

VT.getVectorNumElements());

18785

V1 = DAG.getBitcast(FpVT, V1);

18786

V2 = DAG.getBitcast(FpVT, V2);

18787

return DAG.getBitcast(VT, DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask));

18788

}

18789

18790

if (VT == MVT::v16f16) {

18791

V1 = DAG.getBitcast(MVT::v16i16, V1);

18792

V2 = DAG.getBitcast(MVT::v16i16, V2);

18793

return DAG.getBitcast(MVT::v16f16,

18794

DAG.getVectorShuffle(MVT::v16i16, DL, V1, V2, Mask));

18795

}

18796

18797

switch (VT.SimpleTy) {

18798

case MVT::v4f64:

18799

return lowerV4F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

18800

case MVT::v4i64:

18801

return lowerV4I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

18802

case MVT::v8f32:

18803

return lowerV8F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

18804

case MVT::v8i32:

18805

return lowerV8I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

18806

case MVT::v16i16:

18807

return lowerV16I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

18808

case MVT::v32i8:

18809

return lowerV32I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

18810

18811

default:

18812

llvm_unreachable("Not a valid 256-bit x86 vector type!")::llvm::llvm_unreachable_internal("Not a valid 256-bit x86 vector type!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18812);

18813

}

18814

}

18815

18816

/// Try to lower a vector shuffle as a 128-bit shuffles.

18817

static SDValue lowerV4X128Shuffle(const SDLoc &DL, MVT VT, ArrayRef<int> Mask,

18818

const APInt &Zeroable, SDValue V1, SDValue V2,

18819

const X86Subtarget &Subtarget,

18820

SelectionDAG &DAG) {

18821

assert(VT.getScalarSizeInBits() == 64 &&(static_cast <bool> (VT.getScalarSizeInBits() == 64 &&
"Unexpected element type size for 128bit shuffle.") ? void (
0) : __assert_fail ("VT.getScalarSizeInBits() == 64 && \"Unexpected element type size for 128bit shuffle.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18822, __extension__
__PRETTY_FUNCTION__))

18822

"Unexpected element type size for 128bit shuffle.")(static_cast <bool> (VT.getScalarSizeInBits() == 64 &&
"Unexpected element type size for 128bit shuffle.") ? void (
0) : __assert_fail ("VT.getScalarSizeInBits() == 64 && \"Unexpected element type size for 128bit shuffle.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18822, __extension__
__PRETTY_FUNCTION__));

18823

18824

// To handle 256 bit vector requires VLX and most probably

18825

// function lowerV2X128VectorShuffle() is better solution.

18826

assert(VT.is512BitVector() && "Unexpected vector size for 512bit shuffle.")(static_cast <bool> (VT.is512BitVector() && "Unexpected vector size for 512bit shuffle."
) ? void (0) : __assert_fail ("VT.is512BitVector() && \"Unexpected vector size for 512bit shuffle.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18826, __extension__
__PRETTY_FUNCTION__));

18827

18828

// TODO - use Zeroable like we do for lowerV2X128VectorShuffle?

18829

SmallVector<int, 4> Widened128Mask;

18830

if (!canWidenShuffleElements(Mask, Widened128Mask))

18831

return SDValue();

18832

assert(Widened128Mask.size() == 4 && "Shuffle widening mismatch")(static_cast <bool> (Widened128Mask.size() == 4 &&
"Shuffle widening mismatch") ? void (0) : __assert_fail ("Widened128Mask.size() == 4 && \"Shuffle widening mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18832, __extension__
__PRETTY_FUNCTION__));

18833

18834

// Try to use an insert into a zero vector.

18835

if (Widened128Mask[0] == 0 && (Zeroable & 0xf0) == 0xf0 &&

18836

(Widened128Mask[1] == 1 || (Zeroable & 0x0c) == 0x0c)) {

18837

unsigned NumElts = ((Zeroable & 0x0c) == 0x0c) ? 2 : 4;

18838

MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);

18839

SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,

18840

DAG.getIntPtrConstant(0, DL));

18841

return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,

18842

getZeroVector(VT, Subtarget, DAG, DL), LoV,

18843

DAG.getIntPtrConstant(0, DL));

18844

}

18845

18846

// Check for patterns which can be matched with a single insert of a 256-bit

18847

// subvector.

18848

bool OnlyUsesV1 = isShuffleEquivalent(Mask, {0, 1, 2, 3, 0, 1, 2, 3}, V1, V2);

18849

if (OnlyUsesV1 ||

18850

isShuffleEquivalent(Mask, {0, 1, 2, 3, 8, 9, 10, 11}, V1, V2)) {

18851

MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 4);

18852

SDValue SubVec =

18853

DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, OnlyUsesV1 ? V1 : V2,

18854

DAG.getIntPtrConstant(0, DL));

18855

return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,

18856

DAG.getIntPtrConstant(4, DL));

18857

}

18858

18859

// See if this is an insertion of the lower 128-bits of V2 into V1.

18860

bool IsInsert = true;

18861

int V2Index = -1;

18862

for (int i = 0; i < 4; ++i) {

18863

assert(Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value")(static_cast <bool> (Widened128Mask[i] >= -1 &&
"Illegal shuffle sentinel value") ? void (0) : __assert_fail
("Widened128Mask[i] >= -1 && \"Illegal shuffle sentinel value\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18863, __extension__
__PRETTY_FUNCTION__));

18864

if (Widened128Mask[i] < 0)

18865

continue;

18866

18867

// Make sure all V1 subvectors are in place.

18868

if (Widened128Mask[i] < 4) {

18869

if (Widened128Mask[i] != i) {

18870

IsInsert = false;

18871

break;

18872

}

18873

} else {

18874

// Make sure we only have a single V2 index and its the lowest 128-bits.

18875

if (V2Index >= 0 || Widened128Mask[i] != 4) {

18876

IsInsert = false;

18877

break;

18878

}

18879

V2Index = i;

18880

}

18881

}

18882

if (IsInsert && V2Index >= 0) {

18883

MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);

18884

SDValue Subvec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2,

18885

DAG.getIntPtrConstant(0, DL));

18886

return insert128BitVector(V1, Subvec, V2Index * 2, DAG, DL);

18887

}

18888

18889

// See if we can widen to a 256-bit lane shuffle, we're going to lose 128-lane

18890

// UNDEF info by lowering to X86ISD::SHUF128 anyway, so by widening where

18891

// possible we at least ensure the lanes stay sequential to help later

18892

// combines.

18893

SmallVector<int, 2> Widened256Mask;

18894

if (canWidenShuffleElements(Widened128Mask, Widened256Mask)) {

18895

Widened128Mask.clear();

18896

narrowShuffleMaskElts(2, Widened256Mask, Widened128Mask);

18897

}

18898

18899

// Try to lower to vshuf64x2/vshuf32x4.

18900

SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};

18901

unsigned PermMask = 0;

18902

// Insure elements came from the same Op.

18903

for (int i = 0; i < 4; ++i) {

18904

assert(Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value")(static_cast <bool> (Widened128Mask[i] >= -1 &&
"Illegal shuffle sentinel value") ? void (0) : __assert_fail
("Widened128Mask[i] >= -1 && \"Illegal shuffle sentinel value\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18904, __extension__
__PRETTY_FUNCTION__));

18905

if (Widened128Mask[i] < 0)

18906

continue;

18907

18908

SDValue Op = Widened128Mask[i] >= 4 ? V2 : V1;

18909

unsigned OpIndex = i / 2;

18910

if (Ops[OpIndex].isUndef())

18911

Ops[OpIndex] = Op;

18912

else if (Ops[OpIndex] != Op)

18913

return SDValue();

18914

18915

// Convert the 128-bit shuffle mask selection values into 128-bit selection

18916

// bits defined by a vshuf64x2 instruction's immediate control byte.

18917

PermMask |= (Widened128Mask[i] % 4) << (i * 2);

18918

}

18919

18920

return DAG.getNode(X86ISD::SHUF128, DL, VT, Ops[0], Ops[1],

18921

DAG.getTargetConstant(PermMask, DL, MVT::i8));

18922

}

18923

18924

/// Handle lowering of 8-lane 64-bit floating point shuffles.

18925

static SDValue lowerV8F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

18926

const APInt &Zeroable, SDValue V1, SDValue V2,

18927

const X86Subtarget &Subtarget,

18928

SelectionDAG &DAG) {

18929

assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v8f64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v8f64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18929, __extension__
__PRETTY_FUNCTION__));

18930

assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v8f64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v8f64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18930, __extension__
__PRETTY_FUNCTION__));

18931

assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")(static_cast <bool> (Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 8 && \"Unexpected mask size for v8 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18931, __extension__
__PRETTY_FUNCTION__));

18932

18933

if (V2.isUndef()) {

18934

// Use low duplicate instructions for masks that match their pattern.

18935

if (isShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6}, V1, V2))

18936

return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v8f64, V1);

18937

18938

if (!is128BitLaneCrossingShuffleMask(MVT::v8f64, Mask)) {

18939

// Non-half-crossing single input shuffles can be lowered with an

18940

// interleaved permutation.

18941

unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |

18942

((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3) |

18943

((Mask[4] == 5) << 4) | ((Mask[5] == 5) << 5) |

18944

((Mask[6] == 7) << 6) | ((Mask[7] == 7) << 7);

18945

return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f64, V1,

18946

DAG.getTargetConstant(VPERMILPMask, DL, MVT::i8));

18947

}

18948

18949

SmallVector<int, 4> RepeatedMask;

18950

if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask))

18951

return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8f64, V1,

18952

getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));

18953

}

18954

18955

if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8f64, Mask, Zeroable, V1,

18956

V2, Subtarget, DAG))

18957

return Shuf128;

18958

18959

if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8f64, Mask, V1, V2, DAG))

18960

return Unpck;

18961

18962

// Check if the blend happens to exactly fit that of SHUFPD.

18963

if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v8f64, V1, V2, Mask,

18964

Zeroable, Subtarget, DAG))

18965

return Op;

18966

18967

if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8f64, Zeroable, Mask, V1, V2,

18968

DAG, Subtarget))

18969

return V;

18970

18971

if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f64, V1, V2, Mask,

18972

Zeroable, Subtarget, DAG))

18973

return Blend;

18974

18975

return lowerShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, Subtarget, DAG);

18976

}

18977

18978

/// Handle lowering of 16-lane 32-bit floating point shuffles.

18979

static SDValue lowerV16F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

18980

const APInt &Zeroable, SDValue V1, SDValue V2,

18981

const X86Subtarget &Subtarget,

18982

SelectionDAG &DAG) {

18983

assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v16f32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v16f32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18983, __extension__
__PRETTY_FUNCTION__));

18984

assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v16f32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v16f32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18984, __extension__
__PRETTY_FUNCTION__));

18985

assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!")(static_cast <bool> (Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 16 && \"Unexpected mask size for v16 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18985, __extension__
__PRETTY_FUNCTION__));

18986

18987

// If the shuffle mask is repeated in each 128-bit lane, we have many more

18988

// options to efficiently lower the shuffle.

18989

SmallVector<int, 4> RepeatedMask;

18990

if (is128BitLaneRepeatedShuffleMask(MVT::v16f32, Mask, RepeatedMask)) {

18991

assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!")(static_cast <bool> (RepeatedMask.size() == 4 &&
"Unexpected repeated mask size!") ? void (0) : __assert_fail
("RepeatedMask.size() == 4 && \"Unexpected repeated mask size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 18991, __extension__
__PRETTY_FUNCTION__));

18992

18993

// Use even/odd duplicate instructions for masks that match their pattern.

18994

if (isShuffleEquivalent(RepeatedMask, {0, 0, 2, 2}, V1, V2))

18995

return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v16f32, V1);

18996

if (isShuffleEquivalent(RepeatedMask, {1, 1, 3, 3}, V1, V2))

18997

return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v16f32, V1);

18998

18999

if (V2.isUndef())

19000

return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v16f32, V1,

19001

getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));

19002

19003

// Use dedicated unpack instructions for masks that match their pattern.

19004

if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16f32, Mask, V1, V2, DAG))

19005

return V;

19006

19007

if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,

19008

Zeroable, Subtarget, DAG))

19009

return Blend;

19010

19011

// Otherwise, fall back to a SHUFPS sequence.

19012

return lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG);

19013

}

19014

19015

// Try to create an in-lane repeating shuffle mask and then shuffle the

19016

// results into the target lanes.

19017

if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(

19018

DL, MVT::v16f32, V1, V2, Mask, Subtarget, DAG))

19019

return V;

19020

19021

// If we have a single input shuffle with different shuffle patterns in the

19022

// 128-bit lanes and don't lane cross, use variable mask VPERMILPS.

19023

if (V2.isUndef() &&

19024

!is128BitLaneCrossingShuffleMask(MVT::v16f32, Mask)) {

19025

SDValue VPermMask = getConstVector(Mask, MVT::v16i32, DAG, DL, true);

19026

return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v16f32, V1, VPermMask);

19027

}

19028

19029

// If we have AVX512F support, we can use VEXPAND.

19030

if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v16f32, Zeroable, Mask,

19031

V1, V2, DAG, Subtarget))

19032

return V;

19033

19034

return lowerShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, Subtarget, DAG);

19035

}

19036

19037

/// Handle lowering of 8-lane 64-bit integer shuffles.

19038

static SDValue lowerV8I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

19039

const APInt &Zeroable, SDValue V1, SDValue V2,

19040

const X86Subtarget &Subtarget,

19041

SelectionDAG &DAG) {

19042

assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v8i64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v8i64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19042, __extension__
__PRETTY_FUNCTION__));

19043

assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v8i64
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v8i64 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19043, __extension__
__PRETTY_FUNCTION__));

19044

assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")(static_cast <bool> (Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 8 && \"Unexpected mask size for v8 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19044, __extension__
__PRETTY_FUNCTION__));

19045

19046

if (V2.isUndef()) {

19047

// When the shuffle is mirrored between the 128-bit lanes of the unit, we

19048

// can use lower latency instructions that will operate on all four

19049

// 128-bit lanes.

19050

SmallVector<int, 2> Repeated128Mask;

19051

if (is128BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated128Mask)) {

19052

SmallVector<int, 4> PSHUFDMask;

19053

narrowShuffleMaskElts(2, Repeated128Mask, PSHUFDMask);

19054

return DAG.getBitcast(

19055

MVT::v8i64,

19056

DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32,

19057

DAG.getBitcast(MVT::v16i32, V1),

19058

getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));

19059

}

19060

19061

SmallVector<int, 4> Repeated256Mask;

19062

if (is256BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated256Mask))

19063

return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8i64, V1,

19064

getV4X86ShuffleImm8ForMask(Repeated256Mask, DL, DAG));

19065

}

19066

19067

if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8i64, Mask, Zeroable, V1,

19068

V2, Subtarget, DAG))

19069

return Shuf128;

19070

19071

// Try to use shift instructions.

19072

if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask,

19073

Zeroable, Subtarget, DAG))

19074

return Shift;

19075

19076

// Try to use VALIGN.

19077

if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v8i64, V1, V2, Mask,

19078

Subtarget, DAG))

19079

return Rotate;

19080

19081

// Try to use PALIGNR.

19082

if (Subtarget.hasBWI())

19083

if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i64, V1, V2, Mask,

19084

Subtarget, DAG))

19085

return Rotate;

19086

19087

if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG))

19088

return Unpck;

19089

19090

// If we have AVX512F support, we can use VEXPAND.

19091

if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8i64, Zeroable, Mask, V1, V2,

19092

DAG, Subtarget))

19093

return V;

19094

19095

if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i64, V1, V2, Mask,

19096

Zeroable, Subtarget, DAG))

19097

return Blend;

19098

19099

return lowerShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, Subtarget, DAG);

19100

}

19101

19102

/// Handle lowering of 16-lane 32-bit integer shuffles.

19103

static SDValue lowerV16I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

19104

const APInt &Zeroable, SDValue V1, SDValue V2,

19105

const X86Subtarget &Subtarget,

19106

SelectionDAG &DAG) {

19107

assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v16i32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v16i32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19107, __extension__
__PRETTY_FUNCTION__));

19108

assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v16i32
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v16i32 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19108, __extension__
__PRETTY_FUNCTION__));

19109

assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!")(static_cast <bool> (Mask.size() == 16 && "Unexpected mask size for v16 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 16 && \"Unexpected mask size for v16 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19109, __extension__
__PRETTY_FUNCTION__));

19110

19111

// Whenever we can lower this as a zext, that instruction is strictly faster

19112

// than any alternative. It also allows us to fold memory operands into the

19113

// shuffle in many cases.

19114

if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(

19115

DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))

19116

return ZExt;

19117

19118

// If the shuffle mask is repeated in each 128-bit lane we can use more

19119

// efficient instructions that mirror the shuffles across the four 128-bit

19120

// lanes.

19121

SmallVector<int, 4> RepeatedMask;

19122

bool Is128BitLaneRepeatedShuffle =

19123

is128BitLaneRepeatedShuffleMask(MVT::v16i32, Mask, RepeatedMask);

19124

if (Is128BitLaneRepeatedShuffle) {

19125

assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!")(static_cast <bool> (RepeatedMask.size() == 4 &&
"Unexpected repeated mask size!") ? void (0) : __assert_fail
("RepeatedMask.size() == 4 && \"Unexpected repeated mask size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19125, __extension__
__PRETTY_FUNCTION__));

19126

if (V2.isUndef())

19127

return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32, V1,

19128

getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));

19129

19130

// Use dedicated unpack instructions for masks that match their pattern.

19131

if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i32, Mask, V1, V2, DAG))

19132

return V;

19133

}

19134

19135

// Try to use shift instructions.

19136

if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask,

19137

Zeroable, Subtarget, DAG))

19138

return Shift;

19139

19140

// Try to use VALIGN.

19141

if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v16i32, V1, V2, Mask,

19142

Subtarget, DAG))

19143

return Rotate;

19144

19145

// Try to use byte rotation instructions.

19146

if (Subtarget.hasBWI())

19147

if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i32, V1, V2, Mask,

19148

Subtarget, DAG))

19149

return Rotate;

19150

19151

// Assume that a single SHUFPS is faster than using a permv shuffle.

19152

// If some CPU is harmed by the domain switch, we can fix it in a later pass.

19153

if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {

19154

SDValue CastV1 = DAG.getBitcast(MVT::v16f32, V1);

19155

SDValue CastV2 = DAG.getBitcast(MVT::v16f32, V2);

19156

SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask,

19157

CastV1, CastV2, DAG);

19158

return DAG.getBitcast(MVT::v16i32, ShufPS);

19159

}

19160

19161

// Try to create an in-lane repeating shuffle mask and then shuffle the

19162

// results into the target lanes.

19163

if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(

19164

DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG))

19165

return V;

19166

19167

// If we have AVX512F support, we can use VEXPAND.

19168

if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v16i32, Zeroable, Mask, V1, V2,

19169

DAG, Subtarget))

19170

return V;

19171

19172

if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i32, V1, V2, Mask,

19173

Zeroable, Subtarget, DAG))

19174

return Blend;

19175

19176

return lowerShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, Subtarget, DAG);

19177

}

19178

19179

/// Handle lowering of 32-lane 16-bit integer shuffles.

19180

static SDValue lowerV32I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

19181

const APInt &Zeroable, SDValue V1, SDValue V2,

19182

const X86Subtarget &Subtarget,

19183

SelectionDAG &DAG) {

19184

assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v32i16
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v32i16 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19184, __extension__
__PRETTY_FUNCTION__));

19185

assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v32i16
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v32i16 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19185, __extension__
__PRETTY_FUNCTION__));

19186

assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!")(static_cast <bool> (Mask.size() == 32 && "Unexpected mask size for v32 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 32 && \"Unexpected mask size for v32 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19186, __extension__
__PRETTY_FUNCTION__));

19187

assert(Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!")(static_cast <bool> (Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!"
) ? void (0) : __assert_fail ("Subtarget.hasBWI() && \"We can only lower v32i16 with AVX-512-BWI!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19187, __extension__
__PRETTY_FUNCTION__));

19188

19189

// Whenever we can lower this as a zext, that instruction is strictly faster

19190

// than any alternative. It also allows us to fold memory operands into the

19191

// shuffle in many cases.

19192

if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(

19193

DL, MVT::v32i16, V1, V2, Mask, Zeroable, Subtarget, DAG))

19194

return ZExt;

19195

19196

// Use dedicated unpack instructions for masks that match their pattern.

19197

if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i16, Mask, V1, V2, DAG))

19198

return V;

19199

19200

// Use dedicated pack instructions for masks that match their pattern.

19201

if (SDValue V =

19202

lowerShuffleWithPACK(DL, MVT::v32i16, Mask, V1, V2, DAG, Subtarget))

19203

return V;

19204

19205

// Try to use shift instructions.

19206

if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask,

19207

Zeroable, Subtarget, DAG))

19208

return Shift;

19209

19210

// Try to use byte rotation instructions.

19211

if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i16, V1, V2, Mask,

19212

Subtarget, DAG))

19213

return Rotate;

19214

19215

if (V2.isUndef()) {

19216

// Try to use bit rotation instructions.

19217

if (SDValue Rotate =

19218

lowerShuffleAsBitRotate(DL, MVT::v32i16, V1, Mask, Subtarget, DAG))

19219

return Rotate;

19220

19221

SmallVector<int, 8> RepeatedMask;

19222

if (is128BitLaneRepeatedShuffleMask(MVT::v32i16, Mask, RepeatedMask)) {

19223

// As this is a single-input shuffle, the repeated mask should be

19224

// a strictly valid v8i16 mask that we can pass through to the v8i16

19225

// lowering to handle even the v32 case.

19226

return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v32i16, V1,

19227

RepeatedMask, Subtarget, DAG);

19228

}

19229

}

19230

19231

if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i16, V1, V2, Mask,

19232

Zeroable, Subtarget, DAG))

19233

return Blend;

19234

19235

if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i16, Mask, V1, V2,

19236

Zeroable, Subtarget, DAG))

19237

return PSHUFB;

19238

19239

return lowerShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, Subtarget, DAG);

19240

}

19241

19242

/// Handle lowering of 64-lane 8-bit integer shuffles.

19243

static SDValue lowerV64I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,

19244

const APInt &Zeroable, SDValue V1, SDValue V2,

19245

const X86Subtarget &Subtarget,

19246

SelectionDAG &DAG) {

19247

assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v64i8
&& "Bad operand type!") ? void (0) : __assert_fail (
"V1.getSimpleValueType() == MVT::v64i8 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19247, __extension__
__PRETTY_FUNCTION__));

19248

assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v64i8
&& "Bad operand type!") ? void (0) : __assert_fail (
"V2.getSimpleValueType() == MVT::v64i8 && \"Bad operand type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19248, __extension__
__PRETTY_FUNCTION__));

19249

assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!")(static_cast <bool> (Mask.size() == 64 && "Unexpected mask size for v64 shuffle!"
) ? void (0) : __assert_fail ("Mask.size() == 64 && \"Unexpected mask size for v64 shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19249, __extension__
__PRETTY_FUNCTION__));

19250

assert(Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!")(static_cast <bool> (Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!"
) ? void (0) : __assert_fail ("Subtarget.hasBWI() && \"We can only lower v64i8 with AVX-512-BWI!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19250, __extension__
__PRETTY_FUNCTION__));

19251

19252

// Whenever we can lower this as a zext, that instruction is strictly faster

19253

// than any alternative. It also allows us to fold memory operands into the

19254

// shuffle in many cases.

19255

if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(

19256

DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget, DAG))

19257

return ZExt;

19258

19259

// Use dedicated unpack instructions for masks that match their pattern.

19260

if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v64i8, Mask, V1, V2, DAG))

19261

return V;

19262

19263

// Use dedicated pack instructions for masks that match their pattern.

19264

if (SDValue V = lowerShuffleWithPACK(DL, MVT::v64i8, Mask, V1, V2, DAG,

19265

Subtarget))

19266

return V;

19267

19268

// Try to use shift instructions.

19269

if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v64i8, V1, V2, Mask,

19270

Zeroable, Subtarget, DAG))

19271

return Shift;

19272

19273

// Try to use byte rotation instructions.

19274

if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v64i8, V1, V2, Mask,

19275

Subtarget, DAG))

19276

return Rotate;

19277

19278

// Try to use bit rotation instructions.

19279

if (V2.isUndef())

19280

if (SDValue Rotate =

19281

lowerShuffleAsBitRotate(DL, MVT::v64i8, V1, Mask, Subtarget, DAG))

19282

return Rotate;

19283

19284

// Lower as AND if possible.

19285

if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v64i8, V1, V2, Mask,

19286

Zeroable, Subtarget, DAG))

19287

return Masked;

19288

19289

if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v64i8, Mask, V1, V2,

19290

Zeroable, Subtarget, DAG))

19291

return PSHUFB;

19292

19293

// Try to create an in-lane repeating shuffle mask and then shuffle the

19294

// results into the target lanes.

19295

if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(

19296

DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))

19297

return V;

19298

19299

if (SDValue Result = lowerShuffleAsLanePermuteAndPermute(

19300

DL, MVT::v64i8, V1, V2, Mask, DAG, Subtarget))

19301

return Result;

19302

19303

if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v64i8, V1, V2, Mask,

19304

Zeroable, Subtarget, DAG))

19305

return Blend;

19306

19307

if (!is128BitLaneCrossingShuffleMask(MVT::v64i8, Mask)) {

19308

// Use PALIGNR+Permute if possible - permute might become PSHUFB but the

19309

// PALIGNR will be cheaper than the second PSHUFB+OR.

19310

if (SDValue V = lowerShuffleAsByteRotateAndPermute(DL, MVT::v64i8, V1, V2,

19311

Mask, Subtarget, DAG))

19312

return V;

19313

19314

// If we can't directly blend but can use PSHUFB, that will be better as it

19315

// can both shuffle and set up the inefficient blend.

19316

bool V1InUse, V2InUse;

19317

return lowerShuffleAsBlendOfPSHUFBs(DL, MVT::v64i8, V1, V2, Mask, Zeroable,

19318

DAG, V1InUse, V2InUse);

19319

}

19320

19321

// Try to simplify this by merging 128-bit lanes to enable a lane-based

19322

// shuffle.

19323

if (!V2.isUndef())

19324

if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask(

19325

DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))

19326

return Result;

19327

19328

// VBMI can use VPERMV/VPERMV3 byte shuffles.

19329

if (Subtarget.hasVBMI())

19330

return lowerShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, Subtarget, DAG);

19331

19332

return splitAndLowerShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG);

19333

}

19334

19335

/// High-level routine to lower various 512-bit x86 vector shuffles.

19336

///

19337

/// This routine either breaks down the specific type of a 512-bit x86 vector

19338

/// shuffle or splits it into two 256-bit shuffles and fuses the results back

19339

/// together based on the available instructions.

19340

static SDValue lower512BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,

19341

MVT VT, SDValue V1, SDValue V2,

19342

const APInt &Zeroable,

19343

const X86Subtarget &Subtarget,

19344

SelectionDAG &DAG) {

19345

assert(Subtarget.hasAVX512() &&(static_cast <bool> (Subtarget.hasAVX512() && "Cannot lower 512-bit vectors w/ basic ISA!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && \"Cannot lower 512-bit vectors w/ basic ISA!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19346, __extension__
__PRETTY_FUNCTION__))

19346

"Cannot lower 512-bit vectors w/ basic ISA!")(static_cast <bool> (Subtarget.hasAVX512() && "Cannot lower 512-bit vectors w/ basic ISA!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && \"Cannot lower 512-bit vectors w/ basic ISA!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19346, __extension__
__PRETTY_FUNCTION__));

19347

19348

// If we have a single input to the zero element, insert that into V1 if we

19349

// can do so cheaply.

19350

int NumElts = Mask.size();

19351

int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });

19352

19353

if (NumV2Elements == 1 && Mask[0] >= NumElts)

19354

if (SDValue Insertion = lowerShuffleAsElementInsertion(

19355

DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))

19356

return Insertion;

19357

19358

// Handle special cases where the lower or upper half is UNDEF.

19359

if (SDValue V =

19360

lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))

19361

return V;

19362

19363

// Check for being able to broadcast a single element.

19364

if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, Mask,

19365

Subtarget, DAG))

19366

return Broadcast;

19367

19368

if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI()) {

19369

// Try using bit ops for masking and blending before falling back to

19370

// splitting.

19371

if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,

19372

Subtarget, DAG))

19373

return V;

19374

if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))

19375

return V;

19376

19377

return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);

19378

}

19379

19380

if (VT == MVT::v32f16) {

19381

V1 = DAG.getBitcast(MVT::v32i16, V1);

19382

V2 = DAG.getBitcast(MVT::v32i16, V2);

19383

return DAG.getBitcast(MVT::v32f16,

19384

DAG.getVectorShuffle(MVT::v32i16, DL, V1, V2, Mask));

19385

}

19386

19387

// Dispatch to each element type for lowering. If we don't have support for

19388

// specific element type shuffles at 512 bits, immediately split them and

19389

// lower them. Each lowering routine of a given type is allowed to assume that

19390

// the requisite ISA extensions for that element type are available.

19391

switch (VT.SimpleTy) {

19392

case MVT::v8f64:

19393

return lowerV8F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

19394

case MVT::v16f32:

19395

return lowerV16F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

19396

case MVT::v8i64:

19397

return lowerV8I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

19398

case MVT::v16i32:

19399

return lowerV16I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

19400

case MVT::v32i16:

19401

return lowerV32I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

19402

case MVT::v64i8:

19403

return lowerV64I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

19404

19405

default:

19406

llvm_unreachable("Not a valid 512-bit x86 vector type!")::llvm::llvm_unreachable_internal("Not a valid 512-bit x86 vector type!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19406);

19407

}

19408

}

19409

19410

static SDValue lower1BitShuffleAsKSHIFTR(const SDLoc &DL, ArrayRef<int> Mask,

19411

MVT VT, SDValue V1, SDValue V2,

19412

const X86Subtarget &Subtarget,

19413

SelectionDAG &DAG) {

19414

// Shuffle should be unary.

19415

if (!V2.isUndef())

19416

return SDValue();

19417

19418

int ShiftAmt = -1;

19419

int NumElts = Mask.size();

19420

for (int i = 0; i != NumElts; ++i) {

19421

int M = Mask[i];

19422

assert((M == SM_SentinelUndef || (0 <= M && M < NumElts)) &&(static_cast <bool> ((M == SM_SentinelUndef || (0 <=
M && M < NumElts)) && "Unexpected mask index."
) ? void (0) : __assert_fail ("(M == SM_SentinelUndef || (0 <= M && M < NumElts)) && \"Unexpected mask index.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19423, __extension__
__PRETTY_FUNCTION__))

19423

"Unexpected mask index.")(static_cast <bool> ((M == SM_SentinelUndef || (0 <=
M && M < NumElts)) && "Unexpected mask index."
) ? void (0) : __assert_fail ("(M == SM_SentinelUndef || (0 <= M && M < NumElts)) && \"Unexpected mask index.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19423, __extension__
__PRETTY_FUNCTION__));

19424

if (M < 0)

19425

continue;

19426

19427

// The first non-undef element determines our shift amount.

19428

if (ShiftAmt < 0) {

19429

ShiftAmt = M - i;

19430

// Need to be shifting right.

19431

if (ShiftAmt <= 0)

19432

return SDValue();

19433

}

19434

// All non-undef elements must shift by the same amount.

19435

if (ShiftAmt != M - i)

19436

return SDValue();

19437

}

19438

assert(ShiftAmt >= 0 && "All undef?")(static_cast <bool> (ShiftAmt >= 0 && "All undef?"
) ? void (0) : __assert_fail ("ShiftAmt >= 0 && \"All undef?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19438, __extension__
__PRETTY_FUNCTION__));

19439

19440

// Great we found a shift right.

19441

MVT WideVT = VT;

19442

if ((!Subtarget.hasDQI() && NumElts == 8) || NumElts < 8)

19443

WideVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;

19444

SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideVT,

19445

DAG.getUNDEF(WideVT), V1,

19446

DAG.getIntPtrConstant(0, DL));

19447

Res = DAG.getNode(X86ISD::KSHIFTR, DL, WideVT, Res,

19448

DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));

19449

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,

19450

DAG.getIntPtrConstant(0, DL));

19451

}

19452

19453

// Determine if this shuffle can be implemented with a KSHIFT instruction.

19454

// Returns the shift amount if possible or -1 if not. This is a simplified

19455

// version of matchShuffleAsShift.

19456

static int match1BitShuffleAsKSHIFT(unsigned &Opcode, ArrayRef<int> Mask,

19457

int MaskOffset, const APInt &Zeroable) {

19458

int Size = Mask.size();

19459

19460

auto CheckZeros = [&](int Shift, bool Left) {

19461

for (int j = 0; j < Shift; ++j)

19462

if (!Zeroable[j + (Left ? 0 : (Size - Shift))])

19463

return false;

19464

19465

return true;

19466

};

19467

19468

auto MatchShift = [&](int Shift, bool Left) {

19469

unsigned Pos = Left ? Shift : 0;

19470

unsigned Low = Left ? 0 : Shift;

19471

unsigned Len = Size - Shift;

19472

return isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset);

19473

};

19474

19475

for (int Shift = 1; Shift != Size; ++Shift)

19476

for (bool Left : {true, false})

19477

if (CheckZeros(Shift, Left) && MatchShift(Shift, Left)) {

19478

Opcode = Left ? X86ISD::KSHIFTL : X86ISD::KSHIFTR;

19479

return Shift;

19480

}

19481

19482

return -1;

19483

}

19484

19485

19486

// Lower vXi1 vector shuffles.

19487

// There is no a dedicated instruction on AVX-512 that shuffles the masks.

19488

// The only way to shuffle bits is to sign-extend the mask vector to SIMD

19489

// vector, shuffle and then truncate it back.

19490

static SDValue lower1BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,

19491

MVT VT, SDValue V1, SDValue V2,

19492

const APInt &Zeroable,

19493

const X86Subtarget &Subtarget,

19494

SelectionDAG &DAG) {

19495

assert(Subtarget.hasAVX512() &&(static_cast <bool> (Subtarget.hasAVX512() && "Cannot lower 512-bit vectors w/o basic ISA!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && \"Cannot lower 512-bit vectors w/o basic ISA!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19496, __extension__
__PRETTY_FUNCTION__))

19496

"Cannot lower 512-bit vectors w/o basic ISA!")(static_cast <bool> (Subtarget.hasAVX512() && "Cannot lower 512-bit vectors w/o basic ISA!"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && \"Cannot lower 512-bit vectors w/o basic ISA!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19496, __extension__
__PRETTY_FUNCTION__));

19497

19498

int NumElts = Mask.size();

19499

19500

// Try to recognize shuffles that are just padding a subvector with zeros.

19501

int SubvecElts = 0;

19502

int Src = -1;

19503

for (int i = 0; i != NumElts; ++i) {

19504

if (Mask[i] >= 0) {

19505

// Grab the source from the first valid mask. All subsequent elements need

19506

// to use this same source.

19507

if (Src < 0)

19508

Src = Mask[i] / NumElts;

19509

if (Src != (Mask[i] / NumElts) || (Mask[i] % NumElts) != i)

19510

break;

19511

}

19512

19513

++SubvecElts;

19514

}

19515

assert(SubvecElts != NumElts && "Identity shuffle?")(static_cast <bool> (SubvecElts != NumElts && "Identity shuffle?"
) ? void (0) : __assert_fail ("SubvecElts != NumElts && \"Identity shuffle?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19515, __extension__
__PRETTY_FUNCTION__));

19516

19517

// Clip to a power 2.

19518

SubvecElts = PowerOf2Floor(SubvecElts);

19519

19520

// Make sure the number of zeroable bits in the top at least covers the bits

19521

// not covered by the subvector.

19522

if ((int)Zeroable.countLeadingOnes() >= (NumElts - SubvecElts)) {

19523

assert(Src >= 0 && "Expected a source!")(static_cast <bool> (Src >= 0 && "Expected a source!"
) ? void (0) : __assert_fail ("Src >= 0 && \"Expected a source!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19523, __extension__
__PRETTY_FUNCTION__));

19524

MVT ExtractVT = MVT::getVectorVT(MVT::i1, SubvecElts);

19525

SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT,

19526

Src == 0 ? V1 : V2,

19527

DAG.getIntPtrConstant(0, DL));

19528

return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,

19529

DAG.getConstant(0, DL, VT),

19530

Extract, DAG.getIntPtrConstant(0, DL));

19531

}

19532

19533

// Try a simple shift right with undef elements. Later we'll try with zeros.

19534

if (SDValue Shift = lower1BitShuffleAsKSHIFTR(DL, Mask, VT, V1, V2, Subtarget,

19535

DAG))

19536

return Shift;

19537

19538

// Try to match KSHIFTs.

19539

unsigned Offset = 0;

19540

for (SDValue V : { V1, V2 }) {

19541

unsigned Opcode;

19542

int ShiftAmt = match1BitShuffleAsKSHIFT(Opcode, Mask, Offset, Zeroable);

19543

if (ShiftAmt >= 0) {

19544

MVT WideVT = VT;

19545

if ((!Subtarget.hasDQI() && NumElts == 8) || NumElts < 8)

19546

WideVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;

19547

SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideVT,

19548

DAG.getUNDEF(WideVT), V,

19549

DAG.getIntPtrConstant(0, DL));

19550

// Widened right shifts need two shifts to ensure we shift in zeroes.

19551

if (Opcode == X86ISD::KSHIFTR && WideVT != VT) {

19552

int WideElts = WideVT.getVectorNumElements();

19553

// Shift left to put the original vector in the MSBs of the new size.

19554

Res = DAG.getNode(X86ISD::KSHIFTL, DL, WideVT, Res,

19555

DAG.getTargetConstant(WideElts - NumElts, DL, MVT::i8));

19556

// Increase the shift amount to account for the left shift.

19557

ShiftAmt += WideElts - NumElts;

19558

}

19559

19560

Res = DAG.getNode(Opcode, DL, WideVT, Res,

19561

DAG.getTargetConstant(ShiftAmt, DL, MVT::i8));

19562

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,

19563

DAG.getIntPtrConstant(0, DL));

19564

}

19565

Offset += NumElts; // Increment for next iteration.

19566

}

19567

19568

// If we're broadcasting a SETCC result, try to broadcast the ops instead.

19569

// TODO: What other unary shuffles would benefit from this?

19570

if (isBroadcastShuffleMask(Mask) && V1.getOpcode() == ISD::SETCC &&

19571

V1->hasOneUse()) {

19572

SDValue Op0 = V1.getOperand(0);

19573

SDValue Op1 = V1.getOperand(1);

19574

ISD::CondCode CC = cast<CondCodeSDNode>(V1.getOperand(2))->get();

19575

EVT OpVT = Op0.getValueType();

19576

return DAG.getSetCC(

19577

DL, VT, DAG.getVectorShuffle(OpVT, DL, Op0, DAG.getUNDEF(OpVT), Mask),

19578

DAG.getVectorShuffle(OpVT, DL, Op1, DAG.getUNDEF(OpVT), Mask), CC);

19579

}

19580

19581

MVT ExtVT;

19582

switch (VT.SimpleTy) {

19583

default:

19584

llvm_unreachable("Expected a vector of i1 elements")::llvm::llvm_unreachable_internal("Expected a vector of i1 elements"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19584);

19585

case MVT::v2i1:

19586

ExtVT = MVT::v2i64;

19587

break;

19588

case MVT::v4i1:

19589

ExtVT = MVT::v4i32;

19590

break;

19591

case MVT::v8i1:

19592

// Take 512-bit type, more shuffles on KNL. If we have VLX use a 256-bit

19593

// shuffle.

19594

ExtVT = Subtarget.hasVLX() ? MVT::v8i32 : MVT::v8i64;

19595

break;

19596

case MVT::v16i1:

19597

// Take 512-bit type, unless we are avoiding 512-bit types and have the

19598

// 256-bit operation available.

19599

ExtVT = Subtarget.canExtendTo512DQ() ? MVT::v16i32 : MVT::v16i16;

19600

break;

19601

case MVT::v32i1:

19602

// Take 512-bit type, unless we are avoiding 512-bit types and have the

19603

// 256-bit operation available.

19604

assert(Subtarget.hasBWI() && "Expected AVX512BW support")(static_cast <bool> (Subtarget.hasBWI() && "Expected AVX512BW support"
) ? void (0) : __assert_fail ("Subtarget.hasBWI() && \"Expected AVX512BW support\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19604, __extension__
__PRETTY_FUNCTION__));

19605

ExtVT = Subtarget.canExtendTo512BW() ? MVT::v32i16 : MVT::v32i8;

19606

break;

19607

case MVT::v64i1:

19608

// Fall back to scalarization. FIXME: We can do better if the shuffle

19609

// can be partitioned cleanly.

19610

if (!Subtarget.useBWIRegs())

19611

return SDValue();

19612

ExtVT = MVT::v64i8;

19613

break;

19614

}

19615

19616

V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1);

19617

V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2);

19618

19619

SDValue Shuffle = DAG.getVectorShuffle(ExtVT, DL, V1, V2, Mask);

19620

// i1 was sign extended we can use X86ISD::CVT2MASK.

19621

int NumElems = VT.getVectorNumElements();

19622

if ((Subtarget.hasBWI() && (NumElems >= 32)) ||

19623

(Subtarget.hasDQI() && (NumElems < 32)))

19624

return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, ExtVT),

19625

Shuffle, ISD::SETGT);

19626

19627

return DAG.getNode(ISD::TRUNCATE, DL, VT, Shuffle);

19628

}

19629

19630

/// Helper function that returns true if the shuffle mask should be

19631

/// commuted to improve canonicalization.

19632

static bool canonicalizeShuffleMaskWithCommute(ArrayRef<int> Mask) {

19633

int NumElements = Mask.size();

19634

19635

int NumV1Elements = 0, NumV2Elements = 0;

19636

for (int M : Mask)

19637

if (M < 0)

19638

continue;

19639

else if (M < NumElements)

19640

++NumV1Elements;

19641

else

19642

++NumV2Elements;

19643

19644

// Commute the shuffle as needed such that more elements come from V1 than

19645

// V2. This allows us to match the shuffle pattern strictly on how many

19646

// elements come from V1 without handling the symmetric cases.

19647

if (NumV2Elements > NumV1Elements)

19648

return true;

19649

19650

assert(NumV1Elements > 0 && "No V1 indices")(static_cast <bool> (NumV1Elements > 0 && "No V1 indices"
) ? void (0) : __assert_fail ("NumV1Elements > 0 && \"No V1 indices\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19650, __extension__
__PRETTY_FUNCTION__));

19651

19652

if (NumV2Elements == 0)

19653

return false;

19654

19655

// When the number of V1 and V2 elements are the same, try to minimize the

19656

// number of uses of V2 in the low half of the vector. When that is tied,

19657

// ensure that the sum of indices for V1 is equal to or lower than the sum

19658

// indices for V2. When those are equal, try to ensure that the number of odd

19659

// indices for V1 is lower than the number of odd indices for V2.

19660

if (NumV1Elements == NumV2Elements) {

19661

int LowV1Elements = 0, LowV2Elements = 0;

19662

for (int M : Mask.slice(0, NumElements / 2))

19663

if (M >= NumElements)

19664

++LowV2Elements;

19665

else if (M >= 0)

19666

++LowV1Elements;

19667

if (LowV2Elements > LowV1Elements)

19668

return true;

19669

if (LowV2Elements == LowV1Elements) {

19670

int SumV1Indices = 0, SumV2Indices = 0;

19671

for (int i = 0, Size = Mask.size(); i < Size; ++i)

19672

if (Mask[i] >= NumElements)

19673

SumV2Indices += i;

19674

else if (Mask[i] >= 0)

19675

SumV1Indices += i;

19676

if (SumV2Indices < SumV1Indices)

19677

return true;

19678

if (SumV2Indices == SumV1Indices) {

19679

int NumV1OddIndices = 0, NumV2OddIndices = 0;

19680

for (int i = 0, Size = Mask.size(); i < Size; ++i)

19681

if (Mask[i] >= NumElements)

19682

NumV2OddIndices += i % 2;

19683

else if (Mask[i] >= 0)

19684

NumV1OddIndices += i % 2;

19685

if (NumV2OddIndices < NumV1OddIndices)

19686

return true;

19687

}

19688

}

19689

}

19690

19691

return false;

19692

}

19693

19694

static bool canCombineAsMaskOperation(SDValue V1, SDValue V2,

19695

const X86Subtarget &Subtarget) {

19696

if (!Subtarget.hasAVX512())

19697

return false;

19698

19699

MVT VT = V1.getSimpleValueType().getScalarType();

19700

if ((VT == MVT::i16 || VT == MVT::i8) && !Subtarget.hasBWI())

19701

return false;

19702

19703

// i8 is better to be widen to i16, because there is PBLENDW for vXi16

19704

// when the vector bit size is 128 or 256.

19705

if (VT == MVT::i8 && V1.getSimpleValueType().getSizeInBits() < 512)

19706

return false;

19707

19708

auto HasMaskOperation = [&](SDValue V) {

19709

// TODO: Currently we only check limited opcode. We probably extend

19710

// it to all binary operation by checking TLI.isBinOp().

19711

switch (V->getOpcode()) {

19712

default:

19713

return false;

19714

case ISD::ADD:

19715

case ISD::SUB:

19716

case ISD::AND:

19717

case ISD::XOR:

19718

break;

19719

}

19720

if (!V->hasOneUse())

19721

return false;

19722

19723

return true;

19724

};

19725

19726

if (HasMaskOperation(V1) || HasMaskOperation(V2))

19727

return true;

19728

19729

return false;

19730

}

19731

19732

// Forward declaration.

19733

static SDValue canonicalizeShuffleMaskWithHorizOp(

19734

MutableArrayRef<SDValue> Ops, MutableArrayRef<int> Mask,

19735

unsigned RootSizeInBits, const SDLoc &DL, SelectionDAG &DAG,

19736

const X86Subtarget &Subtarget);

19737

19738

/// Top-level lowering for x86 vector shuffles.

19739

///

19740

/// This handles decomposition, canonicalization, and lowering of all x86

19741

/// vector shuffles. Most of the specific lowering strategies are encapsulated

19742

/// above in helper routines. The canonicalization attempts to widen shuffles

19743

/// to involve fewer lanes of wider elements, consolidate symmetric patterns

19744

/// s.t. only one of the two inputs needs to be tested, etc.

19745

static SDValue lowerVECTOR_SHUFFLE(SDValue Op, const X86Subtarget &Subtarget,

19746

SelectionDAG &DAG) {

19747

ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);

19748

ArrayRef<int> OrigMask = SVOp->getMask();

19749

SDValue V1 = Op.getOperand(0);

19750

SDValue V2 = Op.getOperand(1);

19751

MVT VT = Op.getSimpleValueType();

19752

int NumElements = VT.getVectorNumElements();

19753

SDLoc DL(Op);

19754

bool Is1BitVector = (VT.getVectorElementType() == MVT::i1);

19755

19756

assert((VT.getSizeInBits() != 64 || Is1BitVector) &&(static_cast <bool> ((VT.getSizeInBits() != 64 || Is1BitVector
) && "Can't lower MMX shuffles") ? void (0) : __assert_fail
("(VT.getSizeInBits() != 64 || Is1BitVector) && \"Can't lower MMX shuffles\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19757, __extension__
__PRETTY_FUNCTION__))

19757

"Can't lower MMX shuffles")(static_cast <bool> ((VT.getSizeInBits() != 64 || Is1BitVector
) && "Can't lower MMX shuffles") ? void (0) : __assert_fail
("(VT.getSizeInBits() != 64 || Is1BitVector) && \"Can't lower MMX shuffles\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19757, __extension__
__PRETTY_FUNCTION__));

19758

19759

bool V1IsUndef = V1.isUndef();

19760

bool V2IsUndef = V2.isUndef();

19761

if (V1IsUndef && V2IsUndef)

19762

return DAG.getUNDEF(VT);

19763

19764

// When we create a shuffle node we put the UNDEF node to second operand,

19765

// but in some cases the first operand may be transformed to UNDEF.

19766

// In this case we should just commute the node.

19767

if (V1IsUndef)

19768

return DAG.getCommutedVectorShuffle(*SVOp);

19769

19770

// Check for non-undef masks pointing at an undef vector and make the masks

19771

// undef as well. This makes it easier to match the shuffle based solely on

19772

// the mask.

19773

if (V2IsUndef &&

19774

any_of(OrigMask, [NumElements](int M) { return M >= NumElements; })) {

19775

SmallVector<int, 8> NewMask(OrigMask);

19776

for (int &M : NewMask)

19777

if (M >= NumElements)

19778

M = -1;

19779

return DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);

19780

}

19781

19782

// Check for illegal shuffle mask element index values.

19783

int MaskUpperLimit = OrigMask.size() * (V2IsUndef ? 1 : 2);

19784

(void)MaskUpperLimit;

19785

assert(llvm::all_of(OrigMask,(static_cast <bool> (llvm::all_of(OrigMask, [&](int
M) { return -1 <= M && M < MaskUpperLimit; }) &&
"Out of bounds shuffle index") ? void (0) : __assert_fail ("llvm::all_of(OrigMask, [&](int M) { return -1 <= M && M < MaskUpperLimit; }) && \"Out of bounds shuffle index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19787, __extension__
__PRETTY_FUNCTION__))

19786

[&](int M) { return -1 <= M && M < MaskUpperLimit; }) &&(static_cast <bool> (llvm::all_of(OrigMask, [&](int
M) { return -1 <= M && M < MaskUpperLimit; }) &&
"Out of bounds shuffle index") ? void (0) : __assert_fail ("llvm::all_of(OrigMask, [&](int M) { return -1 <= M && M < MaskUpperLimit; }) && \"Out of bounds shuffle index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19787, __extension__
__PRETTY_FUNCTION__))

19787

"Out of bounds shuffle index")(static_cast <bool> (llvm::all_of(OrigMask, [&](int
M) { return -1 <= M && M < MaskUpperLimit; }) &&
"Out of bounds shuffle index") ? void (0) : __assert_fail ("llvm::all_of(OrigMask, [&](int M) { return -1 <= M && M < MaskUpperLimit; }) && \"Out of bounds shuffle index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19787, __extension__
__PRETTY_FUNCTION__));

19788

19789

// We actually see shuffles that are entirely re-arrangements of a set of

19790

// zero inputs. This mostly happens while decomposing complex shuffles into

19791

// simple ones. Directly lower these as a buildvector of zeros.

19792

APInt KnownUndef, KnownZero;

19793

computeZeroableShuffleElements(OrigMask, V1, V2, KnownUndef, KnownZero);

19794

19795

APInt Zeroable = KnownUndef | KnownZero;

19796

if (Zeroable.isAllOnes())

19797

return getZeroVector(VT, Subtarget, DAG, DL);

19798

19799

bool V2IsZero = !V2IsUndef && ISD::isBuildVectorAllZeros(V2.getNode());

19800

19801

// Try to collapse shuffles into using a vector type with fewer elements but

19802

// wider element types. We cap this to not form integers or floating point

19803

// elements wider than 64 bits. It does not seem beneficial to form i128

19804

// integers to handle flipping the low and high halves of AVX 256-bit vectors.

19805

SmallVector<int, 16> WidenedMask;

19806

if (VT.getScalarSizeInBits() < 64 && !Is1BitVector &&

19807

!canCombineAsMaskOperation(V1, V2, Subtarget) &&

19808

canWidenShuffleElements(OrigMask, Zeroable, V2IsZero, WidenedMask)) {

19809

// Shuffle mask widening should not interfere with a broadcast opportunity

19810

// by obfuscating the operands with bitcasts.

19811

// TODO: Avoid lowering directly from this top-level function: make this

19812

// a query (canLowerAsBroadcast) and defer lowering to the type-based calls.

19813

if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, OrigMask,

19814

Subtarget, DAG))

19815

return Broadcast;

19816

19817

MVT NewEltVT = VT.isFloatingPoint()

19818

? MVT::getFloatingPointVT(VT.getScalarSizeInBits() * 2)

19819

: MVT::getIntegerVT(VT.getScalarSizeInBits() * 2);

19820

int NewNumElts = NumElements / 2;

19821

MVT NewVT = MVT::getVectorVT(NewEltVT, NewNumElts);

19822

// Make sure that the new vector type is legal. For example, v2f64 isn't

19823

// legal on SSE1.

19824

if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {

19825

if (V2IsZero) {

19826

// Modify the new Mask to take all zeros from the all-zero vector.

19827

// Choose indices that are blend-friendly.

19828

bool UsedZeroVector = false;

19829

assert(is_contained(WidenedMask, SM_SentinelZero) &&(static_cast <bool> (is_contained(WidenedMask, SM_SentinelZero
) && "V2's non-undef elements are used?!") ? void (0)
: __assert_fail ("is_contained(WidenedMask, SM_SentinelZero) && \"V2's non-undef elements are used?!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19830, __extension__
__PRETTY_FUNCTION__))

19830

"V2's non-undef elements are used?!")(static_cast <bool> (is_contained(WidenedMask, SM_SentinelZero
) && "V2's non-undef elements are used?!") ? void (0)
: __assert_fail ("is_contained(WidenedMask, SM_SentinelZero) && \"V2's non-undef elements are used?!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19830, __extension__
__PRETTY_FUNCTION__));

19831

for (int i = 0; i != NewNumElts; ++i)

19832

if (WidenedMask[i] == SM_SentinelZero) {

19833

WidenedMask[i] = i + NewNumElts;

19834

UsedZeroVector = true;

19835

}

19836

// Ensure all elements of V2 are zero - isBuildVectorAllZeros permits

19837

// some elements to be undef.

19838

if (UsedZeroVector)

19839

V2 = getZeroVector(NewVT, Subtarget, DAG, DL);

19840

}

19841

V1 = DAG.getBitcast(NewVT, V1);

19842

V2 = DAG.getBitcast(NewVT, V2);

19843

return DAG.getBitcast(

19844

VT, DAG.getVectorShuffle(NewVT, DL, V1, V2, WidenedMask));

19845

}

19846

}

19847

19848

SmallVector<SDValue> Ops = {V1, V2};

19849

SmallVector<int> Mask(OrigMask);

19850

19851

// Canonicalize the shuffle with any horizontal ops inputs.

19852

// NOTE: This may update Ops and Mask.

19853

if (SDValue HOp = canonicalizeShuffleMaskWithHorizOp(

19854

Ops, Mask, VT.getSizeInBits(), DL, DAG, Subtarget))

19855

return DAG.getBitcast(VT, HOp);

19856

19857

V1 = DAG.getBitcast(VT, Ops[0]);

19858

V2 = DAG.getBitcast(VT, Ops[1]);

19859

assert(NumElements == (int)Mask.size() &&(static_cast <bool> (NumElements == (int)Mask.size() &&
"canonicalizeShuffleMaskWithHorizOp " "shouldn't alter the shuffle mask size"
) ? void (0) : __assert_fail ("NumElements == (int)Mask.size() && \"canonicalizeShuffleMaskWithHorizOp \" \"shouldn't alter the shuffle mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19861, __extension__
__PRETTY_FUNCTION__))

19860

"canonicalizeShuffleMaskWithHorizOp "(static_cast <bool> (NumElements == (int)Mask.size() &&
"canonicalizeShuffleMaskWithHorizOp " "shouldn't alter the shuffle mask size"
) ? void (0) : __assert_fail ("NumElements == (int)Mask.size() && \"canonicalizeShuffleMaskWithHorizOp \" \"shouldn't alter the shuffle mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19861, __extension__
__PRETTY_FUNCTION__))

19861

"shouldn't alter the shuffle mask size")(static_cast <bool> (NumElements == (int)Mask.size() &&
"canonicalizeShuffleMaskWithHorizOp " "shouldn't alter the shuffle mask size"
) ? void (0) : __assert_fail ("NumElements == (int)Mask.size() && \"canonicalizeShuffleMaskWithHorizOp \" \"shouldn't alter the shuffle mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 19861, __extension__
__PRETTY_FUNCTION__));

19862

19863

// Commute the shuffle if it will improve canonicalization.

19864

if (canonicalizeShuffleMaskWithCommute(Mask)) {

19865

ShuffleVectorSDNode::commuteMask(Mask);

19866

std::swap(V1, V2);

19867

}

19868

19869

// For each vector width, delegate to a specialized lowering routine.

19870

if (VT.is128BitVector())

19871

return lower128BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);

19872

19873

if (VT.is256BitVector())

19874

return lower256BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);

19875

19876

if (VT.is512BitVector())

19877

return lower512BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);

19878

19879

if (Is1BitVector)

19880

return lower1BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG);

19881

19882

llvm_unreachable("Unimplemented!")::llvm::llvm_unreachable_internal("Unimplemented!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 19882);

19883

}

19884

19885

/// Try to lower a VSELECT instruction to a vector shuffle.

19886

static SDValue lowerVSELECTtoVectorShuffle(SDValue Op,

19887

const X86Subtarget &Subtarget,

19888

SelectionDAG &DAG) {

19889

SDValue Cond = Op.getOperand(0);

19890

SDValue LHS = Op.getOperand(1);

19891

SDValue RHS = Op.getOperand(2);

19892

MVT VT = Op.getSimpleValueType();

19893

19894

// Only non-legal VSELECTs reach this lowering, convert those into generic

19895

// shuffles and re-use the shuffle lowering path for blends.

19896

if (ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) {

19897

SmallVector<int, 32> Mask;

19898

if (createShuffleMaskFromVSELECT(Mask, Cond))

19899

return DAG.getVectorShuffle(VT, SDLoc(Op), LHS, RHS, Mask);

19900

}

19901

19902

return SDValue();

19903

}

19904

19905

SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {

19906

SDValue Cond = Op.getOperand(0);

19907

SDValue LHS = Op.getOperand(1);

19908

SDValue RHS = Op.getOperand(2);

19909

19910

SDLoc dl(Op);

19911

MVT VT = Op.getSimpleValueType();

19912

if (isSoftFP16(VT)) {

19913

MVT NVT = VT.changeVectorElementTypeToInteger();

19914

return DAG.getBitcast(VT, DAG.getNode(ISD::VSELECT, dl, NVT, Cond,

19915

DAG.getBitcast(NVT, LHS),

19916

DAG.getBitcast(NVT, RHS)));

19917

}

19918

19919

// A vselect where all conditions and data are constants can be optimized into

19920

// a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().

19921

if (ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()) &&

19922

ISD::isBuildVectorOfConstantSDNodes(LHS.getNode()) &&

19923

ISD::isBuildVectorOfConstantSDNodes(RHS.getNode()))

19924

return SDValue();

19925

19926

// Try to lower this to a blend-style vector shuffle. This can handle all

19927

// constant condition cases.

19928

if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG))

19929

return BlendOp;

19930

19931

// If this VSELECT has a vector if i1 as a mask, it will be directly matched

19932

// with patterns on the mask registers on AVX-512.

19933

MVT CondVT = Cond.getSimpleValueType();

19934

unsigned CondEltSize = Cond.getScalarValueSizeInBits();

19935

if (CondEltSize == 1)

19936

return Op;

19937

19938

// Variable blends are only legal from SSE4.1 onward.

19939

if (!Subtarget.hasSSE41())

19940

return SDValue();

19941

19942

unsigned EltSize = VT.getScalarSizeInBits();

19943

unsigned NumElts = VT.getVectorNumElements();

19944

19945

// Expand v32i16/v64i8 without BWI.

19946

if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())

19947

return SDValue();

19948

19949

// If the VSELECT is on a 512-bit type, we have to convert a non-i1 condition

19950

// into an i1 condition so that we can use the mask-based 512-bit blend

19951

// instructions.

19952

if (VT.getSizeInBits() == 512) {

19953

// Build a mask by testing the condition against zero.

19954

MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);

19955

SDValue Mask = DAG.getSetCC(dl, MaskVT, Cond,

19956

DAG.getConstant(0, dl, CondVT),

19957

ISD::SETNE);

19958

// Now return a new VSELECT using the mask.

19959

return DAG.getSelect(dl, VT, Mask, LHS, RHS);

19960

}

19961

19962

// SEXT/TRUNC cases where the mask doesn't match the destination size.

19963

if (CondEltSize != EltSize) {

19964

// If we don't have a sign splat, rely on the expansion.

19965

if (CondEltSize != DAG.ComputeNumSignBits(Cond))

19966

return SDValue();

19967

19968

MVT NewCondSVT = MVT::getIntegerVT(EltSize);

19969

MVT NewCondVT = MVT::getVectorVT(NewCondSVT, NumElts);

19970

Cond = DAG.getSExtOrTrunc(Cond, dl, NewCondVT);

19971

return DAG.getNode(ISD::VSELECT, dl, VT, Cond, LHS, RHS);

19972

}

19973

19974

// Only some types will be legal on some subtargets. If we can emit a legal

19975

// VSELECT-matching blend, return Op, and but if we need to expand, return

19976

// a null value.

19977

switch (VT.SimpleTy) {

19978

default:

19979

// Most of the vector types have blends past SSE4.1.

19980

return Op;

19981

19982

case MVT::v32i8:

19983

// The byte blends for AVX vectors were introduced only in AVX2.

19984

if (Subtarget.hasAVX2())

19985

return Op;

19986

19987

return SDValue();

19988

19989

case MVT::v8i16:

19990

case MVT::v16i16: {

19991

// Bitcast everything to the vXi8 type and use a vXi8 vselect.

19992

MVT CastVT = MVT::getVectorVT(MVT::i8, NumElts * 2);

19993

Cond = DAG.getBitcast(CastVT, Cond);

19994

LHS = DAG.getBitcast(CastVT, LHS);

19995

RHS = DAG.getBitcast(CastVT, RHS);

19996

SDValue Select = DAG.getNode(ISD::VSELECT, dl, CastVT, Cond, LHS, RHS);

19997

return DAG.getBitcast(VT, Select);

19998

}

19999

}

20000

}

20001

20002

static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {

20003

MVT VT = Op.getSimpleValueType();

20004

SDValue Vec = Op.getOperand(0);

20005

SDValue Idx = Op.getOperand(1);

20006

assert(isa<ConstantSDNode>(Idx) && "Constant index expected")(static_cast <bool> (isa<ConstantSDNode>(Idx) &&
"Constant index expected") ? void (0) : __assert_fail ("isa<ConstantSDNode>(Idx) && \"Constant index expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20006, __extension__
__PRETTY_FUNCTION__));

20007

SDLoc dl(Op);

20008

20009

if (!Vec.getSimpleValueType().is128BitVector())

20010

return SDValue();

20011

20012

if (VT.getSizeInBits() == 8) {

20013

// If IdxVal is 0, it's cheaper to do a move instead of a pextrb, unless

20014

// we're going to zero extend the register or fold the store.

20015

if (llvm::isNullConstant(Idx) && !X86::mayFoldIntoZeroExtend(Op) &&

20016

!X86::mayFoldIntoStore(Op))

20017

return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,

20018

DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,

20019

DAG.getBitcast(MVT::v4i32, Vec), Idx));

20020

20021

unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();

20022

SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32, Vec,

20023

DAG.getTargetConstant(IdxVal, dl, MVT::i8));

20024

return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);

20025

}

20026

20027

if (VT == MVT::f32) {

20028

// EXTRACTPS outputs to a GPR32 register which will require a movd to copy

20029

// the result back to FR32 register. It's only worth matching if the

20030

// result has a single use which is a store or a bitcast to i32. And in

20031

// the case of a store, it's not worth it if the index is a constant 0,

20032

// because a MOVSSmr can be used instead, which is smaller and faster.

20033

if (!Op.hasOneUse())

20034

return SDValue();

20035

SDNode *User = *Op.getNode()->use_begin();

20036

if ((User->getOpcode() != ISD::STORE || isNullConstant(Idx)) &&

20037

(User->getOpcode() != ISD::BITCAST ||

20038

User->getValueType(0) != MVT::i32))

20039

return SDValue();

20040

SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,

20041

DAG.getBitcast(MVT::v4i32, Vec), Idx);

20042

return DAG.getBitcast(MVT::f32, Extract);

20043

}

20044

20045

if (VT == MVT::i32 || VT == MVT::i64)

20046

return Op;

20047

20048

return SDValue();

20049

}

20050

20051

/// Extract one bit from mask vector, like v16i1 or v8i1.

20052

/// AVX-512 feature.

20053

static SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG,

20054

const X86Subtarget &Subtarget) {

20055

SDValue Vec = Op.getOperand(0);

20056

SDLoc dl(Vec);

20057

MVT VecVT = Vec.getSimpleValueType();

20058

SDValue Idx = Op.getOperand(1);

20059

auto* IdxC = dyn_cast<ConstantSDNode>(Idx);

20060

MVT EltVT = Op.getSimpleValueType();

20061

20062

assert((VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) &&(static_cast <bool> ((VecVT.getVectorNumElements() <=
16 || Subtarget.hasBWI()) && "Unexpected vector type in ExtractBitFromMaskVector"
) ? void (0) : __assert_fail ("(VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) && \"Unexpected vector type in ExtractBitFromMaskVector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20063, __extension__
__PRETTY_FUNCTION__))

20063

"Unexpected vector type in ExtractBitFromMaskVector")(static_cast <bool> ((VecVT.getVectorNumElements() <=
16 || Subtarget.hasBWI()) && "Unexpected vector type in ExtractBitFromMaskVector"
) ? void (0) : __assert_fail ("(VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) && \"Unexpected vector type in ExtractBitFromMaskVector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20063, __extension__
__PRETTY_FUNCTION__));

20064

20065

// variable index can't be handled in mask registers,

20066

// extend vector to VR512/128

20067

if (!IdxC) {

20068

unsigned NumElts = VecVT.getVectorNumElements();

20069

// Extending v8i1/v16i1 to 512-bit get better performance on KNL

20070

// than extending to 128/256bit.

20071

MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;

20072

MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);

20073

SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec);

20074

SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ExtEltVT, Ext, Idx);

20075

return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);

20076

}

20077

20078

unsigned IdxVal = IdxC->getZExtValue();

20079

if (IdxVal == 0) // the operation is legal

20080

return Op;

20081

20082

// Extend to natively supported kshift.

20083

unsigned NumElems = VecVT.getVectorNumElements();

20084

MVT WideVecVT = VecVT;

20085

if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8) {

20086

WideVecVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;

20087

Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVecVT,

20088

DAG.getUNDEF(WideVecVT), Vec,

20089

DAG.getIntPtrConstant(0, dl));

20090

}

20091

20092

// Use kshiftr instruction to move to the lower element.

20093

Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideVecVT, Vec,

20094

DAG.getTargetConstant(IdxVal, dl, MVT::i8));

20095

20096

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,

20097

DAG.getIntPtrConstant(0, dl));

20098

}

20099

20100

SDValue

20101

X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,

20102

SelectionDAG &DAG) const {

20103

SDLoc dl(Op);

20104

SDValue Vec = Op.getOperand(0);

20105

MVT VecVT = Vec.getSimpleValueType();

20106

SDValue Idx = Op.getOperand(1);

20107

auto* IdxC = dyn_cast<ConstantSDNode>(Idx);

20108

20109

if (VecVT.getVectorElementType() == MVT::i1)

20110

return ExtractBitFromMaskVector(Op, DAG, Subtarget);

20111

20112

if (!IdxC) {

20113

// Its more profitable to go through memory (1 cycles throughput)

20114

// than using VMOVD + VPERMV/PSHUFB sequence ( 2/3 cycles throughput)

20115

// IACA tool was used to get performance estimation

20116

// (https://software.intel.com/en-us/articles/intel-architecture-code-analyzer)

20117

//

20118

// example : extractelement <16 x i8> %a, i32 %i

20119

//

20120

// Block Throughput: 3.00 Cycles

20121

// Throughput Bottleneck: Port5

20122

//

20123

// | Num Of | Ports pressure in cycles | |

20124

// | Uops | 0 - DV | 5 | 6 | 7 | |

20125

// ---------------------------------------------

20126

// | 1 | | 1.0 | | | CP | vmovd xmm1, edi

20127

// | 1 | | 1.0 | | | CP | vpshufb xmm0, xmm0, xmm1

20128

// | 2 | 1.0 | 1.0 | | | CP | vpextrb eax, xmm0, 0x0

20129

// Total Num Of Uops: 4

20130

//

20131

//

20132

// Block Throughput: 1.00 Cycles

20133

// Throughput Bottleneck: PORT2_AGU, PORT3_AGU, Port4

20134

//

20135

// | | Ports pressure in cycles | |

20136

// |Uops| 1 | 2 - D |3 - D | 4 | 5 | |

20137

// ---------------------------------------------------------

20138

// |2^ | | 0.5 | 0.5 |1.0| |CP| vmovaps xmmword ptr [rsp-0x18], xmm0

20139

// |1 |0.5| | | |0.5| | lea rax, ptr [rsp-0x18]

20140

// |1 | |0.5, 0.5|0.5, 0.5| | |CP| mov al, byte ptr [rdi+rax*1]

20141

// Total Num Of Uops: 4

20142

20143

return SDValue();

20144

}

20145

20146

unsigned IdxVal = IdxC->getZExtValue();

20147

20148

// If this is a 256-bit vector result, first extract the 128-bit vector and

20149

// then extract the element from the 128-bit vector.

20150

if (VecVT.is256BitVector() || VecVT.is512BitVector()) {

20151

// Get the 128-bit vector.

20152

Vec = extract128BitVector(Vec, IdxVal, DAG, dl);

20153

MVT EltVT = VecVT.getVectorElementType();

20154

20155

unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();

20156

assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2")(static_cast <bool> (isPowerOf2_32(ElemsPerChunk) &&
"Elements per chunk not power of 2") ? void (0) : __assert_fail
("isPowerOf2_32(ElemsPerChunk) && \"Elements per chunk not power of 2\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20156, __extension__
__PRETTY_FUNCTION__));

20157

20158

// Find IdxVal modulo ElemsPerChunk. Since ElemsPerChunk is a power of 2

20159

// this can be done with a mask.

20160

IdxVal &= ElemsPerChunk - 1;

20161

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,

20162

DAG.getIntPtrConstant(IdxVal, dl));

20163

}

20164

20165

assert(VecVT.is128BitVector() && "Unexpected vector length")(static_cast <bool> (VecVT.is128BitVector() && "Unexpected vector length"
) ? void (0) : __assert_fail ("VecVT.is128BitVector() && \"Unexpected vector length\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20165, __extension__
__PRETTY_FUNCTION__));

20166

20167

MVT VT = Op.getSimpleValueType();

20168

20169

if (VT == MVT::i16) {

20170

// If IdxVal is 0, it's cheaper to do a move instead of a pextrw, unless

20171

// we're going to zero extend the register or fold the store (SSE41 only).

20172

if (IdxVal == 0 && !X86::mayFoldIntoZeroExtend(Op) &&

20173

!(Subtarget.hasSSE41() && X86::mayFoldIntoStore(Op))) {

20174

if (Subtarget.hasFP16())

20175

return Op;

20176

20177

return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,

20178

DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,

20179

DAG.getBitcast(MVT::v4i32, Vec), Idx));

20180

}

20181

20182

SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32, Vec,

20183

DAG.getTargetConstant(IdxVal, dl, MVT::i8));

20184

return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);

20185

}

20186

20187

if (Subtarget.hasSSE41())

20188

if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG))

20189

return Res;

20190

20191

// TODO: We only extract a single element from v16i8, we can probably afford

20192

// to be more aggressive here before using the default approach of spilling to

20193

// stack.

20194

if (VT.getSizeInBits() == 8 && Op->isOnlyUserOf(Vec.getNode())) {

20195

// Extract either the lowest i32 or any i16, and extract the sub-byte.

20196

int DWordIdx = IdxVal / 4;

20197

if (DWordIdx == 0) {

20198

SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,

20199

DAG.getBitcast(MVT::v4i32, Vec),

20200

DAG.getIntPtrConstant(DWordIdx, dl));

20201

int ShiftVal = (IdxVal % 4) * 8;

20202

if (ShiftVal != 0)

20203

Res = DAG.getNode(ISD::SRL, dl, MVT::i32, Res,

20204

DAG.getConstant(ShiftVal, dl, MVT::i8));

20205

return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);

20206

}

20207

20208

int WordIdx = IdxVal / 2;

20209

SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,

20210

DAG.getBitcast(MVT::v8i16, Vec),

20211

DAG.getIntPtrConstant(WordIdx, dl));

20212

int ShiftVal = (IdxVal % 2) * 8;

20213

if (ShiftVal != 0)

20214

Res = DAG.getNode(ISD::SRL, dl, MVT::i16, Res,

20215

DAG.getConstant(ShiftVal, dl, MVT::i8));

20216

return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);

20217

}

20218

20219

if (VT == MVT::f16 || VT.getSizeInBits() == 32) {

20220

if (IdxVal == 0)

20221

return Op;

20222

20223

// Shuffle the element to the lowest element, then movss or movsh.

20224

SmallVector<int, 8> Mask(VecVT.getVectorNumElements(), -1);

20225

Mask[0] = static_cast<int>(IdxVal);

20226

Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);

20227

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,

20228

DAG.getIntPtrConstant(0, dl));

20229

}

20230

20231

if (VT.getSizeInBits() == 64) {

20232

// FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b

20233

// FIXME: seems like this should be unnecessary if mov{h,l}pd were taught

20234

// to match extract_elt for f64.

20235

if (IdxVal == 0)

20236

return Op;

20237

20238

// UNPCKHPD the element to the lowest double word, then movsd.

20239

// Note if the lower 64 bits of the result of the UNPCKHPD is then stored

20240

// to a f64mem, the whole operation is folded into a single MOVHPDmr.

20241

int Mask[2] = { 1, -1 };

20242

Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);

20243

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,

20244

DAG.getIntPtrConstant(0, dl));

20245

}

20246

20247

return SDValue();

20248

}

20249

20250

/// Insert one bit to mask vector, like v16i1 or v8i1.

20251

/// AVX-512 feature.

20252

static SDValue InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG,

20253

const X86Subtarget &Subtarget) {

20254

SDLoc dl(Op);

20255

SDValue Vec = Op.getOperand(0);

20256

SDValue Elt = Op.getOperand(1);

20257

SDValue Idx = Op.getOperand(2);

20258

MVT VecVT = Vec.getSimpleValueType();

20259

20260

if (!isa<ConstantSDNode>(Idx)) {

20261

// Non constant index. Extend source and destination,

20262

// insert element and then truncate the result.

20263

unsigned NumElts = VecVT.getVectorNumElements();

20264

MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;

20265

MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);

20266

SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,

20267

DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec),

20268

DAG.getNode(ISD::SIGN_EXTEND, dl, ExtEltVT, Elt), Idx);

20269

return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);

20270

}

20271

20272

// Copy into a k-register, extract to v1i1 and insert_subvector.

20273

SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i1, Elt);

20274

return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT, Vec, EltInVec, Idx);

20275

}

20276

20277

SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,

20278

SelectionDAG &DAG) const {

20279

MVT VT = Op.getSimpleValueType();

20280

MVT EltVT = VT.getVectorElementType();

20281

unsigned NumElts = VT.getVectorNumElements();

20282

unsigned EltSizeInBits = EltVT.getScalarSizeInBits();

20283

20284

if (EltVT == MVT::i1)

20285

return InsertBitToMaskVector(Op, DAG, Subtarget);

20286

20287

SDLoc dl(Op);

20288

SDValue N0 = Op.getOperand(0);

20289

SDValue N1 = Op.getOperand(1);

20290

SDValue N2 = Op.getOperand(2);

20291

auto *N2C = dyn_cast<ConstantSDNode>(N2);

20292

20293

if (!N2C) {

20294

// Variable insertion indices, usually we're better off spilling to stack,

20295

// but AVX512 can use a variable compare+select by comparing against all

20296

// possible vector indices, and FP insertion has less gpr->simd traffic.

20297

if (!(Subtarget.hasBWI() ||

20298

(Subtarget.hasAVX512() && EltSizeInBits >= 32) ||

20299

(Subtarget.hasSSE41() && (EltVT == MVT::f32 || EltVT == MVT::f64))))

20300

return SDValue();

20301

20302

MVT IdxSVT = MVT::getIntegerVT(EltSizeInBits);

20303

MVT IdxVT = MVT::getVectorVT(IdxSVT, NumElts);

20304

if (!isTypeLegal(IdxSVT) || !isTypeLegal(IdxVT))

20305

return SDValue();

20306

20307

SDValue IdxExt = DAG.getZExtOrTrunc(N2, dl, IdxSVT);

20308

SDValue IdxSplat = DAG.getSplatBuildVector(IdxVT, dl, IdxExt);

20309

SDValue EltSplat = DAG.getSplatBuildVector(VT, dl, N1);

20310

20311

SmallVector<SDValue, 16> RawIndices;

20312

for (unsigned I = 0; I != NumElts; ++I)

20313

RawIndices.push_back(DAG.getConstant(I, dl, IdxSVT));

20314

SDValue Indices = DAG.getBuildVector(IdxVT, dl, RawIndices);

20315

20316

// inselt N0, N1, N2 --> select (SplatN2 == {0,1,2...}) ? SplatN1 : N0.

20317

return DAG.getSelectCC(dl, IdxSplat, Indices, EltSplat, N0,

20318

ISD::CondCode::SETEQ);

20319

}

20320

20321

if (N2C->getAPIntValue().uge(NumElts))

20322

return SDValue();

20323

uint64_t IdxVal = N2C->getZExtValue();

20324

20325

bool IsZeroElt = X86::isZeroNode(N1);

20326

bool IsAllOnesElt = VT.isInteger() && llvm::isAllOnesConstant(N1);

20327

20328

if (IsZeroElt || IsAllOnesElt) {

20329

// Lower insertion of v16i8/v32i8/v64i16 -1 elts as an 'OR' blend.

20330

// We don't deal with i8 0 since it appears to be handled elsewhere.

20331

if (IsAllOnesElt &&

20332

((VT == MVT::v16i8 && !Subtarget.hasSSE41()) ||

20333

((VT == MVT::v32i8 || VT == MVT::v16i16) && !Subtarget.hasInt256()))) {

20334

SDValue ZeroCst = DAG.getConstant(0, dl, VT.getScalarType());

20335

SDValue OnesCst = DAG.getAllOnesConstant(dl, VT.getScalarType());

20336

SmallVector<SDValue, 8> CstVectorElts(NumElts, ZeroCst);

20337

CstVectorElts[IdxVal] = OnesCst;

20338

SDValue CstVector = DAG.getBuildVector(VT, dl, CstVectorElts);

20339

return DAG.getNode(ISD::OR, dl, VT, N0, CstVector);

20340

}

20341

// See if we can do this more efficiently with a blend shuffle with a

20342

// rematerializable vector.

20343

if (Subtarget.hasSSE41() &&

20344

(EltSizeInBits >= 16 || (IsZeroElt && !VT.is128BitVector()))) {

20345

SmallVector<int, 8> BlendMask;

20346

for (unsigned i = 0; i != NumElts; ++i)

20347

BlendMask.push_back(i == IdxVal ? i + NumElts : i);

20348

SDValue CstVector = IsZeroElt ? getZeroVector(VT, Subtarget, DAG, dl)

20349

: getOnesVector(VT, DAG, dl);

20350

return DAG.getVectorShuffle(VT, dl, N0, CstVector, BlendMask);

20351

}

20352

}

20353

20354

// If the vector is wider than 128 bits, extract the 128-bit subvector, insert

20355

// into that, and then insert the subvector back into the result.

20356

if (VT.is256BitVector() || VT.is512BitVector()) {

20357

// With a 256-bit vector, we can insert into the zero element efficiently

20358

// using a blend if we have AVX or AVX2 and the right data type.

20359

if (VT.is256BitVector() && IdxVal == 0) {

20360

// TODO: It is worthwhile to cast integer to floating point and back

20361

// and incur a domain crossing penalty if that's what we'll end up

20362

// doing anyway after extracting to a 128-bit vector.

20363

if ((Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||

20364

(Subtarget.hasAVX2() && (EltVT == MVT::i32 || EltVT == MVT::i64))) {

20365

SDValue N1Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);

20366

return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec,

20367

DAG.getTargetConstant(1, dl, MVT::i8));

20368

}

20369

}

20370

20371

unsigned NumEltsIn128 = 128 / EltSizeInBits;

20372

assert(isPowerOf2_32(NumEltsIn128) &&(static_cast <bool> (isPowerOf2_32(NumEltsIn128) &&
"Vectors will always have power-of-two number of elements.")
? void (0) : __assert_fail ("isPowerOf2_32(NumEltsIn128) && \"Vectors will always have power-of-two number of elements.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20373, __extension__
__PRETTY_FUNCTION__))

20373

"Vectors will always have power-of-two number of elements.")(static_cast <bool> (isPowerOf2_32(NumEltsIn128) &&
"Vectors will always have power-of-two number of elements.")
? void (0) : __assert_fail ("isPowerOf2_32(NumEltsIn128) && \"Vectors will always have power-of-two number of elements.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20373, __extension__
__PRETTY_FUNCTION__));

20374

20375

// If we are not inserting into the low 128-bit vector chunk,

20376

// then prefer the broadcast+blend sequence.

20377

// FIXME: relax the profitability check iff all N1 uses are insertions.

20378

if (IdxVal >= NumEltsIn128 &&

20379

((Subtarget.hasAVX2() && EltSizeInBits != 8) ||

20380

(Subtarget.hasAVX() && (EltSizeInBits >= 32) &&

20381

X86::mayFoldLoad(N1, Subtarget)))) {

20382

SDValue N1SplatVec = DAG.getSplatBuildVector(VT, dl, N1);

20383

SmallVector<int, 8> BlendMask;

20384

for (unsigned i = 0; i != NumElts; ++i)

20385

BlendMask.push_back(i == IdxVal ? i + NumElts : i);

20386

return DAG.getVectorShuffle(VT, dl, N0, N1SplatVec, BlendMask);

20387

}

20388

20389

// Get the desired 128-bit vector chunk.

20390

SDValue V = extract128BitVector(N0, IdxVal, DAG, dl);

20391

20392

// Insert the element into the desired chunk.

20393

// Since NumEltsIn128 is a power of 2 we can use mask instead of modulo.

20394

unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1);

20395

20396

V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,

20397

DAG.getIntPtrConstant(IdxIn128, dl));

20398

20399

// Insert the changed part back into the bigger vector

20400

return insert128BitVector(N0, V, IdxVal, DAG, dl);

20401

}

20402

assert(VT.is128BitVector() && "Only 128-bit vector types should be left!")(static_cast <bool> (VT.is128BitVector() && "Only 128-bit vector types should be left!"
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Only 128-bit vector types should be left!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20402, __extension__
__PRETTY_FUNCTION__));

20403

20404

// This will be just movw/movd/movq/movsh/movss/movsd.

20405

if (IdxVal == 0 && ISD::isBuildVectorAllZeros(N0.getNode())) {

20406

if (EltVT == MVT::i32 || EltVT == MVT::f32 || EltVT == MVT::f64 ||

20407

EltVT == MVT::f16 || EltVT == MVT::i64) {

20408

N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);

20409

return getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);

20410

}

20411

20412

// We can't directly insert an i8 or i16 into a vector, so zero extend

20413

// it to i32 first.

20414

if (EltVT == MVT::i16 || EltVT == MVT::i8) {

20415

N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, N1);

20416

MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);

20417

N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, N1);

20418

N1 = getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);

20419

return DAG.getBitcast(VT, N1);

20420

}

20421

}

20422

20423

// Transform it so it match pinsr{b,w} which expects a GR32 as its second

20424

// argument. SSE41 required for pinsrb.

20425

if (VT == MVT::v8i16 || (VT == MVT::v16i8 && Subtarget.hasSSE41())) {

20426

unsigned Opc;

20427

if (VT == MVT::v8i16) {

20428

assert(Subtarget.hasSSE2() && "SSE2 required for PINSRW")(static_cast <bool> (Subtarget.hasSSE2() && "SSE2 required for PINSRW"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"SSE2 required for PINSRW\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20428, __extension__
__PRETTY_FUNCTION__));

20429

Opc = X86ISD::PINSRW;

20430

} else {

20431

assert(VT == MVT::v16i8 && "PINSRB requires v16i8 vector")(static_cast <bool> (VT == MVT::v16i8 && "PINSRB requires v16i8 vector"
) ? void (0) : __assert_fail ("VT == MVT::v16i8 && \"PINSRB requires v16i8 vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20431, __extension__
__PRETTY_FUNCTION__));

20432

assert(Subtarget.hasSSE41() && "SSE41 required for PINSRB")(static_cast <bool> (Subtarget.hasSSE41() && "SSE41 required for PINSRB"
) ? void (0) : __assert_fail ("Subtarget.hasSSE41() && \"SSE41 required for PINSRB\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20432, __extension__
__PRETTY_FUNCTION__));

20433

Opc = X86ISD::PINSRB;

20434

}

20435

20436

assert(N1.getValueType() != MVT::i32 && "Unexpected VT")(static_cast <bool> (N1.getValueType() != MVT::i32 &&
"Unexpected VT") ? void (0) : __assert_fail ("N1.getValueType() != MVT::i32 && \"Unexpected VT\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20436, __extension__
__PRETTY_FUNCTION__));

20437

N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);

20438

N2 = DAG.getTargetConstant(IdxVal, dl, MVT::i8);

20439

return DAG.getNode(Opc, dl, VT, N0, N1, N2);

20440

}

20441

20442

if (Subtarget.hasSSE41()) {

20443

if (EltVT == MVT::f32) {

20444

// Bits [7:6] of the constant are the source select. This will always be

20445

// zero here. The DAG Combiner may combine an extract_elt index into

20446

// these bits. For example (insert (extract, 3), 2) could be matched by

20447

// putting the '3' into bits [7:6] of X86ISD::INSERTPS.

20448

// Bits [5:4] of the constant are the destination select. This is the

20449

// value of the incoming immediate.

20450

// Bits [3:0] of the constant are the zero mask. The DAG Combiner may

20451

// combine either bitwise AND or insert of float 0.0 to set these bits.

20452

20453

bool MinSize = DAG.getMachineFunction().getFunction().hasMinSize();

20454

if (IdxVal == 0 && (!MinSize || !X86::mayFoldLoad(N1, Subtarget))) {

20455

// If this is an insertion of 32-bits into the low 32-bits of

20456

// a vector, we prefer to generate a blend with immediate rather

20457

// than an insertps. Blends are simpler operations in hardware and so

20458

// will always have equal or better performance than insertps.

20459

// But if optimizing for size and there's a load folding opportunity,

20460

// generate insertps because blendps does not have a 32-bit memory

20461

// operand form.

20462

N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);

20463

return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1,

20464

DAG.getTargetConstant(1, dl, MVT::i8));

20465

}

20466

// Create this as a scalar to vector..

20467

N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);

20468

return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1,

20469

DAG.getTargetConstant(IdxVal << 4, dl, MVT::i8));

20470

}

20471

20472

// PINSR* works with constant index.

20473

if (EltVT == MVT::i32 || EltVT == MVT::i64)

20474

return Op;

20475

}

20476

20477

return SDValue();

20478

}

20479

20480

static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, const X86Subtarget &Subtarget,

20481

SelectionDAG &DAG) {

20482

SDLoc dl(Op);

20483

MVT OpVT = Op.getSimpleValueType();

20484

20485

// It's always cheaper to replace a xor+movd with xorps and simplifies further

20486

// combines.

20487

if (X86::isZeroNode(Op.getOperand(0)))

20488

return getZeroVector(OpVT, Subtarget, DAG, dl);

20489

20490

// If this is a 256-bit vector result, first insert into a 128-bit

20491

// vector and then insert into the 256-bit vector.

20492

if (!OpVT.is128BitVector()) {

20493

// Insert into a 128-bit vector.

20494

unsigned SizeFactor = OpVT.getSizeInBits() / 128;

20495

MVT VT128 = MVT::getVectorVT(OpVT.getVectorElementType(),

20496

OpVT.getVectorNumElements() / SizeFactor);

20497

20498

Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));

20499

20500

// Insert the 128-bit vector.

20501

return insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);

20502

}

20503

assert(OpVT.is128BitVector() && OpVT.isInteger() && OpVT != MVT::v2i64 &&(static_cast <bool> (OpVT.is128BitVector() && OpVT
.isInteger() && OpVT != MVT::v2i64 && "Expected an SSE type!"
) ? void (0) : __assert_fail ("OpVT.is128BitVector() && OpVT.isInteger() && OpVT != MVT::v2i64 && \"Expected an SSE type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20504, __extension__
__PRETTY_FUNCTION__))

20504

"Expected an SSE type!")(static_cast <bool> (OpVT.is128BitVector() && OpVT
.isInteger() && OpVT != MVT::v2i64 && "Expected an SSE type!"
) ? void (0) : __assert_fail ("OpVT.is128BitVector() && OpVT.isInteger() && OpVT != MVT::v2i64 && \"Expected an SSE type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20504, __extension__
__PRETTY_FUNCTION__));

20505

20506

// Pass through a v4i32 or V8i16 SCALAR_TO_VECTOR as that's what we use in

20507

// tblgen.

20508

if (OpVT == MVT::v4i32 || (OpVT == MVT::v8i16 && Subtarget.hasFP16()))

20509

return Op;

20510

20511

SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));

20512

return DAG.getBitcast(

20513

OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, AnyExt));

20514

}

20515

20516

// Lower a node with an INSERT_SUBVECTOR opcode. This may result in a

20517

// simple superregister reference or explicit instructions to insert

20518

// the upper bits of a vector.

20519

static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,

20520

SelectionDAG &DAG) {

20521

assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1)(static_cast <bool> (Op.getSimpleValueType().getVectorElementType
() == MVT::i1) ? void (0) : __assert_fail ("Op.getSimpleValueType().getVectorElementType() == MVT::i1"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20521, __extension__
__PRETTY_FUNCTION__));

20522

20523

return insert1BitVector(Op, DAG, Subtarget);

20524

}

20525

20526

static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,

20527

SelectionDAG &DAG) {

20528

assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&(static_cast <bool> (Op.getSimpleValueType().getVectorElementType
() == MVT::i1 && "Only vXi1 extract_subvectors need custom lowering"
) ? void (0) : __assert_fail ("Op.getSimpleValueType().getVectorElementType() == MVT::i1 && \"Only vXi1 extract_subvectors need custom lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20529, __extension__
__PRETTY_FUNCTION__))

20529

"Only vXi1 extract_subvectors need custom lowering")(static_cast <bool> (Op.getSimpleValueType().getVectorElementType
() == MVT::i1 && "Only vXi1 extract_subvectors need custom lowering"
) ? void (0) : __assert_fail ("Op.getSimpleValueType().getVectorElementType() == MVT::i1 && \"Only vXi1 extract_subvectors need custom lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 20529, __extension__
__PRETTY_FUNCTION__));

20530

20531

SDLoc dl(Op);

20532

SDValue Vec = Op.getOperand(0);

20533

uint64_t IdxVal = Op.getConstantOperandVal(1);

20534

20535

if (IdxVal == 0) // the operation is legal

20536

return Op;

20537

20538

MVT VecVT = Vec.getSimpleValueType();

20539

unsigned NumElems = VecVT.getVectorNumElements();

20540

20541

// Extend to natively supported kshift.

20542

MVT WideVecVT = VecVT;

20543

if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8) {

20544

WideVecVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;

20545

Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVecVT,

20546

DAG.getUNDEF(WideVecVT), Vec,

20547

DAG.getIntPtrConstant(0, dl));

20548

}

20549

20550

// Shift to the LSB.

20551

Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideVecVT, Vec,

20552

DAG.getTargetConstant(IdxVal, dl, MVT::i8));

20553

20554

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, Op.getValueType(), Vec,

20555

DAG.getIntPtrConstant(0, dl));

20556

}

20557

20558

// Returns the appropriate wrapper opcode for a global reference.

20559

unsigned X86TargetLowering::getGlobalWrapperKind(

20560

const GlobalValue *GV, const unsigned char OpFlags) const {

20561

// References to absolute symbols are never PC-relative.

20562

if (GV && GV->isAbsoluteSymbolRef())

20563

return X86ISD::Wrapper;

20564

20565

CodeModel::Model M = getTargetMachine().getCodeModel();

20566

if (Subtarget.isPICStyleRIPRel() &&

20567

(M == CodeModel::Small || M == CodeModel::Kernel))

20568

return X86ISD::WrapperRIP;

20569

20570

// In the medium model, functions can always be referenced RIP-relatively,

20571

// since they must be within 2GiB. This is also possible in non-PIC mode, and

20572

// shorter than the 64-bit absolute immediate that would otherwise be emitted.

20573

if (M == CodeModel::Medium && isa_and_nonnull<Function>(GV))

20574

return X86ISD::WrapperRIP;

20575

20576

// GOTPCREL references must always use RIP.

20577

if (OpFlags == X86II::MO_GOTPCREL || OpFlags == X86II::MO_GOTPCREL_NORELAX)

20578

return X86ISD::WrapperRIP;

20579

20580

return X86ISD::Wrapper;

20581

}

20582

20583

// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as

20584

// their target counterpart wrapped in the X86ISD::Wrapper node. Suppose N is

20585

// one of the above mentioned nodes. It has to be wrapped because otherwise

20586

// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only

20587

// be used to form addressing mode. These wrapped nodes will be selected

20588

// into MOV32ri.

20589

SDValue

20590

X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {

20591

ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);

20592

20593

// In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the

20594

// global base reg.

20595

unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);

20596

20597

auto PtrVT = getPointerTy(DAG.getDataLayout());

20598

SDValue Result = DAG.getTargetConstantPool(

20599

CP->getConstVal(), PtrVT, CP->getAlign(), CP->getOffset(), OpFlag);

20600

SDLoc DL(CP);

20601

Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);

20602

// With PIC, the address is actually $g + Offset.

20603

if (OpFlag) {

20604

Result =

20605

DAG.getNode(ISD::ADD, DL, PtrVT,

20606

DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);

20607

}

20608

20609

return Result;

20610

}

20611

20612

SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {

20613

JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);

20614

20615

// In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the

20616

// global base reg.

20617

unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);

20618

20619

auto PtrVT = getPointerTy(DAG.getDataLayout());

20620

SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, OpFlag);

20621

SDLoc DL(JT);

20622

Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);

20623

20624

// With PIC, the address is actually $g + Offset.

20625

if (OpFlag)

20626

Result =

20627

DAG.getNode(ISD::ADD, DL, PtrVT,

20628

DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);

20629

20630

return Result;

20631

}

20632

20633

SDValue X86TargetLowering::LowerExternalSymbol(SDValue Op,

20634

SelectionDAG &DAG) const {

20635

return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false);

20636

}

20637

20638

SDValue

20639

X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {

20640

// Create the TargetBlockAddressAddress node.

20641

unsigned char OpFlags =

20642

Subtarget.classifyBlockAddressReference();

20643

const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();

20644

int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();

20645

SDLoc dl(Op);

20646

auto PtrVT = getPointerTy(DAG.getDataLayout());

20647

SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset, OpFlags);

20648

Result = DAG.getNode(getGlobalWrapperKind(), dl, PtrVT, Result);

20649

20650

// With PIC, the address is actually $g + Offset.

20651

if (isGlobalRelativeToPICBase(OpFlags)) {

20652

Result = DAG.getNode(ISD::ADD, dl, PtrVT,

20653

DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);

20654

}

20655

20656

return Result;

20657

}

20658

20659

/// Creates target global address or external symbol nodes for calls or

20660

/// other uses.

20661

SDValue X86TargetLowering::LowerGlobalOrExternal(SDValue Op, SelectionDAG &DAG,

20662

bool ForCall) const {

20663

// Unpack the global address or external symbol.

20664

const SDLoc &dl = SDLoc(Op);

20665

const GlobalValue *GV = nullptr;

20666

int64_t Offset = 0;

20667

const char *ExternalSym = nullptr;

20668

if (const auto *G = dyn_cast<GlobalAddressSDNode>(Op)) {

20669

GV = G->getGlobal();

20670

Offset = G->getOffset();

20671

} else {

20672

const auto *ES = cast<ExternalSymbolSDNode>(Op);

20673

ExternalSym = ES->getSymbol();

20674

}

20675

20676

// Calculate some flags for address lowering.

20677

const Module &Mod = *DAG.getMachineFunction().getFunction().getParent();

20678

unsigned char OpFlags;

20679

if (ForCall)

20680

OpFlags = Subtarget.classifyGlobalFunctionReference(GV, Mod);

20681

else

20682

OpFlags = Subtarget.classifyGlobalReference(GV, Mod);

20683

bool HasPICReg = isGlobalRelativeToPICBase(OpFlags);

20684

bool NeedsLoad = isGlobalStubReference(OpFlags);

20685

20686

CodeModel::Model M = DAG.getTarget().getCodeModel();

20687

auto PtrVT = getPointerTy(DAG.getDataLayout());

20688

SDValue Result;

20689

20690

if (GV) {

20691

// Create a target global address if this is a global. If possible, fold the

20692

// offset into the global address reference. Otherwise, ADD it on later.

20693

// Suppress the folding if Offset is negative: movl foo-1, %eax is not

20694

// allowed because if the address of foo is 0, the ELF R_X86_64_32

20695

// relocation will compute to a negative value, which is invalid.

20696

int64_t GlobalOffset = 0;

20697

if (OpFlags == X86II::MO_NO_FLAG && Offset >= 0 &&

20698

X86::isOffsetSuitableForCodeModel(Offset, M, true)) {

20699

std::swap(GlobalOffset, Offset);

20700

}

20701

Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, GlobalOffset, OpFlags);

20702

} else {

20703

// If this is not a global address, this must be an external symbol.

20704

Result = DAG.getTargetExternalSymbol(ExternalSym, PtrVT, OpFlags);

20705

}

20706

20707

// If this is a direct call, avoid the wrapper if we don't need to do any

20708

// loads or adds. This allows SDAG ISel to match direct calls.

20709

if (ForCall && !NeedsLoad && !HasPICReg && Offset == 0)

20710

return Result;

20711

20712

Result = DAG.getNode(getGlobalWrapperKind(GV, OpFlags), dl, PtrVT, Result);

20713

20714

// With PIC, the address is actually $g + Offset.

20715

if (HasPICReg) {

20716

Result = DAG.getNode(ISD::ADD, dl, PtrVT,

20717

DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);

20718

}

20719

20720

// For globals that require a load from a stub to get the address, emit the

20721

// load.

20722

if (NeedsLoad)

20723

Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,

20724

MachinePointerInfo::getGOT(DAG.getMachineFunction()));

20725

20726

// If there was a non-zero offset that we didn't fold, create an explicit

20727

// addition for it.

20728

if (Offset != 0)

20729

Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result,

20730

DAG.getConstant(Offset, dl, PtrVT));

20731

20732

return Result;

20733

}

20734

20735

SDValue

20736

X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {

20737

return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false);

20738

}

20739

20740

static SDValue

20741

GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,

20742

SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg,

20743

unsigned char OperandFlags, bool LocalDynamic = false) {

20744

MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();

20745

SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);

20746

SDLoc dl(GA);

20747

SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,

20748

GA->getValueType(0),

20749

GA->getOffset(),

20750

OperandFlags);

20751

20752

X86ISD::NodeType CallType = LocalDynamic ? X86ISD::TLSBASEADDR

20753

: X86ISD::TLSADDR;

20754

20755

if (InFlag) {

20756

SDValue Ops[] = { Chain, TGA, *InFlag };

20757

Chain = DAG.getNode(CallType, dl, NodeTys, Ops);

20758

} else {

20759

SDValue Ops[] = { Chain, TGA };

20760

Chain = DAG.getNode(CallType, dl, NodeTys, Ops);

20761

}

20762

20763

// TLSADDR will be codegen'ed as call. Inform MFI that function has calls.

20764

MFI.setAdjustsStack(true);

20765

MFI.setHasCalls(true);

20766

20767

SDValue Flag = Chain.getValue(1);

20768

return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag);

20769

}

20770

20771

// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit

20772

static SDValue

20773

LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,

20774

const EVT PtrVT) {

20775

SDValue InFlag;

20776

SDLoc dl(GA); // ? function entry point might be better

20777

SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,

20778

DAG.getNode(X86ISD::GlobalBaseReg,

20779

SDLoc(), PtrVT), InFlag);

20780

InFlag = Chain.getValue(1);

20781

20782

return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD);

20783

}

20784

20785

// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit LP64

20786

static SDValue

20787

LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,

20788

const EVT PtrVT) {

20789

return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,

20790

X86::RAX, X86II::MO_TLSGD);

20791

}

20792

20793

// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit ILP32

20794

static SDValue

20795

LowerToTLSGeneralDynamicModelX32(GlobalAddressSDNode *GA, SelectionDAG &DAG,

20796

const EVT PtrVT) {

20797

return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,

20798

X86::EAX, X86II::MO_TLSGD);

20799

}

20800

20801

static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,

20802

SelectionDAG &DAG, const EVT PtrVT,

20803

bool Is64Bit, bool Is64BitLP64) {

20804

SDLoc dl(GA);

20805

20806

// Get the start address of the TLS block for this module.

20807

X86MachineFunctionInfo *MFI = DAG.getMachineFunction()

20808

.getInfo<X86MachineFunctionInfo>();

20809

MFI->incNumLocalDynamicTLSAccesses();

20810

20811

SDValue Base;

20812

if (Is64Bit) {

20813

unsigned ReturnReg = Is64BitLP64 ? X86::RAX : X86::EAX;

20814

Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, ReturnReg,

20815

X86II::MO_TLSLD, /*LocalDynamic=*/true);

20816

} else {

20817

SDValue InFlag;

20818

SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,

20819

DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), InFlag);

20820

InFlag = Chain.getValue(1);

20821

Base = GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX,

20822

X86II::MO_TLSLDM, /*LocalDynamic=*/true);

20823

}

20824

20825

// Note: the CleanupLocalDynamicTLSPass will remove redundant computations

20826

// of Base.

20827

20828

// Build x@dtpoff.

20829

unsigned char OperandFlags = X86II::MO_DTPOFF;

20830

unsigned WrapperKind = X86ISD::Wrapper;

20831

SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,

20832

GA->getValueType(0),

20833

GA->getOffset(), OperandFlags);

20834

SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);

20835

20836

// Add x@dtpoff with the base.

20837

return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);

20838

}

20839

20840

// Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.

20841

static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,

20842

const EVT PtrVT, TLSModel::Model model,

20843

bool is64Bit, bool isPIC) {

20844

SDLoc dl(GA);

20845

20846

// Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).

20847

Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(),

20848

is64Bit ? 257 : 256));

20849

20850

SDValue ThreadPointer =

20851

DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),

20852

MachinePointerInfo(Ptr));

20853

20854

unsigned char OperandFlags = 0;

20855

// Most TLS accesses are not RIP relative, even on x86-64. One exception is

20856

// initialexec.

20857

unsigned WrapperKind = X86ISD::Wrapper;

20858

if (model == TLSModel::LocalExec) {

20859

OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;

20860

} else if (model == TLSModel::InitialExec) {

20861

if (is64Bit) {

20862

OperandFlags = X86II::MO_GOTTPOFF;

20863

WrapperKind = X86ISD::WrapperRIP;

20864

} else {

20865

OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;

20866

}

20867

} else {

20868

llvm_unreachable("Unexpected model")::llvm::llvm_unreachable_internal("Unexpected model", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20868);

20869

}

20870

20871

// emit "addl x@ntpoff,%eax" (local exec)

20872

// or "addl x@indntpoff,%eax" (initial exec)

20873

// or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)

20874

SDValue TGA =

20875

DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),

20876

GA->getOffset(), OperandFlags);

20877

SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);

20878

20879

if (model == TLSModel::InitialExec) {

20880

if (isPIC && !is64Bit) {

20881

Offset = DAG.getNode(ISD::ADD, dl, PtrVT,

20882

DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),

20883

Offset);

20884

}

20885

20886

Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,

20887

MachinePointerInfo::getGOT(DAG.getMachineFunction()));

20888

}

20889

20890

// The address of the thread local variable is the add of the thread

20891

// pointer with the offset of the variable.

20892

return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);

20893

}

20894

20895

SDValue

20896

X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {

20897

20898

GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);

20899

20900

if (DAG.getTarget().useEmulatedTLS())

20901

return LowerToTLSEmulatedModel(GA, DAG);

20902

20903

const GlobalValue *GV = GA->getGlobal();

20904

auto PtrVT = getPointerTy(DAG.getDataLayout());

20905

bool PositionIndependent = isPositionIndependent();

20906

20907

if (Subtarget.isTargetELF()) {

20908

TLSModel::Model model = DAG.getTarget().getTLSModel(GV);

20909

switch (model) {

20910

case TLSModel::GeneralDynamic:

20911

if (Subtarget.is64Bit()) {

20912

if (Subtarget.isTarget64BitLP64())

20913

return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT);

20914

return LowerToTLSGeneralDynamicModelX32(GA, DAG, PtrVT);

20915

}

20916

return LowerToTLSGeneralDynamicModel32(GA, DAG, PtrVT);

20917

case TLSModel::LocalDynamic:

20918

return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT, Subtarget.is64Bit(),

20919

Subtarget.isTarget64BitLP64());

20920

case TLSModel::InitialExec:

20921

case TLSModel::LocalExec:

20922

return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget.is64Bit(),

20923

PositionIndependent);

20924

}

20925

llvm_unreachable("Unknown TLS model.")::llvm::llvm_unreachable_internal("Unknown TLS model.", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 20925);

20926

}

20927

20928

if (Subtarget.isTargetDarwin()) {

20929

// Darwin only has one model of TLS. Lower to that.

20930

unsigned char OpFlag = 0;

20931

unsigned WrapperKind = Subtarget.isPICStyleRIPRel() ?

20932

X86ISD::WrapperRIP : X86ISD::Wrapper;

20933

20934

// In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the

20935

// global base reg.

20936

bool PIC32 = PositionIndependent && !Subtarget.is64Bit();

20937

if (PIC32)

20938

OpFlag = X86II::MO_TLVP_PIC_BASE;

20939

else

20940

OpFlag = X86II::MO_TLVP;

20941

SDLoc DL(Op);

20942

SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL,

20943

GA->getValueType(0),

20944

GA->getOffset(), OpFlag);

20945

SDValue Offset = DAG.getNode(WrapperKind, DL, PtrVT, Result);

20946

20947

// With PIC32, the address is actually $g + Offset.

20948

if (PIC32)

20949

Offset = DAG.getNode(ISD::ADD, DL, PtrVT,

20950

DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),

20951

Offset);

20952

20953

// Lowering the machine isd will make sure everything is in the right

20954

// location.

20955

SDValue Chain = DAG.getEntryNode();

20956

SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);

20957

Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);

20958

SDValue Args[] = { Chain, Offset };

20959

Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);

20960

Chain = DAG.getCALLSEQ_END(Chain, 0, 0, Chain.getValue(1), DL);

20961

20962

// TLSCALL will be codegen'ed as call. Inform MFI that function has calls.

20963

MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();

20964

MFI.setAdjustsStack(true);

20965

20966

// And our return value (tls address) is in the standard call return value

20967

// location.

20968

unsigned Reg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;

20969

return DAG.getCopyFromReg(Chain, DL, Reg, PtrVT, Chain.getValue(1));

20970

}

20971

20972

if (Subtarget.isOSWindows()) {

20973

// Just use the implicit TLS architecture

20974

// Need to generate something similar to:

20975

// mov rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage

20976

// ; from TEB

20977

// mov ecx, dword [rel _tls_index]: Load index (from C runtime)

20978

// mov rcx, qword [rdx+rcx*8]

20979

// mov eax, .tls$:tlsvar

20980

// [rax+rcx] contains the address

20981

// Windows 64bit: gs:0x58

20982

// Windows 32bit: fs:__tls_array

20983

20984

SDLoc dl(GA);

20985

SDValue Chain = DAG.getEntryNode();

20986

20987

// Get the Thread Pointer, which is %fs:__tls_array (32-bit) or

20988

// %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly

20989

// use its literal value of 0x2C.

20990

Value *Ptr = Constant::getNullValue(Subtarget.is64Bit()

20991

? Type::getInt8PtrTy(*DAG.getContext(),

20992

256)

20993

: Type::getInt32PtrTy(*DAG.getContext(),

20994

257));

20995

20996

SDValue TlsArray = Subtarget.is64Bit()

20997

? DAG.getIntPtrConstant(0x58, dl)

20998

: (Subtarget.isTargetWindowsGNU()

20999

? DAG.getIntPtrConstant(0x2C, dl)

21000

: DAG.getExternalSymbol("_tls_array", PtrVT));

21001

21002

SDValue ThreadPointer =

21003

DAG.getLoad(PtrVT, dl, Chain, TlsArray, MachinePointerInfo(Ptr));

21004

21005

SDValue res;

21006

if (GV->getThreadLocalMode() == GlobalVariable::LocalExecTLSModel) {

21007

res = ThreadPointer;

21008

} else {

21009

// Load the _tls_index variable

21010

SDValue IDX = DAG.getExternalSymbol("_tls_index", PtrVT);

21011

if (Subtarget.is64Bit())

21012

IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, PtrVT, Chain, IDX,

21013

MachinePointerInfo(), MVT::i32);

21014

else

21015

IDX = DAG.getLoad(PtrVT, dl, Chain, IDX, MachinePointerInfo());

21016

21017

const DataLayout &DL = DAG.getDataLayout();

21018

SDValue Scale =

21019

DAG.getConstant(Log2_64_Ceil(DL.getPointerSize()), dl, MVT::i8);

21020

IDX = DAG.getNode(ISD::SHL, dl, PtrVT, IDX, Scale);

21021

21022

res = DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, IDX);

21023

}

21024

21025

res = DAG.getLoad(PtrVT, dl, Chain, res, MachinePointerInfo());

21026

21027

// Get the offset of start of .tls section

21028

SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,

21029

GA->getValueType(0),

21030

GA->getOffset(), X86II::MO_SECREL);

21031

SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, TGA);

21032

21033

// The address of the thread local variable is the add of the thread

21034

// pointer with the offset of the variable.

21035

return DAG.getNode(ISD::ADD, dl, PtrVT, res, Offset);

21036

}

21037

21038

llvm_unreachable("TLS not implemented for this target.")::llvm::llvm_unreachable_internal("TLS not implemented for this target."
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21038);

21039

}

21040

21041

/// Lower SRA_PARTS and friends, which return two i32 values

21042

/// and take a 2 x i32 value to shift plus a shift amount.

21043

/// TODO: Can this be moved to general expansion code?

21044

static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) {

21045

SDValue Lo, Hi;

21046

DAG.getTargetLoweringInfo().expandShiftParts(Op.getNode(), Lo, Hi, DAG);

21047

return DAG.getMergeValues({Lo, Hi}, SDLoc(Op));

21048

}

21049

21050

// Try to use a packed vector operation to handle i64 on 32-bit targets when

21051

// AVX512DQ is enabled.

21052

static SDValue LowerI64IntToFP_AVX512DQ(SDValue Op, SelectionDAG &DAG,

21053

const X86Subtarget &Subtarget) {

21054

assert((Op.getOpcode() == ISD::SINT_TO_FP ||(static_cast <bool> ((Op.getOpcode() == ISD::SINT_TO_FP
|| Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode(
) == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP
) && "Unexpected opcode!") ? void (0) : __assert_fail
("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21058, __extension__
__PRETTY_FUNCTION__))

21055

Op.getOpcode() == ISD::STRICT_SINT_TO_FP ||(static_cast <bool> ((Op.getOpcode() == ISD::SINT_TO_FP
|| Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode(
) == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP
) && "Unexpected opcode!") ? void (0) : __assert_fail
("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21058, __extension__
__PRETTY_FUNCTION__))

21056

Op.getOpcode() == ISD::STRICT_UINT_TO_FP ||(static_cast <bool> ((Op.getOpcode() == ISD::SINT_TO_FP
|| Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode(
) == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP
) && "Unexpected opcode!") ? void (0) : __assert_fail
("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21058, __extension__
__PRETTY_FUNCTION__))

21057

Op.getOpcode() == ISD::UINT_TO_FP) &&(static_cast <bool> ((Op.getOpcode() == ISD::SINT_TO_FP
|| Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode(
) == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP
) && "Unexpected opcode!") ? void (0) : __assert_fail
("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21058, __extension__
__PRETTY_FUNCTION__))

21058

"Unexpected opcode!")(static_cast <bool> ((Op.getOpcode() == ISD::SINT_TO_FP
|| Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode(
) == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP
) && "Unexpected opcode!") ? void (0) : __assert_fail
("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21058, __extension__
__PRETTY_FUNCTION__));

21059

bool IsStrict = Op->isStrictFPOpcode();

21060

unsigned OpNo = IsStrict ? 1 : 0;

21061

SDValue Src = Op.getOperand(OpNo);

21062

MVT SrcVT = Src.getSimpleValueType();

21063

MVT VT = Op.getSimpleValueType();

21064

21065

if (!Subtarget.hasDQI() || SrcVT != MVT::i64 || Subtarget.is64Bit() ||

21066

(VT != MVT::f32 && VT != MVT::f64))

21067

return SDValue();

21068

21069

// Pack the i64 into a vector, do the operation and extract.

21070

21071

// Using 256-bit to ensure result is 128-bits for f32 case.

21072

unsigned NumElts = Subtarget.hasVLX() ? 4 : 8;

21073

MVT VecInVT = MVT::getVectorVT(MVT::i64, NumElts);

21074

MVT VecVT = MVT::getVectorVT(VT, NumElts);

21075

21076

SDLoc dl(Op);

21077

SDValue InVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecInVT, Src);

21078

if (IsStrict) {

21079

SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, {VecVT, MVT::Other},

21080

{Op.getOperand(0), InVec});

21081

SDValue Chain = CvtVec.getValue(1);

21082

SDValue Value = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,

21083

DAG.getIntPtrConstant(0, dl));

21084

return DAG.getMergeValues({Value, Chain}, dl);

21085

}

21086

21087

SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, VecVT, InVec);

21088

21089

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,

21090

DAG.getIntPtrConstant(0, dl));

21091

}

21092

21093

// Try to use a packed vector operation to handle i64 on 32-bit targets.

21094

static SDValue LowerI64IntToFP16(SDValue Op, SelectionDAG &DAG,

21095

const X86Subtarget &Subtarget) {

21096

assert((Op.getOpcode() == ISD::SINT_TO_FP ||(static_cast <bool> ((Op.getOpcode() == ISD::SINT_TO_FP
|| Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode(
) == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP
) && "Unexpected opcode!") ? void (0) : __assert_fail
("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21100, __extension__
__PRETTY_FUNCTION__))

21097

Op.getOpcode() == ISD::STRICT_SINT_TO_FP ||(static_cast <bool> ((Op.getOpcode() == ISD::SINT_TO_FP
|| Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode(
) == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP
) && "Unexpected opcode!") ? void (0) : __assert_fail
("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21100, __extension__
__PRETTY_FUNCTION__))

21098

Op.getOpcode() == ISD::STRICT_UINT_TO_FP ||(static_cast <bool> ((Op.getOpcode() == ISD::SINT_TO_FP
|| Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode(
) == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP
) && "Unexpected opcode!") ? void (0) : __assert_fail
("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21100, __extension__
__PRETTY_FUNCTION__))

21099

Op.getOpcode() == ISD::UINT_TO_FP) &&(static_cast <bool> ((Op.getOpcode() == ISD::SINT_TO_FP
|| Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode(
) == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP
) && "Unexpected opcode!") ? void (0) : __assert_fail
("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21100, __extension__
__PRETTY_FUNCTION__))

21100

"Unexpected opcode!")(static_cast <bool> ((Op.getOpcode() == ISD::SINT_TO_FP
|| Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode(
) == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP
) && "Unexpected opcode!") ? void (0) : __assert_fail
("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21100, __extension__
__PRETTY_FUNCTION__));

21101

bool IsStrict = Op->isStrictFPOpcode();

21102

SDValue Src = Op.getOperand(IsStrict ? 1 : 0);

21103

MVT SrcVT = Src.getSimpleValueType();

21104

MVT VT = Op.getSimpleValueType();

21105

21106

if (SrcVT != MVT::i64 || Subtarget.is64Bit() || VT != MVT::f16)

21107

return SDValue();

21108

21109

// Pack the i64 into a vector, do the operation and extract.

21110

21111

assert(Subtarget.hasFP16() && "Expected FP16")(static_cast <bool> (Subtarget.hasFP16() && "Expected FP16"
) ? void (0) : __assert_fail ("Subtarget.hasFP16() && \"Expected FP16\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21111, __extension__
__PRETTY_FUNCTION__));

21112

21113

SDLoc dl(Op);

21114

SDValue InVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Src);

21115

if (IsStrict) {

21116

SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, {MVT::v2f16, MVT::Other},

21117

{Op.getOperand(0), InVec});

21118

SDValue Chain = CvtVec.getValue(1);

21119

SDValue Value = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,

21120

DAG.getIntPtrConstant(0, dl));

21121

return DAG.getMergeValues({Value, Chain}, dl);

21122

}

21123

21124

SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, MVT::v2f16, InVec);

21125

21126

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,

21127

DAG.getIntPtrConstant(0, dl));

21128

}

21129

21130

static bool useVectorCast(unsigned Opcode, MVT FromVT, MVT ToVT,

21131

const X86Subtarget &Subtarget) {

21132

switch (Opcode) {

21133

case ISD::SINT_TO_FP:

21134

// TODO: Handle wider types with AVX/AVX512.

21135

if (!Subtarget.hasSSE2() || FromVT != MVT::v4i32)

21136

return false;

21137

// CVTDQ2PS or (V)CVTDQ2PD

21138

return ToVT == MVT::v4f32 || (Subtarget.hasAVX() && ToVT == MVT::v4f64);

21139

21140

case ISD::UINT_TO_FP:

21141

// TODO: Handle wider types and i64 elements.

21142

if (!Subtarget.hasAVX512() || FromVT != MVT::v4i32)

21143

return false;

21144

// VCVTUDQ2PS or VCVTUDQ2PD

21145

return ToVT == MVT::v4f32 || ToVT == MVT::v4f64;

21146

21147

default:

21148

return false;

21149

}

21150

}

21151

21152

/// Given a scalar cast operation that is extracted from a vector, try to

21153

/// vectorize the cast op followed by extraction. This will avoid an expensive

21154

/// round-trip between XMM and GPR.

21155

static SDValue vectorizeExtractedCast(SDValue Cast, SelectionDAG &DAG,

21156

const X86Subtarget &Subtarget) {

21157

// TODO: This could be enhanced to handle smaller integer types by peeking

21158

// through an extend.

21159

SDValue Extract = Cast.getOperand(0);

21160

MVT DestVT = Cast.getSimpleValueType();

21161

if (Extract.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

21162

!isa<ConstantSDNode>(Extract.getOperand(1)))

21163

return SDValue();

21164

21165

// See if we have a 128-bit vector cast op for this type of cast.

21166

SDValue VecOp = Extract.getOperand(0);

21167

MVT FromVT = VecOp.getSimpleValueType();

21168

unsigned NumEltsInXMM = 128 / FromVT.getScalarSizeInBits();

21169

MVT Vec128VT = MVT::getVectorVT(FromVT.getScalarType(), NumEltsInXMM);

21170

MVT ToVT = MVT::getVectorVT(DestVT, NumEltsInXMM);

21171

if (!useVectorCast(Cast.getOpcode(), Vec128VT, ToVT, Subtarget))

21172

return SDValue();

21173

21174

// If we are extracting from a non-zero element, first shuffle the source

21175

// vector to allow extracting from element zero.

21176

SDLoc DL(Cast);

21177

if (!isNullConstant(Extract.getOperand(1))) {

21178

SmallVector<int, 16> Mask(FromVT.getVectorNumElements(), -1);

21179

Mask[0] = Extract.getConstantOperandVal(1);

21180

VecOp = DAG.getVectorShuffle(FromVT, DL, VecOp, DAG.getUNDEF(FromVT), Mask);

21181

}

21182

// If the source vector is wider than 128-bits, extract the low part. Do not

21183

// create an unnecessarily wide vector cast op.

21184

if (FromVT != Vec128VT)

21185

VecOp = extract128BitVector(VecOp, 0, DAG, DL);

21186

21187

// cast (extelt V, 0) --> extelt (cast (extract_subv V)), 0

21188

// cast (extelt V, C) --> extelt (cast (extract_subv (shuffle V, [C...]))), 0

21189

SDValue VCast = DAG.getNode(Cast.getOpcode(), DL, ToVT, VecOp);

21190

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, DestVT, VCast,

21191

DAG.getIntPtrConstant(0, DL));

21192

}

21193

21194

/// Given a scalar cast to FP with a cast to integer operand (almost an ftrunc),

21195

/// try to vectorize the cast ops. This will avoid an expensive round-trip

21196

/// between XMM and GPR.

21197

static SDValue lowerFPToIntToFP(SDValue CastToFP, SelectionDAG &DAG,

21198

const X86Subtarget &Subtarget) {

21199

// TODO: Allow FP_TO_UINT.

21200

SDValue CastToInt = CastToFP.getOperand(0);

21201

MVT VT = CastToFP.getSimpleValueType();

21202

if (CastToInt.getOpcode() != ISD::FP_TO_SINT || VT.isVector())

21203

return SDValue();

21204

21205

MVT IntVT = CastToInt.getSimpleValueType();

21206

SDValue X = CastToInt.getOperand(0);

21207

MVT SrcVT = X.getSimpleValueType();

21208

if (SrcVT != MVT::f32 && SrcVT != MVT::f64)

21209

return SDValue();

21210

21211

// See if we have 128-bit vector cast instructions for this type of cast.

21212

// We need cvttps2dq/cvttpd2dq and cvtdq2ps/cvtdq2pd.

21213

if (!Subtarget.hasSSE2() || (VT != MVT::f32 && VT != MVT::f64) ||

21214

IntVT != MVT::i32)

21215

return SDValue();

21216

21217

unsigned SrcSize = SrcVT.getSizeInBits();

21218

unsigned IntSize = IntVT.getSizeInBits();

21219

unsigned VTSize = VT.getSizeInBits();

21220

MVT VecSrcVT = MVT::getVectorVT(SrcVT, 128 / SrcSize);

21221

MVT VecIntVT = MVT::getVectorVT(IntVT, 128 / IntSize);

21222

MVT VecVT = MVT::getVectorVT(VT, 128 / VTSize);

21223

21224

// We need target-specific opcodes if this is v2f64 -> v4i32 -> v2f64.

21225

unsigned ToIntOpcode =

21226

SrcSize != IntSize ? X86ISD::CVTTP2SI : (unsigned)ISD::FP_TO_SINT;

21227

unsigned ToFPOpcode =

21228

IntSize != VTSize ? X86ISD::CVTSI2P : (unsigned)ISD::SINT_TO_FP;

21229

21230

// sint_to_fp (fp_to_sint X) --> extelt (sint_to_fp (fp_to_sint (s2v X))), 0

21231

//

21232

// We are not defining the high elements (for example, zero them) because

21233

// that could nullify any performance advantage that we hoped to gain from

21234

// this vector op hack. We do not expect any adverse effects (like denorm

21235

// penalties) with cast ops.

21236

SDLoc DL(CastToFP);

21237

SDValue ZeroIdx = DAG.getIntPtrConstant(0, DL);

21238

SDValue VecX = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecSrcVT, X);

21239

SDValue VCastToInt = DAG.getNode(ToIntOpcode, DL, VecIntVT, VecX);

21240

SDValue VCastToFP = DAG.getNode(ToFPOpcode, DL, VecVT, VCastToInt);

21241

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, VCastToFP, ZeroIdx);

21242

}

21243

21244

static SDValue lowerINT_TO_FP_vXi64(SDValue Op, SelectionDAG &DAG,

21245

const X86Subtarget &Subtarget) {

21246

SDLoc DL(Op);

21247

bool IsStrict = Op->isStrictFPOpcode();

21248

MVT VT = Op->getSimpleValueType(0);

21249

SDValue Src = Op->getOperand(IsStrict ? 1 : 0);

21250

21251

if (Subtarget.hasDQI()) {

21252

assert(!Subtarget.hasVLX() && "Unexpected features")(static_cast <bool> (!Subtarget.hasVLX() && "Unexpected features"
) ? void (0) : __assert_fail ("!Subtarget.hasVLX() && \"Unexpected features\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21252, __extension__
__PRETTY_FUNCTION__));

21253

21254

assert((Src.getSimpleValueType() == MVT::v2i64 ||(static_cast <bool> ((Src.getSimpleValueType() == MVT::
v2i64 || Src.getSimpleValueType() == MVT::v4i64) && "Unsupported custom type"
) ? void (0) : __assert_fail ("(Src.getSimpleValueType() == MVT::v2i64 || Src.getSimpleValueType() == MVT::v4i64) && \"Unsupported custom type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21256, __extension__
__PRETTY_FUNCTION__))

21255

Src.getSimpleValueType() == MVT::v4i64) &&(static_cast <bool> ((Src.getSimpleValueType() == MVT::
v2i64 || Src.getSimpleValueType() == MVT::v4i64) && "Unsupported custom type"
) ? void (0) : __assert_fail ("(Src.getSimpleValueType() == MVT::v2i64 || Src.getSimpleValueType() == MVT::v4i64) && \"Unsupported custom type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21256, __extension__
__PRETTY_FUNCTION__))

21256

"Unsupported custom type")(static_cast <bool> ((Src.getSimpleValueType() == MVT::
v2i64 || Src.getSimpleValueType() == MVT::v4i64) && "Unsupported custom type"
) ? void (0) : __assert_fail ("(Src.getSimpleValueType() == MVT::v2i64 || Src.getSimpleValueType() == MVT::v4i64) && \"Unsupported custom type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21256, __extension__
__PRETTY_FUNCTION__));

21257

21258

// With AVX512DQ, but not VLX we need to widen to get a 512-bit result type.

21259

assert((VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v4f64) &&(static_cast <bool> ((VT == MVT::v4f32 || VT == MVT::v2f64
|| VT == MVT::v4f64) && "Unexpected VT!") ? void (0)
: __assert_fail ("(VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v4f64) && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21260, __extension__
__PRETTY_FUNCTION__))

21260

"Unexpected VT!")(static_cast <bool> ((VT == MVT::v4f32 || VT == MVT::v2f64
|| VT == MVT::v4f64) && "Unexpected VT!") ? void (0)
: __assert_fail ("(VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v4f64) && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21260, __extension__
__PRETTY_FUNCTION__));

21261

MVT WideVT = VT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64;

21262

21263

// Need to concat with zero vector for strict fp to avoid spurious

21264

// exceptions.

21265

SDValue Tmp = IsStrict ? DAG.getConstant(0, DL, MVT::v8i64)

21266

: DAG.getUNDEF(MVT::v8i64);

21267

Src = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i64, Tmp, Src,

21268

DAG.getIntPtrConstant(0, DL));

21269

SDValue Res, Chain;

21270

if (IsStrict) {

21271

Res = DAG.getNode(Op.getOpcode(), DL, {WideVT, MVT::Other},

21272

{Op->getOperand(0), Src});

21273

Chain = Res.getValue(1);

21274

} else {

21275

Res = DAG.getNode(Op.getOpcode(), DL, WideVT, Src);

21276

}

21277

21278

Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,

21279

DAG.getIntPtrConstant(0, DL));

21280

21281

if (IsStrict)

21282

return DAG.getMergeValues({Res, Chain}, DL);

21283

return Res;

21284

}

21285

21286

bool IsSigned = Op->getOpcode() == ISD::SINT_TO_FP ||

21287

Op->getOpcode() == ISD::STRICT_SINT_TO_FP;

21288

if (VT != MVT::v4f32 || IsSigned)

21289

return SDValue();

21290

21291

SDValue Zero = DAG.getConstant(0, DL, MVT::v4i64);

21292

SDValue One = DAG.getConstant(1, DL, MVT::v4i64);

21293

SDValue Sign = DAG.getNode(ISD::OR, DL, MVT::v4i64,

21294

DAG.getNode(ISD::SRL, DL, MVT::v4i64, Src, One),

21295

DAG.getNode(ISD::AND, DL, MVT::v4i64, Src, One));

21296

SDValue IsNeg = DAG.getSetCC(DL, MVT::v4i64, Src, Zero, ISD::SETLT);

21297

SDValue SignSrc = DAG.getSelect(DL, MVT::v4i64, IsNeg, Sign, Src);

21298

SmallVector<SDValue, 4> SignCvts(4);

21299

SmallVector<SDValue, 4> Chains(4);

21300

for (int i = 0; i != 4; ++i) {

21301

SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, SignSrc,

21302

DAG.getIntPtrConstant(i, DL));

21303

if (IsStrict) {

21304

SignCvts[i] =

21305

DAG.getNode(ISD::STRICT_SINT_TO_FP, DL, {MVT::f32, MVT::Other},

21306

{Op.getOperand(0), Elt});

21307

Chains[i] = SignCvts[i].getValue(1);

21308

} else {

21309

SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, DL, MVT::f32, Elt);

21310

}

21311

}

21312

SDValue SignCvt = DAG.getBuildVector(VT, DL, SignCvts);

21313

21314

SDValue Slow, Chain;

21315

if (IsStrict) {

21316

Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);

21317

Slow = DAG.getNode(ISD::STRICT_FADD, DL, {MVT::v4f32, MVT::Other},

21318

{Chain, SignCvt, SignCvt});

21319

Chain = Slow.getValue(1);

21320

} else {

21321

Slow = DAG.getNode(ISD::FADD, DL, MVT::v4f32, SignCvt, SignCvt);

21322

}

21323

21324

IsNeg = DAG.getNode(ISD::TRUNCATE, DL, MVT::v4i32, IsNeg);

21325

SDValue Cvt = DAG.getSelect(DL, MVT::v4f32, IsNeg, Slow, SignCvt);

21326

21327

if (IsStrict)

21328

return DAG.getMergeValues({Cvt, Chain}, DL);

21329

21330

return Cvt;

21331

}

21332

21333

static SDValue promoteXINT_TO_FP(SDValue Op, SelectionDAG &DAG) {

21334

bool IsStrict = Op->isStrictFPOpcode();

21335

SDValue Src = Op.getOperand(IsStrict ? 1 : 0);

21336

SDValue Chain = IsStrict ? Op->getOperand(0) : DAG.getEntryNode();

21337

MVT VT = Op.getSimpleValueType();

21338

MVT NVT = VT.isVector() ? VT.changeVectorElementType(MVT::f32) : MVT::f32;

21339

SDLoc dl(Op);

21340

21341

SDValue Rnd = DAG.getIntPtrConstant(0, dl);

21342

if (IsStrict)

21343

return DAG.getNode(

21344

ISD::STRICT_FP_ROUND, dl, {VT, MVT::Other},

21345

{Chain,

21346

DAG.getNode(Op.getOpcode(), dl, {NVT, MVT::Other}, {Chain, Src}),

21347

Rnd});

21348

return DAG.getNode(ISD::FP_ROUND, dl, VT,

21349

DAG.getNode(Op.getOpcode(), dl, NVT, Src), Rnd);

21350

}

21351

21352

static bool isLegalConversion(MVT VT, bool IsSigned,

21353

const X86Subtarget &Subtarget) {

21354

if (VT == MVT::v4i32 && Subtarget.hasSSE2() && IsSigned)

21355

return true;

21356

if (VT == MVT::v8i32 && Subtarget.hasAVX() && IsSigned)

21357

return true;

21358

if (Subtarget.hasVLX() && (VT == MVT::v4i32 || VT == MVT::v8i32))

21359

return true;

21360

if (Subtarget.useAVX512Regs()) {

21361

if (VT == MVT::v16i32)

21362

return true;

21363

if (VT == MVT::v8i64 && Subtarget.hasDQI())

21364

return true;

21365

}

21366

if (Subtarget.hasDQI() && Subtarget.hasVLX() &&

21367

(VT == MVT::v2i64 || VT == MVT::v4i64))

21368

return true;

21369

return false;

21370

}

21371

21372

SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,

21373

SelectionDAG &DAG) const {

21374

bool IsStrict = Op->isStrictFPOpcode();

21375

unsigned OpNo = IsStrict ? 1 : 0;

21376

SDValue Src = Op.getOperand(OpNo);

21377

SDValue Chain = IsStrict ? Op->getOperand(0) : DAG.getEntryNode();

21378

MVT SrcVT = Src.getSimpleValueType();

21379

MVT VT = Op.getSimpleValueType();

21380

SDLoc dl(Op);

21381

21382

if (isSoftFP16(VT))

21383

return promoteXINT_TO_FP(Op, DAG);

21384

else if (isLegalConversion(SrcVT, true, Subtarget))

21385

return Op;

21386

21387

if (Subtarget.isTargetWin64() && SrcVT == MVT::i128)

21388

return LowerWin64_INT128_TO_FP(Op, DAG);

21389

21390

if (SDValue Extract = vectorizeExtractedCast(Op, DAG, Subtarget))

21391

return Extract;

21392

21393

if (SDValue R = lowerFPToIntToFP(Op, DAG, Subtarget))

21394

return R;

21395

21396

if (SrcVT.isVector()) {

21397

if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) {

21398

// Note: Since v2f64 is a legal type. We don't need to zero extend the

21399

// source for strict FP.

21400

if (IsStrict)

21401

return DAG.getNode(

21402

X86ISD::STRICT_CVTSI2P, dl, {VT, MVT::Other},

21403

{Chain, DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,

21404

DAG.getUNDEF(SrcVT))});

21405

return DAG.getNode(X86ISD::CVTSI2P, dl, VT,

21406

DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,

21407

DAG.getUNDEF(SrcVT)));

21408

}

21409

if (SrcVT == MVT::v2i64 || SrcVT == MVT::v4i64)

21410

return lowerINT_TO_FP_vXi64(Op, DAG, Subtarget);

21411

21412

return SDValue();

21413

}

21414

21415

assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&(static_cast <bool> (SrcVT <= MVT::i64 && SrcVT
>= MVT::i16 && "Unknown SINT_TO_FP to lower!") ? void
(0) : __assert_fail ("SrcVT <= MVT::i64 && SrcVT >= MVT::i16 && \"Unknown SINT_TO_FP to lower!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21416, __extension__
__PRETTY_FUNCTION__))

21416

"Unknown SINT_TO_FP to lower!")(static_cast <bool> (SrcVT <= MVT::i64 && SrcVT
>= MVT::i16 && "Unknown SINT_TO_FP to lower!") ? void
(0) : __assert_fail ("SrcVT <= MVT::i64 && SrcVT >= MVT::i16 && \"Unknown SINT_TO_FP to lower!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21416, __extension__
__PRETTY_FUNCTION__));

21417

21418

bool UseSSEReg = isScalarFPTypeInSSEReg(VT);

21419

21420

// These are really Legal; return the operand so the caller accepts it as

21421

// Legal.

21422

if (SrcVT == MVT::i32 && UseSSEReg)

21423

return Op;

21424

if (SrcVT == MVT::i64 && UseSSEReg && Subtarget.is64Bit())

21425

return Op;

21426

21427

if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, DAG, Subtarget))

21428

return V;

21429

if (SDValue V = LowerI64IntToFP16(Op, DAG, Subtarget))

21430

return V;

21431

21432

// SSE doesn't have an i16 conversion so we need to promote.

21433

if (SrcVT == MVT::i16 && (UseSSEReg || VT == MVT::f128)) {

21434

SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i32, Src);

21435

if (IsStrict)

21436

return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},

21437

{Chain, Ext});

21438

21439

return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Ext);

21440

}

21441

21442

if (VT == MVT::f128 || !Subtarget.hasX87())

21443

return SDValue();

21444

21445

SDValue ValueToStore = Src;

21446

if (SrcVT == MVT::i64 && Subtarget.hasSSE2() && !Subtarget.is64Bit())

21447

// Bitcasting to f64 here allows us to do a single 64-bit store from

21448

// an SSE register, avoiding the store forwarding penalty that would come

21449

// with two 32-bit stores.

21450

ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);

21451

21452

unsigned Size = SrcVT.getStoreSize();

21453

Align Alignment(Size);

21454

MachineFunction &MF = DAG.getMachineFunction();

21455

auto PtrVT = getPointerTy(MF.getDataLayout());

21456

int SSFI = MF.getFrameInfo().CreateStackObject(Size, Alignment, false);

21457

MachinePointerInfo MPI =

21458

MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI);

21459

SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);

21460

Chain = DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, Alignment);

21461

std::pair<SDValue, SDValue> Tmp =

21462

BuildFILD(VT, SrcVT, dl, Chain, StackSlot, MPI, Alignment, DAG);

21463

21464

if (IsStrict)

21465

return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);

21466

21467

return Tmp.first;

21468

}

21469

21470

std::pair<SDValue, SDValue> X86TargetLowering::BuildFILD(

21471

EVT DstVT, EVT SrcVT, const SDLoc &DL, SDValue Chain, SDValue Pointer,

21472

MachinePointerInfo PtrInfo, Align Alignment, SelectionDAG &DAG) const {

21473

// Build the FILD

21474

SDVTList Tys;

21475

bool useSSE = isScalarFPTypeInSSEReg(DstVT);

21476

if (useSSE)

21477

Tys = DAG.getVTList(MVT::f80, MVT::Other);

21478

else

21479

Tys = DAG.getVTList(DstVT, MVT::Other);

21480

21481

SDValue FILDOps[] = {Chain, Pointer};

21482

SDValue Result =

21483

DAG.getMemIntrinsicNode(X86ISD::FILD, DL, Tys, FILDOps, SrcVT, PtrInfo,

21484

Alignment, MachineMemOperand::MOLoad);

21485

Chain = Result.getValue(1);

21486

21487

if (useSSE) {

21488

MachineFunction &MF = DAG.getMachineFunction();

21489

unsigned SSFISize = DstVT.getStoreSize();

21490

int SSFI =

21491

MF.getFrameInfo().CreateStackObject(SSFISize, Align(SSFISize), false);

21492

auto PtrVT = getPointerTy(MF.getDataLayout());

21493

SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);

21494

Tys = DAG.getVTList(MVT::Other);

21495

SDValue FSTOps[] = {Chain, Result, StackSlot};

21496

MachineMemOperand *StoreMMO = DAG.getMachineFunction().getMachineMemOperand(

21497

MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),

21498

MachineMemOperand::MOStore, SSFISize, Align(SSFISize));

21499

21500

Chain =

21501

DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys, FSTOps, DstVT, StoreMMO);

21502

Result = DAG.getLoad(

21503

DstVT, DL, Chain, StackSlot,

21504

MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));

21505

Chain = Result.getValue(1);

21506

}

21507

21508

return { Result, Chain };

21509

}

21510

21511

/// Horizontal vector math instructions may be slower than normal math with

21512

/// shuffles. Limit horizontal op codegen based on size/speed trade-offs, uarch

21513

/// implementation, and likely shuffle complexity of the alternate sequence.

21514

static bool shouldUseHorizontalOp(bool IsSingleSource, SelectionDAG &DAG,

21515

const X86Subtarget &Subtarget) {

21516

bool IsOptimizingSize = DAG.shouldOptForSize();

21517

bool HasFastHOps = Subtarget.hasFastHorizontalOps();

21518

return !IsSingleSource || IsOptimizingSize || HasFastHOps;

21519

}

21520

21521

/// 64-bit unsigned integer to double expansion.

21522

static SDValue LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG,

21523

const X86Subtarget &Subtarget) {

21524

// We can't use this algorithm for strict fp. It produces -0.0 instead of +0.0

21525

// when converting 0 when rounding toward negative infinity. Caller will

21526

// fall back to Expand for when i64 or is legal or use FILD in 32-bit mode.

21527

assert(!Op->isStrictFPOpcode() && "Expected non-strict uint_to_fp!")(static_cast <bool> (!Op->isStrictFPOpcode() &&
"Expected non-strict uint_to_fp!") ? void (0) : __assert_fail
("!Op->isStrictFPOpcode() && \"Expected non-strict uint_to_fp!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21527, __extension__
__PRETTY_FUNCTION__));

21528

// This algorithm is not obvious. Here it is what we're trying to output:

21529

/*

21530

movq %rax, %xmm0

21531

punpckldq (c0), %xmm0 // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }

21532

subpd (c1), %xmm0 // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }

21533

#ifdef __SSE3__

21534

haddpd %xmm0, %xmm0

21535

#else

21536

pshufd $0x4e, %xmm0, %xmm1

21537

addpd %xmm1, %xmm0

21538

#endif

21539

*/

21540

21541

SDLoc dl(Op);

21542

LLVMContext *Context = DAG.getContext();

21543

21544

// Build some magic constants.

21545

static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };

21546

Constant *C0 = ConstantDataVector::get(*Context, CV0);

21547

auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());

21548

SDValue CPIdx0 = DAG.getConstantPool(C0, PtrVT, Align(16));

21549

21550

SmallVector<Constant*,2> CV1;

21551

CV1.push_back(

21552

ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),

21553

APInt(64, 0x4330000000000000ULL))));

21554

CV1.push_back(

21555

ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),

21556

APInt(64, 0x4530000000000000ULL))));

21557

Constant *C1 = ConstantVector::get(CV1);

21558

SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, Align(16));

21559

21560

// Load the 64-bit value into an XMM register.

21561

SDValue XR1 =

21562

DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Op.getOperand(0));

21563

SDValue CLod0 = DAG.getLoad(

21564

MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,

21565

MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Align(16));

21566

SDValue Unpck1 =

21567

getUnpackl(DAG, dl, MVT::v4i32, DAG.getBitcast(MVT::v4i32, XR1), CLod0);

21568

21569

SDValue CLod1 = DAG.getLoad(

21570

MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,

21571

MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Align(16));

21572

SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1);

21573

// TODO: Are there any fast-math-flags to propagate here?

21574

SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);

21575

SDValue Result;

21576

21577

if (Subtarget.hasSSE3() &&

21578

shouldUseHorizontalOp(true, DAG, Subtarget)) {

21579

Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);

21580

} else {

21581

SDValue Shuffle = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, Sub, {1,-1});

21582

Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuffle, Sub);

21583

}

21584

Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,

21585

DAG.getIntPtrConstant(0, dl));

21586

return Result;

21587

}

21588

21589

/// 32-bit unsigned integer to float expansion.

21590

static SDValue LowerUINT_TO_FP_i32(SDValue Op, SelectionDAG &DAG,

21591

const X86Subtarget &Subtarget) {

21592

unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0;

21593

SDLoc dl(Op);

21594

// FP constant to bias correct the final result.

21595

SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl,

21596

MVT::f64);

21597

21598

// Load the 32-bit value into an XMM register.

21599

SDValue Load =

21600

DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Op.getOperand(OpNo));

21601

21602

// Zero out the upper parts of the register.

21603

Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);

21604

21605

// Or the load with the bias.

21606

SDValue Or = DAG.getNode(

21607

ISD::OR, dl, MVT::v2i64,

21608

DAG.getBitcast(MVT::v2i64, Load),

21609

DAG.getBitcast(MVT::v2i64,

21610

DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Bias)));

21611

Or =

21612

DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,

21613

DAG.getBitcast(MVT::v2f64, Or), DAG.getIntPtrConstant(0, dl));

21614

21615

if (Op.getNode()->isStrictFPOpcode()) {

21616

// Subtract the bias.

21617

// TODO: Are there any fast-math-flags to propagate here?

21618

SDValue Chain = Op.getOperand(0);

21619

SDValue Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::f64, MVT::Other},

21620

{Chain, Or, Bias});

21621

21622

if (Op.getValueType() == Sub.getValueType())

21623

return Sub;

21624

21625

// Handle final rounding.

21626

std::pair<SDValue, SDValue> ResultPair = DAG.getStrictFPExtendOrRound(

21627

Sub, Sub.getValue(1), dl, Op.getSimpleValueType());

21628

21629

return DAG.getMergeValues({ResultPair.first, ResultPair.second}, dl);

21630

}

21631

21632

// Subtract the bias.

21633

// TODO: Are there any fast-math-flags to propagate here?

21634

SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);

21635

21636

// Handle final rounding.

21637

return DAG.getFPExtendOrRound(Sub, dl, Op.getSimpleValueType());

21638

}

21639

21640

static SDValue lowerUINT_TO_FP_v2i32(SDValue Op, SelectionDAG &DAG,

21641

const X86Subtarget &Subtarget,

21642

const SDLoc &DL) {

21643

if (Op.getSimpleValueType() != MVT::v2f64)

21644

return SDValue();

21645

21646

bool IsStrict = Op->isStrictFPOpcode();

21647

21648

SDValue N0 = Op.getOperand(IsStrict ? 1 : 0);

21649

assert(N0.getSimpleValueType() == MVT::v2i32 && "Unexpected input type")(static_cast <bool> (N0.getSimpleValueType() == MVT::v2i32
&& "Unexpected input type") ? void (0) : __assert_fail
("N0.getSimpleValueType() == MVT::v2i32 && \"Unexpected input type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21649, __extension__
__PRETTY_FUNCTION__));

21650

21651

if (Subtarget.hasAVX512()) {

21652

if (!Subtarget.hasVLX()) {

21653

// Let generic type legalization widen this.

21654

if (!IsStrict)

21655

return SDValue();

21656

// Otherwise pad the integer input with 0s and widen the operation.

21657

N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,

21658

DAG.getConstant(0, DL, MVT::v2i32));

21659

SDValue Res = DAG.getNode(Op->getOpcode(), DL, {MVT::v4f64, MVT::Other},

21660

{Op.getOperand(0), N0});

21661

SDValue Chain = Res.getValue(1);

21662

Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2f64, Res,

21663

DAG.getIntPtrConstant(0, DL));

21664

return DAG.getMergeValues({Res, Chain}, DL);

21665

}

21666

21667

// Legalize to v4i32 type.

21668

N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,

21669

DAG.getUNDEF(MVT::v2i32));

21670

if (IsStrict)

21671

return DAG.getNode(X86ISD::STRICT_CVTUI2P, DL, {MVT::v2f64, MVT::Other},

21672

{Op.getOperand(0), N0});

21673

return DAG.getNode(X86ISD::CVTUI2P, DL, MVT::v2f64, N0);

21674

}

21675

21676

// Zero extend to 2i64, OR with the floating point representation of 2^52.

21677

// This gives us the floating point equivalent of 2^52 + the i32 integer

21678

// since double has 52-bits of mantissa. Then subtract 2^52 in floating

21679

// point leaving just our i32 integers in double format.

21680

SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v2i64, N0);

21681

SDValue VBias =

21682

DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), DL, MVT::v2f64);

21683

SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v2i64, ZExtIn,

21684

DAG.getBitcast(MVT::v2i64, VBias));

21685

Or = DAG.getBitcast(MVT::v2f64, Or);

21686

21687

if (IsStrict)

21688

return DAG.getNode(ISD::STRICT_FSUB, DL, {MVT::v2f64, MVT::Other},

21689

{Op.getOperand(0), Or, VBias});

21690

return DAG.getNode(ISD::FSUB, DL, MVT::v2f64, Or, VBias);

21691

}

21692

21693

static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,

21694

const X86Subtarget &Subtarget) {

21695

SDLoc DL(Op);

21696

bool IsStrict = Op->isStrictFPOpcode();

21697

SDValue V = Op->getOperand(IsStrict ? 1 : 0);

21698

MVT VecIntVT = V.getSimpleValueType();

21699

assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&(static_cast <bool> ((VecIntVT == MVT::v4i32 || VecIntVT
== MVT::v8i32) && "Unsupported custom type") ? void (
0) : __assert_fail ("(VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) && \"Unsupported custom type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21700, __extension__
__PRETTY_FUNCTION__))

21700

"Unsupported custom type")(static_cast <bool> ((VecIntVT == MVT::v4i32 || VecIntVT
== MVT::v8i32) && "Unsupported custom type") ? void (
0) : __assert_fail ("(VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) && \"Unsupported custom type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21700, __extension__
__PRETTY_FUNCTION__));

21701

21702

if (Subtarget.hasAVX512()) {

21703

// With AVX512, but not VLX we need to widen to get a 512-bit result type.

21704

assert(!Subtarget.hasVLX() && "Unexpected features")(static_cast <bool> (!Subtarget.hasVLX() && "Unexpected features"
) ? void (0) : __assert_fail ("!Subtarget.hasVLX() && \"Unexpected features\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21704, __extension__
__PRETTY_FUNCTION__));

21705

MVT VT = Op->getSimpleValueType(0);

21706

21707

// v8i32->v8f64 is legal with AVX512 so just return it.

21708

if (VT == MVT::v8f64)

21709

return Op;

21710

21711

assert((VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v4f64) &&(static_cast <bool> ((VT == MVT::v4f32 || VT == MVT::v8f32
|| VT == MVT::v4f64) && "Unexpected VT!") ? void (0)
: __assert_fail ("(VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v4f64) && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21712, __extension__
__PRETTY_FUNCTION__))

21712

"Unexpected VT!")(static_cast <bool> ((VT == MVT::v4f32 || VT == MVT::v8f32
|| VT == MVT::v4f64) && "Unexpected VT!") ? void (0)
: __assert_fail ("(VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v4f64) && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21712, __extension__
__PRETTY_FUNCTION__));

21713

MVT WideVT = VT == MVT::v4f64 ? MVT::v8f64 : MVT::v16f32;

21714

MVT WideIntVT = VT == MVT::v4f64 ? MVT::v8i32 : MVT::v16i32;

21715

// Need to concat with zero vector for strict fp to avoid spurious

21716

// exceptions.

21717

SDValue Tmp =

21718

IsStrict ? DAG.getConstant(0, DL, WideIntVT) : DAG.getUNDEF(WideIntVT);

21719

V = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideIntVT, Tmp, V,

21720

DAG.getIntPtrConstant(0, DL));

21721

SDValue Res, Chain;

21722

if (IsStrict) {

21723

Res = DAG.getNode(ISD::STRICT_UINT_TO_FP, DL, {WideVT, MVT::Other},

21724

{Op->getOperand(0), V});

21725

Chain = Res.getValue(1);

21726

} else {

21727

Res = DAG.getNode(ISD::UINT_TO_FP, DL, WideVT, V);

21728

}

21729

21730

Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,

21731

DAG.getIntPtrConstant(0, DL));

21732

21733

if (IsStrict)

21734

return DAG.getMergeValues({Res, Chain}, DL);

21735

return Res;

21736

}

21737

21738

if (Subtarget.hasAVX() && VecIntVT == MVT::v4i32 &&

21739

Op->getSimpleValueType(0) == MVT::v4f64) {

21740

SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v4i64, V);

21741

Constant *Bias = ConstantFP::get(

21742

*DAG.getContext(),

21743

APFloat(APFloat::IEEEdouble(), APInt(64, 0x4330000000000000ULL)));

21744

auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());

21745

SDValue CPIdx = DAG.getConstantPool(Bias, PtrVT, Align(8));

21746

SDVTList Tys = DAG.getVTList(MVT::v4f64, MVT::Other);

21747

SDValue Ops[] = {DAG.getEntryNode(), CPIdx};

21748

SDValue VBias = DAG.getMemIntrinsicNode(

21749

X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::f64,

21750

MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Align(8),

21751

MachineMemOperand::MOLoad);

21752

21753

SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v4i64, ZExtIn,

21754

DAG.getBitcast(MVT::v4i64, VBias));

21755

Or = DAG.getBitcast(MVT::v4f64, Or);

21756

21757

if (IsStrict)

21758

return DAG.getNode(ISD::STRICT_FSUB, DL, {MVT::v4f64, MVT::Other},

21759

{Op.getOperand(0), Or, VBias});

21760

return DAG.getNode(ISD::FSUB, DL, MVT::v4f64, Or, VBias);

21761

}

21762

21763

// The algorithm is the following:

21764

// #ifdef __SSE4_1__

21765

// uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);

21766

// uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),

21767

// (uint4) 0x53000000, 0xaa);

21768

// #else

21769

// uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;

21770

// uint4 hi = (v >> 16) | (uint4) 0x53000000;

21771

// #endif

21772

// float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);

21773

// return (float4) lo + fhi;

21774

21775

bool Is128 = VecIntVT == MVT::v4i32;

21776

MVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;

21777

// If we convert to something else than the supported type, e.g., to v4f64,

21778

// abort early.

21779

if (VecFloatVT != Op->getSimpleValueType(0))

21780

return SDValue();

21781

21782

// In the #idef/#else code, we have in common:

21783

// - The vector of constants:

21784

// -- 0x4b000000

21785

// -- 0x53000000

21786

// - A shift:

21787

// -- v >> 16

21788

21789

// Create the splat vector for 0x4b000000.

21790

SDValue VecCstLow = DAG.getConstant(0x4b000000, DL, VecIntVT);

21791

// Create the splat vector for 0x53000000.

21792

SDValue VecCstHigh = DAG.getConstant(0x53000000, DL, VecIntVT);

21793

21794

// Create the right shift.

21795

SDValue VecCstShift = DAG.getConstant(16, DL, VecIntVT);

21796

SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift);

21797

21798

SDValue Low, High;

21799

if (Subtarget.hasSSE41()) {

21800

MVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16;

21801

// uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);

21802

SDValue VecCstLowBitcast = DAG.getBitcast(VecI16VT, VecCstLow);

21803

SDValue VecBitcast = DAG.getBitcast(VecI16VT, V);

21804

// Low will be bitcasted right away, so do not bother bitcasting back to its

21805

// original type.

21806

Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast,

21807

VecCstLowBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8));

21808

// uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),

21809

// (uint4) 0x53000000, 0xaa);

21810

SDValue VecCstHighBitcast = DAG.getBitcast(VecI16VT, VecCstHigh);

21811

SDValue VecShiftBitcast = DAG.getBitcast(VecI16VT, HighShift);

21812

// High will be bitcasted right away, so do not bother bitcasting back to

21813

// its original type.

21814

High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast,

21815

VecCstHighBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8));

21816

} else {

21817

SDValue VecCstMask = DAG.getConstant(0xffff, DL, VecIntVT);

21818

// uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;

21819

SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask);

21820

Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow);

21821

21822

// uint4 hi = (v >> 16) | (uint4) 0x53000000;

21823

High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh);

21824

}

21825

21826

// Create the vector constant for (0x1.0p39f + 0x1.0p23f).

21827

SDValue VecCstFSub = DAG.getConstantFP(

21828

APFloat(APFloat::IEEEsingle(), APInt(32, 0x53000080)), DL, VecFloatVT);

21829

21830

// float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);

21831

// NOTE: By using fsub of a positive constant instead of fadd of a negative

21832

// constant, we avoid reassociation in MachineCombiner when unsafe-fp-math is

21833

// enabled. See PR24512.

21834

SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High);

21835

// TODO: Are there any fast-math-flags to propagate here?

21836

// (float4) lo;

21837

SDValue LowBitcast = DAG.getBitcast(VecFloatVT, Low);

21838

// return (float4) lo + fhi;

21839

if (IsStrict) {

21840

SDValue FHigh = DAG.getNode(ISD::STRICT_FSUB, DL, {VecFloatVT, MVT::Other},

21841

{Op.getOperand(0), HighBitcast, VecCstFSub});

21842

return DAG.getNode(ISD::STRICT_FADD, DL, {VecFloatVT, MVT::Other},

21843

{FHigh.getValue(1), LowBitcast, FHigh});

21844

}

21845

21846

SDValue FHigh =

21847

DAG.getNode(ISD::FSUB, DL, VecFloatVT, HighBitcast, VecCstFSub);

21848

return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh);

21849

}

21850

21851

static SDValue lowerUINT_TO_FP_vec(SDValue Op, SelectionDAG &DAG,

21852

const X86Subtarget &Subtarget) {

21853

unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0;

21854

SDValue N0 = Op.getOperand(OpNo);

21855

MVT SrcVT = N0.getSimpleValueType();

21856

SDLoc dl(Op);

21857

21858

switch (SrcVT.SimpleTy) {

21859

default:

21860

llvm_unreachable("Custom UINT_TO_FP is not supported!")::llvm::llvm_unreachable_internal("Custom UINT_TO_FP is not supported!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21860);

21861

case MVT::v2i32:

21862

return lowerUINT_TO_FP_v2i32(Op, DAG, Subtarget, dl);

21863

case MVT::v4i32:

21864

case MVT::v8i32:

21865

return lowerUINT_TO_FP_vXi32(Op, DAG, Subtarget);

21866

case MVT::v2i64:

21867

case MVT::v4i64:

21868

return lowerINT_TO_FP_vXi64(Op, DAG, Subtarget);

21869

}

21870

}

21871

21872

SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,

21873

SelectionDAG &DAG) const {

21874

bool IsStrict = Op->isStrictFPOpcode();

21875

unsigned OpNo = IsStrict ? 1 : 0;

21876

SDValue Src = Op.getOperand(OpNo);

21877

SDLoc dl(Op);

21878

auto PtrVT = getPointerTy(DAG.getDataLayout());

21879

MVT SrcVT = Src.getSimpleValueType();

21880

MVT DstVT = Op->getSimpleValueType(0);

21881

SDValue Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();

21882

21883

// Bail out when we don't have native conversion instructions.

21884

if (DstVT == MVT::f128)

21885

return SDValue();

21886

21887

if (isSoftFP16(DstVT))

21888

return promoteXINT_TO_FP(Op, DAG);

21889

else if (isLegalConversion(SrcVT, false, Subtarget))

21890

return Op;

21891

21892

if (DstVT.isVector())

21893

return lowerUINT_TO_FP_vec(Op, DAG, Subtarget);

21894

21895

if (Subtarget.isTargetWin64() && SrcVT == MVT::i128)

21896

return LowerWin64_INT128_TO_FP(Op, DAG);

21897

21898

if (SDValue Extract = vectorizeExtractedCast(Op, DAG, Subtarget))

21899

return Extract;

21900

21901

if (Subtarget.hasAVX512() && isScalarFPTypeInSSEReg(DstVT) &&

21902

(SrcVT == MVT::i32 || (SrcVT == MVT::i64 && Subtarget.is64Bit()))) {

21903

// Conversions from unsigned i32 to f32/f64 are legal,

21904

// using VCVTUSI2SS/SD. Same for i64 in 64-bit mode.

21905

return Op;

21906

}

21907

21908

// Promote i32 to i64 and use a signed conversion on 64-bit targets.

21909

if (SrcVT == MVT::i32 && Subtarget.is64Bit()) {

21910

Src = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Src);

21911

if (IsStrict)

21912

return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {DstVT, MVT::Other},

21913

{Chain, Src});

21914

return DAG.getNode(ISD::SINT_TO_FP, dl, DstVT, Src);

21915

}

21916

21917

if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, DAG, Subtarget))

21918

return V;

21919

if (SDValue V = LowerI64IntToFP16(Op, DAG, Subtarget))

21920

return V;

21921

21922

// The transform for i64->f64 isn't correct for 0 when rounding to negative

21923

// infinity. It produces -0.0, so disable under strictfp.

21924

if (SrcVT == MVT::i64 && DstVT == MVT::f64 && Subtarget.hasSSE2() &&

21925

!IsStrict)

21926

return LowerUINT_TO_FP_i64(Op, DAG, Subtarget);

21927

// The transform for i32->f64/f32 isn't correct for 0 when rounding to

21928

// negative infinity. So disable under strictfp. Using FILD instead.

21929

if (SrcVT == MVT::i32 && Subtarget.hasSSE2() && DstVT != MVT::f80 &&

21930

!IsStrict)

21931

return LowerUINT_TO_FP_i32(Op, DAG, Subtarget);

21932

if (Subtarget.is64Bit() && SrcVT == MVT::i64 &&

21933

(DstVT == MVT::f32 || DstVT == MVT::f64))

21934

return SDValue();

21935

21936

// Make a 64-bit buffer, and use it to build an FILD.

21937

SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64, 8);

21938

int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();

21939

Align SlotAlign(8);

21940

MachinePointerInfo MPI =

21941

MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI);

21942

if (SrcVT == MVT::i32) {

21943

SDValue OffsetSlot =

21944

DAG.getMemBasePlusOffset(StackSlot, TypeSize::Fixed(4), dl);

21945

SDValue Store1 = DAG.getStore(Chain, dl, Src, StackSlot, MPI, SlotAlign);

21946

SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32),

21947

OffsetSlot, MPI.getWithOffset(4), SlotAlign);

21948

std::pair<SDValue, SDValue> Tmp =

21949

BuildFILD(DstVT, MVT::i64, dl, Store2, StackSlot, MPI, SlotAlign, DAG);

21950

if (IsStrict)

21951

return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);

21952

21953

return Tmp.first;

21954

}

21955

21956

assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP")(static_cast <bool> (SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP"
) ? void (0) : __assert_fail ("SrcVT == MVT::i64 && \"Unexpected type in UINT_TO_FP\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 21956, __extension__
__PRETTY_FUNCTION__));

21957

SDValue ValueToStore = Src;

21958

if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget.is64Bit()) {

21959

// Bitcasting to f64 here allows us to do a single 64-bit store from

21960

// an SSE register, avoiding the store forwarding penalty that would come

21961

// with two 32-bit stores.

21962

ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);

21963

}

21964

SDValue Store =

21965

DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, SlotAlign);

21966

// For i64 source, we need to add the appropriate power of 2 if the input

21967

// was negative. We must be careful to do the computation in x87 extended

21968

// precision, not in SSE.

21969

SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);

21970

SDValue Ops[] = { Store, StackSlot };

21971

SDValue Fild =

21972

DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops, MVT::i64, MPI,

21973

SlotAlign, MachineMemOperand::MOLoad);

21974

Chain = Fild.getValue(1);

21975

21976

21977

// Check whether the sign bit is set.

21978

SDValue SignSet = DAG.getSetCC(

21979

dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),

21980

Op.getOperand(OpNo), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT);

21981

21982

// Build a 64 bit pair (FF, 0) in the constant pool, with FF in the hi bits.

21983

APInt FF(64, 0x5F80000000000000ULL);

21984

SDValue FudgePtr = DAG.getConstantPool(

21985

ConstantInt::get(*DAG.getContext(), FF), PtrVT);

21986

Align CPAlignment = cast<ConstantPoolSDNode>(FudgePtr)->getAlign();

21987

21988

// Get a pointer to FF if the sign bit was set, or to 0 otherwise.

21989

SDValue Zero = DAG.getIntPtrConstant(0, dl);

21990

SDValue Four = DAG.getIntPtrConstant(4, dl);

21991

SDValue Offset = DAG.getSelect(dl, Zero.getValueType(), SignSet, Four, Zero);

21992

FudgePtr = DAG.getNode(ISD::ADD, dl, PtrVT, FudgePtr, Offset);

21993

21994

// Load the value out, extending it from f32 to f80.

21995

SDValue Fudge = DAG.getExtLoad(

21996

ISD::EXTLOAD, dl, MVT::f80, Chain, FudgePtr,

21997

MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), MVT::f32,

21998

CPAlignment);

21999

Chain = Fudge.getValue(1);

22000

// Extend everything to 80 bits to force it to be done on x87.

22001

// TODO: Are there any fast-math-flags to propagate here?

22002

if (IsStrict) {

22003

SDValue Add = DAG.getNode(ISD::STRICT_FADD, dl, {MVT::f80, MVT::Other},

22004

{Chain, Fild, Fudge});

22005

// STRICT_FP_ROUND can't handle equal types.

22006

if (DstVT == MVT::f80)

22007

return Add;

22008

return DAG.getNode(ISD::STRICT_FP_ROUND, dl, {DstVT, MVT::Other},

22009

{Add.getValue(1), Add, DAG.getIntPtrConstant(0, dl)});

22010

}

22011

SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge);

22012

return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add,

22013

DAG.getIntPtrConstant(0, dl, /*isTarget=*/true));

22014

}

22015

22016

// If the given FP_TO_SINT (IsSigned) or FP_TO_UINT (!IsSigned) operation

22017

// is legal, or has an fp128 or f16 source (which needs to be promoted to f32),

22018

// just return an SDValue().

22019

// Otherwise it is assumed to be a conversion from one of f32, f64 or f80

22020

// to i16, i32 or i64, and we lower it to a legal sequence and return the

22021

// result.

22022

SDValue

22023

X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,

22024

bool IsSigned, SDValue &Chain) const {

22025

bool IsStrict = Op->isStrictFPOpcode();

22026

SDLoc DL(Op);

22027

22028

EVT DstTy = Op.getValueType();

22029

SDValue Value = Op.getOperand(IsStrict ? 1 : 0);

22030

EVT TheVT = Value.getValueType();

22031

auto PtrVT = getPointerTy(DAG.getDataLayout());

22032

22033

if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) {

22034

// f16 must be promoted before using the lowering in this routine.

22035

// fp128 does not use this lowering.

22036

return SDValue();

22037

}

22038

22039

// If using FIST to compute an unsigned i64, we'll need some fixup

22040

// to handle values above the maximum signed i64. A FIST is always

22041

// used for the 32-bit subtarget, but also for f80 on a 64-bit target.

22042

bool UnsignedFixup = !IsSigned && DstTy == MVT::i64;

22043

22044

// FIXME: This does not generate an invalid exception if the input does not

22045

// fit in i32. PR44019

22046

if (!IsSigned && DstTy != MVT::i64) {

22047

// Replace the fp-to-uint32 operation with an fp-to-sint64 FIST.

22048

// The low 32 bits of the fist result will have the correct uint32 result.

22049

assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT")(static_cast <bool> (DstTy == MVT::i32 && "Unexpected FP_TO_UINT"
) ? void (0) : __assert_fail ("DstTy == MVT::i32 && \"Unexpected FP_TO_UINT\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22049, __extension__
__PRETTY_FUNCTION__));

22050

DstTy = MVT::i64;

22051

}

22052

22053

assert(DstTy.getSimpleVT() <= MVT::i64 &&(static_cast <bool> (DstTy.getSimpleVT() <= MVT::i64
&& DstTy.getSimpleVT() >= MVT::i16 && "Unknown FP_TO_INT to lower!"
) ? void (0) : __assert_fail ("DstTy.getSimpleVT() <= MVT::i64 && DstTy.getSimpleVT() >= MVT::i16 && \"Unknown FP_TO_INT to lower!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22055, __extension__
__PRETTY_FUNCTION__))

22054

DstTy.getSimpleVT() >= MVT::i16 &&(static_cast <bool> (DstTy.getSimpleVT() <= MVT::i64
&& DstTy.getSimpleVT() >= MVT::i16 && "Unknown FP_TO_INT to lower!"
) ? void (0) : __assert_fail ("DstTy.getSimpleVT() <= MVT::i64 && DstTy.getSimpleVT() >= MVT::i16 && \"Unknown FP_TO_INT to lower!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22055, __extension__
__PRETTY_FUNCTION__))

22055

"Unknown FP_TO_INT to lower!")(static_cast <bool> (DstTy.getSimpleVT() <= MVT::i64
&& DstTy.getSimpleVT() >= MVT::i16 && "Unknown FP_TO_INT to lower!"
) ? void (0) : __assert_fail ("DstTy.getSimpleVT() <= MVT::i64 && DstTy.getSimpleVT() >= MVT::i16 && \"Unknown FP_TO_INT to lower!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22055, __extension__
__PRETTY_FUNCTION__));

22056

22057

// We lower FP->int64 into FISTP64 followed by a load from a temporary

22058

// stack slot.

22059

MachineFunction &MF = DAG.getMachineFunction();

22060

unsigned MemSize = DstTy.getStoreSize();

22061

int SSFI =

22062

MF.getFrameInfo().CreateStackObject(MemSize, Align(MemSize), false);

22063

SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);

22064

22065

Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();

22066

22067

SDValue Adjust; // 0x0 or 0x80000000, for result sign bit adjustment.

22068

22069

if (UnsignedFixup) {

22070

//

22071

// Conversion to unsigned i64 is implemented with a select,

22072

// depending on whether the source value fits in the range

22073

// of a signed i64. Let Thresh be the FP equivalent of

22074

// 0x8000000000000000ULL.

22075

//

22076

// Adjust = (Value >= Thresh) ? 0x80000000 : 0;

22077

// FltOfs = (Value >= Thresh) ? 0x80000000 : 0;

22078

// FistSrc = (Value - FltOfs);

22079

// Fist-to-mem64 FistSrc

22080

// Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent

22081

// to XOR'ing the high 32 bits with Adjust.

22082

//

22083

// Being a power of 2, Thresh is exactly representable in all FP formats.

22084

// For X87 we'd like to use the smallest FP type for this constant, but

22085

// for DAG type consistency we have to match the FP operand type.

22086

22087

APFloat Thresh(APFloat::IEEEsingle(), APInt(32, 0x5f000000));

22088

LLVM_ATTRIBUTE_UNUSED__attribute__((__unused__)) APFloat::opStatus Status = APFloat::opOK;

22089

bool LosesInfo = false;

22090

if (TheVT == MVT::f64)

22091

// The rounding mode is irrelevant as the conversion should be exact.

22092

Status = Thresh.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven,

22093

&LosesInfo);

22094

else if (TheVT == MVT::f80)

22095

Status = Thresh.convert(APFloat::x87DoubleExtended(),

22096

APFloat::rmNearestTiesToEven, &LosesInfo);

22097

22098

assert(Status == APFloat::opOK && !LosesInfo &&(static_cast <bool> (Status == APFloat::opOK &&
!LosesInfo && "FP conversion should have been exact"
) ? void (0) : __assert_fail ("Status == APFloat::opOK && !LosesInfo && \"FP conversion should have been exact\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22099, __extension__
__PRETTY_FUNCTION__))

22099

"FP conversion should have been exact")(static_cast <bool> (Status == APFloat::opOK &&
!LosesInfo && "FP conversion should have been exact"
) ? void (0) : __assert_fail ("Status == APFloat::opOK && !LosesInfo && \"FP conversion should have been exact\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22099, __extension__
__PRETTY_FUNCTION__));

22100

22101

SDValue ThreshVal = DAG.getConstantFP(Thresh, DL, TheVT);

22102

22103

EVT ResVT = getSetCCResultType(DAG.getDataLayout(),

22104

*DAG.getContext(), TheVT);

22105

SDValue Cmp;

22106

if (IsStrict) {

22107

Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETGE, Chain,

22108

/*IsSignaling*/ true);

22109

Chain = Cmp.getValue(1);

22110

} else {

22111

Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETGE);

22112

}

22113

22114

// Our preferred lowering of

22115

//

22116

// (Value >= Thresh) ? 0x8000000000000000ULL : 0

22117

//

22118

// is

22119

//

22120

// (Value >= Thresh) << 63

22121

//

22122

// but since we can get here after LegalOperations, DAGCombine might do the

22123

// wrong thing if we create a select. So, directly create the preferred

22124

// version.

22125

SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Cmp);

22126

SDValue Const63 = DAG.getConstant(63, DL, MVT::i8);

22127

Adjust = DAG.getNode(ISD::SHL, DL, MVT::i64, Zext, Const63);

22128

22129

SDValue FltOfs = DAG.getSelect(DL, TheVT, Cmp, ThreshVal,

22130

DAG.getConstantFP(0.0, DL, TheVT));

22131

22132

if (IsStrict) {

22133

Value = DAG.getNode(ISD::STRICT_FSUB, DL, { TheVT, MVT::Other},

22134

{ Chain, Value, FltOfs });

22135

Chain = Value.getValue(1);

22136

} else

22137

Value = DAG.getNode(ISD::FSUB, DL, TheVT, Value, FltOfs);

22138

}

22139

22140

MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, SSFI);

22141

22142

// FIXME This causes a redundant load/store if the SSE-class value is already

22143

// in memory, such as if it is on the callstack.

22144

if (isScalarFPTypeInSSEReg(TheVT)) {

22145

assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!")(static_cast <bool> (DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!"
) ? void (0) : __assert_fail ("DstTy == MVT::i64 && \"Invalid FP_TO_SINT to lower!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22145, __extension__
__PRETTY_FUNCTION__));

22146

Chain = DAG.getStore(Chain, DL, Value, StackSlot, MPI);

22147

SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);

22148

SDValue Ops[] = { Chain, StackSlot };

22149

22150

unsigned FLDSize = TheVT.getStoreSize();

22151

assert(FLDSize <= MemSize && "Stack slot not big enough")(static_cast <bool> (FLDSize <= MemSize && "Stack slot not big enough"
) ? void (0) : __assert_fail ("FLDSize <= MemSize && \"Stack slot not big enough\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22151, __extension__
__PRETTY_FUNCTION__));

22152

MachineMemOperand *MMO = MF.getMachineMemOperand(

22153

MPI, MachineMemOperand::MOLoad, FLDSize, Align(FLDSize));

22154

Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, TheVT, MMO);

22155

Chain = Value.getValue(1);

22156

}

22157

22158

// Build the FP_TO_INT*_IN_MEM

22159

MachineMemOperand *MMO = MF.getMachineMemOperand(

22160

MPI, MachineMemOperand::MOStore, MemSize, Align(MemSize));

22161

SDValue Ops[] = { Chain, Value, StackSlot };

22162

SDValue FIST = DAG.getMemIntrinsicNode(X86ISD::FP_TO_INT_IN_MEM, DL,

22163

DAG.getVTList(MVT::Other),

22164

Ops, DstTy, MMO);

22165

22166

SDValue Res = DAG.getLoad(Op.getValueType(), SDLoc(Op), FIST, StackSlot, MPI);

22167

Chain = Res.getValue(1);

22168

22169

// If we need an unsigned fixup, XOR the result with adjust.

22170

if (UnsignedFixup)

22171

Res = DAG.getNode(ISD::XOR, DL, MVT::i64, Res, Adjust);

22172

22173

return Res;

22174

}

22175

22176

static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,

22177

const X86Subtarget &Subtarget) {

22178

MVT VT = Op.getSimpleValueType();

22179

SDValue In = Op.getOperand(0);

22180

MVT InVT = In.getSimpleValueType();

22181

SDLoc dl(Op);

22182

unsigned Opc = Op.getOpcode();

22183

22184

assert(VT.isVector() && InVT.isVector() && "Expected vector type")(static_cast <bool> (VT.isVector() && InVT.isVector
() && "Expected vector type") ? void (0) : __assert_fail
("VT.isVector() && InVT.isVector() && \"Expected vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22184, __extension__
__PRETTY_FUNCTION__));

22185

assert((Opc == ISD::ANY_EXTEND || Opc == ISD::ZERO_EXTEND) &&(static_cast <bool> ((Opc == ISD::ANY_EXTEND || Opc == ISD
::ZERO_EXTEND) && "Unexpected extension opcode") ? void
(0) : __assert_fail ("(Opc == ISD::ANY_EXTEND || Opc == ISD::ZERO_EXTEND) && \"Unexpected extension opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22186, __extension__
__PRETTY_FUNCTION__))

22186

"Unexpected extension opcode")(static_cast <bool> ((Opc == ISD::ANY_EXTEND || Opc == ISD
::ZERO_EXTEND) && "Unexpected extension opcode") ? void
(0) : __assert_fail ("(Opc == ISD::ANY_EXTEND || Opc == ISD::ZERO_EXTEND) && \"Unexpected extension opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22186, __extension__
__PRETTY_FUNCTION__));

22187

assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&(static_cast <bool> (VT.getVectorNumElements() == InVT.
getVectorNumElements() && "Expected same number of elements"
) ? void (0) : __assert_fail ("VT.getVectorNumElements() == InVT.getVectorNumElements() && \"Expected same number of elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22188, __extension__
__PRETTY_FUNCTION__))

22188

"Expected same number of elements")(static_cast <bool> (VT.getVectorNumElements() == InVT.
getVectorNumElements() && "Expected same number of elements"
) ? void (0) : __assert_fail ("VT.getVectorNumElements() == InVT.getVectorNumElements() && \"Expected same number of elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22188, __extension__
__PRETTY_FUNCTION__));

22189

assert((VT.getVectorElementType() == MVT::i16 ||(static_cast <bool> ((VT.getVectorElementType() == MVT::
i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType
() == MVT::i64) && "Unexpected element type") ? void (
0) : __assert_fail ("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22192, __extension__
__PRETTY_FUNCTION__))

22190

VT.getVectorElementType() == MVT::i32 ||(static_cast <bool> ((VT.getVectorElementType() == MVT::
i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType
() == MVT::i64) && "Unexpected element type") ? void (
0) : __assert_fail ("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22192, __extension__
__PRETTY_FUNCTION__))

22191

VT.getVectorElementType() == MVT::i64) &&(static_cast <bool> ((VT.getVectorElementType() == MVT::
i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType
() == MVT::i64) && "Unexpected element type") ? void (
0) : __assert_fail ("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22192, __extension__
__PRETTY_FUNCTION__))

22192

"Unexpected element type")(static_cast <bool> ((VT.getVectorElementType() == MVT::
i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType
() == MVT::i64) && "Unexpected element type") ? void (
0) : __assert_fail ("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22192, __extension__
__PRETTY_FUNCTION__));

22193

assert((InVT.getVectorElementType() == MVT::i8 ||(static_cast <bool> ((InVT.getVectorElementType() == MVT
::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType
() == MVT::i32) && "Unexpected element type") ? void (
0) : __assert_fail ("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22196, __extension__
__PRETTY_FUNCTION__))

22194

InVT.getVectorElementType() == MVT::i16 ||(static_cast <bool> ((InVT.getVectorElementType() == MVT
::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType
() == MVT::i32) && "Unexpected element type") ? void (
0) : __assert_fail ("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22196, __extension__
__PRETTY_FUNCTION__))

22195

InVT.getVectorElementType() == MVT::i32) &&(static_cast <bool> ((InVT.getVectorElementType() == MVT
::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType
() == MVT::i32) && "Unexpected element type") ? void (
0) : __assert_fail ("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22196, __extension__
__PRETTY_FUNCTION__))

22196

"Unexpected element type")(static_cast <bool> ((InVT.getVectorElementType() == MVT
::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType
() == MVT::i32) && "Unexpected element type") ? void (
0) : __assert_fail ("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22196, __extension__
__PRETTY_FUNCTION__));

22197

22198

unsigned ExtendInVecOpc = DAG.getOpcode_EXTEND_VECTOR_INREG(Opc);

22199

22200

if (VT == MVT::v32i16 && !Subtarget.hasBWI()) {

22201

assert(InVT == MVT::v32i8 && "Unexpected VT!")(static_cast <bool> (InVT == MVT::v32i8 && "Unexpected VT!"
) ? void (0) : __assert_fail ("InVT == MVT::v32i8 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22201, __extension__
__PRETTY_FUNCTION__));

22202

return splitVectorIntUnary(Op, DAG);

22203

}

22204

22205

if (Subtarget.hasInt256())

22206

return Op;

22207

22208

// Optimize vectors in AVX mode:

22209

//

22210

// v8i16 -> v8i32

22211

// Use vpmovzwd for 4 lower elements v8i16 -> v4i32.

22212

// Use vpunpckhwd for 4 upper elements v8i16 -> v4i32.

22213

// Concat upper and lower parts.

22214

//

22215

// v4i32 -> v4i64

22216

// Use vpmovzdq for 4 lower elements v4i32 -> v2i64.

22217

// Use vpunpckhdq for 4 upper elements v4i32 -> v2i64.

22218

// Concat upper and lower parts.

22219

//

22220

MVT HalfVT = VT.getHalfNumVectorElementsVT();

22221

SDValue OpLo = DAG.getNode(ExtendInVecOpc, dl, HalfVT, In);

22222

22223

// Short-circuit if we can determine that each 128-bit half is the same value.

22224

// Otherwise, this is difficult to match and optimize.

22225

if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(In))

22226

if (hasIdenticalHalvesShuffleMask(Shuf->getMask()))

22227

return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpLo);

22228

22229

SDValue ZeroVec = DAG.getConstant(0, dl, InVT);

22230

SDValue Undef = DAG.getUNDEF(InVT);

22231

bool NeedZero = Opc == ISD::ZERO_EXTEND;

22232

SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);

22233

OpHi = DAG.getBitcast(HalfVT, OpHi);

22234

22235

return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);

22236

}

22237

22238

// Helper to split and extend a v16i1 mask to v16i8 or v16i16.

22239

static SDValue SplitAndExtendv16i1(unsigned ExtOpc, MVT VT, SDValue In,

22240

const SDLoc &dl, SelectionDAG &DAG) {

22241

assert((VT == MVT::v16i8 || VT == MVT::v16i16) && "Unexpected VT.")(static_cast <bool> ((VT == MVT::v16i8 || VT == MVT::v16i16
) && "Unexpected VT.") ? void (0) : __assert_fail ("(VT == MVT::v16i8 || VT == MVT::v16i16) && \"Unexpected VT.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22241, __extension__
__PRETTY_FUNCTION__));

22242

SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,

22243

DAG.getIntPtrConstant(0, dl));

22244

SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,

22245

DAG.getIntPtrConstant(8, dl));

22246

Lo = DAG.getNode(ExtOpc, dl, MVT::v8i16, Lo);

22247

Hi = DAG.getNode(ExtOpc, dl, MVT::v8i16, Hi);

22248

SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i16, Lo, Hi);

22249

return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);

22250

}

22251

22252

static SDValue LowerZERO_EXTEND_Mask(SDValue Op,

22253

const X86Subtarget &Subtarget,

22254

SelectionDAG &DAG) {

22255

MVT VT = Op->getSimpleValueType(0);

22256

SDValue In = Op->getOperand(0);

22257

MVT InVT = In.getSimpleValueType();

22258

assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!")(static_cast <bool> (InVT.getVectorElementType() == MVT
::i1 && "Unexpected input type!") ? void (0) : __assert_fail
("InVT.getVectorElementType() == MVT::i1 && \"Unexpected input type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22258, __extension__
__PRETTY_FUNCTION__));

22259

SDLoc DL(Op);

22260

unsigned NumElts = VT.getVectorNumElements();

22261

22262

// For all vectors, but vXi8 we can just emit a sign_extend and a shift. This

22263

// avoids a constant pool load.

22264

if (VT.getVectorElementType() != MVT::i8) {

22265

SDValue Extend = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, In);

22266

return DAG.getNode(ISD::SRL, DL, VT, Extend,

22267

DAG.getConstant(VT.getScalarSizeInBits() - 1, DL, VT));

22268

}

22269

22270

// Extend VT if BWI is not supported.

22271

MVT ExtVT = VT;

22272

if (!Subtarget.hasBWI()) {

22273

// If v16i32 is to be avoided, we'll need to split and concatenate.

22274

if (NumElts == 16 && !Subtarget.canExtendTo512DQ())

22275

return SplitAndExtendv16i1(ISD::ZERO_EXTEND, VT, In, DL, DAG);

22276

22277

ExtVT = MVT::getVectorVT(MVT::i32, NumElts);

22278

}

22279

22280

// Widen to 512-bits if VLX is not supported.

22281

MVT WideVT = ExtVT;

22282

if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {

22283

NumElts *= 512 / ExtVT.getSizeInBits();

22284

InVT = MVT::getVectorVT(MVT::i1, NumElts);

22285

In = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, InVT, DAG.getUNDEF(InVT),

22286

In, DAG.getIntPtrConstant(0, DL));

22287

WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(),

22288

NumElts);

22289

}

22290

22291

SDValue One = DAG.getConstant(1, DL, WideVT);

22292

SDValue Zero = DAG.getConstant(0, DL, WideVT);

22293

22294

SDValue SelectedVal = DAG.getSelect(DL, WideVT, In, One, Zero);

22295

22296

// Truncate if we had to extend above.

22297

if (VT != ExtVT) {

22298

WideVT = MVT::getVectorVT(MVT::i8, NumElts);

22299

SelectedVal = DAG.getNode(ISD::TRUNCATE, DL, WideVT, SelectedVal);

22300

}

22301

22302

// Extract back to 128/256-bit if we widened.

22303

if (WideVT != VT)

22304

SelectedVal = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SelectedVal,

22305

DAG.getIntPtrConstant(0, DL));

22306

22307

return SelectedVal;

22308

}

22309

22310

static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget &Subtarget,

22311

SelectionDAG &DAG) {

22312

SDValue In = Op.getOperand(0);

22313

MVT SVT = In.getSimpleValueType();

22314

22315

if (SVT.getVectorElementType() == MVT::i1)

22316

return LowerZERO_EXTEND_Mask(Op, Subtarget, DAG);

22317

22318

assert(Subtarget.hasAVX() && "Expected AVX support")(static_cast <bool> (Subtarget.hasAVX() && "Expected AVX support"
) ? void (0) : __assert_fail ("Subtarget.hasAVX() && \"Expected AVX support\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22318, __extension__
__PRETTY_FUNCTION__));

22319

return LowerAVXExtend(Op, DAG, Subtarget);

22320

}

22321

22322

/// Helper to recursively truncate vector elements in half with PACKSS/PACKUS.

22323

/// It makes use of the fact that vectors with enough leading sign/zero bits

22324

/// prevent the PACKSS/PACKUS from saturating the results.

22325

/// AVX2 (Int256) sub-targets require extra shuffling as the PACK*S operates

22326

/// within each 128-bit lane.

22327

static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In,

22328

const SDLoc &DL, SelectionDAG &DAG,

22329

const X86Subtarget &Subtarget) {

22330

assert((Opcode == X86ISD::PACKSS || Opcode == X86ISD::PACKUS) &&(static_cast <bool> ((Opcode == X86ISD::PACKSS || Opcode
== X86ISD::PACKUS) && "Unexpected PACK opcode") ? void
(0) : __assert_fail ("(Opcode == X86ISD::PACKSS || Opcode == X86ISD::PACKUS) && \"Unexpected PACK opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22331, __extension__
__PRETTY_FUNCTION__))

22331

"Unexpected PACK opcode")(static_cast <bool> ((Opcode == X86ISD::PACKSS || Opcode
== X86ISD::PACKUS) && "Unexpected PACK opcode") ? void
(0) : __assert_fail ("(Opcode == X86ISD::PACKSS || Opcode == X86ISD::PACKUS) && \"Unexpected PACK opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22331, __extension__
__PRETTY_FUNCTION__));

22332

assert(DstVT.isVector() && "VT not a vector?")(static_cast <bool> (DstVT.isVector() && "VT not a vector?"
) ? void (0) : __assert_fail ("DstVT.isVector() && \"VT not a vector?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22332, __extension__
__PRETTY_FUNCTION__));

22333

22334

// Requires SSE2 for PACKSS (SSE41 PACKUSDW is handled below).

22335

if (!Subtarget.hasSSE2())

22336

return SDValue();

22337

22338

EVT SrcVT = In.getValueType();

22339

22340

// No truncation required, we might get here due to recursive calls.

22341

if (SrcVT == DstVT)

22342

return In;

22343

22344

// We only support vector truncation to 64bits or greater from a

22345

// 128bits or greater source.

22346

unsigned DstSizeInBits = DstVT.getSizeInBits();

22347

unsigned SrcSizeInBits = SrcVT.getSizeInBits();

22348

if ((DstSizeInBits % 64) != 0 || (SrcSizeInBits % 128) != 0)

22349

return SDValue();

22350

22351

unsigned NumElems = SrcVT.getVectorNumElements();

22352

if (!isPowerOf2_32(NumElems))

22353

return SDValue();

22354

22355

LLVMContext &Ctx = *DAG.getContext();

22356

assert(DstVT.getVectorNumElements() == NumElems && "Illegal truncation")(static_cast <bool> (DstVT.getVectorNumElements() == NumElems
&& "Illegal truncation") ? void (0) : __assert_fail (
"DstVT.getVectorNumElements() == NumElems && \"Illegal truncation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22356, __extension__
__PRETTY_FUNCTION__));

22357

assert(SrcSizeInBits > DstSizeInBits && "Illegal truncation")(static_cast <bool> (SrcSizeInBits > DstSizeInBits &&
"Illegal truncation") ? void (0) : __assert_fail ("SrcSizeInBits > DstSizeInBits && \"Illegal truncation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22357, __extension__
__PRETTY_FUNCTION__));

22358

22359

EVT PackedSVT = EVT::getIntegerVT(Ctx, SrcVT.getScalarSizeInBits() / 2);

22360

22361

// Pack to the largest type possible:

22362

// vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.

22363

EVT InVT = MVT::i16, OutVT = MVT::i8;

22364

if (SrcVT.getScalarSizeInBits() > 16 &&

22365

(Opcode == X86ISD::PACKSS || Subtarget.hasSSE41())) {

22366

InVT = MVT::i32;

22367

OutVT = MVT::i16;

22368

}

22369

22370

// 128bit -> 64bit truncate - PACK 128-bit src in the lower subvector.

22371

if (SrcVT.is128BitVector()) {

22372

InVT = EVT::getVectorVT(Ctx, InVT, 128 / InVT.getSizeInBits());

22373

OutVT = EVT::getVectorVT(Ctx, OutVT, 128 / OutVT.getSizeInBits());

22374

In = DAG.getBitcast(InVT, In);

22375

SDValue Res = DAG.getNode(Opcode, DL, OutVT, In, DAG.getUNDEF(InVT));

22376

Res = extractSubVector(Res, 0, DAG, DL, 64);

22377

return DAG.getBitcast(DstVT, Res);

22378

}

22379

22380

// Split lower/upper subvectors.

22381

SDValue Lo, Hi;

22382

std::tie(Lo, Hi) = splitVector(In, DAG, DL);

22383

22384

unsigned SubSizeInBits = SrcSizeInBits / 2;

22385

InVT = EVT::getVectorVT(Ctx, InVT, SubSizeInBits / InVT.getSizeInBits());

22386

OutVT = EVT::getVectorVT(Ctx, OutVT, SubSizeInBits / OutVT.getSizeInBits());

22387

22388

// 256bit -> 128bit truncate - PACK lower/upper 128-bit subvectors.

22389

if (SrcVT.is256BitVector() && DstVT.is128BitVector()) {

22390

Lo = DAG.getBitcast(InVT, Lo);

22391

Hi = DAG.getBitcast(InVT, Hi);

22392

SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);

22393

return DAG.getBitcast(DstVT, Res);

22394

}

22395

22396

// AVX2: 512bit -> 256bit truncate - PACK lower/upper 256-bit subvectors.

22397

// AVX2: 512bit -> 128bit truncate - PACK(PACK, PACK).

22398

if (SrcVT.is512BitVector() && Subtarget.hasInt256()) {

22399

Lo = DAG.getBitcast(InVT, Lo);

22400

Hi = DAG.getBitcast(InVT, Hi);

22401

SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);

22402

22403

// 256-bit PACK(ARG0, ARG1) leaves us with ((LO0,LO1),(HI0,HI1)),

22404

// so we need to shuffle to get ((LO0,HI0),(LO1,HI1)).

22405

// Scale shuffle mask to avoid bitcasts and help ComputeNumSignBits.

22406

SmallVector<int, 64> Mask;

22407

int Scale = 64 / OutVT.getScalarSizeInBits();

22408

narrowShuffleMaskElts(Scale, { 0, 2, 1, 3 }, Mask);

22409

Res = DAG.getVectorShuffle(OutVT, DL, Res, Res, Mask);

22410

22411

if (DstVT.is256BitVector())

22412

return DAG.getBitcast(DstVT, Res);

22413

22414

// If 512bit -> 128bit truncate another stage.

22415

EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems);

22416

Res = DAG.getBitcast(PackedVT, Res);

22417

return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);

22418

}

22419

22420

// Recursively pack lower/upper subvectors, concat result and pack again.

22421

assert(SrcSizeInBits >= 256 && "Expected 256-bit vector or greater")(static_cast <bool> (SrcSizeInBits >= 256 &&
"Expected 256-bit vector or greater") ? void (0) : __assert_fail
("SrcSizeInBits >= 256 && \"Expected 256-bit vector or greater\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22421, __extension__
__PRETTY_FUNCTION__));

22422

EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems / 2);

22423

Lo = truncateVectorWithPACK(Opcode, PackedVT, Lo, DL, DAG, Subtarget);

22424

Hi = truncateVectorWithPACK(Opcode, PackedVT, Hi, DL, DAG, Subtarget);

22425

22426

PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems);

22427

SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, PackedVT, Lo, Hi);

22428

return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);

22429

}

22430

22431

static SDValue LowerTruncateVecI1(SDValue Op, SelectionDAG &DAG,

22432

const X86Subtarget &Subtarget) {

22433

22434

SDLoc DL(Op);

22435

MVT VT = Op.getSimpleValueType();

22436

SDValue In = Op.getOperand(0);

22437

MVT InVT = In.getSimpleValueType();

22438

22439

assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type.")(static_cast <bool> (VT.getVectorElementType() == MVT::
i1 && "Unexpected vector type.") ? void (0) : __assert_fail
("VT.getVectorElementType() == MVT::i1 && \"Unexpected vector type.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22439, __extension__
__PRETTY_FUNCTION__));

22440

22441

// Shift LSB to MSB and use VPMOVB/W2M or TESTD/Q.

22442

unsigned ShiftInx = InVT.getScalarSizeInBits() - 1;

22443

if (InVT.getScalarSizeInBits() <= 16) {

22444

if (Subtarget.hasBWI()) {

22445

// legal, will go to VPMOVB2M, VPMOVW2M

22446

if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {

22447

// We need to shift to get the lsb into sign position.

22448

// Shift packed bytes not supported natively, bitcast to word

22449

MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits()/16);

22450

In = DAG.getNode(ISD::SHL, DL, ExtVT,

22451

DAG.getBitcast(ExtVT, In),

22452

DAG.getConstant(ShiftInx, DL, ExtVT));

22453

In = DAG.getBitcast(InVT, In);

22454

}

22455

return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT),

22456

In, ISD::SETGT);

22457

}

22458

// Use TESTD/Q, extended vector to packed dword/qword.

22459

assert((InVT.is256BitVector() || InVT.is128BitVector()) &&(static_cast <bool> ((InVT.is256BitVector() || InVT.is128BitVector
()) && "Unexpected vector type.") ? void (0) : __assert_fail
("(InVT.is256BitVector() || InVT.is128BitVector()) && \"Unexpected vector type.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22460, __extension__
__PRETTY_FUNCTION__))

22460

"Unexpected vector type.")(static_cast <bool> ((InVT.is256BitVector() || InVT.is128BitVector
()) && "Unexpected vector type.") ? void (0) : __assert_fail
("(InVT.is256BitVector() || InVT.is128BitVector()) && \"Unexpected vector type.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22460, __extension__
__PRETTY_FUNCTION__));

22461

unsigned NumElts = InVT.getVectorNumElements();

22462

assert((NumElts == 8 || NumElts == 16) && "Unexpected number of elements")(static_cast <bool> ((NumElts == 8 || NumElts == 16) &&
"Unexpected number of elements") ? void (0) : __assert_fail (
"(NumElts == 8 || NumElts == 16) && \"Unexpected number of elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22462, __extension__
__PRETTY_FUNCTION__));

22463

// We need to change to a wider element type that we have support for.

22464

// For 8 element vectors this is easy, we either extend to v8i32 or v8i64.

22465

// For 16 element vectors we extend to v16i32 unless we are explicitly

22466

// trying to avoid 512-bit vectors. If we are avoiding 512-bit vectors

22467

// we need to split into two 8 element vectors which we can extend to v8i32,

22468

// truncate and concat the results. There's an additional complication if

22469

// the original type is v16i8. In that case we can't split the v16i8

22470

// directly, so we need to shuffle high elements to low and use

22471

// sign_extend_vector_inreg.

22472

if (NumElts == 16 && !Subtarget.canExtendTo512DQ()) {

22473

SDValue Lo, Hi;

22474

if (InVT == MVT::v16i8) {

22475

Lo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, MVT::v8i32, In);

22476

Hi = DAG.getVectorShuffle(

22477

InVT, DL, In, In,

22478

{8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1});

22479

Hi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, MVT::v8i32, Hi);

22480

} else {

22481

assert(InVT == MVT::v16i16 && "Unexpected VT!")(static_cast <bool> (InVT == MVT::v16i16 && "Unexpected VT!"
) ? void (0) : __assert_fail ("InVT == MVT::v16i16 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22481, __extension__
__PRETTY_FUNCTION__));

22482

Lo = extract128BitVector(In, 0, DAG, DL);

22483

Hi = extract128BitVector(In, 8, DAG, DL);

22484

}

22485

// We're split now, just emit two truncates and a concat. The two

22486

// truncates will trigger legalization to come back to this function.

22487

Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Lo);

22488

Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Hi);

22489

return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);

22490

}

22491

// We either have 8 elements or we're allowed to use 512-bit vectors.

22492

// If we have VLX, we want to use the narrowest vector that can get the

22493

// job done so we use vXi32.

22494

MVT EltVT = Subtarget.hasVLX() ? MVT::i32 : MVT::getIntegerVT(512/NumElts);

22495

MVT ExtVT = MVT::getVectorVT(EltVT, NumElts);

22496

In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);

22497

InVT = ExtVT;

22498

ShiftInx = InVT.getScalarSizeInBits() - 1;

22499

}

22500

22501

if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {

22502

// We need to shift to get the lsb into sign position.

22503

In = DAG.getNode(ISD::SHL, DL, InVT, In,

22504

DAG.getConstant(ShiftInx, DL, InVT));

22505

}

22506

// If we have DQI, emit a pattern that will be iseled as vpmovq2m/vpmovd2m.

22507

if (Subtarget.hasDQI())

22508

return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT), In, ISD::SETGT);

22509

return DAG.getSetCC(DL, VT, In, DAG.getConstant(0, DL, InVT), ISD::SETNE);

22510

}

22511

22512

SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {

22513

SDLoc DL(Op);

22514

MVT VT = Op.getSimpleValueType();

22515

SDValue In = Op.getOperand(0);

22516

MVT InVT = In.getSimpleValueType();

22517

unsigned InNumEltBits = InVT.getScalarSizeInBits();

22518

22519

assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&(static_cast <bool> (VT.getVectorNumElements() == InVT.
getVectorNumElements() && "Invalid TRUNCATE operation"
) ? void (0) : __assert_fail ("VT.getVectorNumElements() == InVT.getVectorNumElements() && \"Invalid TRUNCATE operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22520, __extension__
__PRETTY_FUNCTION__))

22520

"Invalid TRUNCATE operation")(static_cast <bool> (VT.getVectorNumElements() == InVT.
getVectorNumElements() && "Invalid TRUNCATE operation"
) ? void (0) : __assert_fail ("VT.getVectorNumElements() == InVT.getVectorNumElements() && \"Invalid TRUNCATE operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22520, __extension__
__PRETTY_FUNCTION__));

22521

22522

// If we're called by the type legalizer, handle a few cases.

22523

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

22524

if (!TLI.isTypeLegal(InVT)) {

22525

if ((InVT == MVT::v8i64 || InVT == MVT::v16i32 || InVT == MVT::v16i64) &&

22526

VT.is128BitVector()) {

22527

assert((InVT == MVT::v16i64 || Subtarget.hasVLX()) &&(static_cast <bool> ((InVT == MVT::v16i64 || Subtarget.
hasVLX()) && "Unexpected subtarget!") ? void (0) : __assert_fail
("(InVT == MVT::v16i64 || Subtarget.hasVLX()) && \"Unexpected subtarget!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22528, __extension__
__PRETTY_FUNCTION__))

22528

"Unexpected subtarget!")(static_cast <bool> ((InVT == MVT::v16i64 || Subtarget.
hasVLX()) && "Unexpected subtarget!") ? void (0) : __assert_fail
("(InVT == MVT::v16i64 || Subtarget.hasVLX()) && \"Unexpected subtarget!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22528, __extension__
__PRETTY_FUNCTION__));

22529

// The default behavior is to truncate one step, concatenate, and then

22530

// truncate the remainder. We'd rather produce two 64-bit results and

22531

// concatenate those.

22532

SDValue Lo, Hi;

22533

std::tie(Lo, Hi) = DAG.SplitVector(In, DL);

22534

22535

EVT LoVT, HiVT;

22536

std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);

22537

22538

Lo = DAG.getNode(ISD::TRUNCATE, DL, LoVT, Lo);

22539

Hi = DAG.getNode(ISD::TRUNCATE, DL, HiVT, Hi);

22540

return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);

22541

}

22542

22543

// Otherwise let default legalization handle it.

22544

return SDValue();

22545

}

22546

22547

if (VT.getVectorElementType() == MVT::i1)

22548

return LowerTruncateVecI1(Op, DAG, Subtarget);

22549

22550

// vpmovqb/w/d, vpmovdb/w, vpmovwb

22551

if (Subtarget.hasAVX512()) {

22552

if (InVT == MVT::v32i16 && !Subtarget.hasBWI()) {

22553

assert(VT == MVT::v32i8 && "Unexpected VT!")(static_cast <bool> (VT == MVT::v32i8 && "Unexpected VT!"
) ? void (0) : __assert_fail ("VT == MVT::v32i8 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22553, __extension__
__PRETTY_FUNCTION__));

22554

return splitVectorIntUnary(Op, DAG);

22555

}

22556

22557

// word to byte only under BWI. Otherwise we have to promoted to v16i32

22558

// and then truncate that. But we should only do that if we haven't been

22559

// asked to avoid 512-bit vectors. The actual promotion to v16i32 will be

22560

// handled by isel patterns.

22561

if (InVT != MVT::v16i16 || Subtarget.hasBWI() ||

22562

Subtarget.canExtendTo512DQ())

22563

return Op;

22564

}

22565

22566

unsigned NumPackedSignBits = std::min<unsigned>(VT.getScalarSizeInBits(), 16);

22567

unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8;

22568

22569

// Truncate with PACKUS if we are truncating a vector with leading zero bits

22570

// that extend all the way to the packed/truncated value.

22571

// Pre-SSE41 we can only use PACKUSWB.

22572

KnownBits Known = DAG.computeKnownBits(In);

22573

if ((InNumEltBits - NumPackedZeroBits) <= Known.countMinLeadingZeros())

22574

if (SDValue V =

22575

truncateVectorWithPACK(X86ISD::PACKUS, VT, In, DL, DAG, Subtarget))

22576

return V;

22577

22578

// Truncate with PACKSS if we are truncating a vector with sign-bits that

22579

// extend all the way to the packed/truncated value.

22580

if ((InNumEltBits - NumPackedSignBits) < DAG.ComputeNumSignBits(In))

22581

if (SDValue V =

22582

truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget))

22583

return V;

22584

22585

// Handle truncation of V256 to V128 using shuffles.

22586

assert(VT.is128BitVector() && InVT.is256BitVector() && "Unexpected types!")(static_cast <bool> (VT.is128BitVector() && InVT
.is256BitVector() && "Unexpected types!") ? void (0) :
__assert_fail ("VT.is128BitVector() && InVT.is256BitVector() && \"Unexpected types!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22586, __extension__
__PRETTY_FUNCTION__));

22587

22588

if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {

22589

// On AVX2, v4i64 -> v4i32 becomes VPERMD.

22590

if (Subtarget.hasInt256()) {

22591

static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};

22592

In = DAG.getBitcast(MVT::v8i32, In);

22593

In = DAG.getVectorShuffle(MVT::v8i32, DL, In, In, ShufMask);

22594

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,

22595

DAG.getIntPtrConstant(0, DL));

22596

}

22597

22598

SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,

22599

DAG.getIntPtrConstant(0, DL));

22600

SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,

22601

DAG.getIntPtrConstant(2, DL));

22602

static const int ShufMask[] = {0, 2, 4, 6};

22603

return DAG.getVectorShuffle(VT, DL, DAG.getBitcast(MVT::v4i32, OpLo),

22604

DAG.getBitcast(MVT::v4i32, OpHi), ShufMask);

22605

}

22606

22607

if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {

22608

// On AVX2, v8i32 -> v8i16 becomes PSHUFB.

22609

if (Subtarget.hasInt256()) {

22610

// The PSHUFB mask:

22611

static const int ShufMask1[] = { 0, 1, 4, 5, 8, 9, 12, 13,

22612

-1, -1, -1, -1, -1, -1, -1, -1,

22613

16, 17, 20, 21, 24, 25, 28, 29,

22614

-1, -1, -1, -1, -1, -1, -1, -1 };

22615

In = DAG.getBitcast(MVT::v32i8, In);

22616

In = DAG.getVectorShuffle(MVT::v32i8, DL, In, In, ShufMask1);

22617

In = DAG.getBitcast(MVT::v4i64, In);

22618

22619

static const int ShufMask2[] = {0, 2, -1, -1};

22620

In = DAG.getVectorShuffle(MVT::v4i64, DL, In, In, ShufMask2);

22621

In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,

22622

DAG.getIntPtrConstant(0, DL));

22623

return DAG.getBitcast(MVT::v8i16, In);

22624

}

22625

22626

SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,

22627

DAG.getIntPtrConstant(0, DL));

22628

SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,

22629

DAG.getIntPtrConstant(4, DL));

22630

22631

// The PSHUFB mask:

22632

static const int ShufMask1[] = {0, 2, 4, 6, -1, -1, -1, -1};

22633

22634

OpLo = DAG.getBitcast(MVT::v8i16, OpLo);

22635

OpHi = DAG.getBitcast(MVT::v8i16, OpHi);

22636

22637

OpLo = DAG.getVectorShuffle(MVT::v8i16, DL, OpLo, OpLo, ShufMask1);

22638

OpHi = DAG.getVectorShuffle(MVT::v8i16, DL, OpHi, OpHi, ShufMask1);

22639

22640

OpLo = DAG.getBitcast(MVT::v4i32, OpLo);

22641

OpHi = DAG.getBitcast(MVT::v4i32, OpHi);

22642

22643

// The MOVLHPS Mask:

22644

static const int ShufMask2[] = {0, 1, 4, 5};

22645

SDValue res = DAG.getVectorShuffle(MVT::v4i32, DL, OpLo, OpHi, ShufMask2);

22646

return DAG.getBitcast(MVT::v8i16, res);

22647

}

22648

22649

if (VT == MVT::v16i8 && InVT == MVT::v16i16) {

22650

// Use an AND to zero uppper bits for PACKUS.

22651

In = DAG.getNode(ISD::AND, DL, InVT, In, DAG.getConstant(255, DL, InVT));

22652

22653

SDValue InLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i16, In,

22654

DAG.getIntPtrConstant(0, DL));

22655

SDValue InHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i16, In,

22656

DAG.getIntPtrConstant(8, DL));

22657

return DAG.getNode(X86ISD::PACKUS, DL, VT, InLo, InHi);

22658

}

22659

22660

llvm_unreachable("All 256->128 cases should have been handled above!")::llvm::llvm_unreachable_internal("All 256->128 cases should have been handled above!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22660);

22661

}

22662

22663

// We can leverage the specific way the "cvttps2dq/cvttpd2dq" instruction

22664

// behaves on out of range inputs to generate optimized conversions.

22665

static SDValue expandFP_TO_UINT_SSE(MVT VT, SDValue Src, const SDLoc &dl,

22666

SelectionDAG &DAG,

22667

const X86Subtarget &Subtarget) {

22668

MVT SrcVT = Src.getSimpleValueType();

22669

unsigned DstBits = VT.getScalarSizeInBits();

22670

assert(DstBits == 32 && "expandFP_TO_UINT_SSE - only vXi32 supported")(static_cast <bool> (DstBits == 32 && "expandFP_TO_UINT_SSE - only vXi32 supported"
) ? void (0) : __assert_fail ("DstBits == 32 && \"expandFP_TO_UINT_SSE - only vXi32 supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22670, __extension__
__PRETTY_FUNCTION__));

22671

22672

// Calculate the converted result for values in the range 0 to

22673

// 2^31-1 ("Small") and from 2^31 to 2^32-1 ("Big").

22674

SDValue Small = DAG.getNode(X86ISD::CVTTP2SI, dl, VT, Src);

22675

SDValue Big =

22676

DAG.getNode(X86ISD::CVTTP2SI, dl, VT,

22677

DAG.getNode(ISD::FSUB, dl, SrcVT, Src,

22678

DAG.getConstantFP(2147483648.0f, dl, SrcVT)));

22679

22680

// The "CVTTP2SI" instruction conveniently sets the sign bit if

22681

// and only if the value was out of range. So we can use that

22682

// as our indicator that we rather use "Big" instead of "Small".

22683

//

22684

// Use "Small" if "IsOverflown" has all bits cleared

22685

// and "0x80000000 | Big" if all bits in "IsOverflown" are set.

22686

22687

// AVX1 can't use the signsplat masking for 256-bit vectors - we have to

22688

// use the slightly slower blendv select instead.

22689

if (VT == MVT::v8i32 && !Subtarget.hasAVX2()) {

22690

SDValue Overflow = DAG.getNode(ISD::OR, dl, VT, Small, Big);

22691

return DAG.getNode(X86ISD::BLENDV, dl, VT, Small, Overflow, Small);

22692

}

22693

22694

SDValue IsOverflown =

22695

DAG.getNode(X86ISD::VSRAI, dl, VT, Small,

22696

DAG.getTargetConstant(DstBits - 1, dl, MVT::i8));

22697

return DAG.getNode(ISD::OR, dl, VT, Small,

22698

DAG.getNode(ISD::AND, dl, VT, Big, IsOverflown));

22699

}

22700

22701

SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {

22702

bool IsStrict = Op->isStrictFPOpcode();

22703

bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT ||

22704

Op.getOpcode() == ISD::STRICT_FP_TO_SINT;

22705

MVT VT = Op->getSimpleValueType(0);

22706

SDValue Src = Op.getOperand(IsStrict ? 1 : 0);

22707

SDValue Chain = IsStrict ? Op->getOperand(0) : SDValue();

22708

MVT SrcVT = Src.getSimpleValueType();

22709

SDLoc dl(Op);

22710

22711

SDValue Res;

22712

if (isSoftFP16(SrcVT)) {

22713

MVT NVT = VT.isVector() ? VT.changeVectorElementType(MVT::f32) : MVT::f32;

22714

if (IsStrict)

22715

return DAG.getNode(Op.getOpcode(), dl, {VT, MVT::Other},

22716

{Chain, DAG.getNode(ISD::STRICT_FP_EXTEND, dl,

22717

{NVT, MVT::Other}, {Chain, Src})});

22718

return DAG.getNode(Op.getOpcode(), dl, VT,

22719

DAG.getNode(ISD::FP_EXTEND, dl, NVT, Src));

22720

} else if (isTypeLegal(SrcVT) && isLegalConversion(VT, IsSigned, Subtarget)) {

22721

return Op;

22722

}

22723

22724

if (VT.isVector()) {

22725

if (VT == MVT::v2i1 && SrcVT == MVT::v2f64) {

22726

MVT ResVT = MVT::v4i32;

22727

MVT TruncVT = MVT::v4i1;

22728

unsigned Opc;

22729

if (IsStrict)

22730

Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI;

22731

else

22732

Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;

22733

22734

if (!IsSigned && !Subtarget.hasVLX()) {

22735

assert(Subtarget.useAVX512Regs() && "Unexpected features!")(static_cast <bool> (Subtarget.useAVX512Regs() &&
"Unexpected features!") ? void (0) : __assert_fail ("Subtarget.useAVX512Regs() && \"Unexpected features!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22735, __extension__
__PRETTY_FUNCTION__));

22736

// Widen to 512-bits.

22737

ResVT = MVT::v8i32;

22738

TruncVT = MVT::v8i1;

22739

Opc = Op.getOpcode();

22740

// Need to concat with zero vector for strict fp to avoid spurious

22741

// exceptions.

22742

// TODO: Should we just do this for non-strict as well?

22743

SDValue Tmp = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v8f64)

22744

: DAG.getUNDEF(MVT::v8f64);

22745

Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8f64, Tmp, Src,

22746

DAG.getIntPtrConstant(0, dl));

22747

}

22748

if (IsStrict) {

22749

Res = DAG.getNode(Opc, dl, {ResVT, MVT::Other}, {Chain, Src});

22750

Chain = Res.getValue(1);

22751

} else {

22752

Res = DAG.getNode(Opc, dl, ResVT, Src);

22753

}

22754

22755

Res = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Res);

22756

Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i1, Res,

22757

DAG.getIntPtrConstant(0, dl));

22758

if (IsStrict)

22759

return DAG.getMergeValues({Res, Chain}, dl);

22760

return Res;

22761

}

22762

22763

if (Subtarget.hasFP16() && SrcVT.getVectorElementType() == MVT::f16) {

22764

if (VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16)

22765

return Op;

22766

22767

MVT ResVT = VT;

22768

MVT EleVT = VT.getVectorElementType();

22769

if (EleVT != MVT::i64)

22770

ResVT = EleVT == MVT::i32 ? MVT::v4i32 : MVT::v8i16;

22771

22772

if (SrcVT != MVT::v8f16) {

22773

SDValue Tmp =

22774

IsStrict ? DAG.getConstantFP(0.0, dl, SrcVT) : DAG.getUNDEF(SrcVT);

22775

SmallVector<SDValue, 4> Ops(SrcVT == MVT::v2f16 ? 4 : 2, Tmp);

22776

Ops[0] = Src;

22777

Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f16, Ops);

22778

}

22779

22780

if (IsStrict) {

22781

Res = DAG.getNode(IsSigned ? X86ISD::STRICT_CVTTP2SI

22782

: X86ISD::STRICT_CVTTP2UI,

22783

dl, {ResVT, MVT::Other}, {Chain, Src});

22784

Chain = Res.getValue(1);

22785

} else {

22786

Res = DAG.getNode(IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI, dl,

22787

ResVT, Src);

22788

}

22789

22790

// TODO: Need to add exception check code for strict FP.

22791

if (EleVT.getSizeInBits() < 16) {

22792

ResVT = MVT::getVectorVT(EleVT, 8);

22793

Res = DAG.getNode(ISD::TRUNCATE, dl, ResVT, Res);

22794

}

22795

22796

if (ResVT != VT)

22797

Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,

22798

DAG.getIntPtrConstant(0, dl));

22799

22800

if (IsStrict)

22801

return DAG.getMergeValues({Res, Chain}, dl);

22802

return Res;

22803

}

22804

22805

if (VT == MVT::v8i16 && (SrcVT == MVT::v8f32 || SrcVT == MVT::v8f64)) {

22806

if (IsStrict) {

22807

Res = DAG.getNode(IsSigned ? ISD::STRICT_FP_TO_SINT

22808

: ISD::STRICT_FP_TO_UINT,

22809

dl, {MVT::v8i32, MVT::Other}, {Chain, Src});

22810

Chain = Res.getValue(1);

22811

} else {

22812

Res = DAG.getNode(IsSigned ? ISD::FP_TO_SINT : ISD::FP_TO_UINT, dl,

22813

MVT::v8i32, Src);

22814

}

22815

22816

// TODO: Need to add exception check code for strict FP.

22817

Res = DAG.getNode(ISD::TRUNCATE, dl, MVT::v8i16, Res);

22818

22819

if (IsStrict)

22820

return DAG.getMergeValues({Res, Chain}, dl);

22821

return Res;

22822

}

22823

22824

// v8f64->v8i32 is legal, but we need v8i32 to be custom for v8f32.

22825

if (VT == MVT::v8i32 && SrcVT == MVT::v8f64) {

22826

assert(!IsSigned && "Expected unsigned conversion!")(static_cast <bool> (!IsSigned && "Expected unsigned conversion!"
) ? void (0) : __assert_fail ("!IsSigned && \"Expected unsigned conversion!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22826, __extension__
__PRETTY_FUNCTION__));

22827

assert(Subtarget.useAVX512Regs() && "Requires avx512f")(static_cast <bool> (Subtarget.useAVX512Regs() &&
"Requires avx512f") ? void (0) : __assert_fail ("Subtarget.useAVX512Regs() && \"Requires avx512f\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22827, __extension__
__PRETTY_FUNCTION__));

22828

return Op;

22829

}

22830

22831

// Widen vXi32 fp_to_uint with avx512f to 512-bit source.

22832

if ((VT == MVT::v4i32 || VT == MVT::v8i32) &&

22833

(SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32 || SrcVT == MVT::v8f32) &&

22834

Subtarget.useAVX512Regs()) {

22835

assert(!IsSigned && "Expected unsigned conversion!")(static_cast <bool> (!IsSigned && "Expected unsigned conversion!"
) ? void (0) : __assert_fail ("!IsSigned && \"Expected unsigned conversion!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22835, __extension__
__PRETTY_FUNCTION__));

22836

assert(!Subtarget.hasVLX() && "Unexpected features!")(static_cast <bool> (!Subtarget.hasVLX() && "Unexpected features!"
) ? void (0) : __assert_fail ("!Subtarget.hasVLX() && \"Unexpected features!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22836, __extension__
__PRETTY_FUNCTION__));

22837

MVT WideVT = SrcVT == MVT::v4f64 ? MVT::v8f64 : MVT::v16f32;

22838

MVT ResVT = SrcVT == MVT::v4f64 ? MVT::v8i32 : MVT::v16i32;

22839

// Need to concat with zero vector for strict fp to avoid spurious

22840

// exceptions.

22841

// TODO: Should we just do this for non-strict as well?

22842

SDValue Tmp =

22843

IsStrict ? DAG.getConstantFP(0.0, dl, WideVT) : DAG.getUNDEF(WideVT);

22844

Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Tmp, Src,

22845

DAG.getIntPtrConstant(0, dl));

22846

22847

if (IsStrict) {

22848

Res = DAG.getNode(ISD::STRICT_FP_TO_UINT, dl, {ResVT, MVT::Other},

22849

{Chain, Src});

22850

Chain = Res.getValue(1);

22851

} else {

22852

Res = DAG.getNode(ISD::FP_TO_UINT, dl, ResVT, Src);

22853

}

22854

22855

Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,

22856

DAG.getIntPtrConstant(0, dl));

22857

22858

if (IsStrict)

22859

return DAG.getMergeValues({Res, Chain}, dl);

22860

return Res;

22861

}

22862

22863

// Widen vXi64 fp_to_uint/fp_to_sint with avx512dq to 512-bit source.

22864

if ((VT == MVT::v2i64 || VT == MVT::v4i64) &&

22865

(SrcVT == MVT::v2f64 || SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32) &&

22866

Subtarget.useAVX512Regs() && Subtarget.hasDQI()) {

22867

assert(!Subtarget.hasVLX() && "Unexpected features!")(static_cast <bool> (!Subtarget.hasVLX() && "Unexpected features!"
) ? void (0) : __assert_fail ("!Subtarget.hasVLX() && \"Unexpected features!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22867, __extension__
__PRETTY_FUNCTION__));

22868

MVT WideVT = SrcVT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64;

22869

// Need to concat with zero vector for strict fp to avoid spurious

22870

// exceptions.

22871

// TODO: Should we just do this for non-strict as well?

22872

SDValue Tmp =

22873

IsStrict ? DAG.getConstantFP(0.0, dl, WideVT) : DAG.getUNDEF(WideVT);

22874

Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Tmp, Src,

22875

DAG.getIntPtrConstant(0, dl));

22876

22877

if (IsStrict) {

22878

Res = DAG.getNode(Op.getOpcode(), dl, {MVT::v8i64, MVT::Other},

22879

{Chain, Src});

22880

Chain = Res.getValue(1);

22881

} else {

22882

Res = DAG.getNode(Op.getOpcode(), dl, MVT::v8i64, Src);

22883

}

22884

22885

Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,

22886

DAG.getIntPtrConstant(0, dl));

22887

22888

if (IsStrict)

22889

return DAG.getMergeValues({Res, Chain}, dl);

22890

return Res;

22891

}

22892

22893

if (VT == MVT::v2i64 && SrcVT == MVT::v2f32) {

22894

if (!Subtarget.hasVLX()) {

22895

// Non-strict nodes without VLX can we widened to v4f32->v4i64 by type

22896

// legalizer and then widened again by vector op legalization.

22897

if (!IsStrict)

22898

return SDValue();

22899

22900

SDValue Zero = DAG.getConstantFP(0.0, dl, MVT::v2f32);

22901

SDValue Tmp = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f32,

22902

{Src, Zero, Zero, Zero});

22903

Tmp = DAG.getNode(Op.getOpcode(), dl, {MVT::v8i64, MVT::Other},

22904

{Chain, Tmp});

22905

SDValue Chain = Tmp.getValue(1);

22906

Tmp = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i64, Tmp,

22907

DAG.getIntPtrConstant(0, dl));

22908

return DAG.getMergeValues({Tmp, Chain}, dl);

22909

}

22910

22911

assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL")(static_cast <bool> (Subtarget.hasDQI() && Subtarget
.hasVLX() && "Requires AVX512DQVL") ? void (0) : __assert_fail
("Subtarget.hasDQI() && Subtarget.hasVLX() && \"Requires AVX512DQVL\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22911, __extension__
__PRETTY_FUNCTION__));

22912

SDValue Tmp = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,

22913

DAG.getUNDEF(MVT::v2f32));

22914

if (IsStrict) {

22915

unsigned Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI

22916

: X86ISD::STRICT_CVTTP2UI;

22917

return DAG.getNode(Opc, dl, {VT, MVT::Other}, {Op->getOperand(0), Tmp});

22918

}

22919

unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;

22920

return DAG.getNode(Opc, dl, VT, Tmp);

22921

}

22922

22923

// Generate optimized instructions for pre AVX512 unsigned conversions from

22924

// vXf32 to vXi32.

22925

if ((VT == MVT::v4i32 && SrcVT == MVT::v4f32) ||

22926

(VT == MVT::v4i32 && SrcVT == MVT::v4f64) ||

22927

(VT == MVT::v8i32 && SrcVT == MVT::v8f32)) {

22928

assert(!IsSigned && "Expected unsigned conversion!")(static_cast <bool> (!IsSigned && "Expected unsigned conversion!"
) ? void (0) : __assert_fail ("!IsSigned && \"Expected unsigned conversion!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22928, __extension__
__PRETTY_FUNCTION__));

22929

return expandFP_TO_UINT_SSE(VT, Src, dl, DAG, Subtarget);

22930

}

22931

22932

return SDValue();

22933

}

22934

22935

assert(!VT.isVector())(static_cast <bool> (!VT.isVector()) ? void (0) : __assert_fail
("!VT.isVector()", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 22935, __extension__ __PRETTY_FUNCTION__));

22936

22937

bool UseSSEReg = isScalarFPTypeInSSEReg(SrcVT);

22938

22939

if (!IsSigned && UseSSEReg) {

22940

// Conversions from f32/f64 with AVX512 should be legal.

22941

if (Subtarget.hasAVX512())

22942

return Op;

22943

22944

// We can leverage the specific way the "cvttss2si/cvttsd2si" instruction

22945

// behaves on out of range inputs to generate optimized conversions.

22946

if (!IsStrict && ((VT == MVT::i32 && !Subtarget.is64Bit()) ||

22947

(VT == MVT::i64 && Subtarget.is64Bit()))) {

22948

unsigned DstBits = VT.getScalarSizeInBits();

22949

APInt UIntLimit = APInt::getSignMask(DstBits);

22950

SDValue FloatOffset = DAG.getNode(ISD::UINT_TO_FP, dl, SrcVT,

22951

DAG.getConstant(UIntLimit, dl, VT));

22952

MVT SrcVecVT = MVT::getVectorVT(SrcVT, 128 / SrcVT.getScalarSizeInBits());

22953

22954

// Calculate the converted result for values in the range:

22955

// (i32) 0 to 2^31-1 ("Small") and from 2^31 to 2^32-1 ("Big").

22956

// (i64) 0 to 2^63-1 ("Small") and from 2^63 to 2^64-1 ("Big").

22957

SDValue Small =

22958

DAG.getNode(X86ISD::CVTTS2SI, dl, VT,

22959

DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, SrcVecVT, Src));

22960

SDValue Big = DAG.getNode(

22961

X86ISD::CVTTS2SI, dl, VT,

22962

DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, SrcVecVT,

22963

DAG.getNode(ISD::FSUB, dl, SrcVT, Src, FloatOffset)));

22964

22965

// The "CVTTS2SI" instruction conveniently sets the sign bit if

22966

// and only if the value was out of range. So we can use that

22967

// as our indicator that we rather use "Big" instead of "Small".

22968

//

22969

// Use "Small" if "IsOverflown" has all bits cleared

22970

// and "0x80000000 | Big" if all bits in "IsOverflown" are set.

22971

SDValue IsOverflown = DAG.getNode(

22972

ISD::SRA, dl, VT, Small, DAG.getConstant(DstBits - 1, dl, MVT::i8));

22973

return DAG.getNode(ISD::OR, dl, VT, Small,

22974

DAG.getNode(ISD::AND, dl, VT, Big, IsOverflown));

22975

}

22976

22977

// Use default expansion for i64.

22978

if (VT == MVT::i64)

22979

return SDValue();

22980

22981

assert(VT == MVT::i32 && "Unexpected VT!")(static_cast <bool> (VT == MVT::i32 && "Unexpected VT!"
) ? void (0) : __assert_fail ("VT == MVT::i32 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 22981, __extension__
__PRETTY_FUNCTION__));

22982

22983

// Promote i32 to i64 and use a signed operation on 64-bit targets.

22984

// FIXME: This does not generate an invalid exception if the input does not

22985

// fit in i32. PR44019

22986

if (Subtarget.is64Bit()) {

22987

if (IsStrict) {

22988

Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {MVT::i64, MVT::Other},

22989

{Chain, Src});

22990

Chain = Res.getValue(1);

22991

} else

22992

Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i64, Src);

22993

22994

Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);

22995

if (IsStrict)

22996

return DAG.getMergeValues({Res, Chain}, dl);

22997

return Res;

22998

}

22999

23000

// Use default expansion for SSE1/2 targets without SSE3. With SSE3 we can

23001

// use fisttp which will be handled later.

23002

if (!Subtarget.hasSSE3())

23003

return SDValue();

23004

}

23005

23006

// Promote i16 to i32 if we can use a SSE operation or the type is f128.

23007

// FIXME: This does not generate an invalid exception if the input does not

23008

// fit in i16. PR44019

23009

if (VT == MVT::i16 && (UseSSEReg || SrcVT == MVT::f128)) {

23010

assert(IsSigned && "Expected i16 FP_TO_UINT to have been promoted!")(static_cast <bool> (IsSigned && "Expected i16 FP_TO_UINT to have been promoted!"
) ? void (0) : __assert_fail ("IsSigned && \"Expected i16 FP_TO_UINT to have been promoted!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23010, __extension__
__PRETTY_FUNCTION__));

23011

if (IsStrict) {

23012

Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {MVT::i32, MVT::Other},

23013

{Chain, Src});

23014

Chain = Res.getValue(1);

23015

} else

23016

Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, Src);

23017

23018

Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);

23019

if (IsStrict)

23020

return DAG.getMergeValues({Res, Chain}, dl);

23021

return Res;

23022

}

23023

23024

// If this is a FP_TO_SINT using SSEReg we're done.

23025

if (UseSSEReg && IsSigned)

23026

return Op;

23027

23028

// fp128 needs to use a libcall.

23029

if (SrcVT == MVT::f128) {

23030

RTLIB::Libcall LC;

23031

if (IsSigned)

23032

LC = RTLIB::getFPTOSINT(SrcVT, VT);

23033

else

23034

LC = RTLIB::getFPTOUINT(SrcVT, VT);

23035

23036

MakeLibCallOptions CallOptions;

23037

std::pair<SDValue, SDValue> Tmp = makeLibCall(DAG, LC, VT, Src, CallOptions,

23038

SDLoc(Op), Chain);

23039

23040

if (IsStrict)

23041

return DAG.getMergeValues({ Tmp.first, Tmp.second }, dl);

23042

23043

return Tmp.first;

23044

}

23045

23046

// Fall back to X87.

23047

if (SDValue V = FP_TO_INTHelper(Op, DAG, IsSigned, Chain)) {

23048

if (IsStrict)

23049

return DAG.getMergeValues({V, Chain}, dl);

23050

return V;

23051

}

23052

23053

llvm_unreachable("Expected FP_TO_INTHelper to handle all remaining cases.")::llvm::llvm_unreachable_internal("Expected FP_TO_INTHelper to handle all remaining cases."
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23053);

23054

}

23055

23056

SDValue X86TargetLowering::LowerLRINT_LLRINT(SDValue Op,

23057

SelectionDAG &DAG) const {

23058

SDValue Src = Op.getOperand(0);

23059

MVT SrcVT = Src.getSimpleValueType();

23060

23061

if (SrcVT == MVT::f16)

23062

return SDValue();

23063

23064

// If the source is in an SSE register, the node is Legal.

23065

if (isScalarFPTypeInSSEReg(SrcVT))

23066

return Op;

23067

23068

return LRINT_LLRINTHelper(Op.getNode(), DAG);

23069

}

23070

23071

SDValue X86TargetLowering::LRINT_LLRINTHelper(SDNode *N,

23072

SelectionDAG &DAG) const {

23073

EVT DstVT = N->getValueType(0);

23074

SDValue Src = N->getOperand(0);

23075

EVT SrcVT = Src.getValueType();

23076

23077

if (SrcVT != MVT::f32 && SrcVT != MVT::f64 && SrcVT != MVT::f80) {

23078

// f16 must be promoted before using the lowering in this routine.

23079

// fp128 does not use this lowering.

23080

return SDValue();

23081

}

23082

23083

SDLoc DL(N);

23084

SDValue Chain = DAG.getEntryNode();

23085

23086

bool UseSSE = isScalarFPTypeInSSEReg(SrcVT);

23087

23088

// If we're converting from SSE, the stack slot needs to hold both types.

23089

// Otherwise it only needs to hold the DstVT.

23090

EVT OtherVT = UseSSE ? SrcVT : DstVT;

23091

SDValue StackPtr = DAG.CreateStackTemporary(DstVT, OtherVT);

23092

int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();

23093

MachinePointerInfo MPI =

23094

MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);

23095

23096

if (UseSSE) {

23097

assert(DstVT == MVT::i64 && "Invalid LRINT/LLRINT to lower!")(static_cast <bool> (DstVT == MVT::i64 && "Invalid LRINT/LLRINT to lower!"
) ? void (0) : __assert_fail ("DstVT == MVT::i64 && \"Invalid LRINT/LLRINT to lower!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23097, __extension__
__PRETTY_FUNCTION__));

23098

Chain = DAG.getStore(Chain, DL, Src, StackPtr, MPI);

23099

SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);

23100

SDValue Ops[] = { Chain, StackPtr };

23101

23102

Src = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, SrcVT, MPI,

23103

/*Align*/ std::nullopt,

23104

MachineMemOperand::MOLoad);

23105

Chain = Src.getValue(1);

23106

}

23107

23108

SDValue StoreOps[] = { Chain, Src, StackPtr };

23109

Chain = DAG.getMemIntrinsicNode(X86ISD::FIST, DL, DAG.getVTList(MVT::Other),

23110

StoreOps, DstVT, MPI, /*Align*/ std::nullopt,

23111

MachineMemOperand::MOStore);

23112

23113

return DAG.getLoad(DstVT, DL, Chain, StackPtr, MPI);

23114

}

23115

23116

SDValue

23117

X86TargetLowering::LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) const {

23118

// This is based on the TargetLowering::expandFP_TO_INT_SAT implementation,

23119

// but making use of X86 specifics to produce better instruction sequences.

23120

SDNode *Node = Op.getNode();

23121

bool IsSigned = Node->getOpcode() == ISD::FP_TO_SINT_SAT;

23122

unsigned FpToIntOpcode = IsSigned ? ISD::FP_TO_SINT : ISD::FP_TO_UINT;

23123

SDLoc dl(SDValue(Node, 0));

23124

SDValue Src = Node->getOperand(0);

23125

23126

// There are three types involved here: SrcVT is the source floating point

23127

// type, DstVT is the type of the result, and TmpVT is the result of the

23128

// intermediate FP_TO_*INT operation we'll use (which may be a promotion of

23129

// DstVT).

23130

EVT SrcVT = Src.getValueType();

23131

EVT DstVT = Node->getValueType(0);

23132

EVT TmpVT = DstVT;

23133

23134

// This code is only for floats and doubles. Fall back to generic code for

23135

// anything else.

23136

if (!isScalarFPTypeInSSEReg(SrcVT) || isSoftFP16(SrcVT))

23137

return SDValue();

23138

23139

EVT SatVT = cast<VTSDNode>(Node->getOperand(1))->getVT();

23140

unsigned SatWidth = SatVT.getScalarSizeInBits();

23141

unsigned DstWidth = DstVT.getScalarSizeInBits();

23142

unsigned TmpWidth = TmpVT.getScalarSizeInBits();

23143

assert(SatWidth <= DstWidth && SatWidth <= TmpWidth &&(static_cast <bool> (SatWidth <= DstWidth &&
SatWidth <= TmpWidth && "Expected saturation width smaller than result width"
) ? void (0) : __assert_fail ("SatWidth <= DstWidth && SatWidth <= TmpWidth && \"Expected saturation width smaller than result width\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23144, __extension__
__PRETTY_FUNCTION__))

23144

"Expected saturation width smaller than result width")(static_cast <bool> (SatWidth <= DstWidth &&
SatWidth <= TmpWidth && "Expected saturation width smaller than result width"
) ? void (0) : __assert_fail ("SatWidth <= DstWidth && SatWidth <= TmpWidth && \"Expected saturation width smaller than result width\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23144, __extension__
__PRETTY_FUNCTION__));

23145

23146

// Promote result of FP_TO_*INT to at least 32 bits.

23147

if (TmpWidth < 32) {

23148

TmpVT = MVT::i32;

23149

TmpWidth = 32;

23150

}

23151

23152

// Promote conversions to unsigned 32-bit to 64-bit, because it will allow

23153

// us to use a native signed conversion instead.

23154

if (SatWidth == 32 && !IsSigned && Subtarget.is64Bit()) {

23155

TmpVT = MVT::i64;

23156

TmpWidth = 64;

23157

}

23158

23159

// If the saturation width is smaller than the size of the temporary result,

23160

// we can always use signed conversion, which is native.

23161

if (SatWidth < TmpWidth)

23162

FpToIntOpcode = ISD::FP_TO_SINT;

23163

23164

// Determine minimum and maximum integer values and their corresponding

23165

// floating-point values.

23166

APInt MinInt, MaxInt;

23167

if (IsSigned) {

23168

MinInt = APInt::getSignedMinValue(SatWidth).sext(DstWidth);

23169

MaxInt = APInt::getSignedMaxValue(SatWidth).sext(DstWidth);

23170

} else {

23171

MinInt = APInt::getMinValue(SatWidth).zext(DstWidth);

23172

MaxInt = APInt::getMaxValue(SatWidth).zext(DstWidth);

23173

}

23174

23175

APFloat MinFloat(DAG.EVTToAPFloatSemantics(SrcVT));

23176

APFloat MaxFloat(DAG.EVTToAPFloatSemantics(SrcVT));

23177

23178

APFloat::opStatus MinStatus = MinFloat.convertFromAPInt(

23179

MinInt, IsSigned, APFloat::rmTowardZero);

23180

APFloat::opStatus MaxStatus = MaxFloat.convertFromAPInt(

23181

MaxInt, IsSigned, APFloat::rmTowardZero);

23182

bool AreExactFloatBounds = !(MinStatus & APFloat::opStatus::opInexact)

23183

&& !(MaxStatus & APFloat::opStatus::opInexact);

23184

23185

SDValue MinFloatNode = DAG.getConstantFP(MinFloat, dl, SrcVT);

23186

SDValue MaxFloatNode = DAG.getConstantFP(MaxFloat, dl, SrcVT);

23187

23188

// If the integer bounds are exactly representable as floats, emit a

23189

// min+max+fptoi sequence. Otherwise use comparisons and selects.

23190

if (AreExactFloatBounds) {

23191

if (DstVT != TmpVT) {

23192

// Clamp by MinFloat from below. If Src is NaN, propagate NaN.

23193

SDValue MinClamped = DAG.getNode(

23194

X86ISD::FMAX, dl, SrcVT, MinFloatNode, Src);

23195

// Clamp by MaxFloat from above. If Src is NaN, propagate NaN.

23196

SDValue BothClamped = DAG.getNode(

23197

X86ISD::FMIN, dl, SrcVT, MaxFloatNode, MinClamped);

23198

// Convert clamped value to integer.

23199

SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, TmpVT, BothClamped);

23200

23201

// NaN will become INDVAL, with the top bit set and the rest zero.

23202

// Truncation will discard the top bit, resulting in zero.

23203

return DAG.getNode(ISD::TRUNCATE, dl, DstVT, FpToInt);

23204

}

23205

23206

// Clamp by MinFloat from below. If Src is NaN, the result is MinFloat.

23207

SDValue MinClamped = DAG.getNode(

23208

X86ISD::FMAX, dl, SrcVT, Src, MinFloatNode);

23209

// Clamp by MaxFloat from above. NaN cannot occur.

23210

SDValue BothClamped = DAG.getNode(

23211

X86ISD::FMINC, dl, SrcVT, MinClamped, MaxFloatNode);

23212

// Convert clamped value to integer.

23213

SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, DstVT, BothClamped);

23214

23215

if (!IsSigned) {

23216

// In the unsigned case we're done, because we mapped NaN to MinFloat,

23217

// which is zero.

23218

return FpToInt;

23219

}

23220

23221

// Otherwise, select zero if Src is NaN.

23222

SDValue ZeroInt = DAG.getConstant(0, dl, DstVT);

23223

return DAG.getSelectCC(

23224

dl, Src, Src, ZeroInt, FpToInt, ISD::CondCode::SETUO);

23225

}

23226

23227

SDValue MinIntNode = DAG.getConstant(MinInt, dl, DstVT);

23228

SDValue MaxIntNode = DAG.getConstant(MaxInt, dl, DstVT);

23229

23230

// Result of direct conversion, which may be selected away.

23231

SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, TmpVT, Src);

23232

23233

if (DstVT != TmpVT) {

23234

// NaN will become INDVAL, with the top bit set and the rest zero.

23235

// Truncation will discard the top bit, resulting in zero.

23236

FpToInt = DAG.getNode(ISD::TRUNCATE, dl, DstVT, FpToInt);

23237

}

23238

23239

SDValue Select = FpToInt;

23240

// For signed conversions where we saturate to the same size as the

23241

// result type of the fptoi instructions, INDVAL coincides with integer

23242

// minimum, so we don't need to explicitly check it.

23243

if (!IsSigned || SatWidth != TmpVT.getScalarSizeInBits()) {

23244

// If Src ULT MinFloat, select MinInt. In particular, this also selects

23245

// MinInt if Src is NaN.

23246

Select = DAG.getSelectCC(

23247

dl, Src, MinFloatNode, MinIntNode, Select, ISD::CondCode::SETULT);

23248

}

23249

23250

// If Src OGT MaxFloat, select MaxInt.

23251

Select = DAG.getSelectCC(

23252

dl, Src, MaxFloatNode, MaxIntNode, Select, ISD::CondCode::SETOGT);

23253

23254

// In the unsigned case we are done, because we mapped NaN to MinInt, which

23255

// is already zero. The promoted case was already handled above.

23256

if (!IsSigned || DstVT != TmpVT) {

23257

return Select;

23258

}

23259

23260

// Otherwise, select 0 if Src is NaN.

23261

SDValue ZeroInt = DAG.getConstant(0, dl, DstVT);

23262

return DAG.getSelectCC(

23263

dl, Src, Src, ZeroInt, Select, ISD::CondCode::SETUO);

23264

}

23265

23266

SDValue X86TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {

23267

bool IsStrict = Op->isStrictFPOpcode();

23268

23269

SDLoc DL(Op);

23270

MVT VT = Op.getSimpleValueType();

23271

SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();

23272

SDValue In = Op.getOperand(IsStrict ? 1 : 0);

23273

MVT SVT = In.getSimpleValueType();

23274

23275

// Let f16->f80 get lowered to a libcall, except for darwin, where we should

23276

// lower it to an fp_extend via f32 (as only f16<>f32 libcalls are available)

23277

if (VT == MVT::f128 || (SVT == MVT::f16 && VT == MVT::f80 &&

23278

!Subtarget.getTargetTriple().isOSDarwin()))

23279

return SDValue();

23280

23281

if (SVT == MVT::f16) {

23282

if (Subtarget.hasFP16())

23283

return Op;

23284

23285

if (VT != MVT::f32) {

23286

if (IsStrict)

23287

return DAG.getNode(

23288

ISD::STRICT_FP_EXTEND, DL, {VT, MVT::Other},

23289

{Chain, DAG.getNode(ISD::STRICT_FP_EXTEND, DL,

23290

{MVT::f32, MVT::Other}, {Chain, In})});

23291

23292

return DAG.getNode(ISD::FP_EXTEND, DL, VT,

23293

DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, In));

23294

}

23295

23296

if (!Subtarget.hasF16C()) {

23297

if (!Subtarget.getTargetTriple().isOSDarwin())

23298

return SDValue();

23299

23300

assert(VT == MVT::f32 && SVT == MVT::f16 && "unexpected extend libcall")(static_cast <bool> (VT == MVT::f32 && SVT == MVT
::f16 && "unexpected extend libcall") ? void (0) : __assert_fail
("VT == MVT::f32 && SVT == MVT::f16 && \"unexpected extend libcall\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23300, __extension__
__PRETTY_FUNCTION__));

23301

23302

// Need a libcall, but ABI for f16 is soft-float on MacOS.

23303

TargetLowering::CallLoweringInfo CLI(DAG);

23304

Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();

23305

23306

In = DAG.getBitcast(MVT::i16, In);

23307

TargetLowering::ArgListTy Args;

23308

TargetLowering::ArgListEntry Entry;

23309

Entry.Node = In;

23310

Entry.Ty = EVT(MVT::i16).getTypeForEVT(*DAG.getContext());

23311

Entry.IsSExt = false;

23312

Entry.IsZExt = true;

23313

Args.push_back(Entry);

23314

23315

SDValue Callee = DAG.getExternalSymbol(

23316

getLibcallName(RTLIB::FPEXT_F16_F32),

23317

getPointerTy(DAG.getDataLayout()));

23318

CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(

23319

CallingConv::C, EVT(VT).getTypeForEVT(*DAG.getContext()), Callee,

23320

std::move(Args));

23321

23322

SDValue Res;

23323

std::tie(Res,Chain) = LowerCallTo(CLI);

23324

if (IsStrict)

23325

Res = DAG.getMergeValues({Res, Chain}, DL);

23326

23327

return Res;

23328

}

23329

23330

In = DAG.getBitcast(MVT::i16, In);

23331

In = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v8i16,

23332

getZeroVector(MVT::v8i16, Subtarget, DAG, DL), In,

23333

DAG.getIntPtrConstant(0, DL));

23334

SDValue Res;

23335

if (IsStrict) {

23336

Res = DAG.getNode(X86ISD::STRICT_CVTPH2PS, DL, {MVT::v4f32, MVT::Other},

23337

{Chain, In});

23338

Chain = Res.getValue(1);

23339

} else {

23340

Res = DAG.getNode(X86ISD::CVTPH2PS, DL, MVT::v4f32, In,

23341

DAG.getTargetConstant(4, DL, MVT::i32));

23342

}

23343

Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Res,

23344

DAG.getIntPtrConstant(0, DL));

23345

if (IsStrict)

23346

return DAG.getMergeValues({Res, Chain}, DL);

23347

return Res;

23348

}

23349

23350

if (!SVT.isVector())

23351

return Op;

23352

23353

if (SVT.getVectorElementType() == MVT::f16) {

23354

assert(Subtarget.hasF16C() && "Unexpected features!")(static_cast <bool> (Subtarget.hasF16C() && "Unexpected features!"
) ? void (0) : __assert_fail ("Subtarget.hasF16C() && \"Unexpected features!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23354, __extension__
__PRETTY_FUNCTION__));

23355

if (SVT == MVT::v2f16)

23356

In = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f16, In,

23357

DAG.getUNDEF(MVT::v2f16));

23358

SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8f16, In,

23359

DAG.getUNDEF(MVT::v4f16));

23360

if (IsStrict)

23361

return DAG.getNode(X86ISD::STRICT_VFPEXT, DL, {VT, MVT::Other},

23362

{Op->getOperand(0), Res});

23363

return DAG.getNode(X86ISD::VFPEXT, DL, VT, Res);

23364

} else if (VT == MVT::v4f64 || VT == MVT::v8f64) {

23365

return Op;

23366

}

23367

23368

assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!")(static_cast <bool> (SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!"
) ? void (0) : __assert_fail ("SVT == MVT::v2f32 && \"Only customize MVT::v2f32 type legalization!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23368, __extension__
__PRETTY_FUNCTION__));

23369

23370

SDValue Res =

23371

DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32, In, DAG.getUNDEF(SVT));

23372

if (IsStrict)

23373

return DAG.getNode(X86ISD::STRICT_VFPEXT, DL, {VT, MVT::Other},

23374

{Op->getOperand(0), Res});

23375

return DAG.getNode(X86ISD::VFPEXT, DL, VT, Res);

23376

}

23377

23378

SDValue X86TargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {

23379

bool IsStrict = Op->isStrictFPOpcode();

23380

23381

SDLoc DL(Op);

23382

SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();

23383

SDValue In = Op.getOperand(IsStrict ? 1 : 0);

23384

MVT VT = Op.getSimpleValueType();

23385

MVT SVT = In.getSimpleValueType();

23386

23387

if (SVT == MVT::f128 || (VT == MVT::f16 && SVT == MVT::f80))

23388

return SDValue();

23389

23390

if (VT == MVT::f16 && (SVT == MVT::f64 || SVT == MVT::f32) &&

23391

!Subtarget.hasFP16() && (SVT == MVT::f64 || !Subtarget.hasF16C())) {

23392

if (!Subtarget.getTargetTriple().isOSDarwin())

23393

return SDValue();

23394

23395

// We need a libcall but the ABI for f16 libcalls on MacOS is soft.

23396

TargetLowering::CallLoweringInfo CLI(DAG);

23397

Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();

23398

23399

TargetLowering::ArgListTy Args;

23400

TargetLowering::ArgListEntry Entry;

23401

Entry.Node = In;

23402

Entry.Ty = EVT(SVT).getTypeForEVT(*DAG.getContext());

23403

Entry.IsSExt = false;

23404

Entry.IsZExt = true;

23405

Args.push_back(Entry);

23406

23407

SDValue Callee = DAG.getExternalSymbol(

23408

getLibcallName(SVT == MVT::f64 ? RTLIB::FPROUND_F64_F16

23409

: RTLIB::FPROUND_F32_F16),

23410

getPointerTy(DAG.getDataLayout()));

23411

CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(

23412

CallingConv::C, EVT(MVT::i16).getTypeForEVT(*DAG.getContext()), Callee,

23413

std::move(Args));

23414

23415

SDValue Res;

23416

std::tie(Res, Chain) = LowerCallTo(CLI);

23417

23418

Res = DAG.getBitcast(MVT::f16, Res);

23419

23420

if (IsStrict)

23421

Res = DAG.getMergeValues({Res, Chain}, DL);

23422

23423

return Res;

23424

}

23425

23426

if (VT.getScalarType() == MVT::f16 && !Subtarget.hasFP16()) {

23427

if (!Subtarget.hasF16C() || SVT.getScalarType() != MVT::f32)

23428

return SDValue();

23429

23430

if (VT.isVector())

23431

return Op;

23432

23433

SDValue Res;

23434

SDValue Rnd = DAG.getTargetConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, DL,

23435

MVT::i32);

23436

if (IsStrict) {

23437

Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v4f32,

23438

DAG.getConstantFP(0, DL, MVT::v4f32), In,

23439

DAG.getIntPtrConstant(0, DL));

23440

Res = DAG.getNode(X86ISD::STRICT_CVTPS2PH, DL, {MVT::v8i16, MVT::Other},

23441

{Chain, Res, Rnd});

23442

Chain = Res.getValue(1);

23443

} else {

23444

// FIXME: Should we use zeros for upper elements for non-strict?

23445

Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, In);

23446

Res = DAG.getNode(X86ISD::CVTPS2PH, DL, MVT::v8i16, Res, Rnd);

23447

}

23448

23449

Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i16, Res,

23450

DAG.getIntPtrConstant(0, DL));

23451

Res = DAG.getBitcast(MVT::f16, Res);

23452

23453

if (IsStrict)

23454

return DAG.getMergeValues({Res, Chain}, DL);

23455

23456

return Res;

23457

}

23458

23459

return Op;

23460

}

23461

23462

static SDValue LowerFP16_TO_FP(SDValue Op, SelectionDAG &DAG) {

23463

bool IsStrict = Op->isStrictFPOpcode();

23464

SDValue Src = Op.getOperand(IsStrict ? 1 : 0);

23465

assert(Src.getValueType() == MVT::i16 && Op.getValueType() == MVT::f32 &&(static_cast <bool> (Src.getValueType() == MVT::i16 &&
Op.getValueType() == MVT::f32 && "Unexpected VT!") ?
void (0) : __assert_fail ("Src.getValueType() == MVT::i16 && Op.getValueType() == MVT::f32 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23466, __extension__
__PRETTY_FUNCTION__))

23466

"Unexpected VT!")(static_cast <bool> (Src.getValueType() == MVT::i16 &&
Op.getValueType() == MVT::f32 && "Unexpected VT!") ?
void (0) : __assert_fail ("Src.getValueType() == MVT::i16 && Op.getValueType() == MVT::f32 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23466, __extension__
__PRETTY_FUNCTION__));

23467

23468

SDLoc dl(Op);

23469

SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16,

23470

DAG.getConstant(0, dl, MVT::v8i16), Src,

23471

DAG.getIntPtrConstant(0, dl));

23472

23473

SDValue Chain;

23474

if (IsStrict) {

23475

Res = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {MVT::v4f32, MVT::Other},

23476

{Op.getOperand(0), Res});

23477

Chain = Res.getValue(1);

23478

} else {

23479

Res = DAG.getNode(X86ISD::CVTPH2PS, dl, MVT::v4f32, Res);

23480

}

23481

23482

Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,

23483

DAG.getIntPtrConstant(0, dl));

23484

23485

if (IsStrict)

23486

return DAG.getMergeValues({Res, Chain}, dl);

23487

23488

return Res;

23489

}

23490

23491

static SDValue LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) {

23492

bool IsStrict = Op->isStrictFPOpcode();

23493

SDValue Src = Op.getOperand(IsStrict ? 1 : 0);

23494

assert(Src.getValueType() == MVT::f32 && Op.getValueType() == MVT::i16 &&(static_cast <bool> (Src.getValueType() == MVT::f32 &&
Op.getValueType() == MVT::i16 && "Unexpected VT!") ?
void (0) : __assert_fail ("Src.getValueType() == MVT::f32 && Op.getValueType() == MVT::i16 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23495, __extension__
__PRETTY_FUNCTION__))

23495

"Unexpected VT!")(static_cast <bool> (Src.getValueType() == MVT::f32 &&
Op.getValueType() == MVT::i16 && "Unexpected VT!") ?
void (0) : __assert_fail ("Src.getValueType() == MVT::f32 && Op.getValueType() == MVT::i16 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23495, __extension__
__PRETTY_FUNCTION__));

23496

23497

SDLoc dl(Op);

23498

SDValue Res, Chain;

23499

if (IsStrict) {

23500

Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v4f32,

23501

DAG.getConstantFP(0, dl, MVT::v4f32), Src,

23502

DAG.getIntPtrConstant(0, dl));

23503

Res = DAG.getNode(

23504

X86ISD::STRICT_CVTPS2PH, dl, {MVT::v8i16, MVT::Other},

23505

{Op.getOperand(0), Res, DAG.getTargetConstant(4, dl, MVT::i32)});

23506

Chain = Res.getValue(1);

23507

} else {

23508

// FIXME: Should we use zeros for upper elements for non-strict?

23509

Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, Src);

23510

Res = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Res,

23511

DAG.getTargetConstant(4, dl, MVT::i32));

23512

}

23513

23514

Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Res,

23515

DAG.getIntPtrConstant(0, dl));

23516

23517

if (IsStrict)

23518

return DAG.getMergeValues({Res, Chain}, dl);

23519

23520

return Res;

23521

}

23522

23523

SDValue X86TargetLowering::LowerFP_TO_BF16(SDValue Op,

23524

SelectionDAG &DAG) const {

23525

SDLoc DL(Op);

23526

MakeLibCallOptions CallOptions;

23527

RTLIB::Libcall LC =

23528

RTLIB::getFPROUND(Op.getOperand(0).getValueType(), MVT::bf16);

23529

SDValue Res =

23530

makeLibCall(DAG, LC, MVT::f32, Op.getOperand(0), CallOptions, DL).first;

23531

return DAG.getNode(ISD::TRUNCATE, DL, MVT::i16,

23532

DAG.getBitcast(MVT::i32, Res));

23533

}

23534

23535

/// Depending on uarch and/or optimizing for size, we might prefer to use a

23536

/// vector operation in place of the typical scalar operation.

23537

static SDValue lowerAddSubToHorizontalOp(SDValue Op, SelectionDAG &DAG,

23538

const X86Subtarget &Subtarget) {

23539

// If both operands have other uses, this is probably not profitable.

23540

SDValue LHS = Op.getOperand(0);

23541

SDValue RHS = Op.getOperand(1);

23542

if (!LHS.hasOneUse() && !RHS.hasOneUse())

23543

return Op;

23544

23545

// FP horizontal add/sub were added with SSE3. Integer with SSSE3.

23546

bool IsFP = Op.getSimpleValueType().isFloatingPoint();

23547

if (IsFP && !Subtarget.hasSSE3())

23548

return Op;

23549

if (!IsFP && !Subtarget.hasSSSE3())

23550

return Op;

23551

23552

// Extract from a common vector.

23553

if (LHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

23554

RHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

23555

LHS.getOperand(0) != RHS.getOperand(0) ||

23556

!isa<ConstantSDNode>(LHS.getOperand(1)) ||

23557

!isa<ConstantSDNode>(RHS.getOperand(1)) ||

23558

!shouldUseHorizontalOp(true, DAG, Subtarget))

23559

return Op;

23560

23561

// Allow commuted 'hadd' ops.

23562

// TODO: Allow commuted (f)sub by negating the result of (F)HSUB?

23563

unsigned HOpcode;

23564

switch (Op.getOpcode()) {

23565

case ISD::ADD: HOpcode = X86ISD::HADD; break;

23566

case ISD::SUB: HOpcode = X86ISD::HSUB; break;

23567

case ISD::FADD: HOpcode = X86ISD::FHADD; break;

23568

case ISD::FSUB: HOpcode = X86ISD::FHSUB; break;

23569

default:

23570

llvm_unreachable("Trying to lower unsupported opcode to horizontal op")::llvm::llvm_unreachable_internal("Trying to lower unsupported opcode to horizontal op"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23570);

23571

}

23572

unsigned LExtIndex = LHS.getConstantOperandVal(1);

23573

unsigned RExtIndex = RHS.getConstantOperandVal(1);

23574

if ((LExtIndex & 1) == 1 && (RExtIndex & 1) == 0 &&

23575

(HOpcode == X86ISD::HADD || HOpcode == X86ISD::FHADD))

23576

std::swap(LExtIndex, RExtIndex);

23577

23578

if ((LExtIndex & 1) != 0 || RExtIndex != (LExtIndex + 1))

23579

return Op;

23580

23581

SDValue X = LHS.getOperand(0);

23582

EVT VecVT = X.getValueType();

23583

unsigned BitWidth = VecVT.getSizeInBits();

23584

unsigned NumLanes = BitWidth / 128;

23585

unsigned NumEltsPerLane = VecVT.getVectorNumElements() / NumLanes;

23586

assert((BitWidth == 128 || BitWidth == 256 || BitWidth == 512) &&(static_cast <bool> ((BitWidth == 128 || BitWidth == 256
|| BitWidth == 512) && "Not expecting illegal vector widths here"
) ? void (0) : __assert_fail ("(BitWidth == 128 || BitWidth == 256 || BitWidth == 512) && \"Not expecting illegal vector widths here\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23587, __extension__
__PRETTY_FUNCTION__))

23587

"Not expecting illegal vector widths here")(static_cast <bool> ((BitWidth == 128 || BitWidth == 256
|| BitWidth == 512) && "Not expecting illegal vector widths here"
) ? void (0) : __assert_fail ("(BitWidth == 128 || BitWidth == 256 || BitWidth == 512) && \"Not expecting illegal vector widths here\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23587, __extension__
__PRETTY_FUNCTION__));

23588

23589

// Creating a 256-bit horizontal op would be wasteful, and there is no 512-bit

23590

// equivalent, so extract the 256/512-bit source op to 128-bit if we can.

23591

SDLoc DL(Op);

23592

if (BitWidth == 256 || BitWidth == 512) {

23593

unsigned LaneIdx = LExtIndex / NumEltsPerLane;

23594

X = extract128BitVector(X, LaneIdx * NumEltsPerLane, DAG, DL);

23595

LExtIndex %= NumEltsPerLane;

23596

}

23597

23598

// add (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hadd X, X), 0

23599

// add (extractelt (X, 1), extractelt (X, 0)) --> extractelt (hadd X, X), 0

23600

// add (extractelt (X, 2), extractelt (X, 3)) --> extractelt (hadd X, X), 1

23601

// sub (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hsub X, X), 0

23602

SDValue HOp = DAG.getNode(HOpcode, DL, X.getValueType(), X, X);

23603

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getSimpleValueType(), HOp,

23604

DAG.getIntPtrConstant(LExtIndex / 2, DL));

23605

}

23606

23607

/// Depending on uarch and/or optimizing for size, we might prefer to use a

23608

/// vector operation in place of the typical scalar operation.

23609

SDValue X86TargetLowering::lowerFaddFsub(SDValue Op, SelectionDAG &DAG) const {

23610

assert((Op.getValueType() == MVT::f32 || Op.getValueType() == MVT::f64) &&(static_cast <bool> ((Op.getValueType() == MVT::f32 || Op
.getValueType() == MVT::f64) && "Only expecting float/double"
) ? void (0) : __assert_fail ("(Op.getValueType() == MVT::f32 || Op.getValueType() == MVT::f64) && \"Only expecting float/double\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23611, __extension__
__PRETTY_FUNCTION__))

23611

"Only expecting float/double")(static_cast <bool> ((Op.getValueType() == MVT::f32 || Op
.getValueType() == MVT::f64) && "Only expecting float/double"
) ? void (0) : __assert_fail ("(Op.getValueType() == MVT::f32 || Op.getValueType() == MVT::f64) && \"Only expecting float/double\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23611, __extension__
__PRETTY_FUNCTION__));

23612

return lowerAddSubToHorizontalOp(Op, DAG, Subtarget);

23613

}

23614

23615

/// ISD::FROUND is defined to round to nearest with ties rounding away from 0.

23616

/// This mode isn't supported in hardware on X86. But as long as we aren't

23617

/// compiling with trapping math, we can emulate this with

23618

/// trunc(X + copysign(nextafter(0.5, 0.0), X)).

23619

static SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG) {

23620

SDValue N0 = Op.getOperand(0);

23621

SDLoc dl(Op);

23622

MVT VT = Op.getSimpleValueType();

23623

23624

// N0 += copysign(nextafter(0.5, 0.0), N0)

23625

const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(VT);

23626

bool Ignored;

23627

APFloat Point5Pred = APFloat(0.5f);

23628

Point5Pred.convert(Sem, APFloat::rmNearestTiesToEven, &Ignored);

23629

Point5Pred.next(/*nextDown*/true);

23630

23631

SDValue Adder = DAG.getNode(ISD::FCOPYSIGN, dl, VT,

23632

DAG.getConstantFP(Point5Pred, dl, VT), N0);

23633

N0 = DAG.getNode(ISD::FADD, dl, VT, N0, Adder);

23634

23635

// Truncate the result to remove fraction.

23636

return DAG.getNode(ISD::FTRUNC, dl, VT, N0);

23637

}

23638

23639

/// The only differences between FABS and FNEG are the mask and the logic op.

23640

/// FNEG also has a folding opportunity for FNEG(FABS(x)).

23641

static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {

23642

assert((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) &&(static_cast <bool> ((Op.getOpcode() == ISD::FABS || Op
.getOpcode() == ISD::FNEG) && "Wrong opcode for lowering FABS or FNEG."
) ? void (0) : __assert_fail ("(Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) && \"Wrong opcode for lowering FABS or FNEG.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23643, __extension__
__PRETTY_FUNCTION__))

23643

"Wrong opcode for lowering FABS or FNEG.")(static_cast <bool> ((Op.getOpcode() == ISD::FABS || Op
.getOpcode() == ISD::FNEG) && "Wrong opcode for lowering FABS or FNEG."
) ? void (0) : __assert_fail ("(Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) && \"Wrong opcode for lowering FABS or FNEG.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23643, __extension__
__PRETTY_FUNCTION__));

23644

23645

bool IsFABS = (Op.getOpcode() == ISD::FABS);

23646

23647

// If this is a FABS and it has an FNEG user, bail out to fold the combination

23648

// into an FNABS. We'll lower the FABS after that if it is still in use.

23649

if (IsFABS)

23650

for (SDNode *User : Op->uses())

23651

if (User->getOpcode() == ISD::FNEG)

23652

return Op;

23653

23654

SDLoc dl(Op);

23655

MVT VT = Op.getSimpleValueType();

23656

23657

bool IsF128 = (VT == MVT::f128);

23658

assert(VT.isFloatingPoint() && VT != MVT::f80 &&(static_cast <bool> (VT.isFloatingPoint() && VT
!= MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal
(VT) && "Unexpected type in LowerFABSorFNEG") ? void (
0) : __assert_fail ("VT.isFloatingPoint() && VT != MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal(VT) && \"Unexpected type in LowerFABSorFNEG\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23660, __extension__
__PRETTY_FUNCTION__))

23659

DAG.getTargetLoweringInfo().isTypeLegal(VT) &&(static_cast <bool> (VT.isFloatingPoint() && VT
!= MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal
(VT) && "Unexpected type in LowerFABSorFNEG") ? void (
0) : __assert_fail ("VT.isFloatingPoint() && VT != MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal(VT) && \"Unexpected type in LowerFABSorFNEG\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23660, __extension__
__PRETTY_FUNCTION__))

23660

"Unexpected type in LowerFABSorFNEG")(static_cast <bool> (VT.isFloatingPoint() && VT
!= MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal
(VT) && "Unexpected type in LowerFABSorFNEG") ? void (
0) : __assert_fail ("VT.isFloatingPoint() && VT != MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal(VT) && \"Unexpected type in LowerFABSorFNEG\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23660, __extension__
__PRETTY_FUNCTION__));

23661

23662

// FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOpt::Level to

23663

// decide if we should generate a 16-byte constant mask when we only need 4 or

23664

// 8 bytes for the scalar case.

23665

23666

// There are no scalar bitwise logical SSE/AVX instructions, so we

23667

// generate a 16-byte vector constant and logic op even for the scalar case.

23668

// Using a 16-byte mask allows folding the load of the mask with

23669

// the logic op, so it can save (~4 bytes) on code size.

23670

bool IsFakeVector = !VT.isVector() && !IsF128;

23671

MVT LogicVT = VT;

23672

if (IsFakeVector)

23673

LogicVT = (VT == MVT::f64) ? MVT::v2f64

23674

: (VT == MVT::f32) ? MVT::v4f32

23675

: MVT::v8f16;

23676

23677

unsigned EltBits = VT.getScalarSizeInBits();

23678

// For FABS, mask is 0x7f...; for FNEG, mask is 0x80...

23679

APInt MaskElt = IsFABS ? APInt::getSignedMaxValue(EltBits) :

23680

APInt::getSignMask(EltBits);

23681

const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(VT);

23682

SDValue Mask = DAG.getConstantFP(APFloat(Sem, MaskElt), dl, LogicVT);

23683

23684

SDValue Op0 = Op.getOperand(0);

23685

bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);

23686

unsigned LogicOp = IsFABS ? X86ISD::FAND :

23687

IsFNABS ? X86ISD::FOR :

23688

X86ISD::FXOR;

23689

SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;

23690

23691

if (VT.isVector() || IsF128)

23692

return DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);

23693

23694

// For the scalar case extend to a 128-bit vector, perform the logic op,

23695

// and extract the scalar result back out.

23696

Operand = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Operand);

23697

SDValue LogicNode = DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);

23698

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, LogicNode,

23699

DAG.getIntPtrConstant(0, dl));

23700

}

23701

23702

static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {

23703

SDValue Mag = Op.getOperand(0);

23704

SDValue Sign = Op.getOperand(1);

23705

SDLoc dl(Op);

23706

23707

// If the sign operand is smaller, extend it first.

23708

MVT VT = Op.getSimpleValueType();

23709

if (Sign.getSimpleValueType().bitsLT(VT))

23710

Sign = DAG.getNode(ISD::FP_EXTEND, dl, VT, Sign);

23711

23712

// And if it is bigger, shrink it first.

23713

if (Sign.getSimpleValueType().bitsGT(VT))

23714

Sign = DAG.getNode(ISD::FP_ROUND, dl, VT, Sign,

23715

DAG.getIntPtrConstant(0, dl, /*isTarget=*/true));

23716

23717

// At this point the operands and the result should have the same

23718

// type, and that won't be f80 since that is not custom lowered.

23719

bool IsF128 = (VT == MVT::f128);

23720

assert(VT.isFloatingPoint() && VT != MVT::f80 &&(static_cast <bool> (VT.isFloatingPoint() && VT
!= MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal
(VT) && "Unexpected type in LowerFCOPYSIGN") ? void (
0) : __assert_fail ("VT.isFloatingPoint() && VT != MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal(VT) && \"Unexpected type in LowerFCOPYSIGN\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23722, __extension__
__PRETTY_FUNCTION__))

23721

DAG.getTargetLoweringInfo().isTypeLegal(VT) &&(static_cast <bool> (VT.isFloatingPoint() && VT
!= MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal
(VT) && "Unexpected type in LowerFCOPYSIGN") ? void (
0) : __assert_fail ("VT.isFloatingPoint() && VT != MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal(VT) && \"Unexpected type in LowerFCOPYSIGN\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23722, __extension__
__PRETTY_FUNCTION__))

23722

"Unexpected type in LowerFCOPYSIGN")(static_cast <bool> (VT.isFloatingPoint() && VT
!= MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal
(VT) && "Unexpected type in LowerFCOPYSIGN") ? void (
0) : __assert_fail ("VT.isFloatingPoint() && VT != MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal(VT) && \"Unexpected type in LowerFCOPYSIGN\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23722, __extension__
__PRETTY_FUNCTION__));

23723

23724

const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(VT);

23725

23726

// Perform all scalar logic operations as 16-byte vectors because there are no

23727

// scalar FP logic instructions in SSE.

23728

// TODO: This isn't necessary. If we used scalar types, we might avoid some

23729

// unnecessary splats, but we might miss load folding opportunities. Should

23730

// this decision be based on OptimizeForSize?

23731

bool IsFakeVector = !VT.isVector() && !IsF128;

23732

MVT LogicVT = VT;

23733

if (IsFakeVector)

23734

LogicVT = (VT == MVT::f64) ? MVT::v2f64

23735

: (VT == MVT::f32) ? MVT::v4f32

23736

: MVT::v8f16;

23737

23738

// The mask constants are automatically splatted for vector types.

23739

unsigned EltSizeInBits = VT.getScalarSizeInBits();

23740

SDValue SignMask = DAG.getConstantFP(

23741

APFloat(Sem, APInt::getSignMask(EltSizeInBits)), dl, LogicVT);

23742

SDValue MagMask = DAG.getConstantFP(

23743

APFloat(Sem, APInt::getSignedMaxValue(EltSizeInBits)), dl, LogicVT);

23744

23745

// First, clear all bits but the sign bit from the second operand (sign).

23746

if (IsFakeVector)

23747

Sign = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Sign);

23748

SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Sign, SignMask);

23749

23750

// Next, clear the sign bit from the first operand (magnitude).

23751

// TODO: If we had general constant folding for FP logic ops, this check

23752

// wouldn't be necessary.

23753

SDValue MagBits;

23754

if (ConstantFPSDNode *Op0CN = isConstOrConstSplatFP(Mag)) {

23755

APFloat APF = Op0CN->getValueAPF();

23756

APF.clearSign();

23757

MagBits = DAG.getConstantFP(APF, dl, LogicVT);

23758

} else {

23759

// If the magnitude operand wasn't a constant, we need to AND out the sign.

23760

if (IsFakeVector)

23761

Mag = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Mag);

23762

MagBits = DAG.getNode(X86ISD::FAND, dl, LogicVT, Mag, MagMask);

23763

}

23764

23765

// OR the magnitude value with the sign bit.

23766

SDValue Or = DAG.getNode(X86ISD::FOR, dl, LogicVT, MagBits, SignBit);

23767

return !IsFakeVector ? Or : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Or,

23768

DAG.getIntPtrConstant(0, dl));

23769

}

23770

23771

static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {

23772

SDValue N0 = Op.getOperand(0);

23773

SDLoc dl(Op);

23774

MVT VT = Op.getSimpleValueType();

23775

23776

MVT OpVT = N0.getSimpleValueType();

23777

assert((OpVT == MVT::f32 || OpVT == MVT::f64) &&(static_cast <bool> ((OpVT == MVT::f32 || OpVT == MVT::
f64) && "Unexpected type for FGETSIGN") ? void (0) : __assert_fail
("(OpVT == MVT::f32 || OpVT == MVT::f64) && \"Unexpected type for FGETSIGN\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23778, __extension__
__PRETTY_FUNCTION__))

23778

"Unexpected type for FGETSIGN")(static_cast <bool> ((OpVT == MVT::f32 || OpVT == MVT::
f64) && "Unexpected type for FGETSIGN") ? void (0) : __assert_fail
("(OpVT == MVT::f32 || OpVT == MVT::f64) && \"Unexpected type for FGETSIGN\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23778, __extension__
__PRETTY_FUNCTION__));

23779

23780

// Lower ISD::FGETSIGN to (AND (X86ISD::MOVMSK ...) 1).

23781

MVT VecVT = (OpVT == MVT::f32 ? MVT::v4f32 : MVT::v2f64);

23782

SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, N0);

23783

Res = DAG.getNode(X86ISD::MOVMSK, dl, MVT::i32, Res);

23784

Res = DAG.getZExtOrTrunc(Res, dl, VT);

23785

Res = DAG.getNode(ISD::AND, dl, VT, Res, DAG.getConstant(1, dl, VT));

23786

return Res;

23787

}

23788

23789

/// Helper for attempting to create a X86ISD::BT node.

23790

static SDValue getBT(SDValue Src, SDValue BitNo, const SDLoc &DL, SelectionDAG &DAG) {

23791

// If Src is i8, promote it to i32 with any_extend. There is no i8 BT

23792

// instruction. Since the shift amount is in-range-or-undefined, we know

23793

// that doing a bittest on the i32 value is ok. We extend to i32 because

23794

// the encoding for the i16 version is larger than the i32 version.

23795

// Also promote i16 to i32 for performance / code size reason.

23796

if (Src.getValueType().getScalarSizeInBits() < 32)

23797

Src = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Src);

23798

23799

// No legal type found, give up.

23800

if (!DAG.getTargetLoweringInfo().isTypeLegal(Src.getValueType()))

23801

return SDValue();

23802

23803

// See if we can use the 32-bit instruction instead of the 64-bit one for a

23804

// shorter encoding. Since the former takes the modulo 32 of BitNo and the

23805

// latter takes the modulo 64, this is only valid if the 5th bit of BitNo is

23806

// known to be zero.

23807

if (Src.getValueType() == MVT::i64 &&

23808

DAG.MaskedValueIsZero(BitNo, APInt(BitNo.getValueSizeInBits(), 32)))

23809

Src = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Src);

23810

23811

// If the operand types disagree, extend the shift amount to match. Since

23812

// BT ignores high bits (like shifts) we can use anyextend.

23813

if (Src.getValueType() != BitNo.getValueType()) {

23814

// Peek through a mask/modulo operation.

23815

// TODO: DAGCombine fails to do this as it just checks isTruncateFree, but

23816

// we probably need a better IsDesirableToPromoteOp to handle this as well.

23817

if (BitNo.getOpcode() == ISD::AND && BitNo->hasOneUse())

23818

BitNo = DAG.getNode(ISD::AND, DL, Src.getValueType(),

23819

DAG.getNode(ISD::ANY_EXTEND, DL, Src.getValueType(),

23820

BitNo.getOperand(0)),

23821

DAG.getNode(ISD::ANY_EXTEND, DL, Src.getValueType(),

23822

BitNo.getOperand(1)));

23823

else

23824

BitNo = DAG.getNode(ISD::ANY_EXTEND, DL, Src.getValueType(), BitNo);

23825

}

23826

23827

return DAG.getNode(X86ISD::BT, DL, MVT::i32, Src, BitNo);

23828

}

23829

23830

/// Helper for creating a X86ISD::SETCC node.

23831

static SDValue getSETCC(X86::CondCode Cond, SDValue EFLAGS, const SDLoc &dl,

23832

SelectionDAG &DAG) {

23833

return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,

23834

DAG.getTargetConstant(Cond, dl, MVT::i8), EFLAGS);

23835

}

23836

23837

/// Helper for matching OR(EXTRACTELT(X,0),OR(EXTRACTELT(X,1),...))

23838

/// style scalarized (associative) reduction patterns. Partial reductions

23839

/// are supported when the pointer SrcMask is non-null.

23840

/// TODO - move this to SelectionDAG?

23841

static bool matchScalarReduction(SDValue Op, ISD::NodeType BinOp,

23842

SmallVectorImpl<SDValue> &SrcOps,

23843

SmallVectorImpl<APInt> *SrcMask = nullptr) {

23844

SmallVector<SDValue, 8> Opnds;

23845

DenseMap<SDValue, APInt> SrcOpMap;

23846

EVT VT = MVT::Other;

23847

23848

// Recognize a special case where a vector is casted into wide integer to

23849

// test all 0s.

23850

assert(Op.getOpcode() == unsigned(BinOp) &&(static_cast <bool> (Op.getOpcode() == unsigned(BinOp) &&
"Unexpected bit reduction opcode") ? void (0) : __assert_fail
("Op.getOpcode() == unsigned(BinOp) && \"Unexpected bit reduction opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23851, __extension__
__PRETTY_FUNCTION__))

23851

"Unexpected bit reduction opcode")(static_cast <bool> (Op.getOpcode() == unsigned(BinOp) &&
"Unexpected bit reduction opcode") ? void (0) : __assert_fail
("Op.getOpcode() == unsigned(BinOp) && \"Unexpected bit reduction opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23851, __extension__
__PRETTY_FUNCTION__));

23852

Opnds.push_back(Op.getOperand(0));

23853

Opnds.push_back(Op.getOperand(1));

23854

23855

for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {

23856

SmallVectorImpl<SDValue>::const_iterator I = Opnds.begin() + Slot;

23857

// BFS traverse all BinOp operands.

23858

if (I->getOpcode() == unsigned(BinOp)) {

23859

Opnds.push_back(I->getOperand(0));

23860

Opnds.push_back(I->getOperand(1));

23861

// Re-evaluate the number of nodes to be traversed.

23862

e += 2; // 2 more nodes (LHS and RHS) are pushed.

23863

continue;

23864

}

23865

23866

// Quit if a non-EXTRACT_VECTOR_ELT

23867

if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)

23868

return false;

23869

23870

// Quit if without a constant index.

23871

auto *Idx = dyn_cast<ConstantSDNode>(I->getOperand(1));

23872

if (!Idx)

23873

return false;

23874

23875

SDValue Src = I->getOperand(0);

23876

DenseMap<SDValue, APInt>::iterator M = SrcOpMap.find(Src);

23877

if (M == SrcOpMap.end()) {

23878

VT = Src.getValueType();

23879

// Quit if not the same type.

23880

if (!SrcOpMap.empty() && VT != SrcOpMap.begin()->first.getValueType())

23881

return false;

23882

unsigned NumElts = VT.getVectorNumElements();

23883

APInt EltCount = APInt::getZero(NumElts);

23884

M = SrcOpMap.insert(std::make_pair(Src, EltCount)).first;

23885

SrcOps.push_back(Src);

23886

}

23887

23888

// Quit if element already used.

23889

unsigned CIdx = Idx->getZExtValue();

23890

if (M->second[CIdx])

23891

return false;

23892

M->second.setBit(CIdx);

23893

}

23894

23895

if (SrcMask) {

23896

// Collect the source partial masks.

23897

for (SDValue &SrcOp : SrcOps)

23898

SrcMask->push_back(SrcOpMap[SrcOp]);

23899

} else {

23900

// Quit if not all elements are used.

23901

for (const auto &I : SrcOpMap)

23902

if (!I.second.isAllOnes())

23903

return false;

23904

}

23905

23906

return true;

23907

}

23908

23909

// Helper function for comparing all bits of a vector against zero.

23910

static SDValue LowerVectorAllZero(const SDLoc &DL, SDValue V, ISD::CondCode CC,

23911

const APInt &Mask,

23912

const X86Subtarget &Subtarget,

23913

SelectionDAG &DAG, X86::CondCode &X86CC) {

23914

EVT VT = V.getValueType();

23915

unsigned ScalarSize = VT.getScalarSizeInBits();

23916

if (Mask.getBitWidth() != ScalarSize) {

23917

assert(ScalarSize == 1 && "Element Mask vs Vector bitwidth mismatch")(static_cast <bool> (ScalarSize == 1 && "Element Mask vs Vector bitwidth mismatch"
) ? void (0) : __assert_fail ("ScalarSize == 1 && \"Element Mask vs Vector bitwidth mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23917, __extension__
__PRETTY_FUNCTION__));

23918

return SDValue();

23919

}

23920

23921

assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode")(static_cast <bool> ((CC == ISD::SETEQ || CC == ISD::SETNE
) && "Unsupported ISD::CondCode") ? void (0) : __assert_fail
("(CC == ISD::SETEQ || CC == ISD::SETNE) && \"Unsupported ISD::CondCode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23921, __extension__
__PRETTY_FUNCTION__));

23922

X86CC = (CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE);

23923

23924

auto MaskBits = [&](SDValue Src) {

23925

if (Mask.isAllOnes())

23926

return Src;

23927

EVT SrcVT = Src.getValueType();

23928

SDValue MaskValue = DAG.getConstant(Mask, DL, SrcVT);

23929

return DAG.getNode(ISD::AND, DL, SrcVT, Src, MaskValue);

23930

};

23931

23932

// For sub-128-bit vector, cast to (legal) integer and compare with zero.

23933

if (VT.getSizeInBits() < 128) {

23934

EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());

23935

if (!DAG.getTargetLoweringInfo().isTypeLegal(IntVT))

23936

return SDValue();

23937

return DAG.getNode(X86ISD::CMP, DL, MVT::i32,

23938

DAG.getBitcast(IntVT, MaskBits(V)),

23939

DAG.getConstant(0, DL, IntVT));

23940

}

23941

23942

// Quit if not splittable to 128/256-bit vector.

23943

if (!isPowerOf2_32(VT.getSizeInBits()))

23944

return SDValue();

23945

23946

// Split down to 128/256-bit vector.

23947

unsigned TestSize = Subtarget.hasAVX() ? 256 : 128;

23948

while (VT.getSizeInBits() > TestSize) {

23949

auto Split = DAG.SplitVector(V, DL);

23950

VT = Split.first.getValueType();

23951

V = DAG.getNode(ISD::OR, DL, VT, Split.first, Split.second);

23952

}

23953

23954

bool UsePTEST = Subtarget.hasSSE41();

23955

if (UsePTEST) {

23956

MVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;

23957

V = DAG.getBitcast(TestVT, MaskBits(V));

23958

return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, V, V);

23959

}

23960

23961

// Without PTEST, a masked v2i64 or-reduction is not faster than

23962

// scalarization.

23963

if (!Mask.isAllOnes() && VT.getScalarSizeInBits() > 32)

23964

return SDValue();

23965

23966

V = DAG.getBitcast(MVT::v16i8, MaskBits(V));

23967

V = DAG.getNode(X86ISD::PCMPEQ, DL, MVT::v16i8, V,

23968

getZeroVector(MVT::v16i8, Subtarget, DAG, DL));

23969

V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);

23970

return DAG.getNode(X86ISD::CMP, DL, MVT::i32, V,

23971

DAG.getConstant(0xFFFF, DL, MVT::i32));

23972

}

23973

23974

// Check whether an OR'd reduction tree is PTEST-able, or if we can fallback to

23975

// CMP(MOVMSK(PCMPEQB(X,0))).

23976

static SDValue MatchVectorAllZeroTest(SDValue Op, ISD::CondCode CC,

23977

const SDLoc &DL,

23978

const X86Subtarget &Subtarget,

23979

SelectionDAG &DAG, SDValue &X86CC) {

23980

assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode")(static_cast <bool> ((CC == ISD::SETEQ || CC == ISD::SETNE
) && "Unsupported ISD::CondCode") ? void (0) : __assert_fail
("(CC == ISD::SETEQ || CC == ISD::SETNE) && \"Unsupported ISD::CondCode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 23980, __extension__
__PRETTY_FUNCTION__));

23981

23982

if (!Subtarget.hasSSE2() || !Op->hasOneUse())

23983

return SDValue();

23984

23985

// Check whether we're masking/truncating an OR-reduction result, in which

23986

// case track the masked bits.

23987

APInt Mask = APInt::getAllOnes(Op.getScalarValueSizeInBits());

23988

switch (Op.getOpcode()) {

23989

case ISD::TRUNCATE: {

23990

SDValue Src = Op.getOperand(0);

23991

Mask = APInt::getLowBitsSet(Src.getScalarValueSizeInBits(),

23992

Op.getScalarValueSizeInBits());

23993

Op = Src;

23994

break;

23995

}

23996

case ISD::AND: {

23997

if (auto *Cst = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {

23998

Mask = Cst->getAPIntValue();

23999

Op = Op.getOperand(0);

24000

}

24001

break;

24002

}

24003

}

24004

24005

SmallVector<SDValue, 8> VecIns;

24006

if (Op.getOpcode() == ISD::OR && matchScalarReduction(Op, ISD::OR, VecIns)) {

24007

EVT VT = VecIns[0].getValueType();

24008

assert(llvm::all_of(VecIns,(static_cast <bool> (llvm::all_of(VecIns, [VT](SDValue V
) { return VT == V.getValueType(); }) && "Reduction source vector mismatch"
) ? void (0) : __assert_fail ("llvm::all_of(VecIns, [VT](SDValue V) { return VT == V.getValueType(); }) && \"Reduction source vector mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24010, __extension__
__PRETTY_FUNCTION__))

24009

[VT](SDValue V) { return VT == V.getValueType(); }) &&(static_cast <bool> (llvm::all_of(VecIns, [VT](SDValue V
) { return VT == V.getValueType(); }) && "Reduction source vector mismatch"
) ? void (0) : __assert_fail ("llvm::all_of(VecIns, [VT](SDValue V) { return VT == V.getValueType(); }) && \"Reduction source vector mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24010, __extension__
__PRETTY_FUNCTION__))

24010

"Reduction source vector mismatch")(static_cast <bool> (llvm::all_of(VecIns, [VT](SDValue V
) { return VT == V.getValueType(); }) && "Reduction source vector mismatch"
) ? void (0) : __assert_fail ("llvm::all_of(VecIns, [VT](SDValue V) { return VT == V.getValueType(); }) && \"Reduction source vector mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24010, __extension__
__PRETTY_FUNCTION__));

24011

24012

// Quit if less than 128-bits or not splittable to 128/256-bit vector.

24013

if (VT.getSizeInBits() < 128 || !isPowerOf2_32(VT.getSizeInBits()))

24014

return SDValue();

24015

24016

// If more than one full vector is evaluated, OR them first before PTEST.

24017

for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1;

24018

Slot += 2, e += 1) {

24019

// Each iteration will OR 2 nodes and append the result until there is

24020

// only 1 node left, i.e. the final OR'd value of all vectors.

24021

SDValue LHS = VecIns[Slot];

24022

SDValue RHS = VecIns[Slot + 1];

24023

VecIns.push_back(DAG.getNode(ISD::OR, DL, VT, LHS, RHS));

24024

}

24025

24026

X86::CondCode CCode;

24027

if (SDValue V = LowerVectorAllZero(DL, VecIns.back(), CC, Mask, Subtarget,

24028

DAG, CCode)) {

24029

X86CC = DAG.getTargetConstant(CCode, DL, MVT::i8);

24030

return V;

24031

}

24032

}

24033

24034

if (Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {

24035

ISD::NodeType BinOp;

24036

if (SDValue Match =

24037

DAG.matchBinOpReduction(Op.getNode(), BinOp, {ISD::OR})) {

24038

X86::CondCode CCode;

24039

if (SDValue V =

24040

LowerVectorAllZero(DL, Match, CC, Mask, Subtarget, DAG, CCode)) {

24041

X86CC = DAG.getTargetConstant(CCode, DL, MVT::i8);

24042

return V;

24043

}

24044

}

24045

}

24046

24047

return SDValue();

24048

}

24049

24050

/// return true if \c Op has a use that doesn't just read flags.

24051

static bool hasNonFlagsUse(SDValue Op) {

24052

for (SDNode::use_iterator UI = Op->use_begin(), UE = Op->use_end(); UI != UE;

24053

++UI) {

24054

SDNode *User = *UI;

24055

unsigned UOpNo = UI.getOperandNo();

24056

if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {

24057

// Look pass truncate.

24058

UOpNo = User->use_begin().getOperandNo();

24059

User = *User->use_begin();

24060

}

24061

24062

if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC &&

24063

!(User->getOpcode() == ISD::SELECT && UOpNo == 0))

24064

return true;

24065

}

24066

return false;

24067

}

24068

24069

// Transform to an x86-specific ALU node with flags if there is a chance of

24070

// using an RMW op or only the flags are used. Otherwise, leave

24071

// the node alone and emit a 'cmp' or 'test' instruction.

24072

static bool isProfitableToUseFlagOp(SDValue Op) {

24073

for (SDNode *U : Op->uses())

24074

if (U->getOpcode() != ISD::CopyToReg &&

24075

U->getOpcode() != ISD::SETCC &&

24076

U->getOpcode() != ISD::STORE)

24077

return false;

24078

24079

return true;

24080

}

24081

24082

/// Emit nodes that will be selected as "test Op0,Op0", or something

24083

/// equivalent.

24084

static SDValue EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,

24085

SelectionDAG &DAG, const X86Subtarget &Subtarget) {

24086

// CF and OF aren't always set the way we want. Determine which

24087

// of these we need.

24088

bool NeedCF = false;

24089

bool NeedOF = false;

24090

switch (X86CC) {

24091

default: break;

24092

case X86::COND_A: case X86::COND_AE:

24093

case X86::COND_B: case X86::COND_BE:

24094

NeedCF = true;

24095

break;

24096

case X86::COND_G: case X86::COND_GE:

24097

case X86::COND_L: case X86::COND_LE:

24098

case X86::COND_O: case X86::COND_NO: {

24099

// Check if we really need to set the

24100

// Overflow flag. If NoSignedWrap is present

24101

// that is not actually needed.

24102

switch (Op->getOpcode()) {

24103

case ISD::ADD:

24104

case ISD::SUB:

24105

case ISD::MUL:

24106

case ISD::SHL:

24107

if (Op.getNode()->getFlags().hasNoSignedWrap())

24108

break;

24109

[[fallthrough]];

24110

default:

24111

NeedOF = true;

24112

break;

24113

}

24114

break;

24115

}

24116

}

24117

// See if we can use the EFLAGS value from the operand instead of

24118

// doing a separate TEST. TEST always sets OF and CF to 0, so unless

24119

// we prove that the arithmetic won't overflow, we can't use OF or CF.

24120

if (Op.getResNo() != 0 || NeedOF || NeedCF) {

24121

// Emit a CMP with 0, which is the TEST pattern.

24122

return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,

24123

DAG.getConstant(0, dl, Op.getValueType()));

24124

}

24125

unsigned Opcode = 0;

24126

unsigned NumOperands = 0;

24127

24128

SDValue ArithOp = Op;

24129

24130

// NOTICE: In the code below we use ArithOp to hold the arithmetic operation

24131

// which may be the result of a CAST. We use the variable 'Op', which is the

24132

// non-casted variable when we check for possible users.

24133

switch (ArithOp.getOpcode()) {

24134

case ISD::AND:

24135

// If the primary 'and' result isn't used, don't bother using X86ISD::AND,

24136

// because a TEST instruction will be better.

24137

if (!hasNonFlagsUse(Op))

24138

break;

24139

24140

[[fallthrough]];

24141

case ISD::ADD:

24142

case ISD::SUB:

24143

case ISD::OR:

24144

case ISD::XOR:

24145

if (!isProfitableToUseFlagOp(Op))

24146

break;

24147

24148

// Otherwise use a regular EFLAGS-setting instruction.

24149

switch (ArithOp.getOpcode()) {

24150

default: llvm_unreachable("unexpected operator!")::llvm::llvm_unreachable_internal("unexpected operator!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24150);

24151

case ISD::ADD: Opcode = X86ISD::ADD; break;

24152

case ISD::SUB: Opcode = X86ISD::SUB; break;

24153

case ISD::XOR: Opcode = X86ISD::XOR; break;

24154

case ISD::AND: Opcode = X86ISD::AND; break;

24155

case ISD::OR: Opcode = X86ISD::OR; break;

24156

}

24157

24158

NumOperands = 2;

24159

break;

24160

case X86ISD::ADD:

24161

case X86ISD::SUB:

24162

case X86ISD::OR:

24163

case X86ISD::XOR:

24164

case X86ISD::AND:

24165

return SDValue(Op.getNode(), 1);

24166

case ISD::SSUBO:

24167

case ISD::USUBO: {

24168

// /USUBO/SSUBO will become a X86ISD::SUB and we can use its Z flag.

24169

SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);

24170

return DAG.getNode(X86ISD::SUB, dl, VTs, Op->getOperand(0),

24171

Op->getOperand(1)).getValue(1);

24172

}

24173

default:

24174

break;

24175

}

24176

24177

if (Opcode == 0) {

24178

// Emit a CMP with 0, which is the TEST pattern.

24179

return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,

24180

DAG.getConstant(0, dl, Op.getValueType()));

24181

}

24182

SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);

24183

SmallVector<SDValue, 4> Ops(Op->op_begin(), Op->op_begin() + NumOperands);

24184

24185

SDValue New = DAG.getNode(Opcode, dl, VTs, Ops);

24186

DAG.ReplaceAllUsesOfValueWith(SDValue(Op.getNode(), 0), New);

24187

return SDValue(New.getNode(), 1);

24188

}

24189

24190

/// Emit nodes that will be selected as "cmp Op0,Op1", or something

24191

/// equivalent.

24192

static SDValue EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,

24193

const SDLoc &dl, SelectionDAG &DAG,

24194

const X86Subtarget &Subtarget) {

24195

if (isNullConstant(Op1))

24196

return EmitTest(Op0, X86CC, dl, DAG, Subtarget);

24197

24198

EVT CmpVT = Op0.getValueType();

24199

24200

assert((CmpVT == MVT::i8 || CmpVT == MVT::i16 ||(static_cast <bool> ((CmpVT == MVT::i8 || CmpVT == MVT::
i16 || CmpVT == MVT::i32 || CmpVT == MVT::i64) && "Unexpected VT!"
) ? void (0) : __assert_fail ("(CmpVT == MVT::i8 || CmpVT == MVT::i16 || CmpVT == MVT::i32 || CmpVT == MVT::i64) && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24201, __extension__
__PRETTY_FUNCTION__))

24201

CmpVT == MVT::i32 || CmpVT == MVT::i64) && "Unexpected VT!")(static_cast <bool> ((CmpVT == MVT::i8 || CmpVT == MVT::
i16 || CmpVT == MVT::i32 || CmpVT == MVT::i64) && "Unexpected VT!"
) ? void (0) : __assert_fail ("(CmpVT == MVT::i8 || CmpVT == MVT::i16 || CmpVT == MVT::i32 || CmpVT == MVT::i64) && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24201, __extension__
__PRETTY_FUNCTION__));

24202

24203

// Only promote the compare up to I32 if it is a 16 bit operation

24204

// with an immediate. 16 bit immediates are to be avoided.

24205

if (CmpVT == MVT::i16 && !Subtarget.isAtom() &&

24206

!DAG.getMachineFunction().getFunction().hasMinSize()) {

24207

ConstantSDNode *COp0 = dyn_cast<ConstantSDNode>(Op0);

24208

ConstantSDNode *COp1 = dyn_cast<ConstantSDNode>(Op1);

24209

// Don't do this if the immediate can fit in 8-bits.

24210

if ((COp0 && !COp0->getAPIntValue().isSignedIntN(8)) ||

24211

(COp1 && !COp1->getAPIntValue().isSignedIntN(8))) {

24212

unsigned ExtendOp =

24213

isX86CCSigned(X86CC) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;

24214

if (X86CC == X86::COND_E || X86CC == X86::COND_NE) {

24215

// For equality comparisons try to use SIGN_EXTEND if the input was

24216

// truncate from something with enough sign bits.

24217

if (Op0.getOpcode() == ISD::TRUNCATE) {

24218

if (DAG.ComputeMaxSignificantBits(Op0.getOperand(0)) <= 16)

24219

ExtendOp = ISD::SIGN_EXTEND;

24220

} else if (Op1.getOpcode() == ISD::TRUNCATE) {

24221

if (DAG.ComputeMaxSignificantBits(Op1.getOperand(0)) <= 16)

24222

ExtendOp = ISD::SIGN_EXTEND;

24223

}

24224

}

24225

24226

CmpVT = MVT::i32;

24227

Op0 = DAG.getNode(ExtendOp, dl, CmpVT, Op0);

24228

Op1 = DAG.getNode(ExtendOp, dl, CmpVT, Op1);

24229

}

24230

}

24231

24232

// Try to shrink i64 compares if the input has enough zero bits.

24233

// FIXME: Do this for non-constant compares for constant on LHS?

24234

if (CmpVT == MVT::i64 && isa<ConstantSDNode>(Op1) && !isX86CCSigned(X86CC) &&

24235

Op0.hasOneUse() && // Hacky way to not break CSE opportunities with sub.

24236

cast<ConstantSDNode>(Op1)->getAPIntValue().getActiveBits() <= 32 &&

24237

DAG.MaskedValueIsZero(Op0, APInt::getHighBitsSet(64, 32))) {

24238

CmpVT = MVT::i32;

24239

Op0 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op0);

24240

Op1 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op1);

24241

}

24242

24243

// 0-x == y --> x+y == 0

24244

// 0-x != y --> x+y != 0

24245

if (Op0.getOpcode() == ISD::SUB && isNullConstant(Op0.getOperand(0)) &&

24246

Op0.hasOneUse() && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {

24247

SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);

24248

SDValue Add = DAG.getNode(X86ISD::ADD, dl, VTs, Op0.getOperand(1), Op1);

24249

return Add.getValue(1);

24250

}

24251

24252

// x == 0-y --> x+y == 0

24253

// x != 0-y --> x+y != 0

24254

if (Op1.getOpcode() == ISD::SUB && isNullConstant(Op1.getOperand(0)) &&

24255

Op1.hasOneUse() && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {

24256

SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);

24257

SDValue Add = DAG.getNode(X86ISD::ADD, dl, VTs, Op0, Op1.getOperand(1));

24258

return Add.getValue(1);

24259

}

24260

24261

// Use SUB instead of CMP to enable CSE between SUB and CMP.

24262

SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);

24263

SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs, Op0, Op1);

24264

return Sub.getValue(1);

24265

}

24266

24267

/// Check if replacement of SQRT with RSQRT should be disabled.

24268

bool X86TargetLowering::isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const {

24269

EVT VT = Op.getValueType();

24270

24271

// We don't need to replace SQRT with RSQRT for half type.

24272

if (VT.getScalarType() == MVT::f16)

24273

return true;

24274

24275

// We never want to use both SQRT and RSQRT instructions for the same input.

24276

if (DAG.doesNodeExist(X86ISD::FRSQRT, DAG.getVTList(VT), Op))

24277

return false;

24278

24279

if (VT.isVector())

24280

return Subtarget.hasFastVectorFSQRT();

24281

return Subtarget.hasFastScalarFSQRT();

24282

}

24283

24284

/// The minimum architected relative accuracy is 2^-12. We need one

24285

/// Newton-Raphson step to have a good float result (24 bits of precision).

24286

SDValue X86TargetLowering::getSqrtEstimate(SDValue Op,

24287

SelectionDAG &DAG, int Enabled,

24288

int &RefinementSteps,

24289

bool &UseOneConstNR,

24290

bool Reciprocal) const {

24291

SDLoc DL(Op);

24292

EVT VT = Op.getValueType();

24293

24294

// SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps.

24295

// It is likely not profitable to do this for f64 because a double-precision

24296

// rsqrt estimate with refinement on x86 prior to FMA requires at least 16

24297

// instructions: convert to single, rsqrtss, convert back to double, refine

24298

// (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA

24299

// along with FMA, this could be a throughput win.

24300

// TODO: SQRT requires SSE2 to prevent the introduction of an illegal v4i32

24301

// after legalize types.

24302

if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||

24303

(VT == MVT::v4f32 && Subtarget.hasSSE1() && Reciprocal) ||

24304

(VT == MVT::v4f32 && Subtarget.hasSSE2() && !Reciprocal) ||

24305

(VT == MVT::v8f32 && Subtarget.hasAVX()) ||

24306

(VT == MVT::v16f32 && Subtarget.useAVX512Regs())) {

24307

if (RefinementSteps == ReciprocalEstimate::Unspecified)

24308

RefinementSteps = 1;

24309

24310

UseOneConstNR = false;

24311

// There is no FSQRT for 512-bits, but there is RSQRT14.

24312

unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RSQRT14 : X86ISD::FRSQRT;

24313

SDValue Estimate = DAG.getNode(Opcode, DL, VT, Op);

24314

if (RefinementSteps == 0 && !Reciprocal)

24315

Estimate = DAG.getNode(ISD::FMUL, DL, VT, Op, Estimate);

24316

return Estimate;

24317

}

24318

24319

if (VT.getScalarType() == MVT::f16 && isTypeLegal(VT) &&

24320

Subtarget.hasFP16()) {

24321

assert(Reciprocal && "Don't replace SQRT with RSQRT for half type")(static_cast <bool> (Reciprocal && "Don't replace SQRT with RSQRT for half type"
) ? void (0) : __assert_fail ("Reciprocal && \"Don't replace SQRT with RSQRT for half type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24321, __extension__
__PRETTY_FUNCTION__));

24322

if (RefinementSteps == ReciprocalEstimate::Unspecified)

24323

RefinementSteps = 0;

24324

24325

if (VT == MVT::f16) {

24326

SDValue Zero = DAG.getIntPtrConstant(0, DL);

24327

SDValue Undef = DAG.getUNDEF(MVT::v8f16);

24328

Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v8f16, Op);

24329

Op = DAG.getNode(X86ISD::RSQRT14S, DL, MVT::v8f16, Undef, Op);

24330

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16, Op, Zero);

24331

}

24332

24333

return DAG.getNode(X86ISD::RSQRT14, DL, VT, Op);

24334

}

24335

return SDValue();

24336

}

24337

24338

/// The minimum architected relative accuracy is 2^-12. We need one

24339

/// Newton-Raphson step to have a good float result (24 bits of precision).

24340

SDValue X86TargetLowering::getRecipEstimate(SDValue Op, SelectionDAG &DAG,

24341

int Enabled,

24342

int &RefinementSteps) const {

24343

SDLoc DL(Op);

24344

EVT VT = Op.getValueType();

24345

24346

// SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.

24347

// It is likely not profitable to do this for f64 because a double-precision

24348

// reciprocal estimate with refinement on x86 prior to FMA requires

24349

// 15 instructions: convert to single, rcpss, convert back to double, refine

24350

// (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA

24351

// along with FMA, this could be a throughput win.

24352

24353

if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||

24354

(VT == MVT::v4f32 && Subtarget.hasSSE1()) ||

24355

(VT == MVT::v8f32 && Subtarget.hasAVX()) ||

24356

(VT == MVT::v16f32 && Subtarget.useAVX512Regs())) {

24357

// Enable estimate codegen with 1 refinement step for vector division.

24358

// Scalar division estimates are disabled because they break too much

24359

// real-world code. These defaults are intended to match GCC behavior.

24360

if (VT == MVT::f32 && Enabled == ReciprocalEstimate::Unspecified)

24361

return SDValue();

24362

24363

if (RefinementSteps == ReciprocalEstimate::Unspecified)

24364

RefinementSteps = 1;

24365

24366

// There is no FSQRT for 512-bits, but there is RCP14.

24367

unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RCP14 : X86ISD::FRCP;

24368

return DAG.getNode(Opcode, DL, VT, Op);

24369

}

24370

24371

if (VT.getScalarType() == MVT::f16 && isTypeLegal(VT) &&

24372

Subtarget.hasFP16()) {

24373

if (RefinementSteps == ReciprocalEstimate::Unspecified)

24374

RefinementSteps = 0;

24375

24376

if (VT == MVT::f16) {

24377

SDValue Zero = DAG.getIntPtrConstant(0, DL);

24378

SDValue Undef = DAG.getUNDEF(MVT::v8f16);

24379

Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v8f16, Op);

24380

Op = DAG.getNode(X86ISD::RCP14S, DL, MVT::v8f16, Undef, Op);

24381

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16, Op, Zero);

24382

}

24383

24384

return DAG.getNode(X86ISD::RCP14, DL, VT, Op);

24385

}

24386

return SDValue();

24387

}

24388

24389

/// If we have at least two divisions that use the same divisor, convert to

24390

/// multiplication by a reciprocal. This may need to be adjusted for a given

24391

/// CPU if a division's cost is not at least twice the cost of a multiplication.

24392

/// This is because we still need one division to calculate the reciprocal and

24393

/// then we need two multiplies by that reciprocal as replacements for the

24394

/// original divisions.

24395

unsigned X86TargetLowering::combineRepeatedFPDivisors() const {

24396

return 2;

24397

}

24398

24399

SDValue

24400

X86TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,

24401

SelectionDAG &DAG,

24402

SmallVectorImpl<SDNode *> &Created) const {

24403

AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();

24404

if (isIntDivCheap(N->getValueType(0), Attr))

24405

return SDValue(N,0); // Lower SDIV as SDIV

24406

24407

assert((Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()) &&(static_cast <bool> ((Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2
()) && "Unexpected divisor!") ? void (0) : __assert_fail
("(Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()) && \"Unexpected divisor!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24408, __extension__
__PRETTY_FUNCTION__))

24408

"Unexpected divisor!")(static_cast <bool> ((Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2
()) && "Unexpected divisor!") ? void (0) : __assert_fail
("(Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()) && \"Unexpected divisor!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24408, __extension__
__PRETTY_FUNCTION__));

24409

24410

// Only perform this transform if CMOV is supported otherwise the select

24411

// below will become a branch.

24412

if (!Subtarget.canUseCMOV())

24413

return SDValue();

24414

24415

// fold (sdiv X, pow2)

24416

EVT VT = N->getValueType(0);

24417

// FIXME: Support i8.

24418

if (VT != MVT::i16 && VT != MVT::i32 &&

24419

!(Subtarget.is64Bit() && VT == MVT::i64))

24420

return SDValue();

24421

24422

unsigned Lg2 = Divisor.countTrailingZeros();

24423

24424

// If the divisor is 2 or -2, the default expansion is better.

24425

if (Lg2 == 1)

24426

return SDValue();

24427

24428

SDLoc DL(N);

24429

SDValue N0 = N->getOperand(0);

24430

SDValue Zero = DAG.getConstant(0, DL, VT);

24431

APInt Lg2Mask = APInt::getLowBitsSet(VT.getSizeInBits(), Lg2);

24432

SDValue Pow2MinusOne = DAG.getConstant(Lg2Mask, DL, VT);

24433

24434

// If N0 is negative, we need to add (Pow2 - 1) to it before shifting right.

24435

SDValue Cmp = DAG.getSetCC(DL, MVT::i8, N0, Zero, ISD::SETLT);

24436

SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N0, Pow2MinusOne);

24437

SDValue CMov = DAG.getNode(ISD::SELECT, DL, VT, Cmp, Add, N0);

24438

24439

Created.push_back(Cmp.getNode());

24440

Created.push_back(Add.getNode());

24441

Created.push_back(CMov.getNode());

24442

24443

// Divide by pow2.

24444

SDValue SRA =

24445

DAG.getNode(ISD::SRA, DL, VT, CMov, DAG.getConstant(Lg2, DL, MVT::i8));

24446

24447

// If we're dividing by a positive value, we're done. Otherwise, we must

24448

// negate the result.

24449

if (Divisor.isNonNegative())

24450

return SRA;

24451

24452

Created.push_back(SRA.getNode());

24453

return DAG.getNode(ISD::SUB, DL, VT, Zero, SRA);

24454

}

24455

24456

/// Result of 'and' is compared against zero. Change to a BT node if possible.

24457

/// Returns the BT node and the condition code needed to use it.

24458

static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC, const SDLoc &dl,

24459

SelectionDAG &DAG, X86::CondCode &X86CC) {

24460

assert(And.getOpcode() == ISD::AND && "Expected AND node!")(static_cast <bool> (And.getOpcode() == ISD::AND &&
"Expected AND node!") ? void (0) : __assert_fail ("And.getOpcode() == ISD::AND && \"Expected AND node!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24460, __extension__
__PRETTY_FUNCTION__));

24461

SDValue Op0 = And.getOperand(0);

24462

SDValue Op1 = And.getOperand(1);

24463

if (Op0.getOpcode() == ISD::TRUNCATE)

24464

Op0 = Op0.getOperand(0);

24465

if (Op1.getOpcode() == ISD::TRUNCATE)

24466

Op1 = Op1.getOperand(0);

24467

24468

SDValue Src, BitNo;

24469

if (Op1.getOpcode() == ISD::SHL)

24470

std::swap(Op0, Op1);

24471

if (Op0.getOpcode() == ISD::SHL) {

24472

if (isOneConstant(Op0.getOperand(0))) {

24473

// If we looked past a truncate, check that it's only truncating away

24474

// known zeros.

24475

unsigned BitWidth = Op0.getValueSizeInBits();

24476

unsigned AndBitWidth = And.getValueSizeInBits();

24477

if (BitWidth > AndBitWidth) {

24478

KnownBits Known = DAG.computeKnownBits(Op0);

24479

if (Known.countMinLeadingZeros() < BitWidth - AndBitWidth)

24480

return SDValue();

24481

}

24482

Src = Op1;

24483

BitNo = Op0.getOperand(1);

24484

}

24485

} else if (Op1.getOpcode() == ISD::Constant) {

24486

ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);

24487

uint64_t AndRHSVal = AndRHS->getZExtValue();

24488

SDValue AndLHS = Op0;

24489

24490

if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) {

24491

Src = AndLHS.getOperand(0);

24492

BitNo = AndLHS.getOperand(1);

24493

} else {

24494

// Use BT if the immediate can't be encoded in a TEST instruction or we

24495

// are optimizing for size and the immedaite won't fit in a byte.

24496

bool OptForSize = DAG.shouldOptForSize();

24497

if ((!isUInt<32>(AndRHSVal) || (OptForSize && !isUInt<8>(AndRHSVal))) &&

24498

isPowerOf2_64(AndRHSVal)) {

24499

Src = AndLHS;

24500

BitNo = DAG.getConstant(Log2_64_Ceil(AndRHSVal), dl,

24501

Src.getValueType());

24502

}

24503

}

24504

}

24505

24506

// No patterns found, give up.

24507

if (!Src.getNode())

24508

return SDValue();

24509

24510

// Remove any bit flip.

24511

if (isBitwiseNot(Src)) {

24512

Src = Src.getOperand(0);

24513

CC = CC == ISD::SETEQ ? ISD::SETNE : ISD::SETEQ;

24514

}

24515

24516

// Attempt to create the X86ISD::BT node.

24517

if (SDValue BT = getBT(Src, BitNo, dl, DAG)) {

24518

X86CC = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;

24519

return BT;

24520

}

24521

24522

return SDValue();

24523

}

24524

24525

// Check if pre-AVX condcode can be performed by a single FCMP op.

24526

static bool cheapX86FSETCC_SSE(ISD::CondCode SetCCOpcode) {

24527

return (SetCCOpcode != ISD::SETONE) && (SetCCOpcode != ISD::SETUEQ);

24528

}

24529

24530

/// Turns an ISD::CondCode into a value suitable for SSE floating-point mask

24531

/// CMPs.

24532

static unsigned translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,

24533

SDValue &Op1, bool &IsAlwaysSignaling) {

24534

unsigned SSECC;

24535

bool Swap = false;

24536

24537

// SSE Condition code mapping:

24538

// 0 - EQ

24539

// 1 - LT

24540

// 2 - LE

24541

// 3 - UNORD

24542

// 4 - NEQ

24543

// 5 - NLT

24544

// 6 - NLE

24545

// 7 - ORD

24546

switch (SetCCOpcode) {

24547

default: llvm_unreachable("Unexpected SETCC condition")::llvm::llvm_unreachable_internal("Unexpected SETCC condition"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24547);

24548

case ISD::SETOEQ:

24549

case ISD::SETEQ: SSECC = 0; break;

24550

case ISD::SETOGT:

24551

case ISD::SETGT: Swap = true; [[fallthrough]];

24552

case ISD::SETLT:

24553

case ISD::SETOLT: SSECC = 1; break;

24554

case ISD::SETOGE:

24555

case ISD::SETGE: Swap = true; [[fallthrough]];

24556

case ISD::SETLE:

24557

case ISD::SETOLE: SSECC = 2; break;

24558

case ISD::SETUO: SSECC = 3; break;

24559

case ISD::SETUNE:

24560

case ISD::SETNE: SSECC = 4; break;

24561

case ISD::SETULE: Swap = true; [[fallthrough]];

24562

case ISD::SETUGE: SSECC = 5; break;

24563

case ISD::SETULT: Swap = true; [[fallthrough]];

24564

case ISD::SETUGT: SSECC = 6; break;

24565

case ISD::SETO: SSECC = 7; break;

24566

case ISD::SETUEQ: SSECC = 8; break;

24567

case ISD::SETONE: SSECC = 12; break;

24568

}

24569

if (Swap)

24570

std::swap(Op0, Op1);

24571

24572

switch (SetCCOpcode) {

24573

default:

24574

IsAlwaysSignaling = true;

24575

break;

24576

case ISD::SETEQ:

24577

case ISD::SETOEQ:

24578

case ISD::SETUEQ:

24579

case ISD::SETNE:

24580

case ISD::SETONE:

24581

case ISD::SETUNE:

24582

case ISD::SETO:

24583

case ISD::SETUO:

24584

IsAlwaysSignaling = false;

24585

break;

24586

}

24587

24588

return SSECC;

24589

}

24590

24591

/// Break a VSETCC 256-bit integer VSETCC into two new 128 ones and then

24592

/// concatenate the result back.

24593

static SDValue splitIntVSETCC(EVT VT, SDValue LHS, SDValue RHS,

24594

ISD::CondCode Cond, SelectionDAG &DAG,

24595

const SDLoc &dl) {

24596

assert(VT.isInteger() && VT == LHS.getValueType() &&(static_cast <bool> (VT.isInteger() && VT == LHS
.getValueType() && VT == RHS.getValueType() &&
"Unsupported VTs!") ? void (0) : __assert_fail ("VT.isInteger() && VT == LHS.getValueType() && VT == RHS.getValueType() && \"Unsupported VTs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24597, __extension__
__PRETTY_FUNCTION__))

24597

VT == RHS.getValueType() && "Unsupported VTs!")(static_cast <bool> (VT.isInteger() && VT == LHS
.getValueType() && VT == RHS.getValueType() &&
"Unsupported VTs!") ? void (0) : __assert_fail ("VT.isInteger() && VT == LHS.getValueType() && VT == RHS.getValueType() && \"Unsupported VTs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24597, __extension__
__PRETTY_FUNCTION__));

24598

24599

SDValue CC = DAG.getCondCode(Cond);

24600

24601

// Extract the LHS Lo/Hi vectors

24602

SDValue LHS1, LHS2;

24603

std::tie(LHS1, LHS2) = splitVector(LHS, DAG, dl);

24604

24605

// Extract the RHS Lo/Hi vectors

24606

SDValue RHS1, RHS2;

24607

std::tie(RHS1, RHS2) = splitVector(RHS, DAG, dl);

24608

24609

// Issue the operation on the smaller types and concatenate the result back

24610

EVT LoVT, HiVT;

24611

std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);

24612

return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,

24613

DAG.getNode(ISD::SETCC, dl, LoVT, LHS1, RHS1, CC),

24614

DAG.getNode(ISD::SETCC, dl, HiVT, LHS2, RHS2, CC));

24615

}

24616

24617

static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {

24618

24619

SDValue Op0 = Op.getOperand(0);

24620

SDValue Op1 = Op.getOperand(1);

24621

SDValue CC = Op.getOperand(2);

24622

MVT VT = Op.getSimpleValueType();

24623

SDLoc dl(Op);

24624

24625

assert(VT.getVectorElementType() == MVT::i1 &&(static_cast <bool> (VT.getVectorElementType() == MVT::
i1 && "Cannot set masked compare for this operation")
? void (0) : __assert_fail ("VT.getVectorElementType() == MVT::i1 && \"Cannot set masked compare for this operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24626, __extension__
__PRETTY_FUNCTION__))

24626

"Cannot set masked compare for this operation")(static_cast <bool> (VT.getVectorElementType() == MVT::
i1 && "Cannot set masked compare for this operation")
? void (0) : __assert_fail ("VT.getVectorElementType() == MVT::i1 && \"Cannot set masked compare for this operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24626, __extension__
__PRETTY_FUNCTION__));

24627

24628

ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();

24629

24630

// Prefer SETGT over SETLT.

24631

if (SetCCOpcode == ISD::SETLT) {

24632

SetCCOpcode = ISD::getSetCCSwappedOperands(SetCCOpcode);

24633

std::swap(Op0, Op1);

24634

}

24635

24636

return DAG.getSetCC(dl, VT, Op0, Op1, SetCCOpcode);

24637

}

24638

24639

/// Given a buildvector constant, return a new vector constant with each element

24640

/// incremented or decremented. If incrementing or decrementing would result in

24641

/// unsigned overflow or underflow or this is not a simple vector constant,

24642

/// return an empty value.

24643

static SDValue incDecVectorConstant(SDValue V, SelectionDAG &DAG, bool IsInc) {

24644

auto *BV = dyn_cast<BuildVectorSDNode>(V.getNode());

24645

if (!BV)

24646

return SDValue();

24647

24648

MVT VT = V.getSimpleValueType();

24649

MVT EltVT = VT.getVectorElementType();

24650

unsigned NumElts = VT.getVectorNumElements();

24651

SmallVector<SDValue, 8> NewVecC;

24652

SDLoc DL(V);

24653

for (unsigned i = 0; i < NumElts; ++i) {

24654

auto *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i));

24655

if (!Elt || Elt->isOpaque() || Elt->getSimpleValueType(0) != EltVT)

24656

return SDValue();

24657

24658

// Avoid overflow/underflow.

24659

const APInt &EltC = Elt->getAPIntValue();

24660

if ((IsInc && EltC.isMaxValue()) || (!IsInc && EltC.isZero()))

24661

return SDValue();

24662

24663

NewVecC.push_back(DAG.getConstant(EltC + (IsInc ? 1 : -1), DL, EltVT));

24664

}

24665

24666

return DAG.getBuildVector(VT, DL, NewVecC);

24667

}

24668

24669

/// As another special case, use PSUBUS[BW] when it's profitable. E.g. for

24670

/// Op0 u<= Op1:

24671

/// t = psubus Op0, Op1

24672

/// pcmpeq t, <0..0>

24673

static SDValue LowerVSETCCWithSUBUS(SDValue Op0, SDValue Op1, MVT VT,

24674

ISD::CondCode Cond, const SDLoc &dl,

24675

const X86Subtarget &Subtarget,

24676

SelectionDAG &DAG) {

24677

if (!Subtarget.hasSSE2())

24678

return SDValue();

24679

24680

MVT VET = VT.getVectorElementType();

24681

if (VET != MVT::i8 && VET != MVT::i16)

24682

return SDValue();

24683

24684

switch (Cond) {

24685

default:

24686

return SDValue();

24687

case ISD::SETULT: {

24688

// If the comparison is against a constant we can turn this into a

24689

// setule. With psubus, setule does not require a swap. This is

24690

// beneficial because the constant in the register is no longer

24691

// destructed as the destination so it can be hoisted out of a loop.

24692

// Only do this pre-AVX since vpcmp* is no longer destructive.

24693

if (Subtarget.hasAVX())

24694

return SDValue();

24695

SDValue ULEOp1 = incDecVectorConstant(Op1, DAG, /*IsInc*/false);

24696

if (!ULEOp1)

24697

return SDValue();

24698

Op1 = ULEOp1;

24699

break;

24700

}

24701

case ISD::SETUGT: {

24702

// If the comparison is against a constant, we can turn this into a setuge.

24703

// This is beneficial because materializing a constant 0 for the PCMPEQ is

24704

// probably cheaper than XOR+PCMPGT using 2 different vector constants:

24705

// cmpgt (xor X, SignMaskC) CmpC --> cmpeq (usubsat (CmpC+1), X), 0

24706

SDValue UGEOp1 = incDecVectorConstant(Op1, DAG, /*IsInc*/true);

24707

if (!UGEOp1)

24708

return SDValue();

24709

Op1 = Op0;

24710

Op0 = UGEOp1;

24711

break;

24712

}

24713

// Psubus is better than flip-sign because it requires no inversion.

24714

case ISD::SETUGE:

24715

std::swap(Op0, Op1);

24716

break;

24717

case ISD::SETULE:

24718

break;

24719

}

24720

24721

SDValue Result = DAG.getNode(ISD::USUBSAT, dl, VT, Op0, Op1);

24722

return DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result,

24723

DAG.getConstant(0, dl, VT));

24724

}

24725

24726

static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,

24727

SelectionDAG &DAG) {

24728

bool IsStrict = Op.getOpcode() == ISD::STRICT_FSETCC ||

24729

Op.getOpcode() == ISD::STRICT_FSETCCS;

24730

SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0);

24731

SDValue Op1 = Op.getOperand(IsStrict ? 2 : 1);

24732

SDValue CC = Op.getOperand(IsStrict ? 3 : 2);

24733

MVT VT = Op->getSimpleValueType(0);

24734

ISD::CondCode Cond = cast<CondCodeSDNode>(CC)->get();

24735

bool isFP = Op1.getSimpleValueType().isFloatingPoint();

24736

SDLoc dl(Op);

24737

24738

if (isFP) {

24739

MVT EltVT = Op0.getSimpleValueType().getVectorElementType();

24740

assert(EltVT == MVT::f16 || EltVT == MVT::f32 || EltVT == MVT::f64)(static_cast <bool> (EltVT == MVT::f16 || EltVT == MVT::
f32 || EltVT == MVT::f64) ? void (0) : __assert_fail ("EltVT == MVT::f16 || EltVT == MVT::f32 || EltVT == MVT::f64"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24740, __extension__
__PRETTY_FUNCTION__));

24741

if (isSoftFP16(EltVT, Subtarget))

24742

return SDValue();

24743

24744

bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;

24745

SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();

24746

24747

// If we have a strict compare with a vXi1 result and the input is 128/256

24748

// bits we can't use a masked compare unless we have VLX. If we use a wider

24749

// compare like we do for non-strict, we might trigger spurious exceptions

24750

// from the upper elements. Instead emit a AVX compare and convert to mask.

24751

unsigned Opc;

24752

if (Subtarget.hasAVX512() && VT.getVectorElementType() == MVT::i1 &&

24753

(!IsStrict || Subtarget.hasVLX() ||

24754

Op0.getSimpleValueType().is512BitVector())) {

24755

#ifndef NDEBUG

24756

unsigned Num = VT.getVectorNumElements();

24757

assert(Num <= 16 || (Num == 32 && EltVT == MVT::f16))(static_cast <bool> (Num <= 16 || (Num == 32 &&
EltVT == MVT::f16)) ? void (0) : __assert_fail ("Num <= 16 || (Num == 32 && EltVT == MVT::f16)"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24757, __extension__
__PRETTY_FUNCTION__));

24758

#endif

24759

Opc = IsStrict ? X86ISD::STRICT_CMPM : X86ISD::CMPM;

24760

} else {

24761

Opc = IsStrict ? X86ISD::STRICT_CMPP : X86ISD::CMPP;

24762

// The SSE/AVX packed FP comparison nodes are defined with a

24763

// floating-point vector result that matches the operand type. This allows

24764

// them to work with an SSE1 target (integer vector types are not legal).

24765

VT = Op0.getSimpleValueType();

24766

}

24767

24768

SDValue Cmp;

24769

bool IsAlwaysSignaling;

24770

unsigned SSECC = translateX86FSETCC(Cond, Op0, Op1, IsAlwaysSignaling);

24771

if (!Subtarget.hasAVX()) {

24772

// TODO: We could use following steps to handle a quiet compare with

24773

// signaling encodings.

24774

// 1. Get ordered masks from a quiet ISD::SETO

24775

// 2. Use the masks to mask potential unordered elements in operand A, B

24776

// 3. Get the compare results of masked A, B

24777

// 4. Calculating final result using the mask and result from 3

24778

// But currently, we just fall back to scalar operations.

24779

if (IsStrict && IsAlwaysSignaling && !IsSignaling)

24780

return SDValue();

24781

24782

// Insert an extra signaling instruction to raise exception.

24783

if (IsStrict && !IsAlwaysSignaling && IsSignaling) {

24784

SDValue SignalCmp = DAG.getNode(

24785

Opc, dl, {VT, MVT::Other},

24786

{Chain, Op0, Op1, DAG.getTargetConstant(1, dl, MVT::i8)}); // LT_OS

24787

// FIXME: It seems we need to update the flags of all new strict nodes.

24788

// Otherwise, mayRaiseFPException in MI will return false due to

24789

// NoFPExcept = false by default. However, I didn't find it in other

24790

// patches.

24791

SignalCmp->setFlags(Op->getFlags());

24792

Chain = SignalCmp.getValue(1);

24793

}

24794

24795

// In the two cases not handled by SSE compare predicates (SETUEQ/SETONE),

24796

// emit two comparisons and a logic op to tie them together.

24797

if (!cheapX86FSETCC_SSE(Cond)) {

24798

// LLVM predicate is SETUEQ or SETONE.

24799

unsigned CC0, CC1;

24800

unsigned CombineOpc;

24801

if (Cond == ISD::SETUEQ) {

24802

CC0 = 3; // UNORD

24803

CC1 = 0; // EQ

24804

CombineOpc = X86ISD::FOR;

24805

} else {

24806

assert(Cond == ISD::SETONE)(static_cast <bool> (Cond == ISD::SETONE) ? void (0) : __assert_fail
("Cond == ISD::SETONE", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 24806, __extension__ __PRETTY_FUNCTION__));

24807

CC0 = 7; // ORD

24808

CC1 = 4; // NEQ

24809

CombineOpc = X86ISD::FAND;

24810

}

24811

24812

SDValue Cmp0, Cmp1;

24813

if (IsStrict) {

24814

Cmp0 = DAG.getNode(

24815

Opc, dl, {VT, MVT::Other},

24816

{Chain, Op0, Op1, DAG.getTargetConstant(CC0, dl, MVT::i8)});

24817

Cmp1 = DAG.getNode(

24818

Opc, dl, {VT, MVT::Other},

24819

{Chain, Op0, Op1, DAG.getTargetConstant(CC1, dl, MVT::i8)});

24820

Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Cmp0.getValue(1),

24821

Cmp1.getValue(1));

24822

} else {

24823

Cmp0 = DAG.getNode(

24824

Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(CC0, dl, MVT::i8));

24825

Cmp1 = DAG.getNode(

24826

Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(CC1, dl, MVT::i8));

24827

}

24828

Cmp = DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);

24829

} else {

24830

if (IsStrict) {

24831

Cmp = DAG.getNode(

24832

Opc, dl, {VT, MVT::Other},

24833

{Chain, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)});

24834

Chain = Cmp.getValue(1);

24835

} else

24836

Cmp = DAG.getNode(

24837

Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8));

24838

}

24839

} else {

24840

// Handle all other FP comparisons here.

24841

if (IsStrict) {

24842

// Make a flip on already signaling CCs before setting bit 4 of AVX CC.

24843

SSECC |= (IsAlwaysSignaling ^ IsSignaling) << 4;

24844

Cmp = DAG.getNode(

24845

Opc, dl, {VT, MVT::Other},

24846

{Chain, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)});

24847

Chain = Cmp.getValue(1);

24848

} else

24849

Cmp = DAG.getNode(

24850

Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8));

24851

}

24852

24853

if (VT.getFixedSizeInBits() >

24854

Op.getSimpleValueType().getFixedSizeInBits()) {

24855

// We emitted a compare with an XMM/YMM result. Finish converting to a

24856

// mask register using a vptestm.

24857

EVT CastVT = EVT(VT).changeVectorElementTypeToInteger();

24858

Cmp = DAG.getBitcast(CastVT, Cmp);

24859

Cmp = DAG.getSetCC(dl, Op.getSimpleValueType(), Cmp,

24860

DAG.getConstant(0, dl, CastVT), ISD::SETNE);

24861

} else {

24862

// If this is SSE/AVX CMPP, bitcast the result back to integer to match

24863

// the result type of SETCC. The bitcast is expected to be optimized

24864

// away during combining/isel.

24865

Cmp = DAG.getBitcast(Op.getSimpleValueType(), Cmp);

24866

}

24867

24868

if (IsStrict)

24869

return DAG.getMergeValues({Cmp, Chain}, dl);

24870

24871

return Cmp;

24872

}

24873

24874

assert(!IsStrict && "Strict SETCC only handles FP operands.")(static_cast <bool> (!IsStrict && "Strict SETCC only handles FP operands."
) ? void (0) : __assert_fail ("!IsStrict && \"Strict SETCC only handles FP operands.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24874, __extension__
__PRETTY_FUNCTION__));

24875

24876

MVT VTOp0 = Op0.getSimpleValueType();

24877

(void)VTOp0;

24878

assert(VTOp0 == Op1.getSimpleValueType() &&(static_cast <bool> (VTOp0 == Op1.getSimpleValueType() &&
"Expected operands with same type!") ? void (0) : __assert_fail
("VTOp0 == Op1.getSimpleValueType() && \"Expected operands with same type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24879, __extension__
__PRETTY_FUNCTION__))

24879

"Expected operands with same type!")(static_cast <bool> (VTOp0 == Op1.getSimpleValueType() &&
"Expected operands with same type!") ? void (0) : __assert_fail
("VTOp0 == Op1.getSimpleValueType() && \"Expected operands with same type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24879, __extension__
__PRETTY_FUNCTION__));

24880

assert(VT.getVectorNumElements() == VTOp0.getVectorNumElements() &&(static_cast <bool> (VT.getVectorNumElements() == VTOp0
.getVectorNumElements() && "Invalid number of packed elements for source and destination!"
) ? void (0) : __assert_fail ("VT.getVectorNumElements() == VTOp0.getVectorNumElements() && \"Invalid number of packed elements for source and destination!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24881, __extension__
__PRETTY_FUNCTION__))

24881

"Invalid number of packed elements for source and destination!")(static_cast <bool> (VT.getVectorNumElements() == VTOp0
.getVectorNumElements() && "Invalid number of packed elements for source and destination!"
) ? void (0) : __assert_fail ("VT.getVectorNumElements() == VTOp0.getVectorNumElements() && \"Invalid number of packed elements for source and destination!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24881, __extension__
__PRETTY_FUNCTION__));

24882

24883

// The non-AVX512 code below works under the assumption that source and

24884

// destination types are the same.

24885

assert((Subtarget.hasAVX512() || (VT == VTOp0)) &&(static_cast <bool> ((Subtarget.hasAVX512() || (VT == VTOp0
)) && "Value types for source and destination must be the same!"
) ? void (0) : __assert_fail ("(Subtarget.hasAVX512() || (VT == VTOp0)) && \"Value types for source and destination must be the same!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24886, __extension__
__PRETTY_FUNCTION__))

24886

"Value types for source and destination must be the same!")(static_cast <bool> ((Subtarget.hasAVX512() || (VT == VTOp0
)) && "Value types for source and destination must be the same!"
) ? void (0) : __assert_fail ("(Subtarget.hasAVX512() || (VT == VTOp0)) && \"Value types for source and destination must be the same!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24886, __extension__
__PRETTY_FUNCTION__));

24887

24888

// The result is boolean, but operands are int/float

24889

if (VT.getVectorElementType() == MVT::i1) {

24890

// In AVX-512 architecture setcc returns mask with i1 elements,

24891

// But there is no compare instruction for i8 and i16 elements in KNL.

24892

assert((VTOp0.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) &&(static_cast <bool> ((VTOp0.getScalarSizeInBits() >=
32 || Subtarget.hasBWI()) && "Unexpected operand type"
) ? void (0) : __assert_fail ("(VTOp0.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) && \"Unexpected operand type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24893, __extension__
__PRETTY_FUNCTION__))

24893

"Unexpected operand type")(static_cast <bool> ((VTOp0.getScalarSizeInBits() >=
32 || Subtarget.hasBWI()) && "Unexpected operand type"
) ? void (0) : __assert_fail ("(VTOp0.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) && \"Unexpected operand type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24893, __extension__
__PRETTY_FUNCTION__));

24894

return LowerIntVSETCC_AVX512(Op, DAG);

24895

}

24896

24897

// Lower using XOP integer comparisons.

24898

if (VT.is128BitVector() && Subtarget.hasXOP()) {

24899

// Translate compare code to XOP PCOM compare mode.

24900

unsigned CmpMode = 0;

24901

switch (Cond) {

24902

default: llvm_unreachable("Unexpected SETCC condition")::llvm::llvm_unreachable_internal("Unexpected SETCC condition"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 24902);

24903

case ISD::SETULT:

24904

case ISD::SETLT: CmpMode = 0x00; break;

24905

case ISD::SETULE:

24906

case ISD::SETLE: CmpMode = 0x01; break;

24907

case ISD::SETUGT:

24908

case ISD::SETGT: CmpMode = 0x02; break;

24909

case ISD::SETUGE:

24910

case ISD::SETGE: CmpMode = 0x03; break;

24911

case ISD::SETEQ: CmpMode = 0x04; break;

24912

case ISD::SETNE: CmpMode = 0x05; break;

24913

}

24914

24915

// Are we comparing unsigned or signed integers?

24916

unsigned Opc =

24917

ISD::isUnsignedIntSetCC(Cond) ? X86ISD::VPCOMU : X86ISD::VPCOM;

24918

24919

return DAG.getNode(Opc, dl, VT, Op0, Op1,

24920

DAG.getTargetConstant(CmpMode, dl, MVT::i8));

24921

}

24922

24923

// (X & Y) != 0 --> (X & Y) == Y iff Y is power-of-2.

24924

// Revert part of the simplifySetCCWithAnd combine, to avoid an invert.

24925

if (Cond == ISD::SETNE && ISD::isBuildVectorAllZeros(Op1.getNode())) {

24926

SDValue BC0 = peekThroughBitcasts(Op0);

24927

if (BC0.getOpcode() == ISD::AND) {

24928

APInt UndefElts;

24929

SmallVector<APInt, 64> EltBits;

24930

if (getTargetConstantBitsFromNode(BC0.getOperand(1),

24931

VT.getScalarSizeInBits(), UndefElts,

24932

EltBits, false, false)) {

24933

if (llvm::all_of(EltBits, [](APInt &V) { return V.isPowerOf2(); })) {

24934

Cond = ISD::SETEQ;

24935

Op1 = DAG.getBitcast(VT, BC0.getOperand(1));

24936

}

24937

}

24938

}

24939

}

24940

24941

// ICMP_EQ(AND(X,C),C) -> SRA(SHL(X,LOG2(C)),BW-1) iff C is power-of-2.

24942

if (Cond == ISD::SETEQ && Op0.getOpcode() == ISD::AND &&

24943

Op0.getOperand(1) == Op1 && Op0.hasOneUse()) {

24944

ConstantSDNode *C1 = isConstOrConstSplat(Op1);

24945

if (C1 && C1->getAPIntValue().isPowerOf2()) {

24946

unsigned BitWidth = VT.getScalarSizeInBits();

24947

unsigned ShiftAmt = BitWidth - C1->getAPIntValue().logBase2() - 1;

24948

24949

SDValue Result = Op0.getOperand(0);

24950

Result = DAG.getNode(ISD::SHL, dl, VT, Result,

24951

DAG.getConstant(ShiftAmt, dl, VT));

24952

Result = DAG.getNode(ISD::SRA, dl, VT, Result,

24953

DAG.getConstant(BitWidth - 1, dl, VT));

24954

return Result;

24955

}

24956

}

24957

24958

// Break 256-bit integer vector compare into smaller ones.

24959

if (VT.is256BitVector() && !Subtarget.hasInt256())

24960

return splitIntVSETCC(VT, Op0, Op1, Cond, DAG, dl);

24961

24962

// Break 512-bit integer vector compare into smaller ones.

24963

// TODO: Try harder to use VPCMPx + VPMOV2x?

24964

if (VT.is512BitVector())

24965

return splitIntVSETCC(VT, Op0, Op1, Cond, DAG, dl);

24966

24967

// If we have a limit constant, try to form PCMPGT (signed cmp) to avoid

24968

// not-of-PCMPEQ:

24969

// X != INT_MIN --> X >s INT_MIN

24970

// X != INT_MAX --> X <s INT_MAX --> INT_MAX >s X

24971

// +X != 0 --> +X >s 0

24972

APInt ConstValue;

24973

if (Cond == ISD::SETNE &&

24974

ISD::isConstantSplatVector(Op1.getNode(), ConstValue)) {

24975

if (ConstValue.isMinSignedValue())

24976

Cond = ISD::SETGT;

24977

else if (ConstValue.isMaxSignedValue())

24978

Cond = ISD::SETLT;

24979

else if (ConstValue.isZero() && DAG.SignBitIsZero(Op0))

24980

Cond = ISD::SETGT;

24981

}

24982

24983

// If both operands are known non-negative, then an unsigned compare is the

24984

// same as a signed compare and there's no need to flip signbits.

24985

// TODO: We could check for more general simplifications here since we're

24986

// computing known bits.

24987

bool FlipSigns = ISD::isUnsignedIntSetCC(Cond) &&

24988

!(DAG.SignBitIsZero(Op0) && DAG.SignBitIsZero(Op1));

24989

24990

// Special case: Use min/max operations for unsigned compares.

24991

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

24992

if (ISD::isUnsignedIntSetCC(Cond) &&

24993

(FlipSigns || ISD::isTrueWhenEqual(Cond)) &&

24994

TLI.isOperationLegal(ISD::UMIN, VT)) {

24995

// If we have a constant operand, increment/decrement it and change the

24996

// condition to avoid an invert.

24997

if (Cond == ISD::SETUGT) {

24998

// X > C --> X >= (C+1) --> X == umax(X, C+1)

24999

if (SDValue UGTOp1 = incDecVectorConstant(Op1, DAG, /*IsInc*/true)) {

25000

Op1 = UGTOp1;

25001

Cond = ISD::SETUGE;

25002

}

25003

}

25004

if (Cond == ISD::SETULT) {

25005

// X < C --> X <= (C-1) --> X == umin(X, C-1)

25006

if (SDValue ULTOp1 = incDecVectorConstant(Op1, DAG, /*IsInc*/false)) {

25007

Op1 = ULTOp1;

25008

Cond = ISD::SETULE;

25009

}

25010

}

25011

bool Invert = false;

25012

unsigned Opc;

25013

switch (Cond) {

25014

default: llvm_unreachable("Unexpected condition code")::llvm::llvm_unreachable_internal("Unexpected condition code"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25014);

25015

case ISD::SETUGT: Invert = true; [[fallthrough]];

25016

case ISD::SETULE: Opc = ISD::UMIN; break;

25017

case ISD::SETULT: Invert = true; [[fallthrough]];

25018

case ISD::SETUGE: Opc = ISD::UMAX; break;

25019

}

25020

25021

SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);

25022

Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result);

25023

25024

// If the logical-not of the result is required, perform that now.

25025

if (Invert)

25026

Result = DAG.getNOT(dl, Result, VT);

25027

25028

return Result;

25029

}

25030

25031

// Try to use SUBUS and PCMPEQ.

25032

if (FlipSigns)

25033

if (SDValue V =

25034

LowerVSETCCWithSUBUS(Op0, Op1, VT, Cond, dl, Subtarget, DAG))

25035

return V;

25036

25037

// We are handling one of the integer comparisons here. Since SSE only has

25038

// GT and EQ comparisons for integer, swapping operands and multiple

25039

// operations may be required for some comparisons.

25040

unsigned Opc = (Cond == ISD::SETEQ || Cond == ISD::SETNE) ? X86ISD::PCMPEQ

25041

: X86ISD::PCMPGT;

25042

bool Swap = Cond == ISD::SETLT || Cond == ISD::SETULT ||

25043

Cond == ISD::SETGE || Cond == ISD::SETUGE;

25044

bool Invert = Cond == ISD::SETNE ||

25045

(Cond != ISD::SETEQ && ISD::isTrueWhenEqual(Cond));

25046

25047

if (Swap)

25048

std::swap(Op0, Op1);

25049

25050

// Check that the operation in question is available (most are plain SSE2,

25051

// but PCMPGTQ and PCMPEQQ have different requirements).

25052

if (VT == MVT::v2i64) {

25053

if (Opc == X86ISD::PCMPGT && !Subtarget.hasSSE42()) {

25054

assert(Subtarget.hasSSE2() && "Don't know how to lower!")(static_cast <bool> (Subtarget.hasSSE2() && "Don't know how to lower!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"Don't know how to lower!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25054, __extension__
__PRETTY_FUNCTION__));

25055

25056

// Special case for sign bit test. We can use a v4i32 PCMPGT and shuffle

25057

// the odd elements over the even elements.

25058

if (!FlipSigns && !Invert && ISD::isBuildVectorAllZeros(Op0.getNode())) {

25059

Op0 = DAG.getConstant(0, dl, MVT::v4i32);

25060

Op1 = DAG.getBitcast(MVT::v4i32, Op1);

25061

25062

SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);

25063

static const int MaskHi[] = { 1, 1, 3, 3 };

25064

SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);

25065

25066

return DAG.getBitcast(VT, Result);

25067

}

25068

25069

if (!FlipSigns && !Invert && ISD::isBuildVectorAllOnes(Op1.getNode())) {

25070

Op0 = DAG.getBitcast(MVT::v4i32, Op0);

25071

Op1 = DAG.getConstant(-1, dl, MVT::v4i32);

25072

25073

SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);

25074

static const int MaskHi[] = { 1, 1, 3, 3 };

25075

SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);

25076

25077

return DAG.getBitcast(VT, Result);

25078

}

25079

25080

// Since SSE has no unsigned integer comparisons, we need to flip the sign

25081

// bits of the inputs before performing those operations. The lower

25082

// compare is always unsigned.

25083

SDValue SB = DAG.getConstant(FlipSigns ? 0x8000000080000000ULL

25084

: 0x0000000080000000ULL,

25085

dl, MVT::v2i64);

25086

25087

Op0 = DAG.getNode(ISD::XOR, dl, MVT::v2i64, Op0, SB);

25088

Op1 = DAG.getNode(ISD::XOR, dl, MVT::v2i64, Op1, SB);

25089

25090

// Cast everything to the right type.

25091

Op0 = DAG.getBitcast(MVT::v4i32, Op0);

25092

Op1 = DAG.getBitcast(MVT::v4i32, Op1);

25093

25094

// Emulate PCMPGTQ with (hi1 > hi2) | ((hi1 == hi2) & (lo1 > lo2))

25095

SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);

25096

SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1);

25097

25098

// Create masks for only the low parts/high parts of the 64 bit integers.

25099

static const int MaskHi[] = { 1, 1, 3, 3 };

25100

static const int MaskLo[] = { 0, 0, 2, 2 };

25101

SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi);

25102

SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);

25103

SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);

25104

25105

SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo);

25106

Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi);

25107

25108

if (Invert)

25109

Result = DAG.getNOT(dl, Result, MVT::v4i32);

25110

25111

return DAG.getBitcast(VT, Result);

25112

}

25113

25114

if (Opc == X86ISD::PCMPEQ && !Subtarget.hasSSE41()) {

25115

// If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with

25116

// pcmpeqd + pshufd + pand.

25117

assert(Subtarget.hasSSE2() && !FlipSigns && "Don't know how to lower!")(static_cast <bool> (Subtarget.hasSSE2() && !FlipSigns
&& "Don't know how to lower!") ? void (0) : __assert_fail
("Subtarget.hasSSE2() && !FlipSigns && \"Don't know how to lower!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25117, __extension__
__PRETTY_FUNCTION__));

25118

25119

// First cast everything to the right type.

25120

Op0 = DAG.getBitcast(MVT::v4i32, Op0);

25121

Op1 = DAG.getBitcast(MVT::v4i32, Op1);

25122

25123

// Do the compare.

25124

SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);

25125

25126

// Make sure the lower and upper halves are both all-ones.

25127

static const int Mask[] = { 1, 0, 3, 2 };

25128

SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask);

25129

Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf);

25130

25131

if (Invert)

25132

Result = DAG.getNOT(dl, Result, MVT::v4i32);

25133

25134

return DAG.getBitcast(VT, Result);

25135

}

25136

}

25137

25138

// Since SSE has no unsigned integer comparisons, we need to flip the sign

25139

// bits of the inputs before performing those operations.

25140

if (FlipSigns) {

25141

MVT EltVT = VT.getVectorElementType();

25142

SDValue SM = DAG.getConstant(APInt::getSignMask(EltVT.getSizeInBits()), dl,

25143

VT);

25144

Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SM);

25145

Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SM);

25146

}

25147

25148

SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);

25149

25150

// If the logical-not of the result is required, perform that now.

25151

if (Invert)

25152

Result = DAG.getNOT(dl, Result, VT);

25153

25154

return Result;

25155

}

25156

25157

// Try to select this as a KORTEST+SETCC or KTEST+SETCC if possible.

25158

static SDValue EmitAVX512Test(SDValue Op0, SDValue Op1, ISD::CondCode CC,

25159

const SDLoc &dl, SelectionDAG &DAG,

25160

const X86Subtarget &Subtarget,

25161

SDValue &X86CC) {

25162

// Only support equality comparisons.

25163

if (CC != ISD::SETEQ && CC != ISD::SETNE)

25164

return SDValue();

25165

25166

// Must be a bitcast from vXi1.

25167

if (Op0.getOpcode() != ISD::BITCAST)

25168

return SDValue();

25169

25170

Op0 = Op0.getOperand(0);

25171

MVT VT = Op0.getSimpleValueType();

25172

if (!(Subtarget.hasAVX512() && VT == MVT::v16i1) &&

25173

!(Subtarget.hasDQI() && VT == MVT::v8i1) &&

25174

!(Subtarget.hasBWI() && (VT == MVT::v32i1 || VT == MVT::v64i1)))

25175

return SDValue();

25176

25177

X86::CondCode X86Cond;

25178

if (isNullConstant(Op1)) {

25179

X86Cond = CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE;

25180

} else if (isAllOnesConstant(Op1)) {

25181

// C flag is set for all ones.

25182

X86Cond = CC == ISD::SETEQ ? X86::COND_B : X86::COND_AE;

25183

} else

25184

return SDValue();

25185

25186

// If the input is an AND, we can combine it's operands into the KTEST.

25187

bool KTestable = false;

25188

if (Subtarget.hasDQI() && (VT == MVT::v8i1 || VT == MVT::v16i1))

25189

KTestable = true;

25190

if (Subtarget.hasBWI() && (VT == MVT::v32i1 || VT == MVT::v64i1))

25191

KTestable = true;

25192

if (!isNullConstant(Op1))

25193

KTestable = false;

25194

if (KTestable && Op0.getOpcode() == ISD::AND && Op0.hasOneUse()) {

25195

SDValue LHS = Op0.getOperand(0);

25196

SDValue RHS = Op0.getOperand(1);

25197

X86CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8);

25198

return DAG.getNode(X86ISD::KTEST, dl, MVT::i32, LHS, RHS);

25199

}

25200

25201

// If the input is an OR, we can combine it's operands into the KORTEST.

25202

SDValue LHS = Op0;

25203

SDValue RHS = Op0;

25204

if (Op0.getOpcode() == ISD::OR && Op0.hasOneUse()) {

25205

LHS = Op0.getOperand(0);

25206

RHS = Op0.getOperand(1);

25207

}

25208

25209

X86CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8);

25210

return DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);

25211

}

25212

25213

/// Emit flags for the given setcc condition and operands. Also returns the

25214

/// corresponding X86 condition code constant in X86CC.

25215

SDValue X86TargetLowering::emitFlagsForSetcc(SDValue Op0, SDValue Op1,

25216

ISD::CondCode CC, const SDLoc &dl,

25217

SelectionDAG &DAG,

25218

SDValue &X86CC) const {

25219

// Optimize to BT if possible.

25220

// Lower (X & (1 << N)) == 0 to BT(X, N).

25221

// Lower ((X >>u N) & 1) != 0 to BT(X, N).

25222

// Lower ((X >>s N) & 1) != 0 to BT(X, N).

25223

if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() && isNullConstant(Op1) &&

25224

(CC == ISD::SETEQ || CC == ISD::SETNE)) {

25225

X86::CondCode X86CondCode;

25226

if (SDValue BT = LowerAndToBT(Op0, CC, dl, DAG, X86CondCode)) {

25227

X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8);

25228

return BT;

25229

}

25230

}

25231

25232

// Try to use PTEST/PMOVMSKB for a tree ORs equality compared with 0.

25233

// TODO: We could do AND tree with all 1s as well by using the C flag.

25234

if (isNullConstant(Op1) && (CC == ISD::SETEQ || CC == ISD::SETNE))

25235

if (SDValue CmpZ =

25236

MatchVectorAllZeroTest(Op0, CC, dl, Subtarget, DAG, X86CC))

25237

return CmpZ;

25238

25239

// Try to lower using KORTEST or KTEST.

25240

if (SDValue Test = EmitAVX512Test(Op0, Op1, CC, dl, DAG, Subtarget, X86CC))

25241

return Test;

25242

25243

// Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms of

25244

// these.

25245

if ((isOneConstant(Op1) || isNullConstant(Op1)) &&

25246

(CC == ISD::SETEQ || CC == ISD::SETNE)) {

25247

// If the input is a setcc, then reuse the input setcc or use a new one with

25248

// the inverted condition.

25249

if (Op0.getOpcode() == X86ISD::SETCC) {

25250

bool Invert = (CC == ISD::SETNE) ^ isNullConstant(Op1);

25251

25252

X86CC = Op0.getOperand(0);

25253

if (Invert) {

25254

X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0);

25255

CCode = X86::GetOppositeBranchCondition(CCode);

25256

X86CC = DAG.getTargetConstant(CCode, dl, MVT::i8);

25257

}

25258

25259

return Op0.getOperand(1);

25260

}

25261

}

25262

25263

// Try to use the carry flag from the add in place of an separate CMP for:

25264

// (seteq (add X, -1), -1). Similar for setne.

25265

if (isAllOnesConstant(Op1) && Op0.getOpcode() == ISD::ADD &&

25266

Op0.getOperand(1) == Op1 && (CC == ISD::SETEQ || CC == ISD::SETNE)) {

25267

if (isProfitableToUseFlagOp(Op0)) {

25268

SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);

25269

25270

SDValue New = DAG.getNode(X86ISD::ADD, dl, VTs, Op0.getOperand(0),

25271

Op0.getOperand(1));

25272

DAG.ReplaceAllUsesOfValueWith(SDValue(Op0.getNode(), 0), New);

25273

X86::CondCode CCode = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;

25274

X86CC = DAG.getTargetConstant(CCode, dl, MVT::i8);

25275

return SDValue(New.getNode(), 1);

25276

}

25277

}

25278

25279

X86::CondCode CondCode =

25280

TranslateX86CC(CC, dl, /*IsFP*/ false, Op0, Op1, DAG);

25281

assert(CondCode != X86::COND_INVALID && "Unexpected condition code!")(static_cast <bool> (CondCode != X86::COND_INVALID &&
"Unexpected condition code!") ? void (0) : __assert_fail ("CondCode != X86::COND_INVALID && \"Unexpected condition code!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25281, __extension__
__PRETTY_FUNCTION__));

25282

25283

SDValue EFLAGS = EmitCmp(Op0, Op1, CondCode, dl, DAG, Subtarget);

25284

X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);

25285

return EFLAGS;

25286

}

25287

25288

SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {

25289

25290

bool IsStrict = Op.getOpcode() == ISD::STRICT_FSETCC ||

25291

Op.getOpcode() == ISD::STRICT_FSETCCS;

25292

MVT VT = Op->getSimpleValueType(0);

25293

25294

if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);

25295

25296

assert(VT == MVT::i8 && "SetCC type must be 8-bit integer")(static_cast <bool> (VT == MVT::i8 && "SetCC type must be 8-bit integer"
) ? void (0) : __assert_fail ("VT == MVT::i8 && \"SetCC type must be 8-bit integer\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25296, __extension__
__PRETTY_FUNCTION__));

25297

SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();

25298

SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0);

25299

SDValue Op1 = Op.getOperand(IsStrict ? 2 : 1);

25300

SDLoc dl(Op);

25301

ISD::CondCode CC =

25302

cast<CondCodeSDNode>(Op.getOperand(IsStrict ? 3 : 2))->get();

25303

25304

if (isSoftFP16(Op0.getValueType()))

25305

return SDValue();

25306

25307

// Handle f128 first, since one possible outcome is a normal integer

25308

// comparison which gets handled by emitFlagsForSetcc.

25309

if (Op0.getValueType() == MVT::f128) {

25310

softenSetCCOperands(DAG, MVT::f128, Op0, Op1, CC, dl, Op0, Op1, Chain,

25311

Op.getOpcode() == ISD::STRICT_FSETCCS);

25312

25313

// If softenSetCCOperands returned a scalar, use it.

25314

if (!Op1.getNode()) {

25315

assert(Op0.getValueType() == Op.getValueType() &&(static_cast <bool> (Op0.getValueType() == Op.getValueType
() && "Unexpected setcc expansion!") ? void (0) : __assert_fail
("Op0.getValueType() == Op.getValueType() && \"Unexpected setcc expansion!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25316, __extension__
__PRETTY_FUNCTION__))

25316

"Unexpected setcc expansion!")(static_cast <bool> (Op0.getValueType() == Op.getValueType
() && "Unexpected setcc expansion!") ? void (0) : __assert_fail
("Op0.getValueType() == Op.getValueType() && \"Unexpected setcc expansion!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25316, __extension__
__PRETTY_FUNCTION__));

25317

if (IsStrict)

25318

return DAG.getMergeValues({Op0, Chain}, dl);

25319

return Op0;

25320

}

25321

}

25322

25323

if (Op0.getSimpleValueType().isInteger()) {

25324

// Attempt to canonicalize SGT/UGT -> SGE/UGE compares with constant which

25325

// reduces the number of EFLAGs bit reads (the GE conditions don't read ZF),

25326

// this may translate to less uops depending on uarch implementation. The

25327

// equivalent for SLE/ULE -> SLT/ULT isn't likely to happen as we already

25328

// canonicalize to that CondCode.

25329

// NOTE: Only do this if incrementing the constant doesn't increase the bit

25330

// encoding size - so it must either already be a i8 or i32 immediate, or it

25331

// shrinks down to that. We don't do this for any i64's to avoid additional

25332

// constant materializations.

25333

// TODO: Can we move this to TranslateX86CC to handle jumps/branches too?

25334

if (auto *Op1C = dyn_cast<ConstantSDNode>(Op1)) {

25335

const APInt &Op1Val = Op1C->getAPIntValue();

25336

if (!Op1Val.isZero()) {

25337

// Ensure the constant+1 doesn't overflow.

25338

if ((CC == ISD::CondCode::SETGT && !Op1Val.isMaxSignedValue()) ||

25339

(CC == ISD::CondCode::SETUGT && !Op1Val.isMaxValue())) {

25340

APInt Op1ValPlusOne = Op1Val + 1;

25341

if (Op1ValPlusOne.isSignedIntN(32) &&

25342

(!Op1Val.isSignedIntN(8) || Op1ValPlusOne.isSignedIntN(8))) {

25343

Op1 = DAG.getConstant(Op1ValPlusOne, dl, Op0.getValueType());

25344

CC = CC == ISD::CondCode::SETGT ? ISD::CondCode::SETGE

25345

: ISD::CondCode::SETUGE;

25346

}

25347

}

25348

}

25349

}

25350

25351

SDValue X86CC;

25352

SDValue EFLAGS = emitFlagsForSetcc(Op0, Op1, CC, dl, DAG, X86CC);

25353

SDValue Res = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS);

25354

return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;

25355

}

25356

25357

// Handle floating point.

25358

X86::CondCode CondCode = TranslateX86CC(CC, dl, /*IsFP*/ true, Op0, Op1, DAG);

25359

if (CondCode == X86::COND_INVALID)

25360

return SDValue();

25361

25362

SDValue EFLAGS;

25363

if (IsStrict) {

25364

bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;

25365

EFLAGS =

25366

DAG.getNode(IsSignaling ? X86ISD::STRICT_FCMPS : X86ISD::STRICT_FCMP,

25367

dl, {MVT::i32, MVT::Other}, {Chain, Op0, Op1});

25368

Chain = EFLAGS.getValue(1);

25369

} else {

25370

EFLAGS = DAG.getNode(X86ISD::FCMP, dl, MVT::i32, Op0, Op1);

25371

}

25372

25373

SDValue X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);

25374

SDValue Res = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS);

25375

return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;

25376

}

25377

25378

SDValue X86TargetLowering::LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const {

25379

SDValue LHS = Op.getOperand(0);

25380

SDValue RHS = Op.getOperand(1);

25381

SDValue Carry = Op.getOperand(2);

25382

SDValue Cond = Op.getOperand(3);

25383

SDLoc DL(Op);

25384

25385

assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only.")(static_cast <bool> (LHS.getSimpleValueType().isInteger
() && "SETCCCARRY is integer only.") ? void (0) : __assert_fail
("LHS.getSimpleValueType().isInteger() && \"SETCCCARRY is integer only.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25385, __extension__
__PRETTY_FUNCTION__));

25386

X86::CondCode CC = TranslateIntegerX86CC(cast<CondCodeSDNode>(Cond)->get());

25387

25388

// Recreate the carry if needed.

25389

EVT CarryVT = Carry.getValueType();

25390

Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),

25391

Carry, DAG.getAllOnesConstant(DL, CarryVT));

25392

25393

SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);

25394

SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry.getValue(1));

25395

return getSETCC(CC, Cmp.getValue(1), DL, DAG);

25396

}

25397

25398

// This function returns three things: the arithmetic computation itself

25399

// (Value), an EFLAGS result (Overflow), and a condition code (Cond). The

25400

// flag and the condition code define the case in which the arithmetic

25401

// computation overflows.

25402

static std::pair<SDValue, SDValue>

25403

getX86XALUOOp(X86::CondCode &Cond, SDValue Op, SelectionDAG &DAG) {

25404

assert(Op.getResNo() == 0 && "Unexpected result number!")(static_cast <bool> (Op.getResNo() == 0 && "Unexpected result number!"
) ? void (0) : __assert_fail ("Op.getResNo() == 0 && \"Unexpected result number!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25404, __extension__
__PRETTY_FUNCTION__));

25405

SDValue Value, Overflow;

25406

SDValue LHS = Op.getOperand(0);

25407

SDValue RHS = Op.getOperand(1);

25408

unsigned BaseOp = 0;

25409

SDLoc DL(Op);

25410

switch (Op.getOpcode()) {

25411

default: llvm_unreachable("Unknown ovf instruction!")::llvm::llvm_unreachable_internal("Unknown ovf instruction!",
"llvm/lib/Target/X86/X86ISelLowering.cpp", 25411);

25412

case ISD::SADDO:

25413

BaseOp = X86ISD::ADD;

25414

Cond = X86::COND_O;

25415

break;

25416

case ISD::UADDO:

25417

BaseOp = X86ISD::ADD;

25418

Cond = isOneConstant(RHS) ? X86::COND_E : X86::COND_B;

25419

break;

25420

case ISD::SSUBO:

25421

BaseOp = X86ISD::SUB;

25422

Cond = X86::COND_O;

25423

break;

25424

case ISD::USUBO:

25425

BaseOp = X86ISD::SUB;

25426

Cond = X86::COND_B;

25427

break;

25428

case ISD::SMULO:

25429

BaseOp = X86ISD::SMUL;

25430

Cond = X86::COND_O;

25431

break;

25432

case ISD::UMULO:

25433

BaseOp = X86ISD::UMUL;

25434

Cond = X86::COND_O;

25435

break;

25436

}

25437

25438

if (BaseOp) {

25439

// Also sets EFLAGS.

25440

SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);

25441

Value = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);

25442

Overflow = Value.getValue(1);

25443

}

25444

25445

return std::make_pair(Value, Overflow);

25446

}

25447

25448

static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {

25449

// Lower the "add/sub/mul with overflow" instruction into a regular ins plus

25450

// a "setcc" instruction that checks the overflow flag. The "brcond" lowering

25451

// looks for this combo and may remove the "setcc" instruction if the "setcc"

25452

// has only one use.

25453

SDLoc DL(Op);

25454

X86::CondCode Cond;

25455

SDValue Value, Overflow;

25456

std::tie(Value, Overflow) = getX86XALUOOp(Cond, Op, DAG);

25457

25458

SDValue SetCC = getSETCC(Cond, Overflow, DL, DAG);

25459

assert(Op->getValueType(1) == MVT::i8 && "Unexpected VT!")(static_cast <bool> (Op->getValueType(1) == MVT::i8 &&
"Unexpected VT!") ? void (0) : __assert_fail ("Op->getValueType(1) == MVT::i8 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25459, __extension__
__PRETTY_FUNCTION__));

25460

return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Value, SetCC);

25461

}

25462

25463

/// Return true if opcode is a X86 logical comparison.

25464

static bool isX86LogicalCmp(SDValue Op) {

25465

unsigned Opc = Op.getOpcode();

25466

if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI ||

25467

Opc == X86ISD::FCMP)

25468

return true;

25469

if (Op.getResNo() == 1 &&

25470

(Opc == X86ISD::ADD || Opc == X86ISD::SUB || Opc == X86ISD::ADC ||

25471

Opc == X86ISD::SBB || Opc == X86ISD::SMUL || Opc == X86ISD::UMUL ||

25472

Opc == X86ISD::OR || Opc == X86ISD::XOR || Opc == X86ISD::AND))

25473

return true;

25474

25475

return false;

25476

}

25477

25478

static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) {

25479

if (V.getOpcode() != ISD::TRUNCATE)

25480

return false;

25481

25482

SDValue VOp0 = V.getOperand(0);

25483

unsigned InBits = VOp0.getValueSizeInBits();

25484

unsigned Bits = V.getValueSizeInBits();

25485

return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits));

25486

}

25487

25488

SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {

25489

bool AddTest = true;

25490

SDValue Cond = Op.getOperand(0);

25491

SDValue Op1 = Op.getOperand(1);

25492

SDValue Op2 = Op.getOperand(2);

25493

SDLoc DL(Op);

25494

MVT VT = Op1.getSimpleValueType();

25495

SDValue CC;

25496

25497

if (isSoftFP16(VT)) {

25498

MVT NVT = VT.changeTypeToInteger();

25499

return DAG.getBitcast(VT, DAG.getNode(ISD::SELECT, DL, NVT, Cond,

25500

DAG.getBitcast(NVT, Op1),

25501

DAG.getBitcast(NVT, Op2)));

25502

}

25503

25504

// Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops

25505

// are available or VBLENDV if AVX is available.

25506

// Otherwise FP cmovs get lowered into a less efficient branch sequence later.

25507

if (Cond.getOpcode() == ISD::SETCC && isScalarFPTypeInSSEReg(VT) &&

25508

VT == Cond.getOperand(0).getSimpleValueType() && Cond->hasOneUse()) {

25509

SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);

25510

bool IsAlwaysSignaling;

25511

unsigned SSECC =

25512

translateX86FSETCC(cast<CondCodeSDNode>(Cond.getOperand(2))->get(),

25513

CondOp0, CondOp1, IsAlwaysSignaling);

25514

25515

if (Subtarget.hasAVX512()) {

25516

SDValue Cmp =

25517

DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CondOp0, CondOp1,

25518

DAG.getTargetConstant(SSECC, DL, MVT::i8));

25519

assert(!VT.isVector() && "Not a scalar type?")(static_cast <bool> (!VT.isVector() && "Not a scalar type?"
) ? void (0) : __assert_fail ("!VT.isVector() && \"Not a scalar type?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25519, __extension__
__PRETTY_FUNCTION__));

25520

return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);

25521

}

25522

25523

if (SSECC < 8 || Subtarget.hasAVX()) {

25524

SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1,

25525

DAG.getTargetConstant(SSECC, DL, MVT::i8));

25526

25527

// If we have AVX, we can use a variable vector select (VBLENDV) instead

25528

// of 3 logic instructions for size savings and potentially speed.

25529

// Unfortunately, there is no scalar form of VBLENDV.

25530

25531

// If either operand is a +0.0 constant, don't try this. We can expect to

25532

// optimize away at least one of the logic instructions later in that

25533

// case, so that sequence would be faster than a variable blend.

25534

25535

// BLENDV was introduced with SSE 4.1, but the 2 register form implicitly

25536

// uses XMM0 as the selection register. That may need just as many

25537

// instructions as the AND/ANDN/OR sequence due to register moves, so

25538

// don't bother.

25539

if (Subtarget.hasAVX() && !isNullFPConstant(Op1) &&

25540

!isNullFPConstant(Op2)) {

25541

// Convert to vectors, do a VSELECT, and convert back to scalar.

25542

// All of the conversions should be optimized away.

25543

MVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64;

25544

SDValue VOp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op1);

25545

SDValue VOp2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op2);

25546

SDValue VCmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Cmp);

25547

25548

MVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64;

25549

VCmp = DAG.getBitcast(VCmpVT, VCmp);

25550

25551

SDValue VSel = DAG.getSelect(DL, VecVT, VCmp, VOp1, VOp2);

25552

25553

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,

25554

VSel, DAG.getIntPtrConstant(0, DL));

25555

}

25556

SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2);

25557

SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1);

25558

return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And);

25559

}

25560

}

25561

25562

// AVX512 fallback is to lower selects of scalar floats to masked moves.

25563

if (isScalarFPTypeInSSEReg(VT) && Subtarget.hasAVX512()) {

25564

SDValue Cmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Cond);

25565

return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);

25566

}

25567

25568

if (Cond.getOpcode() == ISD::SETCC &&

25569

!isSoftFP16(Cond.getOperand(0).getSimpleValueType())) {

25570

if (SDValue NewCond = LowerSETCC(Cond, DAG)) {

25571

Cond = NewCond;

25572

// If the condition was updated, it's possible that the operands of the

25573

// select were also updated (for example, EmitTest has a RAUW). Refresh

25574

// the local references to the select operands in case they got stale.

25575

Op1 = Op.getOperand(1);

25576

Op2 = Op.getOperand(2);

25577

}

25578

}

25579

25580

// (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y

25581

// (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y

25582

// (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y

25583

// (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y

25584

// (select (and (x , 0x1) == 0), y, (z ^ y) ) -> (-(and (x , 0x1)) & z ) ^ y

25585

// (select (and (x , 0x1) == 0), y, (z | y) ) -> (-(and (x , 0x1)) & z ) | y

25586

// (select (x > 0), x, 0) -> (~(x >> (size_in_bits(x)-1))) & x

25587

// (select (x < 0), x, 0) -> ((x >> (size_in_bits(x)-1))) & x

25588

if (Cond.getOpcode() == X86ISD::SETCC &&

25589

Cond.getOperand(1).getOpcode() == X86ISD::CMP &&

25590

isNullConstant(Cond.getOperand(1).getOperand(1))) {

25591

SDValue Cmp = Cond.getOperand(1);

25592

SDValue CmpOp0 = Cmp.getOperand(0);

25593

unsigned CondCode = Cond.getConstantOperandVal(0);

25594

25595

// Special handling for __builtin_ffs(X) - 1 pattern which looks like

25596

// (select (seteq X, 0), -1, (cttz_zero_undef X)). Disable the special

25597

// handle to keep the CMP with 0. This should be removed by

25598

// optimizeCompareInst by using the flags from the BSR/TZCNT used for the

25599

// cttz_zero_undef.

25600

auto MatchFFSMinus1 = [&](SDValue Op1, SDValue Op2) {

25601

return (Op1.getOpcode() == ISD::CTTZ_ZERO_UNDEF && Op1.hasOneUse() &&

25602

Op1.getOperand(0) == CmpOp0 && isAllOnesConstant(Op2));

25603

};

25604

if (Subtarget.canUseCMOV() && (VT == MVT::i32 || VT == MVT::i64) &&

25605

((CondCode == X86::COND_NE && MatchFFSMinus1(Op1, Op2)) ||

25606

(CondCode == X86::COND_E && MatchFFSMinus1(Op2, Op1)))) {

25607

// Keep Cmp.

25608

} else if ((isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&

25609

(CondCode == X86::COND_E || CondCode == X86::COND_NE)) {

25610

SDValue Y = isAllOnesConstant(Op2) ? Op1 : Op2;

25611

SDVTList CmpVTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32);

25612

25613

// 'X - 1' sets the carry flag if X == 0.

25614

// '0 - X' sets the carry flag if X != 0.

25615

// Convert the carry flag to a -1/0 mask with sbb:

25616

// select (X != 0), -1, Y --> 0 - X; or (sbb), Y

25617

// select (X == 0), Y, -1 --> 0 - X; or (sbb), Y

25618

// select (X != 0), Y, -1 --> X - 1; or (sbb), Y

25619

// select (X == 0), -1, Y --> X - 1; or (sbb), Y

25620

SDValue Sub;

25621

if (isAllOnesConstant(Op1) == (CondCode == X86::COND_NE)) {

25622

SDValue Zero = DAG.getConstant(0, DL, CmpOp0.getValueType());

25623

Sub = DAG.getNode(X86ISD::SUB, DL, CmpVTs, Zero, CmpOp0);

25624

} else {

25625

SDValue One = DAG.getConstant(1, DL, CmpOp0.getValueType());

25626

Sub = DAG.getNode(X86ISD::SUB, DL, CmpVTs, CmpOp0, One);

25627

}

25628

SDValue SBB = DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,

25629

DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),

25630

Sub.getValue(1));

25631

return DAG.getNode(ISD::OR, DL, VT, SBB, Y);

25632

} else if (!Subtarget.canUseCMOV() && CondCode == X86::COND_E &&

25633

CmpOp0.getOpcode() == ISD::AND &&

25634

isOneConstant(CmpOp0.getOperand(1))) {

25635

SDValue Src1, Src2;

25636

// true if Op2 is XOR or OR operator and one of its operands

25637

// is equal to Op1

25638

// ( a , a op b) || ( b , a op b)

25639

auto isOrXorPattern = [&]() {

25640

if ((Op2.getOpcode() == ISD::XOR || Op2.getOpcode() == ISD::OR) &&

25641

(Op2.getOperand(0) == Op1 || Op2.getOperand(1) == Op1)) {

25642

Src1 =

25643

Op2.getOperand(0) == Op1 ? Op2.getOperand(1) : Op2.getOperand(0);

25644

Src2 = Op1;

25645

return true;

25646

}

25647

return false;

25648

};

25649

25650

if (isOrXorPattern()) {

25651

SDValue Neg;

25652

unsigned int CmpSz = CmpOp0.getSimpleValueType().getSizeInBits();

25653

// we need mask of all zeros or ones with same size of the other

25654

// operands.

25655

if (CmpSz > VT.getSizeInBits())

25656

Neg = DAG.getNode(ISD::TRUNCATE, DL, VT, CmpOp0);

25657

else if (CmpSz < VT.getSizeInBits())

25658

Neg = DAG.getNode(ISD::AND, DL, VT,

25659

DAG.getNode(ISD::ANY_EXTEND, DL, VT, CmpOp0.getOperand(0)),

25660

DAG.getConstant(1, DL, VT));

25661

else

25662

Neg = CmpOp0;

25663

SDValue Mask = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),

25664

Neg); // -(and (x, 0x1))

25665

SDValue And = DAG.getNode(ISD::AND, DL, VT, Mask, Src1); // Mask & z

25666

return DAG.getNode(Op2.getOpcode(), DL, VT, And, Src2); // And Op y

25667

}

25668

} else if ((VT == MVT::i32 || VT == MVT::i64) && isNullConstant(Op2) &&

25669

Cmp.getNode()->hasOneUse() && (CmpOp0 == Op1) &&

25670

((CondCode == X86::COND_S) || // smin(x, 0)

25671

(CondCode == X86::COND_G && hasAndNot(Op1)))) { // smax(x, 0)

25672

// (select (x < 0), x, 0) -> ((x >> (size_in_bits(x)-1))) & x

25673

//

25674

// If the comparison is testing for a positive value, we have to invert

25675

// the sign bit mask, so only do that transform if the target has a

25676

// bitwise 'and not' instruction (the invert is free).

25677

// (select (x > 0), x, 0) -> (~(x >> (size_in_bits(x)-1))) & x

25678

unsigned ShCt = VT.getSizeInBits() - 1;

25679

SDValue ShiftAmt = DAG.getConstant(ShCt, DL, VT);

25680

SDValue Shift = DAG.getNode(ISD::SRA, DL, VT, Op1, ShiftAmt);

25681

if (CondCode == X86::COND_G)

25682

Shift = DAG.getNOT(DL, Shift, VT);

25683

return DAG.getNode(ISD::AND, DL, VT, Shift, Op1);

25684

}

25685

}

25686

25687

// Look past (and (setcc_carry (cmp ...)), 1).

25688

if (Cond.getOpcode() == ISD::AND &&

25689

Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&

25690

isOneConstant(Cond.getOperand(1)))

25691

Cond = Cond.getOperand(0);

25692

25693

// If condition flag is set by a X86ISD::CMP, then use it as the condition

25694

// setting operand in place of the X86ISD::SETCC.

25695

unsigned CondOpcode = Cond.getOpcode();

25696

if (CondOpcode == X86ISD::SETCC ||

25697

CondOpcode == X86ISD::SETCC_CARRY) {

25698

CC = Cond.getOperand(0);

25699

25700

SDValue Cmp = Cond.getOperand(1);

25701

bool IllegalFPCMov = false;

25702

if (VT.isFloatingPoint() && !VT.isVector() &&

25703

!isScalarFPTypeInSSEReg(VT) && Subtarget.canUseCMOV()) // FPStack?

25704

IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());

25705

25706

if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||

25707

Cmp.getOpcode() == X86ISD::BT) { // FIXME

25708

Cond = Cmp;

25709

AddTest = false;

25710

}

25711

} else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||

25712

CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||

25713

CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) {

25714

SDValue Value;

25715

X86::CondCode X86Cond;

25716

std::tie(Value, Cond) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG);

25717

25718

CC = DAG.getTargetConstant(X86Cond, DL, MVT::i8);

25719

AddTest = false;

25720

}

25721

25722

if (AddTest) {

25723

// Look past the truncate if the high bits are known zero.

25724

if (isTruncWithZeroHighBitsInput(Cond, DAG))

25725

Cond = Cond.getOperand(0);

25726

25727

// We know the result of AND is compared against zero. Try to match

25728

// it to BT.

25729

if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {

25730

X86::CondCode X86CondCode;

25731

if (SDValue BT = LowerAndToBT(Cond, ISD::SETNE, DL, DAG, X86CondCode)) {

25732

CC = DAG.getTargetConstant(X86CondCode, DL, MVT::i8);

25733

Cond = BT;

25734

AddTest = false;

25735

}

25736

}

25737

}

25738

25739

if (AddTest) {

25740

CC = DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8);

25741

Cond = EmitTest(Cond, X86::COND_NE, DL, DAG, Subtarget);

25742

}

25743

25744

// a < b ? -1 : 0 -> RES = ~setcc_carry

25745

// a < b ? 0 : -1 -> RES = setcc_carry

25746

// a >= b ? -1 : 0 -> RES = setcc_carry

25747

// a >= b ? 0 : -1 -> RES = ~setcc_carry

25748

if (Cond.getOpcode() == X86ISD::SUB) {

25749

unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue();

25750

25751

if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) &&

25752

(isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&

25753

(isNullConstant(Op1) || isNullConstant(Op2))) {

25754

SDValue Res =

25755

DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),

25756

DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), Cond);

25757

if (isAllOnesConstant(Op1) != (CondCode == X86::COND_B))

25758

return DAG.getNOT(DL, Res, Res.getValueType());

25759

return Res;

25760

}

25761

}

25762

25763

// X86 doesn't have an i8 cmov. If both operands are the result of a truncate

25764

// widen the cmov and push the truncate through. This avoids introducing a new

25765

// branch during isel and doesn't add any extensions.

25766

if (Op.getValueType() == MVT::i8 &&

25767

Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) {

25768

SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);

25769

if (T1.getValueType() == T2.getValueType() &&

25770

// Exclude CopyFromReg to avoid partial register stalls.

25771

T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){

25772

SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, T1.getValueType(), T2, T1,

25773

CC, Cond);

25774

return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);

25775

}

25776

}

25777

25778

// Or finally, promote i8 cmovs if we have CMOV,

25779

// or i16 cmovs if it won't prevent folding a load.

25780

// FIXME: we should not limit promotion of i8 case to only when the CMOV is

25781

// legal, but EmitLoweredSelect() can not deal with these extensions

25782

// being inserted between two CMOV's. (in i16 case too TBN)

25783

// https://bugs.llvm.org/show_bug.cgi?id=40974

25784

if ((Op.getValueType() == MVT::i8 && Subtarget.canUseCMOV()) ||

25785

(Op.getValueType() == MVT::i16 && !X86::mayFoldLoad(Op1, Subtarget) &&

25786

!X86::mayFoldLoad(Op2, Subtarget))) {

25787

Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1);

25788

Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2);

25789

SDValue Ops[] = { Op2, Op1, CC, Cond };

25790

SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, MVT::i32, Ops);

25791

return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);

25792

}

25793

25794

// X86ISD::CMOV means set the result (which is operand 1) to the RHS if

25795

// condition is true.

25796

SDValue Ops[] = { Op2, Op1, CC, Cond };

25797

return DAG.getNode(X86ISD::CMOV, DL, Op.getValueType(), Ops);

25798

}

25799

25800

static SDValue LowerSIGN_EXTEND_Mask(SDValue Op,

25801

const X86Subtarget &Subtarget,

25802

SelectionDAG &DAG) {

25803

MVT VT = Op->getSimpleValueType(0);

25804

SDValue In = Op->getOperand(0);

25805

MVT InVT = In.getSimpleValueType();

25806

assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!")(static_cast <bool> (InVT.getVectorElementType() == MVT
::i1 && "Unexpected input type!") ? void (0) : __assert_fail
("InVT.getVectorElementType() == MVT::i1 && \"Unexpected input type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25806, __extension__
__PRETTY_FUNCTION__));

25807

MVT VTElt = VT.getVectorElementType();

25808

SDLoc dl(Op);

25809

25810

unsigned NumElts = VT.getVectorNumElements();

25811

25812

// Extend VT if the scalar type is i8/i16 and BWI is not supported.

25813

MVT ExtVT = VT;

25814

if (!Subtarget.hasBWI() && VTElt.getSizeInBits() <= 16) {

25815

// If v16i32 is to be avoided, we'll need to split and concatenate.

25816

if (NumElts == 16 && !Subtarget.canExtendTo512DQ())

25817

return SplitAndExtendv16i1(Op.getOpcode(), VT, In, dl, DAG);

25818

25819

ExtVT = MVT::getVectorVT(MVT::i32, NumElts);

25820

}

25821

25822

// Widen to 512-bits if VLX is not supported.

25823

MVT WideVT = ExtVT;

25824

if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {

25825

NumElts *= 512 / ExtVT.getSizeInBits();

25826

InVT = MVT::getVectorVT(MVT::i1, NumElts);

25827

In = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, InVT, DAG.getUNDEF(InVT),

25828

In, DAG.getIntPtrConstant(0, dl));

25829

WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(), NumElts);

25830

}

25831

25832

SDValue V;

25833

MVT WideEltVT = WideVT.getVectorElementType();

25834

if ((Subtarget.hasDQI() && WideEltVT.getSizeInBits() >= 32) ||

25835

(Subtarget.hasBWI() && WideEltVT.getSizeInBits() <= 16)) {

25836

V = DAG.getNode(Op.getOpcode(), dl, WideVT, In);

25837

} else {

25838

SDValue NegOne = DAG.getConstant(-1, dl, WideVT);

25839

SDValue Zero = DAG.getConstant(0, dl, WideVT);

25840

V = DAG.getSelect(dl, WideVT, In, NegOne, Zero);

25841

}

25842

25843

// Truncate if we had to extend i16/i8 above.

25844

if (VT != ExtVT) {

25845

WideVT = MVT::getVectorVT(VTElt, NumElts);

25846

V = DAG.getNode(ISD::TRUNCATE, dl, WideVT, V);

25847

}

25848

25849

// Extract back to 128/256-bit if we widened.

25850

if (WideVT != VT)

25851

V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, V,

25852

DAG.getIntPtrConstant(0, dl));

25853

25854

return V;

25855

}

25856

25857

static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget &Subtarget,

25858

SelectionDAG &DAG) {

25859

SDValue In = Op->getOperand(0);

25860

MVT InVT = In.getSimpleValueType();

25861

25862

if (InVT.getVectorElementType() == MVT::i1)

25863

return LowerSIGN_EXTEND_Mask(Op, Subtarget, DAG);

25864

25865

assert(Subtarget.hasAVX() && "Expected AVX support")(static_cast <bool> (Subtarget.hasAVX() && "Expected AVX support"
) ? void (0) : __assert_fail ("Subtarget.hasAVX() && \"Expected AVX support\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25865, __extension__
__PRETTY_FUNCTION__));

25866

return LowerAVXExtend(Op, DAG, Subtarget);

25867

}

25868

25869

// Lowering for SIGN_EXTEND_VECTOR_INREG and ZERO_EXTEND_VECTOR_INREG.

25870

// For sign extend this needs to handle all vector sizes and SSE4.1 and

25871

// non-SSE4.1 targets. For zero extend this should only handle inputs of

25872

// MVT::v64i8 when BWI is not supported, but AVX512 is.

25873

static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op,

25874

const X86Subtarget &Subtarget,

25875

SelectionDAG &DAG) {

25876

SDValue In = Op->getOperand(0);

25877

MVT VT = Op->getSimpleValueType(0);

25878

MVT InVT = In.getSimpleValueType();

25879

25880

MVT SVT = VT.getVectorElementType();

25881

MVT InSVT = InVT.getVectorElementType();

25882

assert(SVT.getFixedSizeInBits() > InSVT.getFixedSizeInBits())(static_cast <bool> (SVT.getFixedSizeInBits() > InSVT
.getFixedSizeInBits()) ? void (0) : __assert_fail ("SVT.getFixedSizeInBits() > InSVT.getFixedSizeInBits()"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25882, __extension__
__PRETTY_FUNCTION__));

25883

25884

if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)

25885

return SDValue();

25886

if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)

25887

return SDValue();

25888

if (!(VT.is128BitVector() && Subtarget.hasSSE2()) &&

25889

!(VT.is256BitVector() && Subtarget.hasAVX()) &&

25890

!(VT.is512BitVector() && Subtarget.hasAVX512()))

25891

return SDValue();

25892

25893

SDLoc dl(Op);

25894

unsigned Opc = Op.getOpcode();

25895

unsigned NumElts = VT.getVectorNumElements();

25896

25897

// For 256-bit vectors, we only need the lower (128-bit) half of the input.

25898

// For 512-bit vectors, we need 128-bits or 256-bits.

25899

if (InVT.getSizeInBits() > 128) {

25900

// Input needs to be at least the same number of elements as output, and

25901

// at least 128-bits.

25902

int InSize = InSVT.getSizeInBits() * NumElts;

25903

In = extractSubVector(In, 0, DAG, dl, std::max(InSize, 128));

25904

InVT = In.getSimpleValueType();

25905

}

25906

25907

// SSE41 targets can use the pmov[sz]x* instructions directly for 128-bit results,

25908

// so are legal and shouldn't occur here. AVX2/AVX512 pmovsx* instructions still

25909

// need to be handled here for 256/512-bit results.

25910

if (Subtarget.hasInt256()) {

25911

assert(VT.getSizeInBits() > 128 && "Unexpected 128-bit vector extension")(static_cast <bool> (VT.getSizeInBits() > 128 &&
"Unexpected 128-bit vector extension") ? void (0) : __assert_fail
("VT.getSizeInBits() > 128 && \"Unexpected 128-bit vector extension\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25911, __extension__
__PRETTY_FUNCTION__));

25912

25913

if (InVT.getVectorNumElements() != NumElts)

25914

return DAG.getNode(Op.getOpcode(), dl, VT, In);

25915

25916

// FIXME: Apparently we create inreg operations that could be regular

25917

// extends.

25918

unsigned ExtOpc =

25919

Opc == ISD::SIGN_EXTEND_VECTOR_INREG ? ISD::SIGN_EXTEND

25920

: ISD::ZERO_EXTEND;

25921

return DAG.getNode(ExtOpc, dl, VT, In);

25922

}

25923

25924

// pre-AVX2 256-bit extensions need to be split into 128-bit instructions.

25925

if (Subtarget.hasAVX()) {

25926

assert(VT.is256BitVector() && "256-bit vector expected")(static_cast <bool> (VT.is256BitVector() && "256-bit vector expected"
) ? void (0) : __assert_fail ("VT.is256BitVector() && \"256-bit vector expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25926, __extension__
__PRETTY_FUNCTION__));

25927

MVT HalfVT = VT.getHalfNumVectorElementsVT();

25928

int HalfNumElts = HalfVT.getVectorNumElements();

25929

25930

unsigned NumSrcElts = InVT.getVectorNumElements();

25931

SmallVector<int, 16> HiMask(NumSrcElts, SM_SentinelUndef);

25932

for (int i = 0; i != HalfNumElts; ++i)

25933

HiMask[i] = HalfNumElts + i;

25934

25935

SDValue Lo = DAG.getNode(Opc, dl, HalfVT, In);

25936

SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, DAG.getUNDEF(InVT), HiMask);

25937

Hi = DAG.getNode(Opc, dl, HalfVT, Hi);

25938

return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);

25939

}

25940

25941

// We should only get here for sign extend.

25942

assert(Opc == ISD::SIGN_EXTEND_VECTOR_INREG && "Unexpected opcode!")(static_cast <bool> (Opc == ISD::SIGN_EXTEND_VECTOR_INREG
&& "Unexpected opcode!") ? void (0) : __assert_fail (
"Opc == ISD::SIGN_EXTEND_VECTOR_INREG && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25942, __extension__
__PRETTY_FUNCTION__));

25943

assert(VT.is128BitVector() && InVT.is128BitVector() && "Unexpected VTs")(static_cast <bool> (VT.is128BitVector() && InVT
.is128BitVector() && "Unexpected VTs") ? void (0) : __assert_fail
("VT.is128BitVector() && InVT.is128BitVector() && \"Unexpected VTs\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25943, __extension__
__PRETTY_FUNCTION__));

25944

25945

// pre-SSE41 targets unpack lower lanes and then sign-extend using SRAI.

25946

SDValue Curr = In;

25947

SDValue SignExt = Curr;

25948

25949

// As SRAI is only available on i16/i32 types, we expand only up to i32

25950

// and handle i64 separately.

25951

if (InVT != MVT::v4i32) {

25952

MVT DestVT = VT == MVT::v2i64 ? MVT::v4i32 : VT;

25953

25954

unsigned DestWidth = DestVT.getScalarSizeInBits();

25955

unsigned Scale = DestWidth / InSVT.getSizeInBits();

25956

25957

unsigned InNumElts = InVT.getVectorNumElements();

25958

unsigned DestElts = DestVT.getVectorNumElements();

25959

25960

// Build a shuffle mask that takes each input element and places it in the

25961

// MSBs of the new element size.

25962

SmallVector<int, 16> Mask(InNumElts, SM_SentinelUndef);

25963

for (unsigned i = 0; i != DestElts; ++i)

25964

Mask[i * Scale + (Scale - 1)] = i;

25965

25966

Curr = DAG.getVectorShuffle(InVT, dl, In, In, Mask);

25967

Curr = DAG.getBitcast(DestVT, Curr);

25968

25969

unsigned SignExtShift = DestWidth - InSVT.getSizeInBits();

25970

SignExt = DAG.getNode(X86ISD::VSRAI, dl, DestVT, Curr,

25971

DAG.getTargetConstant(SignExtShift, dl, MVT::i8));

25972

}

25973

25974

if (VT == MVT::v2i64) {

25975

assert(Curr.getValueType() == MVT::v4i32 && "Unexpected input VT")(static_cast <bool> (Curr.getValueType() == MVT::v4i32 &&
"Unexpected input VT") ? void (0) : __assert_fail ("Curr.getValueType() == MVT::v4i32 && \"Unexpected input VT\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25975, __extension__
__PRETTY_FUNCTION__));

25976

SDValue Zero = DAG.getConstant(0, dl, MVT::v4i32);

25977

SDValue Sign = DAG.getSetCC(dl, MVT::v4i32, Zero, Curr, ISD::SETGT);

25978

SignExt = DAG.getVectorShuffle(MVT::v4i32, dl, SignExt, Sign, {0, 4, 1, 5});

25979

SignExt = DAG.getBitcast(VT, SignExt);

25980

}

25981

25982

return SignExt;

25983

}

25984

25985

static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget,

25986

SelectionDAG &DAG) {

25987

MVT VT = Op->getSimpleValueType(0);

25988

SDValue In = Op->getOperand(0);

25989

MVT InVT = In.getSimpleValueType();

25990

SDLoc dl(Op);

25991

25992

if (InVT.getVectorElementType() == MVT::i1)

25993

return LowerSIGN_EXTEND_Mask(Op, Subtarget, DAG);

25994

25995

assert(VT.isVector() && InVT.isVector() && "Expected vector type")(static_cast <bool> (VT.isVector() && InVT.isVector
() && "Expected vector type") ? void (0) : __assert_fail
("VT.isVector() && InVT.isVector() && \"Expected vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25995, __extension__
__PRETTY_FUNCTION__));

25996

assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&(static_cast <bool> (VT.getVectorNumElements() == InVT.
getVectorNumElements() && "Expected same number of elements"
) ? void (0) : __assert_fail ("VT.getVectorNumElements() == InVT.getVectorNumElements() && \"Expected same number of elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25997, __extension__
__PRETTY_FUNCTION__))

25997

"Expected same number of elements")(static_cast <bool> (VT.getVectorNumElements() == InVT.
getVectorNumElements() && "Expected same number of elements"
) ? void (0) : __assert_fail ("VT.getVectorNumElements() == InVT.getVectorNumElements() && \"Expected same number of elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 25997, __extension__
__PRETTY_FUNCTION__));

25998

assert((VT.getVectorElementType() == MVT::i16 ||(static_cast <bool> ((VT.getVectorElementType() == MVT::
i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType
() == MVT::i64) && "Unexpected element type") ? void (
0) : __assert_fail ("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26001, __extension__
__PRETTY_FUNCTION__))

25999

VT.getVectorElementType() == MVT::i32 ||(static_cast <bool> ((VT.getVectorElementType() == MVT::
i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType
() == MVT::i64) && "Unexpected element type") ? void (
0) : __assert_fail ("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26001, __extension__
__PRETTY_FUNCTION__))

26000

VT.getVectorElementType() == MVT::i64) &&(static_cast <bool> ((VT.getVectorElementType() == MVT::
i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType
() == MVT::i64) && "Unexpected element type") ? void (
0) : __assert_fail ("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26001, __extension__
__PRETTY_FUNCTION__))

26001

"Unexpected element type")(static_cast <bool> ((VT.getVectorElementType() == MVT::
i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType
() == MVT::i64) && "Unexpected element type") ? void (
0) : __assert_fail ("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26001, __extension__
__PRETTY_FUNCTION__));

26002

assert((InVT.getVectorElementType() == MVT::i8 ||(static_cast <bool> ((InVT.getVectorElementType() == MVT
::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType
() == MVT::i32) && "Unexpected element type") ? void (
0) : __assert_fail ("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26005, __extension__
__PRETTY_FUNCTION__))

26003

InVT.getVectorElementType() == MVT::i16 ||(static_cast <bool> ((InVT.getVectorElementType() == MVT
::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType
() == MVT::i32) && "Unexpected element type") ? void (
0) : __assert_fail ("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26005, __extension__
__PRETTY_FUNCTION__))

26004

InVT.getVectorElementType() == MVT::i32) &&(static_cast <bool> ((InVT.getVectorElementType() == MVT
::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType
() == MVT::i32) && "Unexpected element type") ? void (
0) : __assert_fail ("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26005, __extension__
__PRETTY_FUNCTION__))

26005

"Unexpected element type")(static_cast <bool> ((InVT.getVectorElementType() == MVT
::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType
() == MVT::i32) && "Unexpected element type") ? void (
0) : __assert_fail ("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26005, __extension__
__PRETTY_FUNCTION__));

26006

26007

if (VT == MVT::v32i16 && !Subtarget.hasBWI()) {

26008

assert(InVT == MVT::v32i8 && "Unexpected VT!")(static_cast <bool> (InVT == MVT::v32i8 && "Unexpected VT!"
) ? void (0) : __assert_fail ("InVT == MVT::v32i8 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26008, __extension__
__PRETTY_FUNCTION__));

26009

return splitVectorIntUnary(Op, DAG);

26010

}

26011

26012

if (Subtarget.hasInt256())

26013

return Op;

26014

26015

// Optimize vectors in AVX mode

26016

// Sign extend v8i16 to v8i32 and

26017

// v4i32 to v4i64

26018

//

26019

// Divide input vector into two parts

26020

// for v4i32 the high shuffle mask will be {2, 3, -1, -1}

26021

// use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32

26022

// concat the vectors to original VT

26023

MVT HalfVT = VT.getHalfNumVectorElementsVT();

26024

SDValue OpLo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, In);

26025

26026

unsigned NumElems = InVT.getVectorNumElements();

26027

SmallVector<int,8> ShufMask(NumElems, -1);

26028

for (unsigned i = 0; i != NumElems/2; ++i)

26029

ShufMask[i] = i + NumElems/2;

26030

26031

SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask);

26032

OpHi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, OpHi);

26033

26034

return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);

26035

}

26036

26037

/// Change a vector store into a pair of half-size vector stores.

26038

static SDValue splitVectorStore(StoreSDNode *Store, SelectionDAG &DAG) {

26039

SDValue StoredVal = Store->getValue();

26040

assert((StoredVal.getValueType().is256BitVector() ||(static_cast <bool> ((StoredVal.getValueType().is256BitVector
() || StoredVal.getValueType().is512BitVector()) && "Expecting 256/512-bit op"
) ? void (0) : __assert_fail ("(StoredVal.getValueType().is256BitVector() || StoredVal.getValueType().is512BitVector()) && \"Expecting 256/512-bit op\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26042, __extension__
__PRETTY_FUNCTION__))

26041

StoredVal.getValueType().is512BitVector()) &&(static_cast <bool> ((StoredVal.getValueType().is256BitVector
() || StoredVal.getValueType().is512BitVector()) && "Expecting 256/512-bit op"
) ? void (0) : __assert_fail ("(StoredVal.getValueType().is256BitVector() || StoredVal.getValueType().is512BitVector()) && \"Expecting 256/512-bit op\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26042, __extension__
__PRETTY_FUNCTION__))

26042

"Expecting 256/512-bit op")(static_cast <bool> ((StoredVal.getValueType().is256BitVector
() || StoredVal.getValueType().is512BitVector()) && "Expecting 256/512-bit op"
) ? void (0) : __assert_fail ("(StoredVal.getValueType().is256BitVector() || StoredVal.getValueType().is512BitVector()) && \"Expecting 256/512-bit op\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26042, __extension__
__PRETTY_FUNCTION__));

26043

26044

// Splitting volatile memory ops is not allowed unless the operation was not

26045

// legal to begin with. Assume the input store is legal (this transform is

26046

// only used for targets with AVX). Note: It is possible that we have an

26047

// illegal type like v2i128, and so we could allow splitting a volatile store

26048

// in that case if that is important.

26049

if (!Store->isSimple())

26050

return SDValue();

26051

26052

SDLoc DL(Store);

26053

SDValue Value0, Value1;

26054

std::tie(Value0, Value1) = splitVector(StoredVal, DAG, DL);

26055

unsigned HalfOffset = Value0.getValueType().getStoreSize();

26056

SDValue Ptr0 = Store->getBasePtr();

26057

SDValue Ptr1 =

26058

DAG.getMemBasePlusOffset(Ptr0, TypeSize::Fixed(HalfOffset), DL);

26059

SDValue Ch0 =

26060

DAG.getStore(Store->getChain(), DL, Value0, Ptr0, Store->getPointerInfo(),

26061

Store->getOriginalAlign(),

26062

Store->getMemOperand()->getFlags());

26063

SDValue Ch1 = DAG.getStore(Store->getChain(), DL, Value1, Ptr1,

26064

Store->getPointerInfo().getWithOffset(HalfOffset),

26065

Store->getOriginalAlign(),

26066

Store->getMemOperand()->getFlags());

26067

return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Ch0, Ch1);

26068

}

26069

26070

/// Scalarize a vector store, bitcasting to TargetVT to determine the scalar

26071

/// type.

26072

static SDValue scalarizeVectorStore(StoreSDNode *Store, MVT StoreVT,

26073

SelectionDAG &DAG) {

26074

SDValue StoredVal = Store->getValue();

26075

assert(StoreVT.is128BitVector() &&(static_cast <bool> (StoreVT.is128BitVector() &&
StoredVal.getValueType().is128BitVector() && "Expecting 128-bit op"
) ? void (0) : __assert_fail ("StoreVT.is128BitVector() && StoredVal.getValueType().is128BitVector() && \"Expecting 128-bit op\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26076, __extension__
__PRETTY_FUNCTION__))

26076

StoredVal.getValueType().is128BitVector() && "Expecting 128-bit op")(static_cast <bool> (StoreVT.is128BitVector() &&
StoredVal.getValueType().is128BitVector() && "Expecting 128-bit op"
) ? void (0) : __assert_fail ("StoreVT.is128BitVector() && StoredVal.getValueType().is128BitVector() && \"Expecting 128-bit op\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26076, __extension__
__PRETTY_FUNCTION__));

26077

StoredVal = DAG.getBitcast(StoreVT, StoredVal);

26078

26079

// Splitting volatile memory ops is not allowed unless the operation was not

26080

// legal to begin with. We are assuming the input op is legal (this transform

26081

// is only used for targets with AVX).

26082

if (!Store->isSimple())

26083

return SDValue();

26084

26085

MVT StoreSVT = StoreVT.getScalarType();

26086

unsigned NumElems = StoreVT.getVectorNumElements();

26087

unsigned ScalarSize = StoreSVT.getStoreSize();

26088

26089

SDLoc DL(Store);

26090

SmallVector<SDValue, 4> Stores;

26091

for (unsigned i = 0; i != NumElems; ++i) {

26092

unsigned Offset = i * ScalarSize;

26093

SDValue Ptr = DAG.getMemBasePlusOffset(Store->getBasePtr(),

26094

TypeSize::Fixed(Offset), DL);

26095

SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, StoreSVT, StoredVal,

26096

DAG.getIntPtrConstant(i, DL));

26097

SDValue Ch = DAG.getStore(Store->getChain(), DL, Scl, Ptr,

26098

Store->getPointerInfo().getWithOffset(Offset),

26099

Store->getOriginalAlign(),

26100

Store->getMemOperand()->getFlags());

26101

Stores.push_back(Ch);

26102

}

26103

return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);

26104

}

26105

26106

static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget,

26107

SelectionDAG &DAG) {

26108

StoreSDNode *St = cast<StoreSDNode>(Op.getNode());

26109

SDLoc dl(St);

26110

SDValue StoredVal = St->getValue();

26111

26112

// Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 stores.

26113

if (StoredVal.getValueType().isVector() &&

26114

StoredVal.getValueType().getVectorElementType() == MVT::i1) {

26115

unsigned NumElts = StoredVal.getValueType().getVectorNumElements();

26116

assert(NumElts <= 8 && "Unexpected VT")(static_cast <bool> (NumElts <= 8 && "Unexpected VT"
) ? void (0) : __assert_fail ("NumElts <= 8 && \"Unexpected VT\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26116, __extension__
__PRETTY_FUNCTION__));

26117

assert(!St->isTruncatingStore() && "Expected non-truncating store")(static_cast <bool> (!St->isTruncatingStore() &&
"Expected non-truncating store") ? void (0) : __assert_fail (
"!St->isTruncatingStore() && \"Expected non-truncating store\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26117, __extension__
__PRETTY_FUNCTION__));

26118

assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&(static_cast <bool> (Subtarget.hasAVX512() && !
Subtarget.hasDQI() && "Expected AVX512F without AVX512DQI"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && !Subtarget.hasDQI() && \"Expected AVX512F without AVX512DQI\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26119, __extension__
__PRETTY_FUNCTION__))

26119

"Expected AVX512F without AVX512DQI")(static_cast <bool> (Subtarget.hasAVX512() && !
Subtarget.hasDQI() && "Expected AVX512F without AVX512DQI"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && !Subtarget.hasDQI() && \"Expected AVX512F without AVX512DQI\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26119, __extension__
__PRETTY_FUNCTION__));

26120

26121

// We must pad with zeros to ensure we store zeroes to any unused bits.

26122

StoredVal = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,

26123

DAG.getUNDEF(MVT::v16i1), StoredVal,

26124

DAG.getIntPtrConstant(0, dl));

26125

StoredVal = DAG.getBitcast(MVT::i16, StoredVal);

26126

StoredVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, StoredVal);

26127

// Make sure we store zeros in the extra bits.

26128

if (NumElts < 8)

26129

StoredVal = DAG.getZeroExtendInReg(

26130

StoredVal, dl, EVT::getIntegerVT(*DAG.getContext(), NumElts));

26131

26132

return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),

26133

St->getPointerInfo(), St->getOriginalAlign(),

26134

St->getMemOperand()->getFlags());

26135

}

26136

26137

if (St->isTruncatingStore())

26138

return SDValue();

26139

26140

// If this is a 256-bit store of concatenated ops, we are better off splitting

26141

// that store into two 128-bit stores. This avoids spurious use of 256-bit ops

26142

// and each half can execute independently. Some cores would split the op into

26143

// halves anyway, so the concat (vinsertf128) is purely an extra op.

26144

MVT StoreVT = StoredVal.getSimpleValueType();

26145

if (StoreVT.is256BitVector() ||

26146

((StoreVT == MVT::v32i16 || StoreVT == MVT::v64i8) &&

26147

!Subtarget.hasBWI())) {

26148

SmallVector<SDValue, 4> CatOps;

26149

if (StoredVal.hasOneUse() &&

26150

collectConcatOps(StoredVal.getNode(), CatOps, DAG))

26151

return splitVectorStore(St, DAG);

26152

return SDValue();

26153

}

26154

26155

if (StoreVT.is32BitVector())

26156

return SDValue();

26157

26158

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

26159

assert(StoreVT.is64BitVector() && "Unexpected VT")(static_cast <bool> (StoreVT.is64BitVector() &&
"Unexpected VT") ? void (0) : __assert_fail ("StoreVT.is64BitVector() && \"Unexpected VT\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26159, __extension__
__PRETTY_FUNCTION__));

26160

assert(TLI.getTypeAction(*DAG.getContext(), StoreVT) ==(static_cast <bool> (TLI.getTypeAction(*DAG.getContext(
), StoreVT) == TargetLowering::TypeWidenVector && "Unexpected type action!"
) ? void (0) : __assert_fail ("TLI.getTypeAction(*DAG.getContext(), StoreVT) == TargetLowering::TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26162, __extension__
__PRETTY_FUNCTION__))

26161

TargetLowering::TypeWidenVector &&(static_cast <bool> (TLI.getTypeAction(*DAG.getContext(
), StoreVT) == TargetLowering::TypeWidenVector && "Unexpected type action!"
) ? void (0) : __assert_fail ("TLI.getTypeAction(*DAG.getContext(), StoreVT) == TargetLowering::TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26162, __extension__
__PRETTY_FUNCTION__))

26162

"Unexpected type action!")(static_cast <bool> (TLI.getTypeAction(*DAG.getContext(
), StoreVT) == TargetLowering::TypeWidenVector && "Unexpected type action!"
) ? void (0) : __assert_fail ("TLI.getTypeAction(*DAG.getContext(), StoreVT) == TargetLowering::TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26162, __extension__
__PRETTY_FUNCTION__));

26163

26164

EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), StoreVT);

26165

StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, StoredVal,

26166

DAG.getUNDEF(StoreVT));

26167

26168

if (Subtarget.hasSSE2()) {

26169

// Widen the vector, cast to a v2x64 type, extract the single 64-bit element

26170

// and store it.

26171

MVT StVT = Subtarget.is64Bit() && StoreVT.isInteger() ? MVT::i64 : MVT::f64;

26172

MVT CastVT = MVT::getVectorVT(StVT, 2);

26173

StoredVal = DAG.getBitcast(CastVT, StoredVal);

26174

StoredVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, StVT, StoredVal,

26175

DAG.getIntPtrConstant(0, dl));

26176

26177

return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),

26178

St->getPointerInfo(), St->getOriginalAlign(),

26179

St->getMemOperand()->getFlags());

26180

}

26181

assert(Subtarget.hasSSE1() && "Expected SSE")(static_cast <bool> (Subtarget.hasSSE1() && "Expected SSE"
) ? void (0) : __assert_fail ("Subtarget.hasSSE1() && \"Expected SSE\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26181, __extension__
__PRETTY_FUNCTION__));

26182

SDVTList Tys = DAG.getVTList(MVT::Other);

26183

SDValue Ops[] = {St->getChain(), StoredVal, St->getBasePtr()};

26184

return DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, Ops, MVT::i64,

26185

St->getMemOperand());

26186

}

26187

26188

// Lower vector extended loads using a shuffle. If SSSE3 is not available we

26189

// may emit an illegal shuffle but the expansion is still better than scalar

26190

// code. We generate sext/sext_invec for SEXTLOADs if it's available, otherwise

26191

// we'll emit a shuffle and a arithmetic shift.

26192

// FIXME: Is the expansion actually better than scalar code? It doesn't seem so.

26193

// TODO: It is possible to support ZExt by zeroing the undef values during

26194

// the shuffle phase or after the shuffle.

26195

static SDValue LowerLoad(SDValue Op, const X86Subtarget &Subtarget,

26196

SelectionDAG &DAG) {

26197

MVT RegVT = Op.getSimpleValueType();

26198

assert(RegVT.isVector() && "We only custom lower vector loads.")(static_cast <bool> (RegVT.isVector() && "We only custom lower vector loads."
) ? void (0) : __assert_fail ("RegVT.isVector() && \"We only custom lower vector loads.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26198, __extension__
__PRETTY_FUNCTION__));

26199

assert(RegVT.isInteger() &&(static_cast <bool> (RegVT.isInteger() && "We only custom lower integer vector loads."
) ? void (0) : __assert_fail ("RegVT.isInteger() && \"We only custom lower integer vector loads.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26200, __extension__
__PRETTY_FUNCTION__))

26200

"We only custom lower integer vector loads.")(static_cast <bool> (RegVT.isInteger() && "We only custom lower integer vector loads."
) ? void (0) : __assert_fail ("RegVT.isInteger() && \"We only custom lower integer vector loads.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26200, __extension__
__PRETTY_FUNCTION__));

26201

26202

LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());

26203

SDLoc dl(Ld);

26204

26205

// Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 loads.

26206

if (RegVT.getVectorElementType() == MVT::i1) {

26207

assert(EVT(RegVT) == Ld->getMemoryVT() && "Expected non-extending load")(static_cast <bool> (EVT(RegVT) == Ld->getMemoryVT()
&& "Expected non-extending load") ? void (0) : __assert_fail
("EVT(RegVT) == Ld->getMemoryVT() && \"Expected non-extending load\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26207, __extension__
__PRETTY_FUNCTION__));

26208

assert(RegVT.getVectorNumElements() <= 8 && "Unexpected VT")(static_cast <bool> (RegVT.getVectorNumElements() <=
8 && "Unexpected VT") ? void (0) : __assert_fail ("RegVT.getVectorNumElements() <= 8 && \"Unexpected VT\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26208, __extension__
__PRETTY_FUNCTION__));

26209

assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&(static_cast <bool> (Subtarget.hasAVX512() && !
Subtarget.hasDQI() && "Expected AVX512F without AVX512DQI"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && !Subtarget.hasDQI() && \"Expected AVX512F without AVX512DQI\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26210, __extension__
__PRETTY_FUNCTION__))

26210

"Expected AVX512F without AVX512DQI")(static_cast <bool> (Subtarget.hasAVX512() && !
Subtarget.hasDQI() && "Expected AVX512F without AVX512DQI"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && !Subtarget.hasDQI() && \"Expected AVX512F without AVX512DQI\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26210, __extension__
__PRETTY_FUNCTION__));

26211

26212

SDValue NewLd = DAG.getLoad(MVT::i8, dl, Ld->getChain(), Ld->getBasePtr(),

26213

Ld->getPointerInfo(), Ld->getOriginalAlign(),

26214

Ld->getMemOperand()->getFlags());

26215

26216

// Replace chain users with the new chain.

26217

assert(NewLd->getNumValues() == 2 && "Loads must carry a chain!")(static_cast <bool> (NewLd->getNumValues() == 2 &&
"Loads must carry a chain!") ? void (0) : __assert_fail ("NewLd->getNumValues() == 2 && \"Loads must carry a chain!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26217, __extension__
__PRETTY_FUNCTION__));

26218

26219

SDValue Val = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, NewLd);

26220

Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, RegVT,

26221

DAG.getBitcast(MVT::v16i1, Val),

26222

DAG.getIntPtrConstant(0, dl));

26223

return DAG.getMergeValues({Val, NewLd.getValue(1)}, dl);

26224

}

26225

26226

return SDValue();

26227

}

26228

26229

/// Return true if node is an ISD::AND or ISD::OR of two X86ISD::SETCC nodes

26230

/// each of which has no other use apart from the AND / OR.

26231

static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {

26232

Opc = Op.getOpcode();

26233

if (Opc != ISD::OR && Opc != ISD::AND)

26234

return false;

26235

return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&

26236

Op.getOperand(0).hasOneUse() &&

26237

Op.getOperand(1).getOpcode() == X86ISD::SETCC &&

26238

Op.getOperand(1).hasOneUse());

26239

}

26240

26241

SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {

26242

SDValue Chain = Op.getOperand(0);

26243

SDValue Cond = Op.getOperand(1);

26244

SDValue Dest = Op.getOperand(2);

26245

SDLoc dl(Op);

26246

26247

// Bail out when we don't have native compare instructions.

26248

if (Cond.getOpcode() == ISD::SETCC &&

26249

Cond.getOperand(0).getValueType() != MVT::f128 &&

26250

!isSoftFP16(Cond.getOperand(0).getValueType())) {

26251

SDValue LHS = Cond.getOperand(0);

26252

SDValue RHS = Cond.getOperand(1);

26253

ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();

26254

26255

// Special case for

26256

// setcc([su]{add,sub,mul}o == 0)

26257

// setcc([su]{add,sub,mul}o != 1)

26258

if (ISD::isOverflowIntrOpRes(LHS) &&

26259

(CC == ISD::SETEQ || CC == ISD::SETNE) &&

26260

(isNullConstant(RHS) || isOneConstant(RHS))) {

26261

SDValue Value, Overflow;

26262

X86::CondCode X86Cond;

26263

std::tie(Value, Overflow) = getX86XALUOOp(X86Cond, LHS.getValue(0), DAG);

26264

26265

if ((CC == ISD::SETEQ) == isNullConstant(RHS))

26266

X86Cond = X86::GetOppositeBranchCondition(X86Cond);

26267

26268

SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);

26269

return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,

26270

Overflow);

26271

}

26272

26273

if (LHS.getSimpleValueType().isInteger()) {

26274

SDValue CCVal;

26275

SDValue EFLAGS = emitFlagsForSetcc(LHS, RHS, CC, SDLoc(Cond), DAG, CCVal);

26276

return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,

26277

EFLAGS);

26278

}

26279

26280

if (CC == ISD::SETOEQ) {

26281

// For FCMP_OEQ, we can emit

26282

// two branches instead of an explicit AND instruction with a

26283

// separate test. However, we only do this if this block doesn't

26284

// have a fall-through edge, because this requires an explicit

26285

// jmp when the condition is false.

26286

if (Op.getNode()->hasOneUse()) {

26287

SDNode *User = *Op.getNode()->use_begin();

26288

// Look for an unconditional branch following this conditional branch.

26289

// We need this because we need to reverse the successors in order

26290

// to implement FCMP_OEQ.

26291

if (User->getOpcode() == ISD::BR) {

26292

SDValue FalseBB = User->getOperand(1);

26293

SDNode *NewBR =

26294

DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);

26295

assert(NewBR == User)(static_cast <bool> (NewBR == User) ? void (0) : __assert_fail
("NewBR == User", "llvm/lib/Target/X86/X86ISelLowering.cpp",
26295, __extension__ __PRETTY_FUNCTION__));

26296

(void)NewBR;

26297

Dest = FalseBB;

26298

26299

SDValue Cmp =

26300

DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);

26301

SDValue CCVal = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8);

26302

Chain = DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest,

26303

CCVal, Cmp);

26304

CCVal = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8);

26305

return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,

26306

Cmp);

26307

}

26308

}

26309

} else if (CC == ISD::SETUNE) {

26310

// For FCMP_UNE, we can emit

26311

// two branches instead of an explicit OR instruction with a

26312

// separate test.

26313

SDValue Cmp = DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);

26314

SDValue CCVal = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8);

26315

Chain =

26316

DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal, Cmp);

26317

CCVal = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8);

26318

return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,

26319

Cmp);

26320

} else {

26321

X86::CondCode X86Cond =

26322

TranslateX86CC(CC, dl, /*IsFP*/ true, LHS, RHS, DAG);

26323

SDValue Cmp = DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);

26324

SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);

26325

return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,

26326

Cmp);

26327

}

26328

}

26329

26330

if (ISD::isOverflowIntrOpRes(Cond)) {

26331

SDValue Value, Overflow;

26332

X86::CondCode X86Cond;

26333

std::tie(Value, Overflow) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG);

26334

26335

SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);

26336

return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,

26337

Overflow);

26338

}

26339

26340

// Look past the truncate if the high bits are known zero.

26341

if (isTruncWithZeroHighBitsInput(Cond, DAG))

26342

Cond = Cond.getOperand(0);

26343

26344

EVT CondVT = Cond.getValueType();

26345

26346

// Add an AND with 1 if we don't already have one.

26347

if (!(Cond.getOpcode() == ISD::AND && isOneConstant(Cond.getOperand(1))))

26348

Cond =

26349

DAG.getNode(ISD::AND, dl, CondVT, Cond, DAG.getConstant(1, dl, CondVT));

26350

26351

SDValue LHS = Cond;

26352

SDValue RHS = DAG.getConstant(0, dl, CondVT);

26353

26354

SDValue CCVal;

26355

SDValue EFLAGS = emitFlagsForSetcc(LHS, RHS, ISD::SETNE, dl, DAG, CCVal);

26356

return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,

26357

EFLAGS);

26358

}

26359

26360

// Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.

26361

// Calls to _alloca are needed to probe the stack when allocating more than 4k

26362

// bytes in one go. Touching the stack at 4K increments is necessary to ensure

26363

// that the guard pages used by the OS virtual memory manager are allocated in

26364

// correct sequence.

26365

SDValue

26366

X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,

26367

SelectionDAG &DAG) const {

26368

MachineFunction &MF = DAG.getMachineFunction();

26369

bool SplitStack = MF.shouldSplitStack();

26370

bool EmitStackProbeCall = hasStackProbeSymbol(MF);

26371

bool Lower = (Subtarget.isOSWindows() && !Subtarget.isTargetMachO()) ||

26372

SplitStack || EmitStackProbeCall;

26373

SDLoc dl(Op);

26374

26375

// Get the inputs.

26376

SDNode *Node = Op.getNode();

26377

SDValue Chain = Op.getOperand(0);

26378

SDValue Size = Op.getOperand(1);

26379

MaybeAlign Alignment(Op.getConstantOperandVal(2));

26380

EVT VT = Node->getValueType(0);

26381

26382

// Chain the dynamic stack allocation so that it doesn't modify the stack

26383

// pointer when other instructions are using the stack.

26384

Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);

26385

26386

bool Is64Bit = Subtarget.is64Bit();

26387

MVT SPTy = getPointerTy(DAG.getDataLayout());

26388

26389

SDValue Result;

26390

if (!Lower) {

26391

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

26392

Register SPReg = TLI.getStackPointerRegisterToSaveRestore();

26393

assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"(static_cast <bool> (SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
" not tell us which reg is the stack pointer!") ? void (0) :
__assert_fail ("SPReg && \"Target cannot require DYNAMIC_STACKALLOC expansion and\" \" not tell us which reg is the stack pointer!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26394, __extension__
__PRETTY_FUNCTION__))

26394

" not tell us which reg is the stack pointer!")(static_cast <bool> (SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
" not tell us which reg is the stack pointer!") ? void (0) :
__assert_fail ("SPReg && \"Target cannot require DYNAMIC_STACKALLOC expansion and\" \" not tell us which reg is the stack pointer!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26394, __extension__
__PRETTY_FUNCTION__));

26395

26396

const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();

26397

const Align StackAlign = TFI.getStackAlign();

26398

if (hasInlineStackProbe(MF)) {

26399

MachineRegisterInfo &MRI = MF.getRegInfo();

26400

26401

const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);

26402

Register Vreg = MRI.createVirtualRegister(AddrRegClass);

26403

Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);

26404

Result = DAG.getNode(X86ISD::PROBED_ALLOCA, dl, SPTy, Chain,

26405

DAG.getRegister(Vreg, SPTy));

26406

} else {

26407

SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);

26408

Chain = SP.getValue(1);

26409

Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value

26410

}

26411

if (Alignment && *Alignment > StackAlign)

26412

Result =

26413

DAG.getNode(ISD::AND, dl, VT, Result,

26414

DAG.getConstant(~(Alignment->value() - 1ULL), dl, VT));

26415

Chain = DAG.getCopyToReg(Chain, dl, SPReg, Result); // Output chain

26416

} else if (SplitStack) {

26417

MachineRegisterInfo &MRI = MF.getRegInfo();

26418

26419

if (Is64Bit) {

26420

// The 64 bit implementation of segmented stacks needs to clobber both r10

26421

// r11. This makes it impossible to use it along with nested parameters.

26422

const Function &F = MF.getFunction();

26423

for (const auto &A : F.args()) {

26424

if (A.hasNestAttr())

26425

report_fatal_error("Cannot use segmented stacks with functions that "

26426

"have nested arguments.");

26427

}

26428

}

26429

26430

const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);

26431

Register Vreg = MRI.createVirtualRegister(AddrRegClass);

26432

Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);

26433

Result = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,

26434

DAG.getRegister(Vreg, SPTy));

26435

} else {

26436

SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);

26437

Chain = DAG.getNode(X86ISD::DYN_ALLOCA, dl, NodeTys, Chain, Size);

26438

MF.getInfo<X86MachineFunctionInfo>()->setHasDynAlloca(true);

26439

26440

const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();

26441

Register SPReg = RegInfo->getStackRegister();

26442

SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);

26443

Chain = SP.getValue(1);

26444

26445

if (Alignment) {

26446

SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),

26447

DAG.getConstant(~(Alignment->value() - 1ULL), dl, VT));

26448

Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP);

26449

}

26450

26451

Result = SP;

26452

}

26453

26454

Chain = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);

26455

26456

SDValue Ops[2] = {Result, Chain};

26457

return DAG.getMergeValues(Ops, dl);

26458

}

26459

26460

SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {

26461

MachineFunction &MF = DAG.getMachineFunction();

26462

auto PtrVT = getPointerTy(MF.getDataLayout());

26463

X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();

26464

26465

const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();

26466

SDLoc DL(Op);

26467

26468

if (!Subtarget.is64Bit() ||

26469

Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv())) {

26470

// vastart just stores the address of the VarArgsFrameIndex slot into the

26471

// memory location argument.

26472

SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);

26473

return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),

26474

MachinePointerInfo(SV));

26475

}

26476

26477

// __va_list_tag:

26478

// gp_offset (0 - 6 * 8)

26479

// fp_offset (48 - 48 + 8 * 16)

26480

// overflow_arg_area (point to parameters coming in memory).

26481

// reg_save_area

26482

SmallVector<SDValue, 8> MemOps;

26483

SDValue FIN = Op.getOperand(1);

26484

// Store gp_offset

26485

SDValue Store = DAG.getStore(

26486

Op.getOperand(0), DL,

26487

DAG.getConstant(FuncInfo->getVarArgsGPOffset(), DL, MVT::i32), FIN,

26488

MachinePointerInfo(SV));

26489

MemOps.push_back(Store);

26490

26491

// Store fp_offset

26492

FIN = DAG.getMemBasePlusOffset(FIN, TypeSize::Fixed(4), DL);

26493

Store = DAG.getStore(

26494

Op.getOperand(0), DL,

26495

DAG.getConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32), FIN,

26496

MachinePointerInfo(SV, 4));

26497

MemOps.push_back(Store);

26498

26499

// Store ptr to overflow_arg_area

26500

FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(4, DL));

26501

SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);

26502

Store =

26503

DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN, MachinePointerInfo(SV, 8));

26504

MemOps.push_back(Store);

26505

26506

// Store ptr to reg_save_area.

26507

FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(

26508

Subtarget.isTarget64BitLP64() ? 8 : 4, DL));

26509

SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT);

26510

Store = DAG.getStore(

26511

Op.getOperand(0), DL, RSFIN, FIN,

26512

MachinePointerInfo(SV, Subtarget.isTarget64BitLP64() ? 16 : 12));

26513

MemOps.push_back(Store);

26514

return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);

26515

}

26516

26517

SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {

26518

assert(Subtarget.is64Bit() &&(static_cast <bool> (Subtarget.is64Bit() && "LowerVAARG only handles 64-bit va_arg!"
) ? void (0) : __assert_fail ("Subtarget.is64Bit() && \"LowerVAARG only handles 64-bit va_arg!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26519, __extension__
__PRETTY_FUNCTION__))

26519

"LowerVAARG only handles 64-bit va_arg!")(static_cast <bool> (Subtarget.is64Bit() && "LowerVAARG only handles 64-bit va_arg!"
) ? void (0) : __assert_fail ("Subtarget.is64Bit() && \"LowerVAARG only handles 64-bit va_arg!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26519, __extension__
__PRETTY_FUNCTION__));

26520

assert(Op.getNumOperands() == 4)(static_cast <bool> (Op.getNumOperands() == 4) ? void (
0) : __assert_fail ("Op.getNumOperands() == 4", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26520, __extension__ __PRETTY_FUNCTION__));

26521

26522

MachineFunction &MF = DAG.getMachineFunction();

26523

if (Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv()))

26524

// The Win64 ABI uses char* instead of a structure.

26525

return DAG.expandVAArg(Op.getNode());

26526

26527

SDValue Chain = Op.getOperand(0);

26528

SDValue SrcPtr = Op.getOperand(1);

26529

const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();

26530

unsigned Align = Op.getConstantOperandVal(3);

26531

SDLoc dl(Op);

26532

26533

EVT ArgVT = Op.getNode()->getValueType(0);

26534

Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());

26535

uint32_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);

26536

uint8_t ArgMode;

26537

26538

// Decide which area this value should be read from.

26539

// TODO: Implement the AMD64 ABI in its entirety. This simple

26540

// selection mechanism works only for the basic types.

26541

assert(ArgVT != MVT::f80 && "va_arg for f80 not yet implemented")(static_cast <bool> (ArgVT != MVT::f80 && "va_arg for f80 not yet implemented"
) ? void (0) : __assert_fail ("ArgVT != MVT::f80 && \"va_arg for f80 not yet implemented\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26541, __extension__
__PRETTY_FUNCTION__));

26542

if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) {

26543

ArgMode = 2; // Argument passed in XMM register. Use fp_offset.

26544

} else {

26545

assert(ArgVT.isInteger() && ArgSize <= 32 /*bytes*/ &&(static_cast <bool> (ArgVT.isInteger() && ArgSize
<= 32 && "Unhandled argument type in LowerVAARG")
? void (0) : __assert_fail ("ArgVT.isInteger() && ArgSize <= 32 && \"Unhandled argument type in LowerVAARG\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26546, __extension__
__PRETTY_FUNCTION__))

26546

"Unhandled argument type in LowerVAARG")(static_cast <bool> (ArgVT.isInteger() && ArgSize
<= 32 && "Unhandled argument type in LowerVAARG")
? void (0) : __assert_fail ("ArgVT.isInteger() && ArgSize <= 32 && \"Unhandled argument type in LowerVAARG\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26546, __extension__
__PRETTY_FUNCTION__));

26547

ArgMode = 1; // Argument passed in GPR64 register(s). Use gp_offset.

26548

}

26549

26550

if (ArgMode == 2) {

26551

// Make sure using fp_offset makes sense.

26552

assert(!Subtarget.useSoftFloat() &&(static_cast <bool> (!Subtarget.useSoftFloat() &&
!(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat
)) && Subtarget.hasSSE1()) ? void (0) : __assert_fail
("!Subtarget.useSoftFloat() && !(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) && Subtarget.hasSSE1()"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26554, __extension__
__PRETTY_FUNCTION__))

26553

!(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) &&(static_cast <bool> (!Subtarget.useSoftFloat() &&
!(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat
)) && Subtarget.hasSSE1()) ? void (0) : __assert_fail
("!Subtarget.useSoftFloat() && !(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) && Subtarget.hasSSE1()"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26554, __extension__
__PRETTY_FUNCTION__))

26554

Subtarget.hasSSE1())(static_cast <bool> (!Subtarget.useSoftFloat() &&
!(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat
)) && Subtarget.hasSSE1()) ? void (0) : __assert_fail
("!Subtarget.useSoftFloat() && !(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) && Subtarget.hasSSE1()"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26554, __extension__
__PRETTY_FUNCTION__));

26555

}

26556

26557

// Insert VAARG node into the DAG

26558

// VAARG returns two values: Variable Argument Address, Chain

26559

SDValue InstOps[] = {Chain, SrcPtr,

26560

DAG.getTargetConstant(ArgSize, dl, MVT::i32),

26561

DAG.getTargetConstant(ArgMode, dl, MVT::i8),

26562

DAG.getTargetConstant(Align, dl, MVT::i32)};

26563

SDVTList VTs = DAG.getVTList(getPointerTy(DAG.getDataLayout()), MVT::Other);

26564

SDValue VAARG = DAG.getMemIntrinsicNode(

26565

Subtarget.isTarget64BitLP64() ? X86ISD::VAARG_64 : X86ISD::VAARG_X32, dl,

26566

VTs, InstOps, MVT::i64, MachinePointerInfo(SV),

26567

/*Alignment=*/std::nullopt,

26568

MachineMemOperand::MOLoad | MachineMemOperand::MOStore);

26569

Chain = VAARG.getValue(1);

26570

26571

// Load the next argument and return it

26572

return DAG.getLoad(ArgVT, dl, Chain, VAARG, MachinePointerInfo());

26573

}

26574

26575

static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget,

26576

SelectionDAG &DAG) {

26577

// X86-64 va_list is a struct { i32, i32, i8*, i8* }, except on Windows,

26578

// where a va_list is still an i8*.

26579

assert(Subtarget.is64Bit() && "This code only handles 64-bit va_copy!")(static_cast <bool> (Subtarget.is64Bit() && "This code only handles 64-bit va_copy!"
) ? void (0) : __assert_fail ("Subtarget.is64Bit() && \"This code only handles 64-bit va_copy!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26579, __extension__
__PRETTY_FUNCTION__));

26580

if (Subtarget.isCallingConvWin64(

26581

DAG.getMachineFunction().getFunction().getCallingConv()))

26582

// Probably a Win64 va_copy.

26583

return DAG.expandVACopy(Op.getNode());

26584

26585

SDValue Chain = Op.getOperand(0);

26586

SDValue DstPtr = Op.getOperand(1);

26587

SDValue SrcPtr = Op.getOperand(2);

26588

const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();

26589

const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();

26590

SDLoc DL(Op);

26591

26592

return DAG.getMemcpy(

26593

Chain, DL, DstPtr, SrcPtr,

26594

DAG.getIntPtrConstant(Subtarget.isTarget64BitLP64() ? 24 : 16, DL),

26595

Align(Subtarget.isTarget64BitLP64() ? 8 : 4), /*isVolatile*/ false, false,

26596

false, MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));

26597

}

26598

26599

// Helper to get immediate/variable SSE shift opcode from other shift opcodes.

26600

static unsigned getTargetVShiftUniformOpcode(unsigned Opc, bool IsVariable) {

26601

switch (Opc) {

26602

case ISD::SHL:

26603

case X86ISD::VSHL:

26604

case X86ISD::VSHLI:

26605

return IsVariable ? X86ISD::VSHL : X86ISD::VSHLI;

26606

case ISD::SRL:

26607

case X86ISD::VSRL:

26608

case X86ISD::VSRLI:

26609

return IsVariable ? X86ISD::VSRL : X86ISD::VSRLI;

26610

case ISD::SRA:

26611

case X86ISD::VSRA:

26612

case X86ISD::VSRAI:

26613

return IsVariable ? X86ISD::VSRA : X86ISD::VSRAI;

26614

}

26615

llvm_unreachable("Unknown target vector shift node")::llvm::llvm_unreachable_internal("Unknown target vector shift node"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26615);

26616

}

26617

26618

/// Handle vector element shifts where the shift amount is a constant.

26619

/// Takes immediate version of shift as input.

26620

static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,

26621

SDValue SrcOp, uint64_t ShiftAmt,

26622

SelectionDAG &DAG) {

26623

MVT ElementType = VT.getVectorElementType();

26624

26625

// Bitcast the source vector to the output type, this is mainly necessary for

26626

// vXi8/vXi64 shifts.

26627

if (VT != SrcOp.getSimpleValueType())

26628

SrcOp = DAG.getBitcast(VT, SrcOp);

26629

26630

// Fold this packed shift into its first operand if ShiftAmt is 0.

26631

if (ShiftAmt == 0)

26632

return SrcOp;

26633

26634

// Check for ShiftAmt >= element width

26635

if (ShiftAmt >= ElementType.getSizeInBits()) {

26636

if (Opc == X86ISD::VSRAI)

26637

ShiftAmt = ElementType.getSizeInBits() - 1;

26638

else

26639

return DAG.getConstant(0, dl, VT);

26640

}

26641

26642

assert((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI)(static_cast <bool> ((Opc == X86ISD::VSHLI || Opc == X86ISD
::VSRLI || Opc == X86ISD::VSRAI) && "Unknown target vector shift-by-constant node"
) ? void (0) : __assert_fail ("(Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI) && \"Unknown target vector shift-by-constant node\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26643, __extension__
__PRETTY_FUNCTION__))

26643

&& "Unknown target vector shift-by-constant node")(static_cast <bool> ((Opc == X86ISD::VSHLI || Opc == X86ISD
::VSRLI || Opc == X86ISD::VSRAI) && "Unknown target vector shift-by-constant node"
) ? void (0) : __assert_fail ("(Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI) && \"Unknown target vector shift-by-constant node\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26643, __extension__
__PRETTY_FUNCTION__));

26644

26645

// Fold this packed vector shift into a build vector if SrcOp is a

26646

// vector of Constants or UNDEFs.

26647

if (ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) {

26648

unsigned ShiftOpc;

26649

switch (Opc) {

26650

default: llvm_unreachable("Unknown opcode!")::llvm::llvm_unreachable_internal("Unknown opcode!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 26650);

26651

case X86ISD::VSHLI:

26652

ShiftOpc = ISD::SHL;

26653

break;

26654

case X86ISD::VSRLI:

26655

ShiftOpc = ISD::SRL;

26656

break;

26657

case X86ISD::VSRAI:

26658

ShiftOpc = ISD::SRA;

26659

break;

26660

}

26661

26662

SDValue Amt = DAG.getConstant(ShiftAmt, dl, VT);

26663

if (SDValue C = DAG.FoldConstantArithmetic(ShiftOpc, dl, VT, {SrcOp, Amt}))

26664

return C;

26665

}

26666

26667

return DAG.getNode(Opc, dl, VT, SrcOp,

26668

DAG.getTargetConstant(ShiftAmt, dl, MVT::i8));

26669

}

26670

26671

/// Handle vector element shifts by a splat shift amount

26672

static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT,

26673

SDValue SrcOp, SDValue ShAmt, int ShAmtIdx,

26674

const X86Subtarget &Subtarget,

26675

SelectionDAG &DAG) {

26676

MVT AmtVT = ShAmt.getSimpleValueType();

26677

assert(AmtVT.isVector() && "Vector shift type mismatch")(static_cast <bool> (AmtVT.isVector() && "Vector shift type mismatch"
) ? void (0) : __assert_fail ("AmtVT.isVector() && \"Vector shift type mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26677, __extension__
__PRETTY_FUNCTION__));

26678

assert(0 <= ShAmtIdx && ShAmtIdx < (int)AmtVT.getVectorNumElements() &&(static_cast <bool> (0 <= ShAmtIdx && ShAmtIdx
< (int)AmtVT.getVectorNumElements() && "Illegal vector splat index"
) ? void (0) : __assert_fail ("0 <= ShAmtIdx && ShAmtIdx < (int)AmtVT.getVectorNumElements() && \"Illegal vector splat index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26679, __extension__
__PRETTY_FUNCTION__))

26679

"Illegal vector splat index")(static_cast <bool> (0 <= ShAmtIdx && ShAmtIdx
< (int)AmtVT.getVectorNumElements() && "Illegal vector splat index"
) ? void (0) : __assert_fail ("0 <= ShAmtIdx && ShAmtIdx < (int)AmtVT.getVectorNumElements() && \"Illegal vector splat index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26679, __extension__
__PRETTY_FUNCTION__));

26680

26681

// Move the splat element to the bottom element.

26682

if (ShAmtIdx != 0) {

26683

SmallVector<int> Mask(AmtVT.getVectorNumElements(), -1);

26684

Mask[0] = ShAmtIdx;

26685

ShAmt = DAG.getVectorShuffle(AmtVT, dl, ShAmt, DAG.getUNDEF(AmtVT), Mask);

26686

}

26687

26688

// Peek through any zext node if we can get back to a 128-bit source.

26689

if (AmtVT.getScalarSizeInBits() == 64 &&

26690

(ShAmt.getOpcode() == ISD::ZERO_EXTEND ||

26691

ShAmt.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG) &&

26692

ShAmt.getOperand(0).getValueType().isSimple() &&

26693

ShAmt.getOperand(0).getValueType().is128BitVector()) {

26694

ShAmt = ShAmt.getOperand(0);

26695

AmtVT = ShAmt.getSimpleValueType();

26696

}

26697

26698

// See if we can mask off the upper elements using the existing source node.

26699

// The shift uses the entire lower 64-bits of the amount vector, so no need to

26700

// do this for vXi64 types.

26701

bool IsMasked = false;

26702

if (AmtVT.getScalarSizeInBits() < 64) {

26703

if (ShAmt.getOpcode() == ISD::BUILD_VECTOR ||

26704

ShAmt.getOpcode() == ISD::SCALAR_TO_VECTOR) {

26705

// If the shift amount has come from a scalar, then zero-extend the scalar

26706

// before moving to the vector.

26707

ShAmt = DAG.getZExtOrTrunc(ShAmt.getOperand(0), dl, MVT::i32);

26708

ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, ShAmt);

26709

ShAmt = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, ShAmt);

26710

AmtVT = MVT::v4i32;

26711

IsMasked = true;

26712

} else if (ShAmt.getOpcode() == ISD::AND) {

26713

// See if the shift amount is already masked (e.g. for rotation modulo),

26714

// then we can zero-extend it by setting all the other mask elements to

26715

// zero.

26716

SmallVector<SDValue> MaskElts(

26717

AmtVT.getVectorNumElements(),

26718

DAG.getConstant(0, dl, AmtVT.getScalarType()));

26719

MaskElts[0] = DAG.getAllOnesConstant(dl, AmtVT.getScalarType());

26720

SDValue Mask = DAG.getBuildVector(AmtVT, dl, MaskElts);

26721

if ((Mask = DAG.FoldConstantArithmetic(ISD::AND, dl, AmtVT,

26722

{ShAmt.getOperand(1), Mask}))) {

26723

ShAmt = DAG.getNode(ISD::AND, dl, AmtVT, ShAmt.getOperand(0), Mask);

26724

IsMasked = true;

26725

}

26726

}

26727

}

26728

26729

// Extract if the shift amount vector is larger than 128-bits.

26730

if (AmtVT.getSizeInBits() > 128) {

26731

ShAmt = extract128BitVector(ShAmt, 0, DAG, dl);

26732

AmtVT = ShAmt.getSimpleValueType();

26733

}

26734

26735

// Zero-extend bottom element to v2i64 vector type, either by extension or

26736

// shuffle masking.

26737

if (!IsMasked && AmtVT.getScalarSizeInBits() < 64) {

26738

if (AmtVT == MVT::v4i32 && (ShAmt.getOpcode() == X86ISD::VBROADCAST ||

26739

ShAmt.getOpcode() == X86ISD::VBROADCAST_LOAD)) {

26740

ShAmt = DAG.getNode(X86ISD::VZEXT_MOVL, SDLoc(ShAmt), MVT::v4i32, ShAmt);

26741

} else if (Subtarget.hasSSE41()) {

26742

ShAmt = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(ShAmt),

26743

MVT::v2i64, ShAmt);

26744

} else {

26745

SDValue ByteShift = DAG.getTargetConstant(

26746

(128 - AmtVT.getScalarSizeInBits()) / 8, SDLoc(ShAmt), MVT::i8);

26747

ShAmt = DAG.getBitcast(MVT::v16i8, ShAmt);

26748

ShAmt = DAG.getNode(X86ISD::VSHLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt,

26749

ByteShift);

26750

ShAmt = DAG.getNode(X86ISD::VSRLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt,

26751

ByteShift);

26752

}

26753

}

26754

26755

// Change opcode to non-immediate version.

26756

Opc = getTargetVShiftUniformOpcode(Opc, true);

26757

26758

// The return type has to be a 128-bit type with the same element

26759

// type as the input type.

26760

MVT EltVT = VT.getVectorElementType();

26761

MVT ShVT = MVT::getVectorVT(EltVT, 128 / EltVT.getSizeInBits());

26762

26763

ShAmt = DAG.getBitcast(ShVT, ShAmt);

26764

return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);

26765

}

26766

26767

/// Return Mask with the necessary casting or extending

26768

/// for \p Mask according to \p MaskVT when lowering masking intrinsics

26769

static SDValue getMaskNode(SDValue Mask, MVT MaskVT,

26770

const X86Subtarget &Subtarget, SelectionDAG &DAG,

26771

const SDLoc &dl) {

26772

26773

if (isAllOnesConstant(Mask))

26774

return DAG.getConstant(1, dl, MaskVT);

26775

if (X86::isZeroNode(Mask))

26776

return DAG.getConstant(0, dl, MaskVT);

26777

26778

assert(MaskVT.bitsLE(Mask.getSimpleValueType()) && "Unexpected mask size!")(static_cast <bool> (MaskVT.bitsLE(Mask.getSimpleValueType
()) && "Unexpected mask size!") ? void (0) : __assert_fail
("MaskVT.bitsLE(Mask.getSimpleValueType()) && \"Unexpected mask size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26778, __extension__
__PRETTY_FUNCTION__));

26779

26780

if (Mask.getSimpleValueType() == MVT::i64 && Subtarget.is32Bit()) {

26781

assert(MaskVT == MVT::v64i1 && "Expected v64i1 mask!")(static_cast <bool> (MaskVT == MVT::v64i1 && "Expected v64i1 mask!"
) ? void (0) : __assert_fail ("MaskVT == MVT::v64i1 && \"Expected v64i1 mask!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26781, __extension__
__PRETTY_FUNCTION__));

26782

assert(Subtarget.hasBWI() && "Expected AVX512BW target!")(static_cast <bool> (Subtarget.hasBWI() && "Expected AVX512BW target!"
) ? void (0) : __assert_fail ("Subtarget.hasBWI() && \"Expected AVX512BW target!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26782, __extension__
__PRETTY_FUNCTION__));

26783

// In case 32bit mode, bitcast i64 is illegal, extend/split it.

26784

SDValue Lo, Hi;

26785

Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,

26786

DAG.getConstant(0, dl, MVT::i32));

26787

Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,

26788

DAG.getConstant(1, dl, MVT::i32));

26789

26790

Lo = DAG.getBitcast(MVT::v32i1, Lo);

26791

Hi = DAG.getBitcast(MVT::v32i1, Hi);

26792

26793

return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);

26794

} else {

26795

MVT BitcastVT = MVT::getVectorVT(MVT::i1,

26796

Mask.getSimpleValueType().getSizeInBits());

26797

// In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements

26798

// are extracted by EXTRACT_SUBVECTOR.

26799

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,

26800

DAG.getBitcast(BitcastVT, Mask),

26801

DAG.getIntPtrConstant(0, dl));

26802

}

26803

}

26804

26805

/// Return (and \p Op, \p Mask) for compare instructions or

26806

/// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the

26807

/// necessary casting or extending for \p Mask when lowering masking intrinsics

26808

static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,

26809

SDValue PreservedSrc,

26810

const X86Subtarget &Subtarget,

26811

SelectionDAG &DAG) {

26812

MVT VT = Op.getSimpleValueType();

26813

MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());

26814

unsigned OpcodeSelect = ISD::VSELECT;

26815

SDLoc dl(Op);

26816

26817

if (isAllOnesConstant(Mask))

26818

return Op;

26819

26820

SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);

26821

26822

if (PreservedSrc.isUndef())

26823

PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);

26824

return DAG.getNode(OpcodeSelect, dl, VT, VMask, Op, PreservedSrc);

26825

}

26826

26827

/// Creates an SDNode for a predicated scalar operation.

26828

/// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc).

26829

/// The mask is coming as MVT::i8 and it should be transformed

26830

/// to MVT::v1i1 while lowering masking intrinsics.

26831

/// The main difference between ScalarMaskingNode and VectorMaskingNode is using

26832

/// "X86select" instead of "vselect". We just can't create the "vselect" node

26833

/// for a scalar instruction.

26834

static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,

26835

SDValue PreservedSrc,

26836

const X86Subtarget &Subtarget,

26837

SelectionDAG &DAG) {

26838

26839

if (auto *MaskConst = dyn_cast<ConstantSDNode>(Mask))

26840

if (MaskConst->getZExtValue() & 0x1)

26841

return Op;

26842

26843

MVT VT = Op.getSimpleValueType();

26844

SDLoc dl(Op);

26845

26846

assert(Mask.getValueType() == MVT::i8 && "Unexpect type")(static_cast <bool> (Mask.getValueType() == MVT::i8 &&
"Unexpect type") ? void (0) : __assert_fail ("Mask.getValueType() == MVT::i8 && \"Unexpect type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 26846, __extension__
__PRETTY_FUNCTION__));

26847

SDValue IMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v1i1,

26848

DAG.getBitcast(MVT::v8i1, Mask),

26849

DAG.getIntPtrConstant(0, dl));

26850

if (Op.getOpcode() == X86ISD::FSETCCM ||

26851

Op.getOpcode() == X86ISD::FSETCCM_SAE ||

26852

Op.getOpcode() == X86ISD::VFPCLASSS)

26853

return DAG.getNode(ISD::AND, dl, VT, Op, IMask);

26854

26855

if (PreservedSrc.isUndef())

26856

PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);

26857

return DAG.getNode(X86ISD::SELECTS, dl, VT, IMask, Op, PreservedSrc);

26858

}

26859

26860

static int getSEHRegistrationNodeSize(const Function *Fn) {

26861

if (!Fn->hasPersonalityFn())

26862

report_fatal_error(

26863

"querying registration node size for function without personality");

26864

// The RegNodeSize is 6 32-bit words for SEH and 4 for C++ EH. See

26865

// WinEHStatePass for the full struct definition.

26866

switch (classifyEHPersonality(Fn->getPersonalityFn())) {

26867

case EHPersonality::MSVC_X86SEH: return 24;

26868

case EHPersonality::MSVC_CXX: return 16;

26869

default: break;

26870

}

26871

report_fatal_error(

26872

"can only recover FP for 32-bit MSVC EH personality functions");

26873

}

26874

26875

/// When the MSVC runtime transfers control to us, either to an outlined

26876

/// function or when returning to a parent frame after catching an exception, we

26877

/// recover the parent frame pointer by doing arithmetic on the incoming EBP.

26878

/// Here's the math:

26879

/// RegNodeBase = EntryEBP - RegNodeSize

26880

/// ParentFP = RegNodeBase - ParentFrameOffset

26881

/// Subtracting RegNodeSize takes us to the offset of the registration node, and

26882

/// subtracting the offset (negative on x86) takes us back to the parent FP.

26883

static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn,

26884

SDValue EntryEBP) {

26885

MachineFunction &MF = DAG.getMachineFunction();

26886

SDLoc dl;

26887

26888

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

26889

MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());

26890

26891

// It's possible that the parent function no longer has a personality function

26892

// if the exceptional code was optimized away, in which case we just return

26893

// the incoming EBP.

26894

if (!Fn->hasPersonalityFn())

26895

return EntryEBP;

26896

26897

// Get an MCSymbol that will ultimately resolve to the frame offset of the EH

26898

// registration, or the .set_setframe offset.

26899

MCSymbol *OffsetSym =

26900

MF.getMMI().getContext().getOrCreateParentFrameOffsetSymbol(

26901

GlobalValue::dropLLVMManglingEscape(Fn->getName()));

26902

SDValue OffsetSymVal = DAG.getMCSymbol(OffsetSym, PtrVT);

26903

SDValue ParentFrameOffset =

26904

DAG.getNode(ISD::LOCAL_RECOVER, dl, PtrVT, OffsetSymVal);

26905

26906

// Return EntryEBP + ParentFrameOffset for x64. This adjusts from RSP after

26907

// prologue to RBP in the parent function.

26908

const X86Subtarget &Subtarget = DAG.getSubtarget<X86Subtarget>();

26909

if (Subtarget.is64Bit())

26910

return DAG.getNode(ISD::ADD, dl, PtrVT, EntryEBP, ParentFrameOffset);

26911

26912

int RegNodeSize = getSEHRegistrationNodeSize(Fn);

26913

// RegNodeBase = EntryEBP - RegNodeSize

26914

// ParentFP = RegNodeBase - ParentFrameOffset

26915

SDValue RegNodeBase = DAG.getNode(ISD::SUB, dl, PtrVT, EntryEBP,

26916

DAG.getConstant(RegNodeSize, dl, PtrVT));

26917

return DAG.getNode(ISD::SUB, dl, PtrVT, RegNodeBase, ParentFrameOffset);

26918

}

26919

26920

SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,

26921

SelectionDAG &DAG) const {

26922

// Helper to detect if the operand is CUR_DIRECTION rounding mode.

26923

auto isRoundModeCurDirection = [](SDValue Rnd) {

26924

if (auto *C = dyn_cast<ConstantSDNode>(Rnd))

26925

return C->getAPIntValue() == X86::STATIC_ROUNDING::CUR_DIRECTION;

26926

26927

return false;

26928

};

26929

auto isRoundModeSAE = [](SDValue Rnd) {

26930

if (auto *C = dyn_cast<ConstantSDNode>(Rnd)) {

26931

unsigned RC = C->getZExtValue();

26932

if (RC & X86::STATIC_ROUNDING::NO_EXC) {

26933

// Clear the NO_EXC bit and check remaining bits.

26934

RC ^= X86::STATIC_ROUNDING::NO_EXC;

26935

// As a convenience we allow no other bits or explicitly

26936

// current direction.

26937

return RC == 0 || RC == X86::STATIC_ROUNDING::CUR_DIRECTION;

26938

}

26939

}

26940

26941

return false;

26942

};

26943

auto isRoundModeSAEToX = [](SDValue Rnd, unsigned &RC) {

26944

if (auto *C = dyn_cast<ConstantSDNode>(Rnd)) {

26945

RC = C->getZExtValue();

26946

if (RC & X86::STATIC_ROUNDING::NO_EXC) {

26947

// Clear the NO_EXC bit and check remaining bits.

26948

RC ^= X86::STATIC_ROUNDING::NO_EXC;

26949

return RC == X86::STATIC_ROUNDING::TO_NEAREST_INT ||

26950

RC == X86::STATIC_ROUNDING::TO_NEG_INF ||

26951

RC == X86::STATIC_ROUNDING::TO_POS_INF ||

26952

RC == X86::STATIC_ROUNDING::TO_ZERO;

26953

}

26954

}

26955

26956

return false;

26957

};

26958

26959

SDLoc dl(Op);

26960

unsigned IntNo = Op.getConstantOperandVal(0);

26961

MVT VT = Op.getSimpleValueType();

26962

const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo);

26963

26964

// Propagate flags from original node to transformed node(s).

26965

SelectionDAG::FlagInserter FlagsInserter(DAG, Op->getFlags());

26966

26967

if (IntrData) {

26968

switch(IntrData->Type) {

26969

case INTR_TYPE_1OP: {

26970

// We specify 2 possible opcodes for intrinsics with rounding modes.

26971

// First, we check if the intrinsic may have non-default rounding mode,

26972

// (IntrData->Opc1 != 0), then we check the rounding mode operand.

26973

unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;

26974

if (IntrWithRoundingModeOpcode != 0) {

26975

SDValue Rnd = Op.getOperand(2);

26976

unsigned RC = 0;

26977

if (isRoundModeSAEToX(Rnd, RC))

26978

return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),

26979

Op.getOperand(1),

26980

DAG.getTargetConstant(RC, dl, MVT::i32));

26981

if (!isRoundModeCurDirection(Rnd))

26982

return SDValue();

26983

}

26984

return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),

26985

Op.getOperand(1));

26986

}

26987

case INTR_TYPE_1OP_SAE: {

26988

SDValue Sae = Op.getOperand(2);

26989

26990

unsigned Opc;

26991

if (isRoundModeCurDirection(Sae))

26992

Opc = IntrData->Opc0;

26993

else if (isRoundModeSAE(Sae))

26994

Opc = IntrData->Opc1;

26995

else

26996

return SDValue();

26997

26998

return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1));

26999

}

27000

case INTR_TYPE_2OP: {

27001

SDValue Src2 = Op.getOperand(2);

27002

27003

// We specify 2 possible opcodes for intrinsics with rounding modes.

27004

// First, we check if the intrinsic may have non-default rounding mode,

27005

// (IntrData->Opc1 != 0), then we check the rounding mode operand.

27006

unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;

27007

if (IntrWithRoundingModeOpcode != 0) {

27008

SDValue Rnd = Op.getOperand(3);

27009

unsigned RC = 0;

27010

if (isRoundModeSAEToX(Rnd, RC))

27011

return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),

27012

Op.getOperand(1), Src2,

27013

DAG.getTargetConstant(RC, dl, MVT::i32));

27014

if (!isRoundModeCurDirection(Rnd))

27015

return SDValue();

27016

}

27017

27018

return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),

27019

Op.getOperand(1), Src2);

27020

}

27021

case INTR_TYPE_2OP_SAE: {

27022

SDValue Sae = Op.getOperand(3);

27023

27024

unsigned Opc;

27025

if (isRoundModeCurDirection(Sae))

27026

Opc = IntrData->Opc0;

27027

else if (isRoundModeSAE(Sae))

27028

Opc = IntrData->Opc1;

27029

else

27030

return SDValue();

27031

27032

return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1),

27033

Op.getOperand(2));

27034

}

27035

case INTR_TYPE_3OP:

27036

case INTR_TYPE_3OP_IMM8: {

27037

SDValue Src1 = Op.getOperand(1);

27038

SDValue Src2 = Op.getOperand(2);

27039

SDValue Src3 = Op.getOperand(3);

27040

27041

if (IntrData->Type == INTR_TYPE_3OP_IMM8 &&

27042

Src3.getValueType() != MVT::i8) {

27043

Src3 = DAG.getTargetConstant(

27044

cast<ConstantSDNode>(Src3)->getZExtValue() & 0xff, dl, MVT::i8);

27045

}

27046

27047

// We specify 2 possible opcodes for intrinsics with rounding modes.

27048

// First, we check if the intrinsic may have non-default rounding mode,

27049

// (IntrData->Opc1 != 0), then we check the rounding mode operand.

27050

unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;

27051

if (IntrWithRoundingModeOpcode != 0) {

27052

SDValue Rnd = Op.getOperand(4);

27053

unsigned RC = 0;

27054

if (isRoundModeSAEToX(Rnd, RC))

27055

return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),

27056

Src1, Src2, Src3,

27057

DAG.getTargetConstant(RC, dl, MVT::i32));

27058

if (!isRoundModeCurDirection(Rnd))

27059

return SDValue();

27060

}

27061

27062

return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),

27063

{Src1, Src2, Src3});

27064

}

27065

case INTR_TYPE_4OP_IMM8: {

27066

assert(Op.getOperand(4)->getOpcode() == ISD::TargetConstant)(static_cast <bool> (Op.getOperand(4)->getOpcode() ==
ISD::TargetConstant) ? void (0) : __assert_fail ("Op.getOperand(4)->getOpcode() == ISD::TargetConstant"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27066, __extension__
__PRETTY_FUNCTION__));

27067

SDValue Src4 = Op.getOperand(4);

27068

if (Src4.getValueType() != MVT::i8) {

27069

Src4 = DAG.getTargetConstant(

27070

cast<ConstantSDNode>(Src4)->getZExtValue() & 0xff, dl, MVT::i8);

27071

}

27072

27073

return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),

27074

Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),

27075

Src4);

27076

}

27077

case INTR_TYPE_1OP_MASK: {

27078

SDValue Src = Op.getOperand(1);

27079

SDValue PassThru = Op.getOperand(2);

27080

SDValue Mask = Op.getOperand(3);

27081

// We add rounding mode to the Node when

27082

// - RC Opcode is specified and

27083

// - RC is not "current direction".

27084

unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;

27085

if (IntrWithRoundingModeOpcode != 0) {

27086

SDValue Rnd = Op.getOperand(4);

27087

unsigned RC = 0;

27088

if (isRoundModeSAEToX(Rnd, RC))

27089

return getVectorMaskingNode(

27090

DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),

27091

Src, DAG.getTargetConstant(RC, dl, MVT::i32)),

27092

Mask, PassThru, Subtarget, DAG);

27093

if (!isRoundModeCurDirection(Rnd))

27094

return SDValue();

27095

}

27096

return getVectorMaskingNode(

27097

DAG.getNode(IntrData->Opc0, dl, VT, Src), Mask, PassThru,

27098

Subtarget, DAG);

27099

}

27100

case INTR_TYPE_1OP_MASK_SAE: {

27101

SDValue Src = Op.getOperand(1);

27102

SDValue PassThru = Op.getOperand(2);

27103

SDValue Mask = Op.getOperand(3);

27104

SDValue Rnd = Op.getOperand(4);

27105

27106

unsigned Opc;

27107

if (isRoundModeCurDirection(Rnd))

27108

Opc = IntrData->Opc0;

27109

else if (isRoundModeSAE(Rnd))

27110

Opc = IntrData->Opc1;

27111

else

27112

return SDValue();

27113

27114

return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src), Mask, PassThru,

27115

Subtarget, DAG);

27116

}

27117

case INTR_TYPE_SCALAR_MASK: {

27118

SDValue Src1 = Op.getOperand(1);

27119

SDValue Src2 = Op.getOperand(2);

27120

SDValue passThru = Op.getOperand(3);

27121

SDValue Mask = Op.getOperand(4);

27122

unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;

27123

// There are 2 kinds of intrinsics in this group:

27124

// (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands

27125

// (2) With rounding mode and sae - 7 operands.

27126

bool HasRounding = IntrWithRoundingModeOpcode != 0;

27127

if (Op.getNumOperands() == (5U + HasRounding)) {

27128

if (HasRounding) {

27129

SDValue Rnd = Op.getOperand(5);

27130

unsigned RC = 0;

27131

if (isRoundModeSAEToX(Rnd, RC))

27132

return getScalarMaskingNode(

27133

DAG.getNode(IntrWithRoundingModeOpcode, dl, VT, Src1, Src2,

27134

DAG.getTargetConstant(RC, dl, MVT::i32)),

27135

Mask, passThru, Subtarget, DAG);

27136

if (!isRoundModeCurDirection(Rnd))

27137

return SDValue();

27138

}

27139

return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,

27140

Src2),

27141

Mask, passThru, Subtarget, DAG);

27142

}

27143

27144

assert(Op.getNumOperands() == (6U + HasRounding) &&(static_cast <bool> (Op.getNumOperands() == (6U + HasRounding
) && "Unexpected intrinsic form") ? void (0) : __assert_fail
("Op.getNumOperands() == (6U + HasRounding) && \"Unexpected intrinsic form\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27145, __extension__
__PRETTY_FUNCTION__))

27145

"Unexpected intrinsic form")(static_cast <bool> (Op.getNumOperands() == (6U + HasRounding
) && "Unexpected intrinsic form") ? void (0) : __assert_fail
("Op.getNumOperands() == (6U + HasRounding) && \"Unexpected intrinsic form\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27145, __extension__
__PRETTY_FUNCTION__));

27146

SDValue RoundingMode = Op.getOperand(5);

27147

unsigned Opc = IntrData->Opc0;

27148

if (HasRounding) {

27149

SDValue Sae = Op.getOperand(6);

27150

if (isRoundModeSAE(Sae))

27151

Opc = IntrWithRoundingModeOpcode;

27152

else if (!isRoundModeCurDirection(Sae))

27153

return SDValue();

27154

}

27155

return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1,

27156

Src2, RoundingMode),

27157

Mask, passThru, Subtarget, DAG);

27158

}

27159

case INTR_TYPE_SCALAR_MASK_RND: {

27160

SDValue Src1 = Op.getOperand(1);

27161

SDValue Src2 = Op.getOperand(2);

27162

SDValue passThru = Op.getOperand(3);

27163

SDValue Mask = Op.getOperand(4);

27164

SDValue Rnd = Op.getOperand(5);

27165

27166

SDValue NewOp;

27167

unsigned RC = 0;

27168

if (isRoundModeCurDirection(Rnd))

27169

NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2);

27170

else if (isRoundModeSAEToX(Rnd, RC))

27171

NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2,

27172

DAG.getTargetConstant(RC, dl, MVT::i32));

27173

else

27174

return SDValue();

27175

27176

return getScalarMaskingNode(NewOp, Mask, passThru, Subtarget, DAG);

27177

}

27178

case INTR_TYPE_SCALAR_MASK_SAE: {

27179

SDValue Src1 = Op.getOperand(1);

27180

SDValue Src2 = Op.getOperand(2);

27181

SDValue passThru = Op.getOperand(3);

27182

SDValue Mask = Op.getOperand(4);

27183

SDValue Sae = Op.getOperand(5);

27184

unsigned Opc;

27185

if (isRoundModeCurDirection(Sae))

27186

Opc = IntrData->Opc0;

27187

else if (isRoundModeSAE(Sae))

27188

Opc = IntrData->Opc1;

27189

else

27190

return SDValue();

27191

27192

return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2),

27193

Mask, passThru, Subtarget, DAG);

27194

}

27195

case INTR_TYPE_2OP_MASK: {

27196

SDValue Src1 = Op.getOperand(1);

27197

SDValue Src2 = Op.getOperand(2);

27198

SDValue PassThru = Op.getOperand(3);

27199

SDValue Mask = Op.getOperand(4);

27200

SDValue NewOp;

27201

if (IntrData->Opc1 != 0) {

27202

SDValue Rnd = Op.getOperand(5);

27203

unsigned RC = 0;

27204

if (isRoundModeSAEToX(Rnd, RC))

27205

NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2,

27206

DAG.getTargetConstant(RC, dl, MVT::i32));

27207

else if (!isRoundModeCurDirection(Rnd))

27208

return SDValue();

27209

}

27210

if (!NewOp)

27211

NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2);

27212

return getVectorMaskingNode(NewOp, Mask, PassThru, Subtarget, DAG);

27213

}

27214

case INTR_TYPE_2OP_MASK_SAE: {

27215

SDValue Src1 = Op.getOperand(1);

27216

SDValue Src2 = Op.getOperand(2);

27217

SDValue PassThru = Op.getOperand(3);

27218

SDValue Mask = Op.getOperand(4);

27219

27220

unsigned Opc = IntrData->Opc0;

27221

if (IntrData->Opc1 != 0) {

27222

SDValue Sae = Op.getOperand(5);

27223

if (isRoundModeSAE(Sae))

27224

Opc = IntrData->Opc1;

27225

else if (!isRoundModeCurDirection(Sae))

27226

return SDValue();

27227

}

27228

27229

return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2),

27230

Mask, PassThru, Subtarget, DAG);

27231

}

27232

case INTR_TYPE_3OP_SCALAR_MASK_SAE: {

27233

SDValue Src1 = Op.getOperand(1);

27234

SDValue Src2 = Op.getOperand(2);

27235

SDValue Src3 = Op.getOperand(3);

27236

SDValue PassThru = Op.getOperand(4);

27237

SDValue Mask = Op.getOperand(5);

27238

SDValue Sae = Op.getOperand(6);

27239

unsigned Opc;

27240

if (isRoundModeCurDirection(Sae))

27241

Opc = IntrData->Opc0;

27242

else if (isRoundModeSAE(Sae))

27243

Opc = IntrData->Opc1;

27244

else

27245

return SDValue();

27246

27247

return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, Src3),

27248

Mask, PassThru, Subtarget, DAG);

27249

}

27250

case INTR_TYPE_3OP_MASK_SAE: {

27251

SDValue Src1 = Op.getOperand(1);

27252

SDValue Src2 = Op.getOperand(2);

27253

SDValue Src3 = Op.getOperand(3);

27254

SDValue PassThru = Op.getOperand(4);

27255

SDValue Mask = Op.getOperand(5);

27256

27257

unsigned Opc = IntrData->Opc0;

27258

if (IntrData->Opc1 != 0) {

27259

SDValue Sae = Op.getOperand(6);

27260

if (isRoundModeSAE(Sae))

27261

Opc = IntrData->Opc1;

27262

else if (!isRoundModeCurDirection(Sae))

27263

return SDValue();

27264

}

27265

return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, Src3),

27266

Mask, PassThru, Subtarget, DAG);

27267

}

27268

case BLENDV: {

27269

SDValue Src1 = Op.getOperand(1);

27270

SDValue Src2 = Op.getOperand(2);

27271

SDValue Src3 = Op.getOperand(3);

27272

27273

EVT MaskVT = Src3.getValueType().changeVectorElementTypeToInteger();

27274

Src3 = DAG.getBitcast(MaskVT, Src3);

27275

27276

// Reverse the operands to match VSELECT order.

27277

return DAG.getNode(IntrData->Opc0, dl, VT, Src3, Src2, Src1);

27278

}

27279

case VPERM_2OP : {

27280

SDValue Src1 = Op.getOperand(1);

27281

SDValue Src2 = Op.getOperand(2);

27282

27283

// Swap Src1 and Src2 in the node creation

27284

return DAG.getNode(IntrData->Opc0, dl, VT,Src2, Src1);

27285

}

27286

case CFMA_OP_MASKZ:

27287

case CFMA_OP_MASK: {

27288

SDValue Src1 = Op.getOperand(1);

27289

SDValue Src2 = Op.getOperand(2);

27290

SDValue Src3 = Op.getOperand(3);

27291

SDValue Mask = Op.getOperand(4);

27292

MVT VT = Op.getSimpleValueType();

27293

27294

SDValue PassThru = Src3;

27295

if (IntrData->Type == CFMA_OP_MASKZ)

27296

PassThru = getZeroVector(VT, Subtarget, DAG, dl);

27297

27298

// We add rounding mode to the Node when

27299

// - RC Opcode is specified and

27300

// - RC is not "current direction".

27301

SDValue NewOp;

27302

if (IntrData->Opc1 != 0) {

27303

SDValue Rnd = Op.getOperand(5);

27304

unsigned RC = 0;

27305

if (isRoundModeSAEToX(Rnd, RC))

27306

NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2, Src3,

27307

DAG.getTargetConstant(RC, dl, MVT::i32));

27308

else if (!isRoundModeCurDirection(Rnd))

27309

return SDValue();

27310

}

27311

if (!NewOp)

27312

NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2, Src3);

27313

return getVectorMaskingNode(NewOp, Mask, PassThru, Subtarget, DAG);

27314

}

27315

case IFMA_OP:

27316

// NOTE: We need to swizzle the operands to pass the multiply operands

27317

// first.

27318

return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),

27319

Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));

27320

case FPCLASSS: {

27321

SDValue Src1 = Op.getOperand(1);

27322

SDValue Imm = Op.getOperand(2);

27323

SDValue Mask = Op.getOperand(3);

27324

SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Imm);

27325

SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask, SDValue(),

27326

Subtarget, DAG);

27327

// Need to fill with zeros to ensure the bitcast will produce zeroes

27328

// for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.

27329

SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,

27330

DAG.getConstant(0, dl, MVT::v8i1),

27331

FPclassMask, DAG.getIntPtrConstant(0, dl));

27332

return DAG.getBitcast(MVT::i8, Ins);

27333

}

27334

27335

case CMP_MASK_CC: {

27336

MVT MaskVT = Op.getSimpleValueType();

27337

SDValue CC = Op.getOperand(3);

27338

SDValue Mask = Op.getOperand(4);

27339

// We specify 2 possible opcodes for intrinsics with rounding modes.

27340

// First, we check if the intrinsic may have non-default rounding mode,

27341

// (IntrData->Opc1 != 0), then we check the rounding mode operand.

27342

if (IntrData->Opc1 != 0) {

27343

SDValue Sae = Op.getOperand(5);

27344

if (isRoundModeSAE(Sae))

27345

return DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1),

27346

Op.getOperand(2), CC, Mask, Sae);

27347

if (!isRoundModeCurDirection(Sae))

27348

return SDValue();

27349

}

27350

//default rounding mode

27351

return DAG.getNode(IntrData->Opc0, dl, MaskVT,

27352

{Op.getOperand(1), Op.getOperand(2), CC, Mask});

27353

}

27354

case CMP_MASK_SCALAR_CC: {

27355

SDValue Src1 = Op.getOperand(1);

27356

SDValue Src2 = Op.getOperand(2);

27357

SDValue CC = Op.getOperand(3);

27358

SDValue Mask = Op.getOperand(4);

27359

27360

SDValue Cmp;

27361

if (IntrData->Opc1 != 0) {

27362

SDValue Sae = Op.getOperand(5);

27363

if (isRoundModeSAE(Sae))

27364

Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::v1i1, Src1, Src2, CC, Sae);

27365

else if (!isRoundModeCurDirection(Sae))

27366

return SDValue();

27367

}

27368

//default rounding mode

27369

if (!Cmp.getNode())

27370

Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Src2, CC);

27371

27372

SDValue CmpMask = getScalarMaskingNode(Cmp, Mask, SDValue(),

27373

Subtarget, DAG);

27374

// Need to fill with zeros to ensure the bitcast will produce zeroes

27375

// for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.

27376

SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,

27377

DAG.getConstant(0, dl, MVT::v8i1),

27378

CmpMask, DAG.getIntPtrConstant(0, dl));

27379

return DAG.getBitcast(MVT::i8, Ins);

27380

}

27381

case COMI: { // Comparison intrinsics

27382

ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;

27383

SDValue LHS = Op.getOperand(1);

27384

SDValue RHS = Op.getOperand(2);

27385

// Some conditions require the operands to be swapped.

27386

if (CC == ISD::SETLT || CC == ISD::SETLE)

27387

std::swap(LHS, RHS);

27388

27389

SDValue Comi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, LHS, RHS);

27390

SDValue SetCC;

27391

switch (CC) {

27392

case ISD::SETEQ: { // (ZF = 0 and PF = 0)

27393

SetCC = getSETCC(X86::COND_E, Comi, dl, DAG);

27394

SDValue SetNP = getSETCC(X86::COND_NP, Comi, dl, DAG);

27395

SetCC = DAG.getNode(ISD::AND, dl, MVT::i8, SetCC, SetNP);

27396

break;

27397

}

27398

case ISD::SETNE: { // (ZF = 1 or PF = 1)

27399

SetCC = getSETCC(X86::COND_NE, Comi, dl, DAG);

27400

SDValue SetP = getSETCC(X86::COND_P, Comi, dl, DAG);

27401

SetCC = DAG.getNode(ISD::OR, dl, MVT::i8, SetCC, SetP);

27402

break;

27403

}

27404

case ISD::SETGT: // (CF = 0 and ZF = 0)

27405

case ISD::SETLT: { // Condition opposite to GT. Operands swapped above.

27406

SetCC = getSETCC(X86::COND_A, Comi, dl, DAG);

27407

break;

27408

}

27409

case ISD::SETGE: // CF = 0

27410

case ISD::SETLE: // Condition opposite to GE. Operands swapped above.

27411

SetCC = getSETCC(X86::COND_AE, Comi, dl, DAG);

27412

break;

27413

default:

27414

llvm_unreachable("Unexpected illegal condition!")::llvm::llvm_unreachable_internal("Unexpected illegal condition!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27414);

27415

}

27416

return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);

27417

}

27418

case COMI_RM: { // Comparison intrinsics with Sae

27419

SDValue LHS = Op.getOperand(1);

27420

SDValue RHS = Op.getOperand(2);

27421

unsigned CondVal = Op.getConstantOperandVal(3);

27422

SDValue Sae = Op.getOperand(4);

27423

27424

SDValue FCmp;

27425

if (isRoundModeCurDirection(Sae))

27426

FCmp = DAG.getNode(X86ISD::FSETCCM, dl, MVT::v1i1, LHS, RHS,

27427

DAG.getTargetConstant(CondVal, dl, MVT::i8));

27428

else if (isRoundModeSAE(Sae))

27429

FCmp = DAG.getNode(X86ISD::FSETCCM_SAE, dl, MVT::v1i1, LHS, RHS,

27430

DAG.getTargetConstant(CondVal, dl, MVT::i8), Sae);

27431

else

27432

return SDValue();

27433

// Need to fill with zeros to ensure the bitcast will produce zeroes

27434

// for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.

27435

SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,

27436

DAG.getConstant(0, dl, MVT::v16i1),

27437

FCmp, DAG.getIntPtrConstant(0, dl));

27438

return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32,

27439

DAG.getBitcast(MVT::i16, Ins));

27440

}

27441

case VSHIFT: {

27442

SDValue SrcOp = Op.getOperand(1);

27443

SDValue ShAmt = Op.getOperand(2);

27444

assert(ShAmt.getValueType() == MVT::i32 &&(static_cast <bool> (ShAmt.getValueType() == MVT::i32 &&
"Unexpected VSHIFT amount type") ? void (0) : __assert_fail (
"ShAmt.getValueType() == MVT::i32 && \"Unexpected VSHIFT amount type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27445, __extension__
__PRETTY_FUNCTION__))

27445

"Unexpected VSHIFT amount type")(static_cast <bool> (ShAmt.getValueType() == MVT::i32 &&
"Unexpected VSHIFT amount type") ? void (0) : __assert_fail (
"ShAmt.getValueType() == MVT::i32 && \"Unexpected VSHIFT amount type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27445, __extension__
__PRETTY_FUNCTION__));

27446

27447

// Catch shift-by-constant.

27448

if (auto *CShAmt = dyn_cast<ConstantSDNode>(ShAmt))

27449

return getTargetVShiftByConstNode(IntrData->Opc0, dl,

27450

Op.getSimpleValueType(), SrcOp,

27451

CShAmt->getZExtValue(), DAG);

27452

27453

ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, ShAmt);

27454

return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),

27455

SrcOp, ShAmt, 0, Subtarget, DAG);

27456

}

27457

case COMPRESS_EXPAND_IN_REG: {

27458

SDValue Mask = Op.getOperand(3);

27459

SDValue DataToCompress = Op.getOperand(1);

27460

SDValue PassThru = Op.getOperand(2);

27461

if (ISD::isBuildVectorAllOnes(Mask.getNode())) // return data as is

27462

return Op.getOperand(1);

27463

27464

// Avoid false dependency.

27465

if (PassThru.isUndef())

27466

PassThru = getZeroVector(VT, Subtarget, DAG, dl);

27467

27468

return DAG.getNode(IntrData->Opc0, dl, VT, DataToCompress, PassThru,

27469

Mask);

27470

}

27471

case FIXUPIMM:

27472

case FIXUPIMM_MASKZ: {

27473

SDValue Src1 = Op.getOperand(1);

27474

SDValue Src2 = Op.getOperand(2);

27475

SDValue Src3 = Op.getOperand(3);

27476

SDValue Imm = Op.getOperand(4);

27477

SDValue Mask = Op.getOperand(5);

27478

SDValue Passthru = (IntrData->Type == FIXUPIMM)

27479

? Src1

27480

: getZeroVector(VT, Subtarget, DAG, dl);

27481

27482

unsigned Opc = IntrData->Opc0;

27483

if (IntrData->Opc1 != 0) {

27484

SDValue Sae = Op.getOperand(6);

27485

if (isRoundModeSAE(Sae))

27486

Opc = IntrData->Opc1;

27487

else if (!isRoundModeCurDirection(Sae))

27488

return SDValue();

27489

}

27490

27491

SDValue FixupImm = DAG.getNode(Opc, dl, VT, Src1, Src2, Src3, Imm);

27492

27493

if (Opc == X86ISD::VFIXUPIMM || Opc == X86ISD::VFIXUPIMM_SAE)

27494

return getVectorMaskingNode(FixupImm, Mask, Passthru, Subtarget, DAG);

27495

27496

return getScalarMaskingNode(FixupImm, Mask, Passthru, Subtarget, DAG);

27497

}

27498

case ROUNDP: {

27499

assert(IntrData->Opc0 == X86ISD::VRNDSCALE && "Unexpected opcode")(static_cast <bool> (IntrData->Opc0 == X86ISD::VRNDSCALE
&& "Unexpected opcode") ? void (0) : __assert_fail (
"IntrData->Opc0 == X86ISD::VRNDSCALE && \"Unexpected opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27499, __extension__
__PRETTY_FUNCTION__));

27500

// Clear the upper bits of the rounding immediate so that the legacy

27501

// intrinsic can't trigger the scaling behavior of VRNDSCALE.

27502

auto Round = cast<ConstantSDNode>(Op.getOperand(2));

27503

SDValue RoundingMode =

27504

DAG.getTargetConstant(Round->getZExtValue() & 0xf, dl, MVT::i32);

27505

return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),

27506

Op.getOperand(1), RoundingMode);

27507

}

27508

case ROUNDS: {

27509

assert(IntrData->Opc0 == X86ISD::VRNDSCALES && "Unexpected opcode")(static_cast <bool> (IntrData->Opc0 == X86ISD::VRNDSCALES
&& "Unexpected opcode") ? void (0) : __assert_fail (
"IntrData->Opc0 == X86ISD::VRNDSCALES && \"Unexpected opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27509, __extension__
__PRETTY_FUNCTION__));

27510

// Clear the upper bits of the rounding immediate so that the legacy

27511

// intrinsic can't trigger the scaling behavior of VRNDSCALE.

27512

auto Round = cast<ConstantSDNode>(Op.getOperand(3));

27513

SDValue RoundingMode =

27514

DAG.getTargetConstant(Round->getZExtValue() & 0xf, dl, MVT::i32);

27515

return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),

27516

Op.getOperand(1), Op.getOperand(2), RoundingMode);

27517

}

27518

case BEXTRI: {

27519

assert(IntrData->Opc0 == X86ISD::BEXTRI && "Unexpected opcode")(static_cast <bool> (IntrData->Opc0 == X86ISD::BEXTRI
&& "Unexpected opcode") ? void (0) : __assert_fail (
"IntrData->Opc0 == X86ISD::BEXTRI && \"Unexpected opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27519, __extension__
__PRETTY_FUNCTION__));

27520

27521

uint64_t Imm = Op.getConstantOperandVal(2);

27522

SDValue Control = DAG.getTargetConstant(Imm & 0xffff, dl,

27523

Op.getValueType());

27524

return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),

27525

Op.getOperand(1), Control);

27526

}

27527

// ADC/ADCX/SBB

27528

case ADX: {

27529

SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::i32);

27530

SDVTList VTs = DAG.getVTList(Op.getOperand(2).getValueType(), MVT::i32);

27531

27532

SDValue Res;

27533

// If the carry in is zero, then we should just use ADD/SUB instead of

27534

// ADC/SBB.

27535

if (isNullConstant(Op.getOperand(1))) {

27536

Res = DAG.getNode(IntrData->Opc1, dl, VTs, Op.getOperand(2),

27537

Op.getOperand(3));

27538

} else {

27539

SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(1),

27540

DAG.getConstant(-1, dl, MVT::i8));

27541

Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(2),

27542

Op.getOperand(3), GenCF.getValue(1));

27543

}

27544

SDValue SetCC = getSETCC(X86::COND_B, Res.getValue(1), dl, DAG);

27545

SDValue Results[] = { SetCC, Res };

27546

return DAG.getMergeValues(Results, dl);

27547

}

27548

case CVTPD2PS_MASK:

27549

case CVTPD2DQ_MASK:

27550

case CVTQQ2PS_MASK:

27551

case TRUNCATE_TO_REG: {

27552

SDValue Src = Op.getOperand(1);

27553

SDValue PassThru = Op.getOperand(2);

27554

SDValue Mask = Op.getOperand(3);

27555

27556

if (isAllOnesConstant(Mask))

27557

return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src);

27558

27559

MVT SrcVT = Src.getSimpleValueType();

27560

MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());

27561

Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);

27562

return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(),

27563

{Src, PassThru, Mask});

27564

}

27565

case CVTPS2PH_MASK: {

27566

SDValue Src = Op.getOperand(1);

27567

SDValue Rnd = Op.getOperand(2);

27568

SDValue PassThru = Op.getOperand(3);

27569

SDValue Mask = Op.getOperand(4);

27570

27571

unsigned RC = 0;

27572

unsigned Opc = IntrData->Opc0;

27573

bool SAE = Src.getValueType().is512BitVector() &&

27574

(isRoundModeSAEToX(Rnd, RC) || isRoundModeSAE(Rnd));

27575

if (SAE) {

27576

Opc = X86ISD::CVTPS2PH_SAE;

27577

Rnd = DAG.getTargetConstant(RC, dl, MVT::i32);

27578

}

27579

27580

if (isAllOnesConstant(Mask))

27581

return DAG.getNode(Opc, dl, Op.getValueType(), Src, Rnd);

27582

27583

if (SAE)

27584

Opc = X86ISD::MCVTPS2PH_SAE;

27585

else

27586

Opc = IntrData->Opc1;

27587

MVT SrcVT = Src.getSimpleValueType();

27588

MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());

27589

Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);

27590

return DAG.getNode(Opc, dl, Op.getValueType(), Src, Rnd, PassThru, Mask);

27591

}

27592

case CVTNEPS2BF16_MASK: {

27593

SDValue Src = Op.getOperand(1);

27594

SDValue PassThru = Op.getOperand(2);

27595

SDValue Mask = Op.getOperand(3);

27596

27597

if (ISD::isBuildVectorAllOnes(Mask.getNode()))

27598

return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src);

27599

27600

// Break false dependency.

27601

if (PassThru.isUndef())

27602

PassThru = DAG.getConstant(0, dl, PassThru.getValueType());

27603

27604

return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(), Src, PassThru,

27605

Mask);

27606

}

27607

default:

27608

break;

27609

}

27610

}

27611

27612

switch (IntNo) {

27613

default: return SDValue(); // Don't custom lower most intrinsics.

27614

27615

// ptest and testp intrinsics. The intrinsic these come from are designed to

27616

// return an integer value, not just an instruction so lower it to the ptest

27617

// or testp pattern and a setcc for the result.

27618

case Intrinsic::x86_avx512_ktestc_b:

27619

case Intrinsic::x86_avx512_ktestc_w:

27620

case Intrinsic::x86_avx512_ktestc_d:

27621

case Intrinsic::x86_avx512_ktestc_q:

27622

case Intrinsic::x86_avx512_ktestz_b:

27623

case Intrinsic::x86_avx512_ktestz_w:

27624

case Intrinsic::x86_avx512_ktestz_d:

27625

case Intrinsic::x86_avx512_ktestz_q:

27626

case Intrinsic::x86_sse41_ptestz:

27627

case Intrinsic::x86_sse41_ptestc:

27628

case Intrinsic::x86_sse41_ptestnzc:

27629

case Intrinsic::x86_avx_ptestz_256:

27630

case Intrinsic::x86_avx_ptestc_256:

27631

case Intrinsic::x86_avx_ptestnzc_256:

27632

case Intrinsic::x86_avx_vtestz_ps:

27633

case Intrinsic::x86_avx_vtestc_ps:

27634

case Intrinsic::x86_avx_vtestnzc_ps:

27635

case Intrinsic::x86_avx_vtestz_pd:

27636

case Intrinsic::x86_avx_vtestc_pd:

27637

case Intrinsic::x86_avx_vtestnzc_pd:

27638

case Intrinsic::x86_avx_vtestz_ps_256:

27639

case Intrinsic::x86_avx_vtestc_ps_256:

27640

case Intrinsic::x86_avx_vtestnzc_ps_256:

27641

case Intrinsic::x86_avx_vtestz_pd_256:

27642

case Intrinsic::x86_avx_vtestc_pd_256:

27643

case Intrinsic::x86_avx_vtestnzc_pd_256: {

27644

unsigned TestOpc = X86ISD::PTEST;

27645

X86::CondCode X86CC;

27646

switch (IntNo) {

27647

default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.")::llvm::llvm_unreachable_internal("Bad fallthrough in Intrinsic lowering."
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 27647);

27648

case Intrinsic::x86_avx512_ktestc_b:

27649

case Intrinsic::x86_avx512_ktestc_w:

27650

case Intrinsic::x86_avx512_ktestc_d:

27651

case Intrinsic::x86_avx512_ktestc_q:

27652

// CF = 1

27653

TestOpc = X86ISD::KTEST;

27654

X86CC = X86::COND_B;

27655

break;

27656

case Intrinsic::x86_avx512_ktestz_b:

27657

case Intrinsic::x86_avx512_ktestz_w:

27658

case Intrinsic::x86_avx512_ktestz_d:

27659

case Intrinsic::x86_avx512_ktestz_q:

27660

TestOpc = X86ISD::KTEST;

27661

X86CC = X86::COND_E;

27662

break;

27663

case Intrinsic::x86_avx_vtestz_ps:

27664

case Intrinsic::x86_avx_vtestz_pd:

27665

case Intrinsic::x86_avx_vtestz_ps_256:

27666

case Intrinsic::x86_avx_vtestz_pd_256:

27667

TestOpc = X86ISD::TESTP;

27668

[[fallthrough]];

27669

case Intrinsic::x86_sse41_ptestz:

27670

case Intrinsic::x86_avx_ptestz_256:

27671

// ZF = 1

27672

X86CC = X86::COND_E;

27673

break;

27674

case Intrinsic::x86_avx_vtestc_ps:

27675

case Intrinsic::x86_avx_vtestc_pd:

27676

case Intrinsic::x86_avx_vtestc_ps_256:

27677

case Intrinsic::x86_avx_vtestc_pd_256:

27678

TestOpc = X86ISD::TESTP;

27679

[[fallthrough]];

27680

case Intrinsic::x86_sse41_ptestc:

27681

case Intrinsic::x86_avx_ptestc_256:

27682

// CF = 1

27683

X86CC = X86::COND_B;

27684

break;

27685

case Intrinsic::x86_avx_vtestnzc_ps:

27686

case Intrinsic::x86_avx_vtestnzc_pd:

27687

case Intrinsic::x86_avx_vtestnzc_ps_256:

27688

case Intrinsic::x86_avx_vtestnzc_pd_256:

27689

TestOpc = X86ISD::TESTP;

27690

[[fallthrough]];

27691

case Intrinsic::x86_sse41_ptestnzc:

27692

case Intrinsic::x86_avx_ptestnzc_256:

27693

// ZF and CF = 0

27694

X86CC = X86::COND_A;

27695

break;

27696

}

27697

27698

SDValue LHS = Op.getOperand(1);

27699

SDValue RHS = Op.getOperand(2);

27700

SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS);

27701

SDValue SetCC = getSETCC(X86CC, Test, dl, DAG);

27702

return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);

27703

}

27704

27705

case Intrinsic::x86_sse42_pcmpistria128:

27706

case Intrinsic::x86_sse42_pcmpestria128:

27707

case Intrinsic::x86_sse42_pcmpistric128:

27708

case Intrinsic::x86_sse42_pcmpestric128:

27709

case Intrinsic::x86_sse42_pcmpistrio128:

27710

case Intrinsic::x86_sse42_pcmpestrio128:

27711

case Intrinsic::x86_sse42_pcmpistris128:

27712

case Intrinsic::x86_sse42_pcmpestris128:

27713

case Intrinsic::x86_sse42_pcmpistriz128:

27714

case Intrinsic::x86_sse42_pcmpestriz128: {

27715

unsigned Opcode;

27716

X86::CondCode X86CC;

27717

switch (IntNo) {

27718

default: llvm_unreachable("Impossible intrinsic")::llvm::llvm_unreachable_internal("Impossible intrinsic", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27718); // Can't reach here.

27719

case Intrinsic::x86_sse42_pcmpistria128:

27720

Opcode = X86ISD::PCMPISTR;

27721

X86CC = X86::COND_A;

27722

break;

27723

case Intrinsic::x86_sse42_pcmpestria128:

27724

Opcode = X86ISD::PCMPESTR;

27725

X86CC = X86::COND_A;

27726

break;

27727

case Intrinsic::x86_sse42_pcmpistric128:

27728

Opcode = X86ISD::PCMPISTR;

27729

X86CC = X86::COND_B;

27730

break;

27731

case Intrinsic::x86_sse42_pcmpestric128:

27732

Opcode = X86ISD::PCMPESTR;

27733

X86CC = X86::COND_B;

27734

break;

27735

case Intrinsic::x86_sse42_pcmpistrio128:

27736

Opcode = X86ISD::PCMPISTR;

27737

X86CC = X86::COND_O;

27738

break;

27739

case Intrinsic::x86_sse42_pcmpestrio128:

27740

Opcode = X86ISD::PCMPESTR;

27741

X86CC = X86::COND_O;

27742

break;

27743

case Intrinsic::x86_sse42_pcmpistris128:

27744

Opcode = X86ISD::PCMPISTR;

27745

X86CC = X86::COND_S;

27746

break;

27747

case Intrinsic::x86_sse42_pcmpestris128:

27748

Opcode = X86ISD::PCMPESTR;

27749

X86CC = X86::COND_S;

27750

break;

27751

case Intrinsic::x86_sse42_pcmpistriz128:

27752

Opcode = X86ISD::PCMPISTR;

27753

X86CC = X86::COND_E;

27754

break;

27755

case Intrinsic::x86_sse42_pcmpestriz128:

27756

Opcode = X86ISD::PCMPESTR;

27757

X86CC = X86::COND_E;

27758

break;

27759

}

27760

SmallVector<SDValue, 5> NewOps(llvm::drop_begin(Op->ops()));

27761

SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);

27762

SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps).getValue(2);

27763

SDValue SetCC = getSETCC(X86CC, PCMP, dl, DAG);

27764

return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);

27765

}

27766

27767

case Intrinsic::x86_sse42_pcmpistri128:

27768

case Intrinsic::x86_sse42_pcmpestri128: {

27769

unsigned Opcode;

27770

if (IntNo == Intrinsic::x86_sse42_pcmpistri128)

27771

Opcode = X86ISD::PCMPISTR;

27772

else

27773

Opcode = X86ISD::PCMPESTR;

27774

27775

SmallVector<SDValue, 5> NewOps(llvm::drop_begin(Op->ops()));

27776

SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);

27777

return DAG.getNode(Opcode, dl, VTs, NewOps);

27778

}

27779

27780

case Intrinsic::x86_sse42_pcmpistrm128:

27781

case Intrinsic::x86_sse42_pcmpestrm128: {

27782

unsigned Opcode;

27783

if (IntNo == Intrinsic::x86_sse42_pcmpistrm128)

27784

Opcode = X86ISD::PCMPISTR;

27785

else

27786

Opcode = X86ISD::PCMPESTR;

27787

27788

SmallVector<SDValue, 5> NewOps(llvm::drop_begin(Op->ops()));

27789

SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);

27790

return DAG.getNode(Opcode, dl, VTs, NewOps).getValue(1);

27791

}

27792

27793

case Intrinsic::eh_sjlj_lsda: {

27794

MachineFunction &MF = DAG.getMachineFunction();

27795

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

27796

MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());

27797

auto &Context = MF.getMMI().getContext();

27798

MCSymbol *S = Context.getOrCreateSymbol(Twine("GCC_except_table") +

27799

Twine(MF.getFunctionNumber()));

27800

return DAG.getNode(getGlobalWrapperKind(), dl, VT,

27801

DAG.getMCSymbol(S, PtrVT));

27802

}

27803

27804

case Intrinsic::x86_seh_lsda: {

27805

// Compute the symbol for the LSDA. We know it'll get emitted later.

27806

MachineFunction &MF = DAG.getMachineFunction();

27807

SDValue Op1 = Op.getOperand(1);

27808

auto *Fn = cast<Function>(cast<GlobalAddressSDNode>(Op1)->getGlobal());

27809

MCSymbol *LSDASym = MF.getMMI().getContext().getOrCreateLSDASymbol(

27810

GlobalValue::dropLLVMManglingEscape(Fn->getName()));

27811

27812

// Generate a simple absolute symbol reference. This intrinsic is only

27813

// supported on 32-bit Windows, which isn't PIC.

27814

SDValue Result = DAG.getMCSymbol(LSDASym, VT);

27815

return DAG.getNode(X86ISD::Wrapper, dl, VT, Result);

27816

}

27817

27818

case Intrinsic::eh_recoverfp: {

27819

SDValue FnOp = Op.getOperand(1);

27820

SDValue IncomingFPOp = Op.getOperand(2);

27821

GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);

27822

auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);

27823

if (!Fn)

27824

report_fatal_error(

27825

"llvm.eh.recoverfp must take a function as the first argument");

27826

return recoverFramePointer(DAG, Fn, IncomingFPOp);

27827

}

27828

27829

case Intrinsic::localaddress: {

27830

// Returns one of the stack, base, or frame pointer registers, depending on

27831

// which is used to reference local variables.

27832

MachineFunction &MF = DAG.getMachineFunction();

27833

const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();

27834

unsigned Reg;

27835

if (RegInfo->hasBasePointer(MF))

27836

Reg = RegInfo->getBaseRegister();

27837

else { // Handles the SP or FP case.

27838

bool CantUseFP = RegInfo->hasStackRealignment(MF);

27839

if (CantUseFP)

27840

Reg = RegInfo->getPtrSizedStackRegister(MF);

27841

else

27842

Reg = RegInfo->getPtrSizedFrameRegister(MF);

27843

}

27844

return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);

27845

}

27846

case Intrinsic::x86_avx512_vp2intersect_q_512:

27847

case Intrinsic::x86_avx512_vp2intersect_q_256:

27848

case Intrinsic::x86_avx512_vp2intersect_q_128:

27849

case Intrinsic::x86_avx512_vp2intersect_d_512:

27850

case Intrinsic::x86_avx512_vp2intersect_d_256:

27851

case Intrinsic::x86_avx512_vp2intersect_d_128: {

27852

MVT MaskVT = Op.getSimpleValueType();

27853

27854

SDVTList VTs = DAG.getVTList(MVT::Untyped, MVT::Other);

27855

SDLoc DL(Op);

27856

27857

SDValue Operation =

27858

DAG.getNode(X86ISD::VP2INTERSECT, DL, VTs,

27859

Op->getOperand(1), Op->getOperand(2));

27860

27861

SDValue Result0 = DAG.getTargetExtractSubreg(X86::sub_mask_0, DL,

27862

MaskVT, Operation);

27863

SDValue Result1 = DAG.getTargetExtractSubreg(X86::sub_mask_1, DL,

27864

MaskVT, Operation);

27865

return DAG.getMergeValues({Result0, Result1}, DL);

27866

}

27867

case Intrinsic::x86_mmx_pslli_w:

27868

case Intrinsic::x86_mmx_pslli_d:

27869

case Intrinsic::x86_mmx_pslli_q:

27870

case Intrinsic::x86_mmx_psrli_w:

27871

case Intrinsic::x86_mmx_psrli_d:

27872

case Intrinsic::x86_mmx_psrli_q:

27873

case Intrinsic::x86_mmx_psrai_w:

27874

case Intrinsic::x86_mmx_psrai_d: {

27875

SDLoc DL(Op);

27876

SDValue ShAmt = Op.getOperand(2);

27877

// If the argument is a constant, convert it to a target constant.

27878

if (auto *C = dyn_cast<ConstantSDNode>(ShAmt)) {

27879

// Clamp out of bounds shift amounts since they will otherwise be masked

27880

// to 8-bits which may make it no longer out of bounds.

27881

unsigned ShiftAmount = C->getAPIntValue().getLimitedValue(255);

27882

if (ShiftAmount == 0)

27883

return Op.getOperand(1);

27884

27885

return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),

27886

Op.getOperand(0), Op.getOperand(1),

27887

DAG.getTargetConstant(ShiftAmount, DL, MVT::i32));

27888

}

27889

27890

unsigned NewIntrinsic;

27891

switch (IntNo) {

27892

default: llvm_unreachable("Impossible intrinsic")::llvm::llvm_unreachable_internal("Impossible intrinsic", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 27892); // Can't reach here.

27893

case Intrinsic::x86_mmx_pslli_w:

27894

NewIntrinsic = Intrinsic::x86_mmx_psll_w;

27895

break;

27896

case Intrinsic::x86_mmx_pslli_d:

27897

NewIntrinsic = Intrinsic::x86_mmx_psll_d;

27898

break;

27899

case Intrinsic::x86_mmx_pslli_q:

27900

NewIntrinsic = Intrinsic::x86_mmx_psll_q;

27901

break;

27902

case Intrinsic::x86_mmx_psrli_w:

27903

NewIntrinsic = Intrinsic::x86_mmx_psrl_w;

27904

break;

27905

case Intrinsic::x86_mmx_psrli_d:

27906

NewIntrinsic = Intrinsic::x86_mmx_psrl_d;

27907

break;

27908

case Intrinsic::x86_mmx_psrli_q:

27909

NewIntrinsic = Intrinsic::x86_mmx_psrl_q;

27910

break;

27911

case Intrinsic::x86_mmx_psrai_w:

27912

NewIntrinsic = Intrinsic::x86_mmx_psra_w;

27913

break;

27914

case Intrinsic::x86_mmx_psrai_d:

27915

NewIntrinsic = Intrinsic::x86_mmx_psra_d;

27916

break;

27917

}

27918

27919

// The vector shift intrinsics with scalars uses 32b shift amounts but

27920

// the sse2/mmx shift instructions reads 64 bits. Copy the 32 bits to an

27921

// MMX register.

27922

ShAmt = DAG.getNode(X86ISD::MMX_MOVW2D, DL, MVT::x86mmx, ShAmt);

27923

return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),

27924

DAG.getTargetConstant(NewIntrinsic, DL,

27925

getPointerTy(DAG.getDataLayout())),

27926

Op.getOperand(1), ShAmt);

27927

}

27928

case Intrinsic::thread_pointer: {

27929

if (Subtarget.isTargetELF()) {

27930

SDLoc dl(Op);

27931

EVT PtrVT = getPointerTy(DAG.getDataLayout());

27932

// Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).

27933

Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(

27934

*DAG.getContext(), Subtarget.is64Bit() ? X86AS::FS : X86AS::GS));

27935

return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),

27936

DAG.getIntPtrConstant(0, dl), MachinePointerInfo(Ptr));

27937

}

27938

report_fatal_error(

27939

"Target OS doesn't support __builtin_thread_pointer() yet.");

27940

}

27941

}

27942

}

27943

27944

static SDValue getAVX2GatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,

27945

SDValue Src, SDValue Mask, SDValue Base,

27946

SDValue Index, SDValue ScaleOp, SDValue Chain,

27947

const X86Subtarget &Subtarget) {

27948

SDLoc dl(Op);

27949

auto *C = dyn_cast<ConstantSDNode>(ScaleOp);

27950

// Scale must be constant.

27951

if (!C)

27952

return SDValue();

27953

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

27954

SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,

27955

TLI.getPointerTy(DAG.getDataLayout()));

27956

EVT MaskVT = Mask.getValueType().changeVectorElementTypeToInteger();

27957

SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Other);

27958

// If source is undef or we know it won't be used, use a zero vector

27959

// to break register dependency.

27960

// TODO: use undef instead and let BreakFalseDeps deal with it?

27961

if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))

27962

Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);

27963

27964

// Cast mask to an integer type.

27965

Mask = DAG.getBitcast(MaskVT, Mask);

27966

27967

MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);

27968

27969

SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale };

27970

SDValue Res =

27971

DAG.getMemIntrinsicNode(X86ISD::MGATHER, dl, VTs, Ops,

27972

MemIntr->getMemoryVT(), MemIntr->getMemOperand());

27973

return DAG.getMergeValues({Res, Res.getValue(1)}, dl);

27974

}

27975

27976

static SDValue getGatherNode(SDValue Op, SelectionDAG &DAG,

27977

SDValue Src, SDValue Mask, SDValue Base,

27978

SDValue Index, SDValue ScaleOp, SDValue Chain,

27979

const X86Subtarget &Subtarget) {

27980

MVT VT = Op.getSimpleValueType();

27981

SDLoc dl(Op);

27982

auto *C = dyn_cast<ConstantSDNode>(ScaleOp);

27983

// Scale must be constant.

27984

if (!C)

27985

return SDValue();

27986

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

27987

SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,

27988

TLI.getPointerTy(DAG.getDataLayout()));

27989

unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(),

27990

VT.getVectorNumElements());

27991

MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts);

27992

27993

// We support two versions of the gather intrinsics. One with scalar mask and

27994

// one with vXi1 mask. Convert scalar to vXi1 if necessary.

27995

if (Mask.getValueType() != MaskVT)

27996

Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);

27997

27998

SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Other);

27999

// If source is undef or we know it won't be used, use a zero vector

28000

// to break register dependency.

28001

// TODO: use undef instead and let BreakFalseDeps deal with it?

28002

if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))

28003

Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);

28004

28005

MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);

28006

28007

SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale };

28008

SDValue Res =

28009

DAG.getMemIntrinsicNode(X86ISD::MGATHER, dl, VTs, Ops,

28010

MemIntr->getMemoryVT(), MemIntr->getMemOperand());

28011

return DAG.getMergeValues({Res, Res.getValue(1)}, dl);

28012

}

28013

28014

static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,

28015

SDValue Src, SDValue Mask, SDValue Base,

28016

SDValue Index, SDValue ScaleOp, SDValue Chain,

28017

const X86Subtarget &Subtarget) {

28018

SDLoc dl(Op);

28019

auto *C = dyn_cast<ConstantSDNode>(ScaleOp);

28020

// Scale must be constant.

28021

if (!C)

28022

return SDValue();

28023

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

28024

SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,

28025

TLI.getPointerTy(DAG.getDataLayout()));

28026

unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(),

28027

Src.getSimpleValueType().getVectorNumElements());

28028

MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts);

28029

28030

// We support two versions of the scatter intrinsics. One with scalar mask and

28031

// one with vXi1 mask. Convert scalar to vXi1 if necessary.

28032

if (Mask.getValueType() != MaskVT)

28033

Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);

28034

28035

MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);

28036

28037

SDVTList VTs = DAG.getVTList(MVT::Other);

28038

SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale};

28039

SDValue Res =

28040

DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,

28041

MemIntr->getMemoryVT(), MemIntr->getMemOperand());

28042

return Res;

28043

}

28044

28045

static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,

28046

SDValue Mask, SDValue Base, SDValue Index,

28047

SDValue ScaleOp, SDValue Chain,

28048

const X86Subtarget &Subtarget) {

28049

SDLoc dl(Op);

28050

auto *C = dyn_cast<ConstantSDNode>(ScaleOp);

28051

// Scale must be constant.

28052

if (!C)

28053

return SDValue();

28054

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

28055

SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,

28056

TLI.getPointerTy(DAG.getDataLayout()));

28057

SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);

28058

SDValue Segment = DAG.getRegister(0, MVT::i32);

28059

MVT MaskVT =

28060

MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());

28061

SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);

28062

SDValue Ops[] = {VMask, Base, Scale, Index, Disp, Segment, Chain};

28063

SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);

28064

return SDValue(Res, 0);

28065

}

28066

28067

/// Handles the lowering of builtin intrinsics with chain that return their

28068

/// value into registers EDX:EAX.

28069

/// If operand ScrReg is a valid register identifier, then operand 2 of N is

28070

/// copied to SrcReg. The assumption is that SrcReg is an implicit input to

28071

/// TargetOpcode.

28072

/// Returns a Glue value which can be used to add extra copy-from-reg if the

28073

/// expanded intrinsics implicitly defines extra registers (i.e. not just

28074

/// EDX:EAX).

28075

static SDValue expandIntrinsicWChainHelper(SDNode *N, const SDLoc &DL,

28076

SelectionDAG &DAG,

28077

unsigned TargetOpcode,

28078

unsigned SrcReg,

28079

const X86Subtarget &Subtarget,

28080

SmallVectorImpl<SDValue> &Results) {

28081

SDValue Chain = N->getOperand(0);

28082

SDValue Glue;

28083

28084

if (SrcReg) {

28085

assert(N->getNumOperands() == 3 && "Unexpected number of operands!")(static_cast <bool> (N->getNumOperands() == 3 &&
"Unexpected number of operands!") ? void (0) : __assert_fail
("N->getNumOperands() == 3 && \"Unexpected number of operands!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28085, __extension__
__PRETTY_FUNCTION__));

28086

Chain = DAG.getCopyToReg(Chain, DL, SrcReg, N->getOperand(2), Glue);

28087

Glue = Chain.getValue(1);

28088

}

28089

28090

SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);

28091

SDValue N1Ops[] = {Chain, Glue};

28092

SDNode *N1 = DAG.getMachineNode(

28093

TargetOpcode, DL, Tys, ArrayRef<SDValue>(N1Ops, Glue.getNode() ? 2 : 1));

28094

Chain = SDValue(N1, 0);

28095

28096

// Reads the content of XCR and returns it in registers EDX:EAX.

28097

SDValue LO, HI;

28098

if (Subtarget.is64Bit()) {

28099

LO = DAG.getCopyFromReg(Chain, DL, X86::RAX, MVT::i64, SDValue(N1, 1));

28100

HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,

28101

LO.getValue(2));

28102

} else {

28103

LO = DAG.getCopyFromReg(Chain, DL, X86::EAX, MVT::i32, SDValue(N1, 1));

28104

HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,

28105

LO.getValue(2));

28106

}

28107

Chain = HI.getValue(1);

28108

Glue = HI.getValue(2);

28109

28110

if (Subtarget.is64Bit()) {

28111

// Merge the two 32-bit values into a 64-bit one.

28112

SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,

28113

DAG.getConstant(32, DL, MVT::i8));

28114

Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));

28115

Results.push_back(Chain);

28116

return Glue;

28117

}

28118

28119

// Use a buildpair to merge the two 32-bit values into a 64-bit one.

28120

SDValue Ops[] = { LO, HI };

28121

SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);

28122

Results.push_back(Pair);

28123

Results.push_back(Chain);

28124

return Glue;

28125

}

28126

28127

/// Handles the lowering of builtin intrinsics that read the time stamp counter

28128

/// (x86_rdtsc and x86_rdtscp). This function is also used to custom lower

28129

/// READCYCLECOUNTER nodes.

28130

static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode,

28131

SelectionDAG &DAG,

28132

const X86Subtarget &Subtarget,

28133

SmallVectorImpl<SDValue> &Results) {

28134

// The processor's time-stamp counter (a 64-bit MSR) is stored into the

28135

// EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR

28136

// and the EAX register is loaded with the low-order 32 bits.

28137

SDValue Glue = expandIntrinsicWChainHelper(N, DL, DAG, Opcode,

28138

/* NoRegister */0, Subtarget,

28139

Results);

28140

if (Opcode != X86::RDTSCP)

28141

return;

28142

28143

SDValue Chain = Results[1];

28144

// Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into

28145

// the ECX register. Add 'ecx' explicitly to the chain.

28146

SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32, Glue);

28147

Results[1] = ecx;

28148

Results.push_back(ecx.getValue(1));

28149

}

28150

28151

static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget &Subtarget,

28152

SelectionDAG &DAG) {

28153

SmallVector<SDValue, 3> Results;

28154

SDLoc DL(Op);

28155

getReadTimeStampCounter(Op.getNode(), DL, X86::RDTSC, DAG, Subtarget,

28156

Results);

28157

return DAG.getMergeValues(Results, DL);

28158

}

28159

28160

static SDValue MarkEHRegistrationNode(SDValue Op, SelectionDAG &DAG) {

28161

MachineFunction &MF = DAG.getMachineFunction();

28162

SDValue Chain = Op.getOperand(0);

28163

SDValue RegNode = Op.getOperand(2);

28164

WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();

28165

if (!EHInfo)

28166

report_fatal_error("EH registrations only live in functions using WinEH");

28167

28168

// Cast the operand to an alloca, and remember the frame index.

28169

auto *FINode = dyn_cast<FrameIndexSDNode>(RegNode);

28170

if (!FINode)

28171

report_fatal_error("llvm.x86.seh.ehregnode expects a static alloca");

28172

EHInfo->EHRegNodeFrameIndex = FINode->getIndex();

28173

28174

// Return the chain operand without making any DAG nodes.

28175

return Chain;

28176

}

28177

28178

static SDValue MarkEHGuard(SDValue Op, SelectionDAG &DAG) {

28179

MachineFunction &MF = DAG.getMachineFunction();

28180

SDValue Chain = Op.getOperand(0);

28181

SDValue EHGuard = Op.getOperand(2);

28182

WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();

28183

if (!EHInfo)

28184

report_fatal_error("EHGuard only live in functions using WinEH");

28185

28186

// Cast the operand to an alloca, and remember the frame index.

28187

auto *FINode = dyn_cast<FrameIndexSDNode>(EHGuard);

28188

if (!FINode)

28189

report_fatal_error("llvm.x86.seh.ehguard expects a static alloca");

28190

EHInfo->EHGuardFrameIndex = FINode->getIndex();

28191

28192

// Return the chain operand without making any DAG nodes.

28193

return Chain;

28194

}

28195

28196

/// Emit Truncating Store with signed or unsigned saturation.

28197

static SDValue

28198

EmitTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl, SDValue Val,

28199

SDValue Ptr, EVT MemVT, MachineMemOperand *MMO,

28200

SelectionDAG &DAG) {

28201

SDVTList VTs = DAG.getVTList(MVT::Other);

28202

SDValue Undef = DAG.getUNDEF(Ptr.getValueType());

28203

SDValue Ops[] = { Chain, Val, Ptr, Undef };

28204

unsigned Opc = SignedSat ? X86ISD::VTRUNCSTORES : X86ISD::VTRUNCSTOREUS;

28205

return DAG.getMemIntrinsicNode(Opc, Dl, VTs, Ops, MemVT, MMO);

28206

}

28207

28208

/// Emit Masked Truncating Store with signed or unsigned saturation.

28209

static SDValue

28210

EmitMaskedTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl,

28211

SDValue Val, SDValue Ptr, SDValue Mask, EVT MemVT,

28212

MachineMemOperand *MMO, SelectionDAG &DAG) {

28213

SDVTList VTs = DAG.getVTList(MVT::Other);

28214

SDValue Ops[] = { Chain, Val, Ptr, Mask };

28215

unsigned Opc = SignedSat ? X86ISD::VMTRUNCSTORES : X86ISD::VMTRUNCSTOREUS;

28216

return DAG.getMemIntrinsicNode(Opc, Dl, VTs, Ops, MemVT, MMO);

28217

}

28218

28219

static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,

28220

SelectionDAG &DAG) {

28221

unsigned IntNo = Op.getConstantOperandVal(1);

28222

const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo);

28223

if (!IntrData) {

28224

switch (IntNo) {

28225

28226

case Intrinsic::swift_async_context_addr: {

28227

SDLoc dl(Op);

28228

auto &MF = DAG.getMachineFunction();

28229

auto X86FI = MF.getInfo<X86MachineFunctionInfo>();

28230

if (Subtarget.is64Bit()) {

28231

MF.getFrameInfo().setFrameAddressIsTaken(true);

28232

X86FI->setHasSwiftAsyncContext(true);

28233

SDValue Chain = Op->getOperand(0);

28234

SDValue CopyRBP = DAG.getCopyFromReg(Chain, dl, X86::RBP, MVT::i64);

28235

SDValue Result =

28236

SDValue(DAG.getMachineNode(X86::SUB64ri8, dl, MVT::i64, CopyRBP,

28237

DAG.getTargetConstant(8, dl, MVT::i32)),

28238

0);

28239

// Return { result, chain }.

28240

return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result,

28241

CopyRBP.getValue(1));

28242

} else {

28243

// 32-bit so no special extended frame, create or reuse an existing

28244

// stack slot.

28245

if (!X86FI->getSwiftAsyncContextFrameIdx())

28246

X86FI->setSwiftAsyncContextFrameIdx(

28247

MF.getFrameInfo().CreateStackObject(4, Align(4), false));

28248

SDValue Result =

28249

DAG.getFrameIndex(*X86FI->getSwiftAsyncContextFrameIdx(), MVT::i32);

28250

// Return { result, chain }.

28251

return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result,

28252

Op->getOperand(0));

28253

}

28254

}

28255

28256

case llvm::Intrinsic::x86_seh_ehregnode:

28257

return MarkEHRegistrationNode(Op, DAG);

28258

case llvm::Intrinsic::x86_seh_ehguard:

28259

return MarkEHGuard(Op, DAG);

28260

case llvm::Intrinsic::x86_rdpkru: {

28261

SDLoc dl(Op);

28262

SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);

28263

// Create a RDPKRU node and pass 0 to the ECX parameter.

28264

return DAG.getNode(X86ISD::RDPKRU, dl, VTs, Op.getOperand(0),

28265

DAG.getConstant(0, dl, MVT::i32));

28266

}

28267

case llvm::Intrinsic::x86_wrpkru: {

28268

SDLoc dl(Op);

28269

// Create a WRPKRU node, pass the input to the EAX parameter, and pass 0

28270

// to the EDX and ECX parameters.

28271

return DAG.getNode(X86ISD::WRPKRU, dl, MVT::Other,

28272

Op.getOperand(0), Op.getOperand(2),

28273

DAG.getConstant(0, dl, MVT::i32),

28274

DAG.getConstant(0, dl, MVT::i32));

28275

}

28276

case llvm::Intrinsic::asan_check_memaccess: {

28277

// Mark this as adjustsStack because it will be lowered to a call.

28278

DAG.getMachineFunction().getFrameInfo().setAdjustsStack(true);

28279

// Don't do anything here, we will expand these intrinsics out later.

28280

return Op;

28281

}

28282

case llvm::Intrinsic::x86_flags_read_u32:

28283

case llvm::Intrinsic::x86_flags_read_u64:

28284

case llvm::Intrinsic::x86_flags_write_u32:

28285

case llvm::Intrinsic::x86_flags_write_u64: {

28286

// We need a frame pointer because this will get lowered to a PUSH/POP

28287

// sequence.

28288

MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();

28289

MFI.setHasCopyImplyingStackAdjustment(true);

28290

// Don't do anything here, we will expand these intrinsics out later

28291

// during FinalizeISel in EmitInstrWithCustomInserter.

28292

return Op;

28293

}

28294

case Intrinsic::x86_lwpins32:

28295

case Intrinsic::x86_lwpins64:

28296

case Intrinsic::x86_umwait:

28297

case Intrinsic::x86_tpause: {

28298

SDLoc dl(Op);

28299

SDValue Chain = Op->getOperand(0);

28300

SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);

28301

unsigned Opcode;

28302

28303

switch (IntNo) {

28304

default: llvm_unreachable("Impossible intrinsic")::llvm::llvm_unreachable_internal("Impossible intrinsic", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28304);

28305

case Intrinsic::x86_umwait:

28306

Opcode = X86ISD::UMWAIT;

28307

break;

28308

case Intrinsic::x86_tpause:

28309

Opcode = X86ISD::TPAUSE;

28310

break;

28311

case Intrinsic::x86_lwpins32:

28312

case Intrinsic::x86_lwpins64:

28313

Opcode = X86ISD::LWPINS;

28314

break;

28315

}

28316

28317

SDValue Operation =

28318

DAG.getNode(Opcode, dl, VTs, Chain, Op->getOperand(2),

28319

Op->getOperand(3), Op->getOperand(4));

28320

SDValue SetCC = getSETCC(X86::COND_B, Operation.getValue(0), dl, DAG);

28321

return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,

28322

Operation.getValue(1));

28323

}

28324

case Intrinsic::x86_enqcmd:

28325

case Intrinsic::x86_enqcmds: {

28326

SDLoc dl(Op);

28327

SDValue Chain = Op.getOperand(0);

28328

SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);

28329

unsigned Opcode;

28330

switch (IntNo) {

28331

default: llvm_unreachable("Impossible intrinsic!")::llvm::llvm_unreachable_internal("Impossible intrinsic!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28331);

28332

case Intrinsic::x86_enqcmd:

28333

Opcode = X86ISD::ENQCMD;

28334

break;

28335

case Intrinsic::x86_enqcmds:

28336

Opcode = X86ISD::ENQCMDS;

28337

break;

28338

}

28339

SDValue Operation = DAG.getNode(Opcode, dl, VTs, Chain, Op.getOperand(2),

28340

Op.getOperand(3));

28341

SDValue SetCC = getSETCC(X86::COND_E, Operation.getValue(0), dl, DAG);

28342

return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,

28343

Operation.getValue(1));

28344

}

28345

case Intrinsic::x86_aesenc128kl:

28346

case Intrinsic::x86_aesdec128kl:

28347

case Intrinsic::x86_aesenc256kl:

28348

case Intrinsic::x86_aesdec256kl: {

28349

SDLoc DL(Op);

28350

SDVTList VTs = DAG.getVTList(MVT::v2i64, MVT::i32, MVT::Other);

28351

SDValue Chain = Op.getOperand(0);

28352

unsigned Opcode;

28353

28354

switch (IntNo) {

28355

default: llvm_unreachable("Impossible intrinsic")::llvm::llvm_unreachable_internal("Impossible intrinsic", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28355);

28356

case Intrinsic::x86_aesenc128kl:

28357

Opcode = X86ISD::AESENC128KL;

28358

break;

28359

case Intrinsic::x86_aesdec128kl:

28360

Opcode = X86ISD::AESDEC128KL;

28361

break;

28362

case Intrinsic::x86_aesenc256kl:

28363

Opcode = X86ISD::AESENC256KL;

28364

break;

28365

case Intrinsic::x86_aesdec256kl:

28366

Opcode = X86ISD::AESDEC256KL;

28367

break;

28368

}

28369

28370

MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);

28371

MachineMemOperand *MMO = MemIntr->getMemOperand();

28372

EVT MemVT = MemIntr->getMemoryVT();

28373

SDValue Operation = DAG.getMemIntrinsicNode(

28374

Opcode, DL, VTs, {Chain, Op.getOperand(2), Op.getOperand(3)}, MemVT,

28375

MMO);

28376

SDValue ZF = getSETCC(X86::COND_E, Operation.getValue(1), DL, DAG);

28377

28378

return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),

28379

{ZF, Operation.getValue(0), Operation.getValue(2)});

28380

}

28381

case Intrinsic::x86_aesencwide128kl:

28382

case Intrinsic::x86_aesdecwide128kl:

28383

case Intrinsic::x86_aesencwide256kl:

28384

case Intrinsic::x86_aesdecwide256kl: {

28385

SDLoc DL(Op);

28386

SDVTList VTs = DAG.getVTList(

28387

{MVT::i32, MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::v2i64,

28388

MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::Other});

28389

SDValue Chain = Op.getOperand(0);

28390

unsigned Opcode;

28391

28392

switch (IntNo) {

28393

default: llvm_unreachable("Impossible intrinsic")::llvm::llvm_unreachable_internal("Impossible intrinsic", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28393);

28394

case Intrinsic::x86_aesencwide128kl:

28395

Opcode = X86ISD::AESENCWIDE128KL;

28396

break;

28397

case Intrinsic::x86_aesdecwide128kl:

28398

Opcode = X86ISD::AESDECWIDE128KL;

28399

break;

28400

case Intrinsic::x86_aesencwide256kl:

28401

Opcode = X86ISD::AESENCWIDE256KL;

28402

break;

28403

case Intrinsic::x86_aesdecwide256kl:

28404

Opcode = X86ISD::AESDECWIDE256KL;

28405

break;

28406

}

28407

28408

MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);

28409

MachineMemOperand *MMO = MemIntr->getMemOperand();

28410

EVT MemVT = MemIntr->getMemoryVT();

28411

SDValue Operation = DAG.getMemIntrinsicNode(

28412

Opcode, DL, VTs,

28413

{Chain, Op.getOperand(2), Op.getOperand(3), Op.getOperand(4),

28414

Op.getOperand(5), Op.getOperand(6), Op.getOperand(7),

28415

Op.getOperand(8), Op.getOperand(9), Op.getOperand(10)},

28416

MemVT, MMO);

28417

SDValue ZF = getSETCC(X86::COND_E, Operation.getValue(0), DL, DAG);

28418

28419

return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),

28420

{ZF, Operation.getValue(1), Operation.getValue(2),

28421

Operation.getValue(3), Operation.getValue(4),

28422

Operation.getValue(5), Operation.getValue(6),

28423

Operation.getValue(7), Operation.getValue(8),

28424

Operation.getValue(9)});

28425

}

28426

case Intrinsic::x86_testui: {

28427

SDLoc dl(Op);

28428

SDValue Chain = Op.getOperand(0);

28429

SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);

28430

SDValue Operation = DAG.getNode(X86ISD::TESTUI, dl, VTs, Chain);

28431

SDValue SetCC = getSETCC(X86::COND_B, Operation.getValue(0), dl, DAG);

28432

return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC,

28433

Operation.getValue(1));

28434

}

28435

case Intrinsic::x86_atomic_bts_rm:

28436

case Intrinsic::x86_atomic_btc_rm:

28437

case Intrinsic::x86_atomic_btr_rm: {

28438

SDLoc DL(Op);

28439

MVT VT = Op.getSimpleValueType();

28440

SDValue Chain = Op.getOperand(0);

28441

SDValue Op1 = Op.getOperand(2);

28442

SDValue Op2 = Op.getOperand(3);

28443

unsigned Opc = IntNo == Intrinsic::x86_atomic_bts_rm ? X86ISD::LBTS_RM

28444

: IntNo == Intrinsic::x86_atomic_btc_rm ? X86ISD::LBTC_RM

28445

: X86ISD::LBTR_RM;

28446

MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();

28447

SDValue Res =

28448

DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::i32, MVT::Other),

28449

{Chain, Op1, Op2}, VT, MMO);

28450

Chain = Res.getValue(1);

28451

Res = DAG.getZExtOrTrunc(getSETCC(X86::COND_B, Res, DL, DAG), DL, VT);

28452

return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Res, Chain);

28453

}

28454

case Intrinsic::x86_atomic_bts:

28455

case Intrinsic::x86_atomic_btc:

28456

case Intrinsic::x86_atomic_btr: {

28457

SDLoc DL(Op);

28458

MVT VT = Op.getSimpleValueType();

28459

SDValue Chain = Op.getOperand(0);

28460

SDValue Op1 = Op.getOperand(2);

28461

SDValue Op2 = Op.getOperand(3);

28462

unsigned Opc = IntNo == Intrinsic::x86_atomic_bts ? X86ISD::LBTS

28463

: IntNo == Intrinsic::x86_atomic_btc ? X86ISD::LBTC

28464

: X86ISD::LBTR;

28465

SDValue Size = DAG.getConstant(VT.getScalarSizeInBits(), DL, MVT::i32);

28466

MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();

28467

SDValue Res =

28468

DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::i32, MVT::Other),

28469

{Chain, Op1, Op2, Size}, VT, MMO);

28470

Chain = Res.getValue(1);

28471

Res = DAG.getZExtOrTrunc(getSETCC(X86::COND_B, Res, DL, DAG), DL, VT);

28472

unsigned Imm = cast<ConstantSDNode>(Op2)->getZExtValue();

28473

if (Imm)

28474

Res = DAG.getNode(ISD::SHL, DL, VT, Res,

28475

DAG.getShiftAmountConstant(Imm, VT, DL));

28476

return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Res, Chain);

28477

}

28478

case Intrinsic::x86_cmpccxadd32:

28479

case Intrinsic::x86_cmpccxadd64: {

28480

SDLoc DL(Op);

28481

SDValue Chain = Op.getOperand(0);

28482

SDValue Addr = Op.getOperand(2);

28483

SDValue Src1 = Op.getOperand(3);

28484

SDValue Src2 = Op.getOperand(4);

28485

SDValue CC = Op.getOperand(5);

28486

MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();

28487

SDValue Operation = DAG.getMemIntrinsicNode(

28488

X86ISD::CMPCCXADD, DL, Op->getVTList(), {Chain, Addr, Src1, Src2, CC},

28489

MVT::i32, MMO);

28490

return Operation;

28491

}

28492

case Intrinsic::x86_aadd32:

28493

case Intrinsic::x86_aadd64:

28494

case Intrinsic::x86_aand32:

28495

case Intrinsic::x86_aand64:

28496

case Intrinsic::x86_aor32:

28497

case Intrinsic::x86_aor64:

28498

case Intrinsic::x86_axor32:

28499

case Intrinsic::x86_axor64: {

28500

SDLoc DL(Op);

28501

SDValue Chain = Op.getOperand(0);

28502

SDValue Op1 = Op.getOperand(2);

28503

SDValue Op2 = Op.getOperand(3);

28504

MVT VT = Op2.getSimpleValueType();

28505

unsigned Opc = 0;

28506

switch (IntNo) {

28507

default:

28508

llvm_unreachable("Unknown Intrinsic")::llvm::llvm_unreachable_internal("Unknown Intrinsic", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28508);

28509

case Intrinsic::x86_aadd32:

28510

case Intrinsic::x86_aadd64:

28511

Opc = X86ISD::AADD;

28512

break;

28513

case Intrinsic::x86_aand32:

28514

case Intrinsic::x86_aand64:

28515

Opc = X86ISD::AAND;

28516

break;

28517

case Intrinsic::x86_aor32:

28518

case Intrinsic::x86_aor64:

28519

Opc = X86ISD::AOR;

28520

break;

28521

case Intrinsic::x86_axor32:

28522

case Intrinsic::x86_axor64:

28523

Opc = X86ISD::AXOR;

28524

break;

28525

}

28526

MachineMemOperand *MMO = cast<MemSDNode>(Op)->getMemOperand();

28527

return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(),

28528

{Chain, Op1, Op2}, VT, MMO);

28529

}

28530

case Intrinsic::x86_atomic_add_cc:

28531

case Intrinsic::x86_atomic_sub_cc:

28532

case Intrinsic::x86_atomic_or_cc:

28533

case Intrinsic::x86_atomic_and_cc:

28534

case Intrinsic::x86_atomic_xor_cc: {

28535

SDLoc DL(Op);

28536

SDValue Chain = Op.getOperand(0);

28537

SDValue Op1 = Op.getOperand(2);

28538

SDValue Op2 = Op.getOperand(3);

28539

X86::CondCode CC = (X86::CondCode)Op.getConstantOperandVal(4);

28540

MVT VT = Op2.getSimpleValueType();

28541

unsigned Opc = 0;

28542

switch (IntNo) {

28543

default:

28544

llvm_unreachable("Unknown Intrinsic")::llvm::llvm_unreachable_internal("Unknown Intrinsic", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28544);

28545

case Intrinsic::x86_atomic_add_cc:

28546

Opc = X86ISD::LADD;

28547

break;

28548

case Intrinsic::x86_atomic_sub_cc:

28549

Opc = X86ISD::LSUB;

28550

break;

28551

case Intrinsic::x86_atomic_or_cc:

28552

Opc = X86ISD::LOR;

28553

break;

28554

case Intrinsic::x86_atomic_and_cc:

28555

Opc = X86ISD::LAND;

28556

break;

28557

case Intrinsic::x86_atomic_xor_cc:

28558

Opc = X86ISD::LXOR;

28559

break;

28560

}

28561

MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand();

28562

SDValue LockArith =

28563

DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::i32, MVT::Other),

28564

{Chain, Op1, Op2}, VT, MMO);

28565

Chain = LockArith.getValue(1);

28566

return DAG.getMergeValues({getSETCC(CC, LockArith, DL, DAG), Chain}, DL);

28567

}

28568

}

28569

return SDValue();

28570

}

28571

28572

SDLoc dl(Op);

28573

switch(IntrData->Type) {

28574

default: llvm_unreachable("Unknown Intrinsic Type")::llvm::llvm_unreachable_internal("Unknown Intrinsic Type", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 28574);

28575

case RDSEED:

28576

case RDRAND: {

28577

// Emit the node with the right value type.

28578

SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32, MVT::Other);

28579

SDValue Result = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));

28580

28581

// If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.

28582

// Otherwise return the value from Rand, which is always 0, casted to i32.

28583

SDValue Ops[] = {DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),

28584

DAG.getConstant(1, dl, Op->getValueType(1)),

28585

DAG.getTargetConstant(X86::COND_B, dl, MVT::i8),

28586

SDValue(Result.getNode(), 1)};

28587

SDValue isValid = DAG.getNode(X86ISD::CMOV, dl, Op->getValueType(1), Ops);

28588

28589

// Return { result, isValid, chain }.

28590

return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,

28591

SDValue(Result.getNode(), 2));

28592

}

28593

case GATHER_AVX2: {

28594

SDValue Chain = Op.getOperand(0);

28595

SDValue Src = Op.getOperand(2);

28596

SDValue Base = Op.getOperand(3);

28597

SDValue Index = Op.getOperand(4);

28598

SDValue Mask = Op.getOperand(5);

28599

SDValue Scale = Op.getOperand(6);

28600

return getAVX2GatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,

28601

Scale, Chain, Subtarget);

28602

}

28603

case GATHER: {

28604

//gather(v1, mask, index, base, scale);

28605

SDValue Chain = Op.getOperand(0);

28606

SDValue Src = Op.getOperand(2);

28607

SDValue Base = Op.getOperand(3);

28608

SDValue Index = Op.getOperand(4);

28609

SDValue Mask = Op.getOperand(5);

28610

SDValue Scale = Op.getOperand(6);

28611

return getGatherNode(Op, DAG, Src, Mask, Base, Index, Scale,

28612

Chain, Subtarget);

28613

}

28614

case SCATTER: {

28615

//scatter(base, mask, index, v1, scale);

28616

SDValue Chain = Op.getOperand(0);

28617

SDValue Base = Op.getOperand(2);

28618

SDValue Mask = Op.getOperand(3);

28619

SDValue Index = Op.getOperand(4);

28620

SDValue Src = Op.getOperand(5);

28621

SDValue Scale = Op.getOperand(6);

28622

return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,

28623

Scale, Chain, Subtarget);

28624

}

28625

case PREFETCH: {

28626

const APInt &HintVal = Op.getConstantOperandAPInt(6);

28627

assert((HintVal == 2 || HintVal == 3) &&(static_cast <bool> ((HintVal == 2 || HintVal == 3) &&
"Wrong prefetch hint in intrinsic: should be 2 or 3") ? void
(0) : __assert_fail ("(HintVal == 2 || HintVal == 3) && \"Wrong prefetch hint in intrinsic: should be 2 or 3\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28628, __extension__
__PRETTY_FUNCTION__))

28628

"Wrong prefetch hint in intrinsic: should be 2 or 3")(static_cast <bool> ((HintVal == 2 || HintVal == 3) &&
"Wrong prefetch hint in intrinsic: should be 2 or 3") ? void
(0) : __assert_fail ("(HintVal == 2 || HintVal == 3) && \"Wrong prefetch hint in intrinsic: should be 2 or 3\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28628, __extension__
__PRETTY_FUNCTION__));

28629

unsigned Opcode = (HintVal == 2 ? IntrData->Opc1 : IntrData->Opc0);

28630

SDValue Chain = Op.getOperand(0);

28631

SDValue Mask = Op.getOperand(2);

28632

SDValue Index = Op.getOperand(3);

28633

SDValue Base = Op.getOperand(4);

28634

SDValue Scale = Op.getOperand(5);

28635

return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain,

28636

Subtarget);

28637

}

28638

// Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP).

28639

case RDTSC: {

28640

SmallVector<SDValue, 2> Results;

28641

getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget,

28642

Results);

28643

return DAG.getMergeValues(Results, dl);

28644

}

28645

// Read Performance Monitoring Counters.

28646

case RDPMC:

28647

// Read Processor Register.

28648

case RDPRU:

28649

// GetExtended Control Register.

28650

case XGETBV: {

28651

SmallVector<SDValue, 2> Results;

28652

28653

// RDPMC uses ECX to select the index of the performance counter to read.

28654

// RDPRU uses ECX to select the processor register to read.

28655

// XGETBV uses ECX to select the index of the XCR register to return.

28656

// The result is stored into registers EDX:EAX.

28657

expandIntrinsicWChainHelper(Op.getNode(), dl, DAG, IntrData->Opc0, X86::ECX,

28658

Subtarget, Results);

28659

return DAG.getMergeValues(Results, dl);

28660

}

28661

// XTEST intrinsics.

28662

case XTEST: {

28663

SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);

28664

SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));

28665

28666

SDValue SetCC = getSETCC(X86::COND_NE, InTrans, dl, DAG);

28667

SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC);

28668

return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),

28669

Ret, SDValue(InTrans.getNode(), 1));

28670

}

28671

case TRUNCATE_TO_MEM_VI8:

28672

case TRUNCATE_TO_MEM_VI16:

28673

case TRUNCATE_TO_MEM_VI32: {

28674

SDValue Mask = Op.getOperand(4);

28675

SDValue DataToTruncate = Op.getOperand(3);

28676

SDValue Addr = Op.getOperand(2);

28677

SDValue Chain = Op.getOperand(0);

28678

28679

MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);

28680

assert(MemIntr && "Expected MemIntrinsicSDNode!")(static_cast <bool> (MemIntr && "Expected MemIntrinsicSDNode!"
) ? void (0) : __assert_fail ("MemIntr && \"Expected MemIntrinsicSDNode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28680, __extension__
__PRETTY_FUNCTION__));

28681

28682

EVT MemVT = MemIntr->getMemoryVT();

28683

28684

uint16_t TruncationOp = IntrData->Opc0;

28685

switch (TruncationOp) {

28686

case X86ISD::VTRUNC: {

28687

if (isAllOnesConstant(Mask)) // return just a truncate store

28688

return DAG.getTruncStore(Chain, dl, DataToTruncate, Addr, MemVT,

28689

MemIntr->getMemOperand());

28690

28691

MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());

28692

SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);

28693

SDValue Offset = DAG.getUNDEF(VMask.getValueType());

28694

28695

return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr, Offset, VMask,

28696

MemVT, MemIntr->getMemOperand(), ISD::UNINDEXED,

28697

true /* truncating */);

28698

}

28699

case X86ISD::VTRUNCUS:

28700

case X86ISD::VTRUNCS: {

28701

bool IsSigned = (TruncationOp == X86ISD::VTRUNCS);

28702

if (isAllOnesConstant(Mask))

28703

return EmitTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr, MemVT,

28704

MemIntr->getMemOperand(), DAG);

28705

28706

MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());

28707

SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);

28708

28709

return EmitMaskedTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr,

28710

VMask, MemVT, MemIntr->getMemOperand(), DAG);

28711

}

28712

default:

28713

llvm_unreachable("Unsupported truncstore intrinsic")::llvm::llvm_unreachable_internal("Unsupported truncstore intrinsic"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28713);

28714

}

28715

}

28716

}

28717

}

28718

28719

SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,

28720

SelectionDAG &DAG) const {

28721

MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();

28722

MFI.setReturnAddressIsTaken(true);

28723

28724

if (verifyReturnAddressArgumentIsConstant(Op, DAG))

28725

return SDValue();

28726

28727

unsigned Depth = Op.getConstantOperandVal(0);

28728

SDLoc dl(Op);

28729

EVT PtrVT = getPointerTy(DAG.getDataLayout());

28730

28731

if (Depth > 0) {

28732

SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);

28733

const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();

28734

SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), dl, PtrVT);

28735

return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),

28736

DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset),

28737

MachinePointerInfo());

28738

}

28739

28740

// Just load the return address.

28741

SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);

28742

return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), RetAddrFI,

28743

MachinePointerInfo());

28744

}

28745

28746

SDValue X86TargetLowering::LowerADDROFRETURNADDR(SDValue Op,

28747

SelectionDAG &DAG) const {

28748

DAG.getMachineFunction().getFrameInfo().setReturnAddressIsTaken(true);

28749

return getReturnAddressFrameIndex(DAG);

28750

}

28751

28752

SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {

28753

MachineFunction &MF = DAG.getMachineFunction();

28754

MachineFrameInfo &MFI = MF.getFrameInfo();

28755

X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();

28756

const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();

28757

EVT VT = Op.getValueType();

28758

28759

MFI.setFrameAddressIsTaken(true);

28760

28761

if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) {

28762

// Depth > 0 makes no sense on targets which use Windows unwind codes. It

28763

// is not possible to crawl up the stack without looking at the unwind codes

28764

// simultaneously.

28765

int FrameAddrIndex = FuncInfo->getFAIndex();

28766

if (!FrameAddrIndex) {

28767

// Set up a frame object for the return address.

28768

unsigned SlotSize = RegInfo->getSlotSize();

28769

FrameAddrIndex = MF.getFrameInfo().CreateFixedObject(

28770

SlotSize, /*SPOffset=*/0, /*IsImmutable=*/false);

28771

FuncInfo->setFAIndex(FrameAddrIndex);

28772

}

28773

return DAG.getFrameIndex(FrameAddrIndex, VT);

28774

}

28775

28776

unsigned FrameReg =

28777

RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());

28778

SDLoc dl(Op); // FIXME probably not meaningful

28779

unsigned Depth = Op.getConstantOperandVal(0);

28780

assert(((FrameReg == X86::RBP && VT == MVT::i64) ||(static_cast <bool> (((FrameReg == X86::RBP && VT
== MVT::i64) || (FrameReg == X86::EBP && VT == MVT::
i32)) && "Invalid Frame Register!") ? void (0) : __assert_fail
("((FrameReg == X86::RBP && VT == MVT::i64) || (FrameReg == X86::EBP && VT == MVT::i32)) && \"Invalid Frame Register!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28782, __extension__
__PRETTY_FUNCTION__))

28781

(FrameReg == X86::EBP && VT == MVT::i32)) &&(static_cast <bool> (((FrameReg == X86::RBP && VT
== MVT::i64) || (FrameReg == X86::EBP && VT == MVT::
i32)) && "Invalid Frame Register!") ? void (0) : __assert_fail
("((FrameReg == X86::RBP && VT == MVT::i64) || (FrameReg == X86::EBP && VT == MVT::i32)) && \"Invalid Frame Register!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28782, __extension__
__PRETTY_FUNCTION__))

28782

"Invalid Frame Register!")(static_cast <bool> (((FrameReg == X86::RBP && VT
== MVT::i64) || (FrameReg == X86::EBP && VT == MVT::
i32)) && "Invalid Frame Register!") ? void (0) : __assert_fail
("((FrameReg == X86::RBP && VT == MVT::i64) || (FrameReg == X86::EBP && VT == MVT::i32)) && \"Invalid Frame Register!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28782, __extension__
__PRETTY_FUNCTION__));

28783

SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);

28784

while (Depth--)

28785

FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,

28786

MachinePointerInfo());

28787

return FrameAddr;

28788

}

28789

28790

// FIXME? Maybe this could be a TableGen attribute on some registers and

28791

// this table could be generated automatically from RegInfo.

28792

Register X86TargetLowering::getRegisterByName(const char* RegName, LLT VT,

28793

const MachineFunction &MF) const {

28794

const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();

28795

28796

Register Reg = StringSwitch<unsigned>(RegName)

28797

.Case("esp", X86::ESP)

28798

.Case("rsp", X86::RSP)

28799

.Case("ebp", X86::EBP)

28800

.Case("rbp", X86::RBP)

28801

.Default(0);

28802

28803

if (Reg == X86::EBP || Reg == X86::RBP) {

28804

if (!TFI.hasFP(MF))

28805

report_fatal_error("register " + StringRef(RegName) +

28806

" is allocatable: function has no frame pointer");

28807

#ifndef NDEBUG

28808

else {

28809

const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();

28810

Register FrameReg = RegInfo->getPtrSizedFrameRegister(MF);

28811

assert((FrameReg == X86::EBP || FrameReg == X86::RBP) &&(static_cast <bool> ((FrameReg == X86::EBP || FrameReg ==
X86::RBP) && "Invalid Frame Register!") ? void (0) :
__assert_fail ("(FrameReg == X86::EBP || FrameReg == X86::RBP) && \"Invalid Frame Register!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28812, __extension__
__PRETTY_FUNCTION__))

28812

"Invalid Frame Register!")(static_cast <bool> ((FrameReg == X86::EBP || FrameReg ==
X86::RBP) && "Invalid Frame Register!") ? void (0) :
__assert_fail ("(FrameReg == X86::EBP || FrameReg == X86::RBP) && \"Invalid Frame Register!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28812, __extension__
__PRETTY_FUNCTION__));

28813

}

28814

#endif

28815

}

28816

28817

if (Reg)

28818

return Reg;

28819

28820

report_fatal_error("Invalid register name global variable");

28821

}

28822

28823

SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,

28824

SelectionDAG &DAG) const {

28825

const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();

28826

return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize(), SDLoc(Op));

28827

}

28828

28829

Register X86TargetLowering::getExceptionPointerRegister(

28830

const Constant *PersonalityFn) const {

28831

if (classifyEHPersonality(PersonalityFn) == EHPersonality::CoreCLR)

28832

return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;

28833

28834

return Subtarget.isTarget64BitLP64() ? X86::RAX : X86::EAX;

28835

}

28836

28837

Register X86TargetLowering::getExceptionSelectorRegister(

28838

const Constant *PersonalityFn) const {

28839

// Funclet personalities don't use selectors (the runtime does the selection).

28840

if (isFuncletEHPersonality(classifyEHPersonality(PersonalityFn)))

28841

return X86::NoRegister;

28842

return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;

28843

}

28844

28845

bool X86TargetLowering::needsFixedCatchObjects() const {

28846

return Subtarget.isTargetWin64();

28847

}

28848

28849

SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {

28850

SDValue Chain = Op.getOperand(0);

28851

SDValue Offset = Op.getOperand(1);

28852

SDValue Handler = Op.getOperand(2);

28853

SDLoc dl (Op);

28854

28855

EVT PtrVT = getPointerTy(DAG.getDataLayout());

28856

const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();

28857

Register FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());

28858

assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) ||(static_cast <bool> (((FrameReg == X86::RBP && PtrVT
== MVT::i64) || (FrameReg == X86::EBP && PtrVT == MVT
::i32)) && "Invalid Frame Register!") ? void (0) : __assert_fail
("((FrameReg == X86::RBP && PtrVT == MVT::i64) || (FrameReg == X86::EBP && PtrVT == MVT::i32)) && \"Invalid Frame Register!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28860, __extension__
__PRETTY_FUNCTION__))

28859

(FrameReg == X86::EBP && PtrVT == MVT::i32)) &&(static_cast <bool> (((FrameReg == X86::RBP && PtrVT
== MVT::i64) || (FrameReg == X86::EBP && PtrVT == MVT
::i32)) && "Invalid Frame Register!") ? void (0) : __assert_fail
("((FrameReg == X86::RBP && PtrVT == MVT::i64) || (FrameReg == X86::EBP && PtrVT == MVT::i32)) && \"Invalid Frame Register!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28860, __extension__
__PRETTY_FUNCTION__))

28860

"Invalid Frame Register!")(static_cast <bool> (((FrameReg == X86::RBP && PtrVT
== MVT::i64) || (FrameReg == X86::EBP && PtrVT == MVT
::i32)) && "Invalid Frame Register!") ? void (0) : __assert_fail
("((FrameReg == X86::RBP && PtrVT == MVT::i64) || (FrameReg == X86::EBP && PtrVT == MVT::i32)) && \"Invalid Frame Register!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28860, __extension__
__PRETTY_FUNCTION__));

28861

SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT);

28862

Register StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX;

28863

28864

SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame,

28865

DAG.getIntPtrConstant(RegInfo->getSlotSize(),

28866

dl));

28867

StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset);

28868

Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo());

28869

Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);

28870

28871

return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain,

28872

DAG.getRegister(StoreAddrReg, PtrVT));

28873

}

28874

28875

SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,

28876

SelectionDAG &DAG) const {

28877

SDLoc DL(Op);

28878

// If the subtarget is not 64bit, we may need the global base reg

28879

// after isel expand pseudo, i.e., after CGBR pass ran.

28880

// Therefore, ask for the GlobalBaseReg now, so that the pass

28881

// inserts the code for us in case we need it.

28882

// Otherwise, we will end up in a situation where we will

28883

// reference a virtual register that is not defined!

28884

if (!Subtarget.is64Bit()) {

28885

const X86InstrInfo *TII = Subtarget.getInstrInfo();

28886

(void)TII->getGlobalBaseReg(&DAG.getMachineFunction());

28887

}

28888

return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL,

28889

DAG.getVTList(MVT::i32, MVT::Other),

28890

Op.getOperand(0), Op.getOperand(1));

28891

}

28892

28893

SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,

28894

SelectionDAG &DAG) const {

28895

SDLoc DL(Op);

28896

return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other,

28897

Op.getOperand(0), Op.getOperand(1));

28898

}

28899

28900

SDValue X86TargetLowering::lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,

28901

SelectionDAG &DAG) const {

28902

SDLoc DL(Op);

28903

return DAG.getNode(X86ISD::EH_SJLJ_SETUP_DISPATCH, DL, MVT::Other,

28904

Op.getOperand(0));

28905

}

28906

28907

static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) {

28908

return Op.getOperand(0);

28909

}

28910

28911

SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,

28912

SelectionDAG &DAG) const {

28913

SDValue Root = Op.getOperand(0);

28914

SDValue Trmp = Op.getOperand(1); // trampoline

28915

SDValue FPtr = Op.getOperand(2); // nested function

28916

SDValue Nest = Op.getOperand(3); // 'nest' parameter value

28917

SDLoc dl (Op);

28918

28919

const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();

28920

const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();

28921

28922

if (Subtarget.is64Bit()) {

28923

SDValue OutChains[6];

28924

28925

// Large code-model.

28926

const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode.

28927

const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.

28928

28929

const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7;

28930

const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7;

28931

28932

const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix

28933

28934

// Load the pointer to the nested function into R11.

28935

unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11

28936

SDValue Addr = Trmp;

28937

OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),

28938

Addr, MachinePointerInfo(TrmpAddr));

28939

28940

Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,

28941

DAG.getConstant(2, dl, MVT::i64));

28942

OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr,

28943

MachinePointerInfo(TrmpAddr, 2), Align(2));

28944

28945

// Load the 'nest' parameter value into R10.

28946

// R10 is specified in X86CallingConv.td

28947

OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10

28948

Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,

28949

DAG.getConstant(10, dl, MVT::i64));

28950

OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),

28951

Addr, MachinePointerInfo(TrmpAddr, 10));

28952

28953

Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,

28954

DAG.getConstant(12, dl, MVT::i64));

28955

OutChains[3] = DAG.getStore(Root, dl, Nest, Addr,

28956

MachinePointerInfo(TrmpAddr, 12), Align(2));

28957

28958

// Jump to the nested function.

28959

OpCode = (JMP64r << 8) | REX_WB; // jmpq *...

28960

Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,

28961

DAG.getConstant(20, dl, MVT::i64));

28962

OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),

28963

Addr, MachinePointerInfo(TrmpAddr, 20));

28964

28965

unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11

28966

Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,

28967

DAG.getConstant(22, dl, MVT::i64));

28968

OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, dl, MVT::i8),

28969

Addr, MachinePointerInfo(TrmpAddr, 22));

28970

28971

return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);

28972

} else {

28973

const Function *Func =

28974

cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());

28975

CallingConv::ID CC = Func->getCallingConv();

28976

unsigned NestReg;

28977

28978

switch (CC) {

28979

default:

28980

llvm_unreachable("Unsupported calling convention")::llvm::llvm_unreachable_internal("Unsupported calling convention"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 28980);

28981

case CallingConv::C:

28982

case CallingConv::X86_StdCall: {

28983

// Pass 'nest' parameter in ECX.

28984

// Must be kept in sync with X86CallingConv.td

28985

NestReg = X86::ECX;

28986

28987

// Check that ECX wasn't needed by an 'inreg' parameter.

28988

FunctionType *FTy = Func->getFunctionType();

28989

const AttributeList &Attrs = Func->getAttributes();

28990

28991

if (!Attrs.isEmpty() && !Func->isVarArg()) {

28992

unsigned InRegCount = 0;

28993

unsigned Idx = 0;

28994

28995

for (FunctionType::param_iterator I = FTy->param_begin(),

28996

E = FTy->param_end(); I != E; ++I, ++Idx)

28997

if (Attrs.hasParamAttr(Idx, Attribute::InReg)) {

28998

const DataLayout &DL = DAG.getDataLayout();

28999

// FIXME: should only count parameters that are lowered to integers.

29000

InRegCount += (DL.getTypeSizeInBits(*I) + 31) / 32;

29001

}

29002

29003

if (InRegCount > 2) {

29004

report_fatal_error("Nest register in use - reduce number of inreg"

29005

" parameters!");

29006

}

29007

}

29008

break;

29009

}

29010

case CallingConv::X86_FastCall:

29011

case CallingConv::X86_ThisCall:

29012

case CallingConv::Fast:

29013

case CallingConv::Tail:

29014

case CallingConv::SwiftTail:

29015

// Pass 'nest' parameter in EAX.

29016

// Must be kept in sync with X86CallingConv.td

29017

NestReg = X86::EAX;

29018

break;

29019

}

29020

29021

SDValue OutChains[4];

29022

SDValue Addr, Disp;

29023

29024

Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,

29025

DAG.getConstant(10, dl, MVT::i32));

29026

Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);

29027

29028

// This is storing the opcode for MOV32ri.

29029

const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.

29030

const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7;

29031

OutChains[0] =

29032

DAG.getStore(Root, dl, DAG.getConstant(MOV32ri | N86Reg, dl, MVT::i8),

29033

Trmp, MachinePointerInfo(TrmpAddr));

29034

29035

Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,

29036

DAG.getConstant(1, dl, MVT::i32));

29037

OutChains[1] = DAG.getStore(Root, dl, Nest, Addr,

29038

MachinePointerInfo(TrmpAddr, 1), Align(1));

29039

29040

const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.

29041

Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,

29042

DAG.getConstant(5, dl, MVT::i32));

29043

OutChains[2] =

29044

DAG.getStore(Root, dl, DAG.getConstant(JMP, dl, MVT::i8), Addr,

29045

MachinePointerInfo(TrmpAddr, 5), Align(1));

29046

29047

Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,

29048

DAG.getConstant(6, dl, MVT::i32));

29049

OutChains[3] = DAG.getStore(Root, dl, Disp, Addr,

29050

MachinePointerInfo(TrmpAddr, 6), Align(1));

29051

29052

return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);

29053

}

29054

}

29055

29056

SDValue X86TargetLowering::LowerGET_ROUNDING(SDValue Op,

29057

SelectionDAG &DAG) const {

29058

/*

29059

The rounding mode is in bits 11:10 of FPSR, and has the following

29060

settings:

29061

00 Round to nearest

29062

01 Round to -inf

29063

10 Round to +inf

29064

11 Round to 0

29065

29066

GET_ROUNDING, on the other hand, expects the following:

29067

-1 Undefined

29068

0 Round to 0

29069

1 Round to nearest

29070

2 Round to +inf

29071

3 Round to -inf

29072

29073

To perform the conversion, we use a packed lookup table of the four 2-bit

29074

values that we can index by FPSP[11:10]

29075

0x2d --> (0b00,10,11,01) --> (0,2,3,1) >> FPSR[11:10]

29076

29077

(0x2d >> ((FPSR & 0xc00) >> 9)) & 3

29078

*/

29079

29080

MachineFunction &MF = DAG.getMachineFunction();

29081

MVT VT = Op.getSimpleValueType();

29082

SDLoc DL(Op);

29083

29084

// Save FP Control Word to stack slot

29085

int SSFI = MF.getFrameInfo().CreateStackObject(2, Align(2), false);

29086

SDValue StackSlot =

29087

DAG.getFrameIndex(SSFI, getPointerTy(DAG.getDataLayout()));

29088

29089

MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, SSFI);

29090

29091

SDValue Chain = Op.getOperand(0);

29092

SDValue Ops[] = {Chain, StackSlot};

29093

Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL,

29094

DAG.getVTList(MVT::Other), Ops, MVT::i16, MPI,

29095

Align(2), MachineMemOperand::MOStore);

29096

29097

// Load FP Control Word from stack slot

29098

SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MPI, Align(2));

29099

Chain = CWD.getValue(1);

29100

29101

// Mask and turn the control bits into a shift for the lookup table.

29102

SDValue Shift =

29103

DAG.getNode(ISD::SRL, DL, MVT::i16,

29104

DAG.getNode(ISD::AND, DL, MVT::i16,

29105

CWD, DAG.getConstant(0xc00, DL, MVT::i16)),

29106

DAG.getConstant(9, DL, MVT::i8));

29107

Shift = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Shift);

29108

29109

SDValue LUT = DAG.getConstant(0x2d, DL, MVT::i32);

29110

SDValue RetVal =

29111

DAG.getNode(ISD::AND, DL, MVT::i32,

29112

DAG.getNode(ISD::SRL, DL, MVT::i32, LUT, Shift),

29113

DAG.getConstant(3, DL, MVT::i32));

29114

29115

RetVal = DAG.getZExtOrTrunc(RetVal, DL, VT);

29116

29117

return DAG.getMergeValues({RetVal, Chain}, DL);

29118

}

29119

29120

SDValue X86TargetLowering::LowerSET_ROUNDING(SDValue Op,

29121

SelectionDAG &DAG) const {

29122

MachineFunction &MF = DAG.getMachineFunction();

29123

SDLoc DL(Op);

29124

SDValue Chain = Op.getNode()->getOperand(0);

29125

29126

// FP control word may be set only from data in memory. So we need to allocate

29127

// stack space to save/load FP control word.

29128

int OldCWFrameIdx = MF.getFrameInfo().CreateStackObject(4, Align(4), false);

29129

SDValue StackSlot =

29130

DAG.getFrameIndex(OldCWFrameIdx, getPointerTy(DAG.getDataLayout()));

29131

MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, OldCWFrameIdx);

29132

MachineMemOperand *MMO =

29133

MF.getMachineMemOperand(MPI, MachineMemOperand::MOStore, 2, Align(2));

29134

29135

// Store FP control word into memory.

29136

SDValue Ops[] = {Chain, StackSlot};

29137

Chain = DAG.getMemIntrinsicNode(

29138

X86ISD::FNSTCW16m, DL, DAG.getVTList(MVT::Other), Ops, MVT::i16, MMO);

29139

29140

// Load FP Control Word from stack slot and clear RM field (bits 11:10).

29141

SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MPI);

29142

Chain = CWD.getValue(1);

29143

CWD = DAG.getNode(ISD::AND, DL, MVT::i16, CWD.getValue(0),

29144

DAG.getConstant(0xf3ff, DL, MVT::i16));

29145

29146

// Calculate new rounding mode.

29147

SDValue NewRM = Op.getNode()->getOperand(1);

29148

SDValue RMBits;

29149

if (auto *CVal = dyn_cast<ConstantSDNode>(NewRM)) {

29150

uint64_t RM = CVal->getZExtValue();

29151

int FieldVal;

29152

switch (static_cast<RoundingMode>(RM)) {

29153

case RoundingMode::NearestTiesToEven: FieldVal = X86::rmToNearest; break;

29154

case RoundingMode::TowardNegative: FieldVal = X86::rmDownward; break;

29155

case RoundingMode::TowardPositive: FieldVal = X86::rmUpward; break;

29156

case RoundingMode::TowardZero: FieldVal = X86::rmTowardZero; break;

29157

default:

29158

llvm_unreachable("rounding mode is not supported by X86 hardware")::llvm::llvm_unreachable_internal("rounding mode is not supported by X86 hardware"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29158);

29159

}

29160

RMBits = DAG.getConstant(FieldVal, DL, MVT::i16);

29161

} else {

29162

// Need to convert argument into bits of control word:

29163

// 0 Round to 0 -> 11

29164

// 1 Round to nearest -> 00

29165

// 2 Round to +inf -> 10

29166

// 3 Round to -inf -> 01

29167

// The 2-bit value needs then to be shifted so that it occupies bits 11:10.

29168

// To make the conversion, put all these values into a value 0xc9 and shift

29169

// it left depending on the rounding mode:

29170

// (0xc9 << 4) & 0xc00 = X86::rmTowardZero

29171

// (0xc9 << 6) & 0xc00 = X86::rmToNearest

29172

// ...

29173

// (0xc9 << (2 * NewRM + 4)) & 0xc00

29174

SDValue ShiftValue =

29175

DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,

29176

DAG.getNode(ISD::ADD, DL, MVT::i32,

29177

DAG.getNode(ISD::SHL, DL, MVT::i32, NewRM,

29178

DAG.getConstant(1, DL, MVT::i8)),

29179

DAG.getConstant(4, DL, MVT::i32)));

29180

SDValue Shifted =

29181

DAG.getNode(ISD::SHL, DL, MVT::i16, DAG.getConstant(0xc9, DL, MVT::i16),

29182

ShiftValue);

29183

RMBits = DAG.getNode(ISD::AND, DL, MVT::i16, Shifted,

29184

DAG.getConstant(0xc00, DL, MVT::i16));

29185

}

29186

29187

// Update rounding mode bits and store the new FP Control Word into stack.

29188

CWD = DAG.getNode(ISD::OR, DL, MVT::i16, CWD, RMBits);

29189

Chain = DAG.getStore(Chain, DL, CWD, StackSlot, MPI, Align(2));

29190

29191

// Load FP control word from the slot.

29192

SDValue OpsLD[] = {Chain, StackSlot};

29193

MachineMemOperand *MMOL =

29194

MF.getMachineMemOperand(MPI, MachineMemOperand::MOLoad, 2, Align(2));

29195

Chain = DAG.getMemIntrinsicNode(

29196

X86ISD::FLDCW16m, DL, DAG.getVTList(MVT::Other), OpsLD, MVT::i16, MMOL);

29197

29198

// If target supports SSE, set MXCSR as well. Rounding mode is encoded in the

29199

// same way but in bits 14:13.

29200

if (Subtarget.hasSSE1()) {

29201

// Store MXCSR into memory.

29202

Chain = DAG.getNode(

29203

ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,

29204

DAG.getTargetConstant(Intrinsic::x86_sse_stmxcsr, DL, MVT::i32),

29205

StackSlot);

29206

29207

// Load MXCSR from stack slot and clear RM field (bits 14:13).

29208

SDValue CWD = DAG.getLoad(MVT::i32, DL, Chain, StackSlot, MPI);

29209

Chain = CWD.getValue(1);

29210

CWD = DAG.getNode(ISD::AND, DL, MVT::i32, CWD.getValue(0),

29211

DAG.getConstant(0xffff9fff, DL, MVT::i32));

29212

29213

// Shift X87 RM bits from 11:10 to 14:13.

29214

RMBits = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, RMBits);

29215

RMBits = DAG.getNode(ISD::SHL, DL, MVT::i32, RMBits,

29216

DAG.getConstant(3, DL, MVT::i8));

29217

29218

// Update rounding mode bits and store the new FP Control Word into stack.

29219

CWD = DAG.getNode(ISD::OR, DL, MVT::i32, CWD, RMBits);

29220

Chain = DAG.getStore(Chain, DL, CWD, StackSlot, MPI, Align(4));

29221

29222

// Load MXCSR from the slot.

29223

Chain = DAG.getNode(

29224

ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain,

29225

DAG.getTargetConstant(Intrinsic::x86_sse_ldmxcsr, DL, MVT::i32),

29226

StackSlot);

29227

}

29228

29229

return Chain;

29230

}

29231

29232

/// Lower a vector CTLZ using native supported vector CTLZ instruction.

29233

//

29234

// i8/i16 vector implemented using dword LZCNT vector instruction

29235

// ( sub(trunc(lzcnt(zext32(x)))) ). In case zext32(x) is illegal,

29236

// split the vector, perform operation on it's Lo a Hi part and

29237

// concatenate the results.

29238

static SDValue LowerVectorCTLZ_AVX512CDI(SDValue Op, SelectionDAG &DAG,

29239

const X86Subtarget &Subtarget) {

29240

assert(Op.getOpcode() == ISD::CTLZ)(static_cast <bool> (Op.getOpcode() == ISD::CTLZ) ? void
(0) : __assert_fail ("Op.getOpcode() == ISD::CTLZ", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 29240, __extension__ __PRETTY_FUNCTION__));

29241

SDLoc dl(Op);

29242

MVT VT = Op.getSimpleValueType();

29243

MVT EltVT = VT.getVectorElementType();

29244

unsigned NumElems = VT.getVectorNumElements();

29245

29246

assert((EltVT == MVT::i8 || EltVT == MVT::i16) &&(static_cast <bool> ((EltVT == MVT::i8 || EltVT == MVT::
i16) && "Unsupported element type") ? void (0) : __assert_fail
("(EltVT == MVT::i8 || EltVT == MVT::i16) && \"Unsupported element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29247, __extension__
__PRETTY_FUNCTION__))

29247

"Unsupported element type")(static_cast <bool> ((EltVT == MVT::i8 || EltVT == MVT::
i16) && "Unsupported element type") ? void (0) : __assert_fail
("(EltVT == MVT::i8 || EltVT == MVT::i16) && \"Unsupported element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29247, __extension__
__PRETTY_FUNCTION__));

29248

29249

// Split vector, it's Lo and Hi parts will be handled in next iteration.

29250

if (NumElems > 16 ||

29251

(NumElems == 16 && !Subtarget.canExtendTo512DQ()))

29252

return splitVectorIntUnary(Op, DAG);

29253

29254

MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);

29255

assert((NewVT.is256BitVector() || NewVT.is512BitVector()) &&(static_cast <bool> ((NewVT.is256BitVector() || NewVT.is512BitVector
()) && "Unsupported value type for operation") ? void
(0) : __assert_fail ("(NewVT.is256BitVector() || NewVT.is512BitVector()) && \"Unsupported value type for operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29256, __extension__
__PRETTY_FUNCTION__))

29256

"Unsupported value type for operation")(static_cast <bool> ((NewVT.is256BitVector() || NewVT.is512BitVector
()) && "Unsupported value type for operation") ? void
(0) : __assert_fail ("(NewVT.is256BitVector() || NewVT.is512BitVector()) && \"Unsupported value type for operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29256, __extension__
__PRETTY_FUNCTION__));

29257

29258

// Use native supported vector instruction vplzcntd.

29259

Op = DAG.getNode(ISD::ZERO_EXTEND, dl, NewVT, Op.getOperand(0));

29260

SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Op);

29261

SDValue TruncNode = DAG.getNode(ISD::TRUNCATE, dl, VT, CtlzNode);

29262

SDValue Delta = DAG.getConstant(32 - EltVT.getSizeInBits(), dl, VT);

29263

29264

return DAG.getNode(ISD::SUB, dl, VT, TruncNode, Delta);

29265

}

29266

29267

// Lower CTLZ using a PSHUFB lookup table implementation.

29268

static SDValue LowerVectorCTLZInRegLUT(SDValue Op, const SDLoc &DL,

29269

const X86Subtarget &Subtarget,

29270

SelectionDAG &DAG) {

29271

MVT VT = Op.getSimpleValueType();

29272

int NumElts = VT.getVectorNumElements();

29273

int NumBytes = NumElts * (VT.getScalarSizeInBits() / 8);

29274

MVT CurrVT = MVT::getVectorVT(MVT::i8, NumBytes);

29275

29276

// Per-nibble leading zero PSHUFB lookup table.

29277

const int LUT[16] = {/* 0 */ 4, /* 1 */ 3, /* 2 */ 2, /* 3 */ 2,

29278

/* 4 */ 1, /* 5 */ 1, /* 6 */ 1, /* 7 */ 1,

29279

/* 8 */ 0, /* 9 */ 0, /* a */ 0, /* b */ 0,

29280

/* c */ 0, /* d */ 0, /* e */ 0, /* f */ 0};

29281

29282

SmallVector<SDValue, 64> LUTVec;

29283

for (int i = 0; i < NumBytes; ++i)

29284

LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));

29285

SDValue InRegLUT = DAG.getBuildVector(CurrVT, DL, LUTVec);

29286

29287

// Begin by bitcasting the input to byte vector, then split those bytes

29288

// into lo/hi nibbles and use the PSHUFB LUT to perform CLTZ on each of them.

29289

// If the hi input nibble is zero then we add both results together, otherwise

29290

// we just take the hi result (by masking the lo result to zero before the

29291

// add).

29292

SDValue Op0 = DAG.getBitcast(CurrVT, Op.getOperand(0));

29293

SDValue Zero = DAG.getConstant(0, DL, CurrVT);

29294

29295

SDValue NibbleShift = DAG.getConstant(0x4, DL, CurrVT);

29296

SDValue Lo = Op0;

29297

SDValue Hi = DAG.getNode(ISD::SRL, DL, CurrVT, Op0, NibbleShift);

29298

SDValue HiZ;

29299

if (CurrVT.is512BitVector()) {

29300

MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());

29301

HiZ = DAG.getSetCC(DL, MaskVT, Hi, Zero, ISD::SETEQ);

29302

HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);

29303

} else {

29304

HiZ = DAG.getSetCC(DL, CurrVT, Hi, Zero, ISD::SETEQ);

29305

}

29306

29307

Lo = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Lo);

29308

Hi = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Hi);

29309

Lo = DAG.getNode(ISD::AND, DL, CurrVT, Lo, HiZ);

29310

SDValue Res = DAG.getNode(ISD::ADD, DL, CurrVT, Lo, Hi);

29311

29312

// Merge result back from vXi8 back to VT, working on the lo/hi halves

29313

// of the current vector width in the same way we did for the nibbles.

29314

// If the upper half of the input element is zero then add the halves'

29315

// leading zero counts together, otherwise just use the upper half's.

29316

// Double the width of the result until we are at target width.

29317

while (CurrVT != VT) {

29318

int CurrScalarSizeInBits = CurrVT.getScalarSizeInBits();

29319

int CurrNumElts = CurrVT.getVectorNumElements();

29320

MVT NextSVT = MVT::getIntegerVT(CurrScalarSizeInBits * 2);

29321

MVT NextVT = MVT::getVectorVT(NextSVT, CurrNumElts / 2);

29322

SDValue Shift = DAG.getConstant(CurrScalarSizeInBits, DL, NextVT);

29323

29324

// Check if the upper half of the input element is zero.

29325

if (CurrVT.is512BitVector()) {

29326

MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());

29327

HiZ = DAG.getSetCC(DL, MaskVT, DAG.getBitcast(CurrVT, Op0),

29328

DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);

29329

HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);

29330

} else {

29331

HiZ = DAG.getSetCC(DL, CurrVT, DAG.getBitcast(CurrVT, Op0),

29332

DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);

29333

}

29334

HiZ = DAG.getBitcast(NextVT, HiZ);

29335

29336

// Move the upper/lower halves to the lower bits as we'll be extending to

29337

// NextVT. Mask the lower result to zero if HiZ is true and add the results

29338

// together.

29339

SDValue ResNext = Res = DAG.getBitcast(NextVT, Res);

29340

SDValue R0 = DAG.getNode(ISD::SRL, DL, NextVT, ResNext, Shift);

29341

SDValue R1 = DAG.getNode(ISD::SRL, DL, NextVT, HiZ, Shift);

29342

R1 = DAG.getNode(ISD::AND, DL, NextVT, ResNext, R1);

29343

Res = DAG.getNode(ISD::ADD, DL, NextVT, R0, R1);

29344

CurrVT = NextVT;

29345

}

29346

29347

return Res;

29348

}

29349

29350

static SDValue LowerVectorCTLZ(SDValue Op, const SDLoc &DL,

29351

const X86Subtarget &Subtarget,

29352

SelectionDAG &DAG) {

29353

MVT VT = Op.getSimpleValueType();

29354

29355

if (Subtarget.hasCDI() &&

29356

// vXi8 vectors need to be promoted to 512-bits for vXi32.

29357

(Subtarget.canExtendTo512DQ() || VT.getVectorElementType() != MVT::i8))

29358

return LowerVectorCTLZ_AVX512CDI(Op, DAG, Subtarget);

29359

29360

// Decompose 256-bit ops into smaller 128-bit ops.

29361

if (VT.is256BitVector() && !Subtarget.hasInt256())

29362

return splitVectorIntUnary(Op, DAG);

29363

29364

// Decompose 512-bit ops into smaller 256-bit ops.

29365

if (VT.is512BitVector() && !Subtarget.hasBWI())

29366

return splitVectorIntUnary(Op, DAG);

29367

29368

assert(Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB")(static_cast <bool> (Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB"
) ? void (0) : __assert_fail ("Subtarget.hasSSSE3() && \"Expected SSSE3 support for PSHUFB\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29368, __extension__
__PRETTY_FUNCTION__));

29369

return LowerVectorCTLZInRegLUT(Op, DL, Subtarget, DAG);

29370

}

29371

29372

static SDValue LowerCTLZ(SDValue Op, const X86Subtarget &Subtarget,

29373

SelectionDAG &DAG) {

29374

MVT VT = Op.getSimpleValueType();

29375

MVT OpVT = VT;

29376

unsigned NumBits = VT.getSizeInBits();

29377

SDLoc dl(Op);

29378

unsigned Opc = Op.getOpcode();

29379

29380

if (VT.isVector())

29381

return LowerVectorCTLZ(Op, dl, Subtarget, DAG);

29382

29383

Op = Op.getOperand(0);

29384

if (VT == MVT::i8) {

29385

// Zero extend to i32 since there is not an i8 bsr.

29386

OpVT = MVT::i32;

29387

Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);

29388

}

29389

29390

// Issue a bsr (scan bits in reverse) which also sets EFLAGS.

29391

SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);

29392

Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);

29393

29394

if (Opc == ISD::CTLZ) {

29395

// If src is zero (i.e. bsr sets ZF), returns NumBits.

29396

SDValue Ops[] = {Op, DAG.getConstant(NumBits + NumBits - 1, dl, OpVT),

29397

DAG.getTargetConstant(X86::COND_E, dl, MVT::i8),

29398

Op.getValue(1)};

29399

Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops);

29400

}

29401

29402

// Finally xor with NumBits-1.

29403

Op = DAG.getNode(ISD::XOR, dl, OpVT, Op,

29404

DAG.getConstant(NumBits - 1, dl, OpVT));

29405

29406

if (VT == MVT::i8)

29407

Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);

29408

return Op;

29409

}

29410

29411

static SDValue LowerCTTZ(SDValue Op, const X86Subtarget &Subtarget,

29412

SelectionDAG &DAG) {

29413

MVT VT = Op.getSimpleValueType();

29414

unsigned NumBits = VT.getScalarSizeInBits();

29415

SDValue N0 = Op.getOperand(0);

29416

SDLoc dl(Op);

29417

29418

assert(!VT.isVector() && Op.getOpcode() == ISD::CTTZ &&(static_cast <bool> (!VT.isVector() && Op.getOpcode
() == ISD::CTTZ && "Only scalar CTTZ requires custom lowering"
) ? void (0) : __assert_fail ("!VT.isVector() && Op.getOpcode() == ISD::CTTZ && \"Only scalar CTTZ requires custom lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29419, __extension__
__PRETTY_FUNCTION__))

29419

"Only scalar CTTZ requires custom lowering")(static_cast <bool> (!VT.isVector() && Op.getOpcode
() == ISD::CTTZ && "Only scalar CTTZ requires custom lowering"
) ? void (0) : __assert_fail ("!VT.isVector() && Op.getOpcode() == ISD::CTTZ && \"Only scalar CTTZ requires custom lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29419, __extension__
__PRETTY_FUNCTION__));

29420

29421

// Issue a bsf (scan bits forward) which also sets EFLAGS.

29422

SDVTList VTs = DAG.getVTList(VT, MVT::i32);

29423

Op = DAG.getNode(X86ISD::BSF, dl, VTs, N0);

29424

29425

// If src is known never zero we can skip the CMOV.

29426

if (DAG.isKnownNeverZero(N0))

29427

return Op;

29428

29429

// If src is zero (i.e. bsf sets ZF), returns NumBits.

29430

SDValue Ops[] = {Op, DAG.getConstant(NumBits, dl, VT),

29431

DAG.getTargetConstant(X86::COND_E, dl, MVT::i8),

29432

Op.getValue(1)};

29433

return DAG.getNode(X86ISD::CMOV, dl, VT, Ops);

29434

}

29435

29436

static SDValue lowerAddSub(SDValue Op, SelectionDAG &DAG,

29437

const X86Subtarget &Subtarget) {

29438

MVT VT = Op.getSimpleValueType();

29439

if (VT == MVT::i16 || VT == MVT::i32)

29440

return lowerAddSubToHorizontalOp(Op, DAG, Subtarget);

29441

29442

if (VT == MVT::v32i16 || VT == MVT::v64i8)

29443

return splitVectorIntBinary(Op, DAG);

29444

29445

assert(Op.getSimpleValueType().is256BitVector() &&(static_cast <bool> (Op.getSimpleValueType().is256BitVector
() && Op.getSimpleValueType().isInteger() && "Only handle AVX 256-bit vector integer operation"
) ? void (0) : __assert_fail ("Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType().isInteger() && \"Only handle AVX 256-bit vector integer operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29447, __extension__
__PRETTY_FUNCTION__))

29446

Op.getSimpleValueType().isInteger() &&(static_cast <bool> (Op.getSimpleValueType().is256BitVector
() && Op.getSimpleValueType().isInteger() && "Only handle AVX 256-bit vector integer operation"
) ? void (0) : __assert_fail ("Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType().isInteger() && \"Only handle AVX 256-bit vector integer operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29447, __extension__
__PRETTY_FUNCTION__))

29447

"Only handle AVX 256-bit vector integer operation")(static_cast <bool> (Op.getSimpleValueType().is256BitVector
() && Op.getSimpleValueType().isInteger() && "Only handle AVX 256-bit vector integer operation"
) ? void (0) : __assert_fail ("Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType().isInteger() && \"Only handle AVX 256-bit vector integer operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29447, __extension__
__PRETTY_FUNCTION__));

29448

return splitVectorIntBinary(Op, DAG);

29449

}

29450

29451

static SDValue LowerADDSAT_SUBSAT(SDValue Op, SelectionDAG &DAG,

29452

const X86Subtarget &Subtarget) {

29453

MVT VT = Op.getSimpleValueType();

29454

SDValue X = Op.getOperand(0), Y = Op.getOperand(1);

29455

unsigned Opcode = Op.getOpcode();

29456

SDLoc DL(Op);

29457

29458

if (VT == MVT::v32i16 || VT == MVT::v64i8 ||

29459

(VT.is256BitVector() && !Subtarget.hasInt256())) {

29460

assert(Op.getSimpleValueType().isInteger() &&(static_cast <bool> (Op.getSimpleValueType().isInteger(
) && "Only handle AVX vector integer operation") ? void
(0) : __assert_fail ("Op.getSimpleValueType().isInteger() && \"Only handle AVX vector integer operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29461, __extension__
__PRETTY_FUNCTION__))

29461

"Only handle AVX vector integer operation")(static_cast <bool> (Op.getSimpleValueType().isInteger(
) && "Only handle AVX vector integer operation") ? void
(0) : __assert_fail ("Op.getSimpleValueType().isInteger() && \"Only handle AVX vector integer operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29461, __extension__
__PRETTY_FUNCTION__));

29462

return splitVectorIntBinary(Op, DAG);

29463

}

29464

29465

// Avoid the generic expansion with min/max if we don't have pminu*/pmaxu*.

29466

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

29467

EVT SetCCResultType =

29468

TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);

29469

29470

unsigned BitWidth = VT.getScalarSizeInBits();

29471

if (Opcode == ISD::USUBSAT) {

29472

if (!TLI.isOperationLegal(ISD::UMAX, VT) || useVPTERNLOG(Subtarget, VT)) {

29473

// Handle a special-case with a bit-hack instead of cmp+select:

29474

// usubsat X, SMIN --> (X ^ SMIN) & (X s>> BW-1)

29475

// If the target can use VPTERNLOG, DAGToDAG will match this as

29476

// "vpsra + vpternlog" which is better than "vpmax + vpsub" with a

29477

// "broadcast" constant load.

29478

ConstantSDNode *C = isConstOrConstSplat(Y, true);

29479

if (C && C->getAPIntValue().isSignMask()) {

29480

SDValue SignMask = DAG.getConstant(C->getAPIntValue(), DL, VT);

29481

SDValue ShiftAmt = DAG.getConstant(BitWidth - 1, DL, VT);

29482

SDValue Xor = DAG.getNode(ISD::XOR, DL, VT, X, SignMask);

29483

SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, X, ShiftAmt);

29484

return DAG.getNode(ISD::AND, DL, VT, Xor, Sra);

29485

}

29486

}

29487

if (!TLI.isOperationLegal(ISD::UMAX, VT)) {

29488

// usubsat X, Y --> (X >u Y) ? X - Y : 0

29489

SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, X, Y);

29490

SDValue Cmp = DAG.getSetCC(DL, SetCCResultType, X, Y, ISD::SETUGT);

29491

// TODO: Move this to DAGCombiner?

29492

if (SetCCResultType == VT &&

29493

DAG.ComputeNumSignBits(Cmp) == VT.getScalarSizeInBits())

29494

return DAG.getNode(ISD::AND, DL, VT, Cmp, Sub);

29495

return DAG.getSelect(DL, VT, Cmp, Sub, DAG.getConstant(0, DL, VT));

29496

}

29497

}

29498

29499

if ((Opcode == ISD::SADDSAT || Opcode == ISD::SSUBSAT) &&

29500

(!VT.isVector() || VT == MVT::v2i64)) {

29501

APInt MinVal = APInt::getSignedMinValue(BitWidth);

29502

APInt MaxVal = APInt::getSignedMaxValue(BitWidth);

29503

SDValue Zero = DAG.getConstant(0, DL, VT);

29504

SDValue Result =

29505

DAG.getNode(Opcode == ISD::SADDSAT ? ISD::SADDO : ISD::SSUBO, DL,

29506

DAG.getVTList(VT, SetCCResultType), X, Y);

29507

SDValue SumDiff = Result.getValue(0);

29508

SDValue Overflow = Result.getValue(1);

29509

SDValue SatMin = DAG.getConstant(MinVal, DL, VT);

29510

SDValue SatMax = DAG.getConstant(MaxVal, DL, VT);

29511

SDValue SumNeg =

29512

DAG.getSetCC(DL, SetCCResultType, SumDiff, Zero, ISD::SETLT);

29513

Result = DAG.getSelect(DL, VT, SumNeg, SatMax, SatMin);

29514

return DAG.getSelect(DL, VT, Overflow, Result, SumDiff);

29515

}

29516

29517

// Use default expansion.

29518

return SDValue();

29519

}

29520

29521

static SDValue LowerABS(SDValue Op, const X86Subtarget &Subtarget,

29522

SelectionDAG &DAG) {

29523

MVT VT = Op.getSimpleValueType();

29524

if (VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) {

29525

// Since X86 does not have CMOV for 8-bit integer, we don't convert

29526

// 8-bit integer abs to NEG and CMOV.

29527

SDLoc DL(Op);

29528

SDValue N0 = Op.getOperand(0);

29529

SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),

29530

DAG.getConstant(0, DL, VT), N0);

29531

SDValue Ops[] = {N0, Neg, DAG.getTargetConstant(X86::COND_NS, DL, MVT::i8),

29532

SDValue(Neg.getNode(), 1)};

29533

return DAG.getNode(X86ISD::CMOV, DL, VT, Ops);

29534

}

29535

29536

// ABS(vXi64 X) --> VPBLENDVPD(X, 0-X, X).

29537

if ((VT == MVT::v2i64 || VT == MVT::v4i64) && Subtarget.hasSSE41()) {

29538

SDLoc DL(Op);

29539

SDValue Src = Op.getOperand(0);

29540

SDValue Sub =

29541

DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Src);

29542

return DAG.getNode(X86ISD::BLENDV, DL, VT, Src, Sub, Src);

29543

}

29544

29545

if (VT.is256BitVector() && !Subtarget.hasInt256()) {

29546

assert(VT.isInteger() &&(static_cast <bool> (VT.isInteger() && "Only handle AVX 256-bit vector integer operation"
) ? void (0) : __assert_fail ("VT.isInteger() && \"Only handle AVX 256-bit vector integer operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29547, __extension__
__PRETTY_FUNCTION__))

29547

"Only handle AVX 256-bit vector integer operation")(static_cast <bool> (VT.isInteger() && "Only handle AVX 256-bit vector integer operation"
) ? void (0) : __assert_fail ("VT.isInteger() && \"Only handle AVX 256-bit vector integer operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29547, __extension__
__PRETTY_FUNCTION__));

29548

return splitVectorIntUnary(Op, DAG);

29549

}

29550

29551

if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())

29552

return splitVectorIntUnary(Op, DAG);

29553

29554

// Default to expand.

29555

return SDValue();

29556

}

29557

29558

static SDValue LowerAVG(SDValue Op, const X86Subtarget &Subtarget,

29559

SelectionDAG &DAG) {

29560

MVT VT = Op.getSimpleValueType();

29561

29562

// For AVX1 cases, split to use legal ops.

29563

if (VT.is256BitVector() && !Subtarget.hasInt256())

29564

return splitVectorIntBinary(Op, DAG);

29565

29566

if (VT == MVT::v32i16 || VT == MVT::v64i8)

29567

return splitVectorIntBinary(Op, DAG);

29568

29569

// Default to expand.

29570

return SDValue();

29571

}

29572

29573

static SDValue LowerMINMAX(SDValue Op, const X86Subtarget &Subtarget,

29574

SelectionDAG &DAG) {

29575

MVT VT = Op.getSimpleValueType();

29576

29577

// For AVX1 cases, split to use legal ops.

29578

if (VT.is256BitVector() && !Subtarget.hasInt256())

29579

return splitVectorIntBinary(Op, DAG);

29580

29581

if (VT == MVT::v32i16 || VT == MVT::v64i8)

29582

return splitVectorIntBinary(Op, DAG);

29583

29584

// Default to expand.

29585

return SDValue();

29586

}

29587

29588

static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,

29589

SelectionDAG &DAG) {

29590

SDLoc dl(Op);

29591

MVT VT = Op.getSimpleValueType();

29592

29593

// Decompose 256-bit ops into 128-bit ops.

29594

if (VT.is256BitVector() && !Subtarget.hasInt256())

29595

return splitVectorIntBinary(Op, DAG);

29596

29597

if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())

29598

return splitVectorIntBinary(Op, DAG);

29599

29600

SDValue A = Op.getOperand(0);

29601

SDValue B = Op.getOperand(1);

29602

29603

// Lower v16i8/v32i8/v64i8 mul as sign-extension to v8i16/v16i16/v32i16

29604

// vector pairs, multiply and truncate.

29605

if (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8) {

29606

unsigned NumElts = VT.getVectorNumElements();

29607

29608

if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||

29609

(VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {

29610

MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());

29611

return DAG.getNode(

29612

ISD::TRUNCATE, dl, VT,

29613

DAG.getNode(ISD::MUL, dl, ExVT,

29614

DAG.getNode(ISD::ANY_EXTEND, dl, ExVT, A),

29615

DAG.getNode(ISD::ANY_EXTEND, dl, ExVT, B)));

29616

}

29617

29618

MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);

29619

29620

// Extract the lo/hi parts to any extend to i16.

29621

// We're going to mask off the low byte of each result element of the

29622

// pmullw, so it doesn't matter what's in the high byte of each 16-bit

29623

// element.

29624

SDValue Undef = DAG.getUNDEF(VT);

29625

SDValue ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A, Undef));

29626

SDValue AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A, Undef));

29627

29628

SDValue BLo, BHi;

29629

if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) {

29630

// If the RHS is a constant, manually unpackl/unpackh.

29631

SmallVector<SDValue, 16> LoOps, HiOps;

29632

for (unsigned i = 0; i != NumElts; i += 16) {

29633

for (unsigned j = 0; j != 8; ++j) {

29634

LoOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j), dl,

29635

MVT::i16));

29636

HiOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j + 8), dl,

29637

MVT::i16));

29638

}

29639

}

29640

29641

BLo = DAG.getBuildVector(ExVT, dl, LoOps);

29642

BHi = DAG.getBuildVector(ExVT, dl, HiOps);

29643

} else {

29644

BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B, Undef));

29645

BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B, Undef));

29646

}

29647

29648

// Multiply, mask the lower 8bits of the lo/hi results and pack.

29649

SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);

29650

SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);

29651

return getPack(DAG, Subtarget, dl, VT, RLo, RHi);

29652

}

29653

29654

// Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle.

29655

if (VT == MVT::v4i32) {

29656

assert(Subtarget.hasSSE2() && !Subtarget.hasSSE41() &&(static_cast <bool> (Subtarget.hasSSE2() && !Subtarget
.hasSSE41() && "Should not custom lower when pmulld is available!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && !Subtarget.hasSSE41() && \"Should not custom lower when pmulld is available!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29657, __extension__
__PRETTY_FUNCTION__))

29657

"Should not custom lower when pmulld is available!")(static_cast <bool> (Subtarget.hasSSE2() && !Subtarget
.hasSSE41() && "Should not custom lower when pmulld is available!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && !Subtarget.hasSSE41() && \"Should not custom lower when pmulld is available!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29657, __extension__
__PRETTY_FUNCTION__));

29658

29659

// Extract the odd parts.

29660

static const int UnpackMask[] = { 1, -1, 3, -1 };

29661

SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);

29662

SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask);

29663

29664

// Multiply the even parts.

29665

SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64,

29666

DAG.getBitcast(MVT::v2i64, A),

29667

DAG.getBitcast(MVT::v2i64, B));

29668

// Now multiply odd parts.

29669

SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64,

29670

DAG.getBitcast(MVT::v2i64, Aodds),

29671

DAG.getBitcast(MVT::v2i64, Bodds));

29672

29673

Evens = DAG.getBitcast(VT, Evens);

29674

Odds = DAG.getBitcast(VT, Odds);

29675

29676

// Merge the two vectors back together with a shuffle. This expands into 2

29677

// shuffles.

29678

static const int ShufMask[] = { 0, 4, 2, 6 };

29679

return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);

29680

}

29681

29682

assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&(static_cast <bool> ((VT == MVT::v2i64 || VT == MVT::v4i64
|| VT == MVT::v8i64) && "Only know how to lower V2I64/V4I64/V8I64 multiply"
) ? void (0) : __assert_fail ("(VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) && \"Only know how to lower V2I64/V4I64/V8I64 multiply\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29683, __extension__
__PRETTY_FUNCTION__))

29683

"Only know how to lower V2I64/V4I64/V8I64 multiply")(static_cast <bool> ((VT == MVT::v2i64 || VT == MVT::v4i64
|| VT == MVT::v8i64) && "Only know how to lower V2I64/V4I64/V8I64 multiply"
) ? void (0) : __assert_fail ("(VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) && \"Only know how to lower V2I64/V4I64/V8I64 multiply\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29683, __extension__
__PRETTY_FUNCTION__));

29684

assert(!Subtarget.hasDQI() && "DQI should use MULLQ")(static_cast <bool> (!Subtarget.hasDQI() && "DQI should use MULLQ"
) ? void (0) : __assert_fail ("!Subtarget.hasDQI() && \"DQI should use MULLQ\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29684, __extension__
__PRETTY_FUNCTION__));

29685

29686

// Ahi = psrlqi(a, 32);

29687

// Bhi = psrlqi(b, 32);

29688

//

29689

// AloBlo = pmuludq(a, b);

29690

// AloBhi = pmuludq(a, Bhi);

29691

// AhiBlo = pmuludq(Ahi, b);

29692

//

29693

// Hi = psllqi(AloBhi + AhiBlo, 32);

29694

// return AloBlo + Hi;

29695

KnownBits AKnown = DAG.computeKnownBits(A);

29696

KnownBits BKnown = DAG.computeKnownBits(B);

29697

29698

APInt LowerBitsMask = APInt::getLowBitsSet(64, 32);

29699

bool ALoIsZero = LowerBitsMask.isSubsetOf(AKnown.Zero);

29700

bool BLoIsZero = LowerBitsMask.isSubsetOf(BKnown.Zero);

29701

29702

APInt UpperBitsMask = APInt::getHighBitsSet(64, 32);

29703

bool AHiIsZero = UpperBitsMask.isSubsetOf(AKnown.Zero);

29704

bool BHiIsZero = UpperBitsMask.isSubsetOf(BKnown.Zero);

29705

29706

SDValue Zero = DAG.getConstant(0, dl, VT);

29707

29708

// Only multiply lo/hi halves that aren't known to be zero.

29709

SDValue AloBlo = Zero;

29710

if (!ALoIsZero && !BLoIsZero)

29711

AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B);

29712

29713

SDValue AloBhi = Zero;

29714

if (!ALoIsZero && !BHiIsZero) {

29715

SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG);

29716

AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi);

29717

}

29718

29719

SDValue AhiBlo = Zero;

29720

if (!AHiIsZero && !BLoIsZero) {

29721

SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG);

29722

AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B);

29723

}

29724

29725

SDValue Hi = DAG.getNode(ISD::ADD, dl, VT, AloBhi, AhiBlo);

29726

Hi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Hi, 32, DAG);

29727

29728

return DAG.getNode(ISD::ADD, dl, VT, AloBlo, Hi);

29729

}

29730

29731

static SDValue LowervXi8MulWithUNPCK(SDValue A, SDValue B, const SDLoc &dl,

29732

MVT VT, bool IsSigned,

29733

const X86Subtarget &Subtarget,

29734

SelectionDAG &DAG,

29735

SDValue *Low = nullptr) {

29736

unsigned NumElts = VT.getVectorNumElements();

29737

29738

// For vXi8 we will unpack the low and high half of each 128 bit lane to widen

29739

// to a vXi16 type. Do the multiplies, shift the results and pack the half

29740

// lane results back together.

29741

29742

// We'll take different approaches for signed and unsigned.

29743

// For unsigned we'll use punpcklbw/punpckhbw to put zero extend the bytes

29744

// and use pmullw to calculate the full 16-bit product.

29745

// For signed we'll use punpcklbw/punpckbw to extend the bytes to words and

29746

// shift them left into the upper byte of each word. This allows us to use

29747

// pmulhw to calculate the full 16-bit product. This trick means we don't

29748

// need to sign extend the bytes to use pmullw.

29749

29750

MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);

29751

SDValue Zero = DAG.getConstant(0, dl, VT);

29752

29753

SDValue ALo, AHi;

29754

if (IsSigned) {

29755

ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, Zero, A));

29756

AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, Zero, A));

29757

} else {

29758

ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A, Zero));

29759

AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A, Zero));

29760

}

29761

29762

SDValue BLo, BHi;

29763

if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) {

29764

// If the RHS is a constant, manually unpackl/unpackh and extend.

29765

SmallVector<SDValue, 16> LoOps, HiOps;

29766

for (unsigned i = 0; i != NumElts; i += 16) {

29767

for (unsigned j = 0; j != 8; ++j) {

29768

SDValue LoOp = B.getOperand(i + j);

29769

SDValue HiOp = B.getOperand(i + j + 8);

29770

29771

if (IsSigned) {

29772

LoOp = DAG.getAnyExtOrTrunc(LoOp, dl, MVT::i16);

29773

HiOp = DAG.getAnyExtOrTrunc(HiOp, dl, MVT::i16);

29774

LoOp = DAG.getNode(ISD::SHL, dl, MVT::i16, LoOp,

29775

DAG.getConstant(8, dl, MVT::i16));

29776

HiOp = DAG.getNode(ISD::SHL, dl, MVT::i16, HiOp,

29777

DAG.getConstant(8, dl, MVT::i16));

29778

} else {

29779

LoOp = DAG.getZExtOrTrunc(LoOp, dl, MVT::i16);

29780

HiOp = DAG.getZExtOrTrunc(HiOp, dl, MVT::i16);

29781

}

29782

29783

LoOps.push_back(LoOp);

29784

HiOps.push_back(HiOp);

29785

}

29786

}

29787

29788

BLo = DAG.getBuildVector(ExVT, dl, LoOps);

29789

BHi = DAG.getBuildVector(ExVT, dl, HiOps);

29790

} else if (IsSigned) {

29791

BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, Zero, B));

29792

BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, Zero, B));

29793

} else {

29794

BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B, Zero));

29795

BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B, Zero));

29796

}

29797

29798

// Multiply, lshr the upper 8bits to the lower 8bits of the lo/hi results and

29799

// pack back to vXi8.

29800

unsigned MulOpc = IsSigned ? ISD::MULHS : ISD::MUL;

29801

SDValue RLo = DAG.getNode(MulOpc, dl, ExVT, ALo, BLo);

29802

SDValue RHi = DAG.getNode(MulOpc, dl, ExVT, AHi, BHi);

29803

29804

if (Low)

29805

*Low = getPack(DAG, Subtarget, dl, VT, RLo, RHi);

29806

29807

return getPack(DAG, Subtarget, dl, VT, RLo, RHi, /*PackHiHalf*/ true);

29808

}

29809

29810

static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,

29811

SelectionDAG &DAG) {

29812

SDLoc dl(Op);

29813

MVT VT = Op.getSimpleValueType();

29814

bool IsSigned = Op->getOpcode() == ISD::MULHS;

29815

unsigned NumElts = VT.getVectorNumElements();

29816

SDValue A = Op.getOperand(0);

29817

SDValue B = Op.getOperand(1);

29818

29819

// Decompose 256-bit ops into 128-bit ops.

29820

if (VT.is256BitVector() && !Subtarget.hasInt256())

29821

return splitVectorIntBinary(Op, DAG);

29822

29823

if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())

29824

return splitVectorIntBinary(Op, DAG);

29825

29826

if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) {

29827

assert((VT == MVT::v4i32 && Subtarget.hasSSE2()) ||(static_cast <bool> ((VT == MVT::v4i32 && Subtarget
.hasSSE2()) || (VT == MVT::v8i32 && Subtarget.hasInt256
()) || (VT == MVT::v16i32 && Subtarget.hasAVX512())) ?
void (0) : __assert_fail ("(VT == MVT::v4i32 && Subtarget.hasSSE2()) || (VT == MVT::v8i32 && Subtarget.hasInt256()) || (VT == MVT::v16i32 && Subtarget.hasAVX512())"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29829, __extension__
__PRETTY_FUNCTION__))

29828

(VT == MVT::v8i32 && Subtarget.hasInt256()) ||(static_cast <bool> ((VT == MVT::v4i32 && Subtarget
.hasSSE2()) || (VT == MVT::v8i32 && Subtarget.hasInt256
()) || (VT == MVT::v16i32 && Subtarget.hasAVX512())) ?
void (0) : __assert_fail ("(VT == MVT::v4i32 && Subtarget.hasSSE2()) || (VT == MVT::v8i32 && Subtarget.hasInt256()) || (VT == MVT::v16i32 && Subtarget.hasAVX512())"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29829, __extension__
__PRETTY_FUNCTION__))

29829

(VT == MVT::v16i32 && Subtarget.hasAVX512()))(static_cast <bool> ((VT == MVT::v4i32 && Subtarget
.hasSSE2()) || (VT == MVT::v8i32 && Subtarget.hasInt256
()) || (VT == MVT::v16i32 && Subtarget.hasAVX512())) ?
void (0) : __assert_fail ("(VT == MVT::v4i32 && Subtarget.hasSSE2()) || (VT == MVT::v8i32 && Subtarget.hasInt256()) || (VT == MVT::v16i32 && Subtarget.hasAVX512())"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29829, __extension__
__PRETTY_FUNCTION__));

29830

29831

// PMULxD operations multiply each even value (starting at 0) of LHS with

29832

// the related value of RHS and produce a widen result.

29833

// E.g., PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>

29834

// => <2 x i64> <ae|cg>

29835

//

29836

// In other word, to have all the results, we need to perform two PMULxD:

29837

// 1. one with the even values.

29838

// 2. one with the odd values.

29839

// To achieve #2, with need to place the odd values at an even position.

29840

//

29841

// Place the odd value at an even position (basically, shift all values 1

29842

// step to the left):

29843

const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1,

29844

9, -1, 11, -1, 13, -1, 15, -1};

29845

// <a|b|c|d> => <b|undef|d|undef>

29846

SDValue Odd0 =

29847

DAG.getVectorShuffle(VT, dl, A, A, ArrayRef(&Mask[0], NumElts));

29848

// <e|f|g|h> => <f|undef|h|undef>

29849

SDValue Odd1 =

29850

DAG.getVectorShuffle(VT, dl, B, B, ArrayRef(&Mask[0], NumElts));

29851

29852

// Emit two multiplies, one for the lower 2 ints and one for the higher 2

29853

// ints.

29854

MVT MulVT = MVT::getVectorVT(MVT::i64, NumElts / 2);

29855

unsigned Opcode =

29856

(IsSigned && Subtarget.hasSSE41()) ? X86ISD::PMULDQ : X86ISD::PMULUDQ;

29857

// PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>

29858

// => <2 x i64> <ae|cg>

29859

SDValue Mul1 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,

29860

DAG.getBitcast(MulVT, A),

29861

DAG.getBitcast(MulVT, B)));

29862

29863

// => <2 x i64> <bf|dh>

29864

SDValue Mul2 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,

29865

DAG.getBitcast(MulVT, Odd0),

29866

DAG.getBitcast(MulVT, Odd1)));

29867

29868

// Shuffle it back into the right order.

29869

SmallVector<int, 16> ShufMask(NumElts);

29870

for (int i = 0; i != (int)NumElts; ++i)

29871

ShufMask[i] = (i / 2) * 2 + ((i % 2) * NumElts) + 1;

29872

29873

SDValue Res = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, ShufMask);

29874

29875

// If we have a signed multiply but no PMULDQ fix up the result of an

29876

// unsigned multiply.

29877

if (IsSigned && !Subtarget.hasSSE41()) {

29878

SDValue Zero = DAG.getConstant(0, dl, VT);

29879

SDValue T1 = DAG.getNode(ISD::AND, dl, VT,

29880

DAG.getSetCC(dl, VT, Zero, A, ISD::SETGT), B);

29881

SDValue T2 = DAG.getNode(ISD::AND, dl, VT,

29882

DAG.getSetCC(dl, VT, Zero, B, ISD::SETGT), A);

29883

29884

SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2);

29885

Res = DAG.getNode(ISD::SUB, dl, VT, Res, Fixup);

29886

}

29887

29888

return Res;

29889

}

29890

29891

// Only i8 vectors should need custom lowering after this.

29892

assert((VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) ||(static_cast <bool> ((VT == MVT::v16i8 || (VT == MVT::v32i8
&& Subtarget.hasInt256()) || (VT == MVT::v64i8 &&
Subtarget.hasBWI())) && "Unsupported vector type") ?
void (0) : __assert_fail ("(VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) || (VT == MVT::v64i8 && Subtarget.hasBWI())) && \"Unsupported vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29894, __extension__
__PRETTY_FUNCTION__))

29893

(VT == MVT::v64i8 && Subtarget.hasBWI())) &&(static_cast <bool> ((VT == MVT::v16i8 || (VT == MVT::v32i8
&& Subtarget.hasInt256()) || (VT == MVT::v64i8 &&
Subtarget.hasBWI())) && "Unsupported vector type") ?
void (0) : __assert_fail ("(VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) || (VT == MVT::v64i8 && Subtarget.hasBWI())) && \"Unsupported vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29894, __extension__
__PRETTY_FUNCTION__))

29894

"Unsupported vector type")(static_cast <bool> ((VT == MVT::v16i8 || (VT == MVT::v32i8
&& Subtarget.hasInt256()) || (VT == MVT::v64i8 &&
Subtarget.hasBWI())) && "Unsupported vector type") ?
void (0) : __assert_fail ("(VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) || (VT == MVT::v64i8 && Subtarget.hasBWI())) && \"Unsupported vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 29894, __extension__
__PRETTY_FUNCTION__));

29895

29896

// Lower v16i8/v32i8 as extension to v8i16/v16i16 vector pairs, multiply,

29897

// logical shift down the upper half and pack back to i8.

29898

29899

// With SSE41 we can use sign/zero extend, but for pre-SSE41 we unpack

29900

// and then ashr/lshr the upper bits down to the lower bits before multiply.

29901

29902

if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||

29903

(VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {

29904

MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);

29905

unsigned ExAVX = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;

29906

SDValue ExA = DAG.getNode(ExAVX, dl, ExVT, A);

29907

SDValue ExB = DAG.getNode(ExAVX, dl, ExVT, B);

29908

SDValue Mul = DAG.getNode(ISD::MUL, dl, ExVT, ExA, ExB);

29909

Mul = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);

29910

return DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);

29911

}

29912

29913

return LowervXi8MulWithUNPCK(A, B, dl, VT, IsSigned, Subtarget, DAG);

29914

}

29915

29916

// Custom lowering for SMULO/UMULO.

29917

static SDValue LowerMULO(SDValue Op, const X86Subtarget &Subtarget,

29918

SelectionDAG &DAG) {

29919

MVT VT = Op.getSimpleValueType();

29920

29921

// Scalars defer to LowerXALUO.

29922

if (!VT.isVector())

29923

return LowerXALUO(Op, DAG);

29924

29925

SDLoc dl(Op);

29926

bool IsSigned = Op->getOpcode() == ISD::SMULO;

29927

SDValue A = Op.getOperand(0);

29928

SDValue B = Op.getOperand(1);

29929

EVT OvfVT = Op->getValueType(1);

29930

29931

if ((VT == MVT::v32i8 && !Subtarget.hasInt256()) ||

29932

(VT == MVT::v64i8 && !Subtarget.hasBWI())) {

29933

// Extract the LHS Lo/Hi vectors

29934

SDValue LHSLo, LHSHi;

29935

std::tie(LHSLo, LHSHi) = splitVector(A, DAG, dl);

29936

29937

// Extract the RHS Lo/Hi vectors

29938

SDValue RHSLo, RHSHi;

29939

std::tie(RHSLo, RHSHi) = splitVector(B, DAG, dl);

29940

29941

EVT LoOvfVT, HiOvfVT;

29942

std::tie(LoOvfVT, HiOvfVT) = DAG.GetSplitDestVTs(OvfVT);

29943

SDVTList LoVTs = DAG.getVTList(LHSLo.getValueType(), LoOvfVT);

29944

SDVTList HiVTs = DAG.getVTList(LHSHi.getValueType(), HiOvfVT);

29945

29946

// Issue the split operations.

29947

SDValue Lo = DAG.getNode(Op.getOpcode(), dl, LoVTs, LHSLo, RHSLo);

29948

SDValue Hi = DAG.getNode(Op.getOpcode(), dl, HiVTs, LHSHi, RHSHi);

29949

29950

// Join the separate data results and the overflow results.

29951

SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);

29952

SDValue Ovf = DAG.getNode(ISD::CONCAT_VECTORS, dl, OvfVT, Lo.getValue(1),

29953

Hi.getValue(1));

29954

29955

return DAG.getMergeValues({Res, Ovf}, dl);

29956

}

29957

29958

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

29959

EVT SetccVT =

29960

TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);

29961

29962

if ((VT == MVT::v16i8 && Subtarget.hasInt256()) ||

29963

(VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) {

29964

unsigned NumElts = VT.getVectorNumElements();

29965

MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);

29966

unsigned ExAVX = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;

29967

SDValue ExA = DAG.getNode(ExAVX, dl, ExVT, A);

29968

SDValue ExB = DAG.getNode(ExAVX, dl, ExVT, B);

29969

SDValue Mul = DAG.getNode(ISD::MUL, dl, ExVT, ExA, ExB);

29970

29971

SDValue Low = DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);

29972

29973

SDValue Ovf;

29974

if (IsSigned) {

29975

SDValue High, LowSign;

29976

if (OvfVT.getVectorElementType() == MVT::i1 &&

29977

(Subtarget.hasBWI() || Subtarget.canExtendTo512DQ())) {

29978

// Rather the truncating try to do the compare on vXi16 or vXi32.

29979

// Shift the high down filling with sign bits.

29980

High = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Mul, 8, DAG);

29981

// Fill all 16 bits with the sign bit from the low.

29982

LowSign =

29983

getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ExVT, Mul, 8, DAG);

29984

LowSign = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, LowSign,

29985

15, DAG);

29986

SetccVT = OvfVT;

29987

if (!Subtarget.hasBWI()) {

29988

// We can't do a vXi16 compare so sign extend to v16i32.

29989

High = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v16i32, High);

29990

LowSign = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v16i32, LowSign);

29991

}

29992

} else {

29993

// Otherwise do the compare at vXi8.

29994

High = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);

29995

High = DAG.getNode(ISD::TRUNCATE, dl, VT, High);

29996

LowSign =

29997

DAG.getNode(ISD::SRA, dl, VT, Low, DAG.getConstant(7, dl, VT));

29998

}

29999

30000

Ovf = DAG.getSetCC(dl, SetccVT, LowSign, High, ISD::SETNE);

30001

} else {

30002

SDValue High =

30003

getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG);

30004

if (OvfVT.getVectorElementType() == MVT::i1 &&

30005

(Subtarget.hasBWI() || Subtarget.canExtendTo512DQ())) {

30006

// Rather the truncating try to do the compare on vXi16 or vXi32.

30007

SetccVT = OvfVT;

30008

if (!Subtarget.hasBWI()) {

30009

// We can't do a vXi16 compare so sign extend to v16i32.

30010

High = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v16i32, High);

30011

}

30012

} else {

30013

// Otherwise do the compare at vXi8.

30014

High = DAG.getNode(ISD::TRUNCATE, dl, VT, High);

30015

}

30016

30017

Ovf =

30018

DAG.getSetCC(dl, SetccVT, High,

30019

DAG.getConstant(0, dl, High.getValueType()), ISD::SETNE);

30020

}

30021

30022

Ovf = DAG.getSExtOrTrunc(Ovf, dl, OvfVT);

30023

30024

return DAG.getMergeValues({Low, Ovf}, dl);

30025

}

30026

30027

SDValue Low;

30028

SDValue High =

30029

LowervXi8MulWithUNPCK(A, B, dl, VT, IsSigned, Subtarget, DAG, &Low);

30030

30031

SDValue Ovf;

30032

if (IsSigned) {

30033

// SMULO overflows if the high bits don't match the sign of the low.

30034

SDValue LowSign =

30035

DAG.getNode(ISD::SRA, dl, VT, Low, DAG.getConstant(7, dl, VT));

30036

Ovf = DAG.getSetCC(dl, SetccVT, LowSign, High, ISD::SETNE);

30037

} else {

30038

// UMULO overflows if the high bits are non-zero.

30039

Ovf =

30040

DAG.getSetCC(dl, SetccVT, High, DAG.getConstant(0, dl, VT), ISD::SETNE);

30041

}

30042

30043

Ovf = DAG.getSExtOrTrunc(Ovf, dl, OvfVT);

30044

30045

return DAG.getMergeValues({Low, Ovf}, dl);

30046

}

30047

30048

SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const {

30049

assert(Subtarget.isTargetWin64() && "Unexpected target")(static_cast <bool> (Subtarget.isTargetWin64() &&
"Unexpected target") ? void (0) : __assert_fail ("Subtarget.isTargetWin64() && \"Unexpected target\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30049, __extension__
__PRETTY_FUNCTION__));

30050

EVT VT = Op.getValueType();

30051

assert(VT.isInteger() && VT.getSizeInBits() == 128 &&(static_cast <bool> (VT.isInteger() && VT.getSizeInBits
() == 128 && "Unexpected return type for lowering") ?
void (0) : __assert_fail ("VT.isInteger() && VT.getSizeInBits() == 128 && \"Unexpected return type for lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30052, __extension__
__PRETTY_FUNCTION__))

30052

"Unexpected return type for lowering")(static_cast <bool> (VT.isInteger() && VT.getSizeInBits
() == 128 && "Unexpected return type for lowering") ?
void (0) : __assert_fail ("VT.isInteger() && VT.getSizeInBits() == 128 && \"Unexpected return type for lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30052, __extension__
__PRETTY_FUNCTION__));

30053

30054

if (isa<ConstantSDNode>(Op->getOperand(1))) {

30055

SmallVector<SDValue> Result;

30056

if (expandDIVREMByConstant(Op.getNode(), Result, MVT::i64, DAG))

30057

return DAG.getNode(ISD::BUILD_PAIR, SDLoc(Op), VT, Result[0], Result[1]);

30058

}

30059

30060

RTLIB::Libcall LC;

30061

bool isSigned;

30062

switch (Op->getOpcode()) {

30063

default: llvm_unreachable("Unexpected request for libcall!")::llvm::llvm_unreachable_internal("Unexpected request for libcall!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30063);

30064

case ISD::SDIV: isSigned = true; LC = RTLIB::SDIV_I128; break;

30065

case ISD::UDIV: isSigned = false; LC = RTLIB::UDIV_I128; break;

30066

case ISD::SREM: isSigned = true; LC = RTLIB::SREM_I128; break;

30067

case ISD::UREM: isSigned = false; LC = RTLIB::UREM_I128; break;

30068

}

30069

30070

SDLoc dl(Op);

30071

SDValue InChain = DAG.getEntryNode();

30072

30073

TargetLowering::ArgListTy Args;

30074

TargetLowering::ArgListEntry Entry;

30075

for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {

30076

EVT ArgVT = Op->getOperand(i).getValueType();

30077

assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&(static_cast <bool> (ArgVT.isInteger() && ArgVT
.getSizeInBits() == 128 && "Unexpected argument type for lowering"
) ? void (0) : __assert_fail ("ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 && \"Unexpected argument type for lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30078, __extension__
__PRETTY_FUNCTION__))

30078

"Unexpected argument type for lowering")(static_cast <bool> (ArgVT.isInteger() && ArgVT
.getSizeInBits() == 128 && "Unexpected argument type for lowering"
) ? void (0) : __assert_fail ("ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 && \"Unexpected argument type for lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30078, __extension__
__PRETTY_FUNCTION__));

30079

SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);

30080

int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();

30081

MachinePointerInfo MPI =

30082

MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);

30083

Entry.Node = StackPtr;

30084

InChain =

30085

DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr, MPI, Align(16));

30086

Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());

30087

Entry.Ty = PointerType::get(ArgTy,0);

30088

Entry.IsSExt = false;

30089

Entry.IsZExt = false;

30090

Args.push_back(Entry);

30091

}

30092

30093

SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),

30094

getPointerTy(DAG.getDataLayout()));

30095

30096

TargetLowering::CallLoweringInfo CLI(DAG);

30097

CLI.setDebugLoc(dl)

30098

.setChain(InChain)

30099

.setLibCallee(

30100

getLibcallCallingConv(LC),

30101

static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()), Callee,

30102

std::move(Args))

30103

.setInRegister()

30104

.setSExtResult(isSigned)

30105

.setZExtResult(!isSigned);

30106

30107

std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);

30108

return DAG.getBitcast(VT, CallInfo.first);

30109

}

30110

30111

SDValue X86TargetLowering::LowerWin64_FP_TO_INT128(SDValue Op,

30112

SelectionDAG &DAG,

30113

SDValue &Chain) const {

30114

assert(Subtarget.isTargetWin64() && "Unexpected target")(static_cast <bool> (Subtarget.isTargetWin64() &&
"Unexpected target") ? void (0) : __assert_fail ("Subtarget.isTargetWin64() && \"Unexpected target\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30114, __extension__
__PRETTY_FUNCTION__));

30115

EVT VT = Op.getValueType();

30116

bool IsStrict = Op->isStrictFPOpcode();

30117

30118

SDValue Arg = Op.getOperand(IsStrict ? 1 : 0);

30119

EVT ArgVT = Arg.getValueType();

30120

30121

assert(VT.isInteger() && VT.getSizeInBits() == 128 &&(static_cast <bool> (VT.isInteger() && VT.getSizeInBits
() == 128 && "Unexpected return type for lowering") ?
void (0) : __assert_fail ("VT.isInteger() && VT.getSizeInBits() == 128 && \"Unexpected return type for lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30122, __extension__
__PRETTY_FUNCTION__))

30122

"Unexpected return type for lowering")(static_cast <bool> (VT.isInteger() && VT.getSizeInBits
() == 128 && "Unexpected return type for lowering") ?
void (0) : __assert_fail ("VT.isInteger() && VT.getSizeInBits() == 128 && \"Unexpected return type for lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30122, __extension__
__PRETTY_FUNCTION__));

30123

30124

RTLIB::Libcall LC;

30125

if (Op->getOpcode() == ISD::FP_TO_SINT ||

30126

Op->getOpcode() == ISD::STRICT_FP_TO_SINT)

30127

LC = RTLIB::getFPTOSINT(ArgVT, VT);

30128

else

30129

LC = RTLIB::getFPTOUINT(ArgVT, VT);

30130

assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected request for libcall!")(static_cast <bool> (LC != RTLIB::UNKNOWN_LIBCALL &&
"Unexpected request for libcall!") ? void (0) : __assert_fail
("LC != RTLIB::UNKNOWN_LIBCALL && \"Unexpected request for libcall!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30130, __extension__
__PRETTY_FUNCTION__));

30131

30132

SDLoc dl(Op);

30133

MakeLibCallOptions CallOptions;

30134

Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();

30135

30136

SDValue Result;

30137

// Expect the i128 argument returned as a v2i64 in xmm0, cast back to the

30138

// expected VT (i128).

30139

std::tie(Result, Chain) =

30140

makeLibCall(DAG, LC, MVT::v2i64, Arg, CallOptions, dl, Chain);

30141

Result = DAG.getBitcast(VT, Result);

30142

return Result;

30143

}

30144

30145

SDValue X86TargetLowering::LowerWin64_INT128_TO_FP(SDValue Op,

30146

SelectionDAG &DAG) const {

30147

assert(Subtarget.isTargetWin64() && "Unexpected target")(static_cast <bool> (Subtarget.isTargetWin64() &&
"Unexpected target") ? void (0) : __assert_fail ("Subtarget.isTargetWin64() && \"Unexpected target\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30147, __extension__
__PRETTY_FUNCTION__));

30148

EVT VT = Op.getValueType();

30149

bool IsStrict = Op->isStrictFPOpcode();

30150

30151

SDValue Arg = Op.getOperand(IsStrict ? 1 : 0);

30152

EVT ArgVT = Arg.getValueType();

30153

30154

assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&(static_cast <bool> (ArgVT.isInteger() && ArgVT
.getSizeInBits() == 128 && "Unexpected argument type for lowering"
) ? void (0) : __assert_fail ("ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 && \"Unexpected argument type for lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30155, __extension__
__PRETTY_FUNCTION__))

30155

"Unexpected argument type for lowering")(static_cast <bool> (ArgVT.isInteger() && ArgVT
.getSizeInBits() == 128 && "Unexpected argument type for lowering"
) ? void (0) : __assert_fail ("ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 && \"Unexpected argument type for lowering\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30155, __extension__
__PRETTY_FUNCTION__));

30156

30157

RTLIB::Libcall LC;

30158

if (Op->getOpcode() == ISD::SINT_TO_FP ||

30159

Op->getOpcode() == ISD::STRICT_SINT_TO_FP)

30160

LC = RTLIB::getSINTTOFP(ArgVT, VT);

30161

else

30162

LC = RTLIB::getUINTTOFP(ArgVT, VT);

30163

assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected request for libcall!")(static_cast <bool> (LC != RTLIB::UNKNOWN_LIBCALL &&
"Unexpected request for libcall!") ? void (0) : __assert_fail
("LC != RTLIB::UNKNOWN_LIBCALL && \"Unexpected request for libcall!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30163, __extension__
__PRETTY_FUNCTION__));

30164

30165

SDLoc dl(Op);

30166

MakeLibCallOptions CallOptions;

30167

SDValue Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();

30168

30169

// Pass the i128 argument as an indirect argument on the stack.

30170

SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);

30171

int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();

30172

MachinePointerInfo MPI =

30173

MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);

30174

Chain = DAG.getStore(Chain, dl, Arg, StackPtr, MPI, Align(16));

30175

30176

SDValue Result;

30177

std::tie(Result, Chain) =

30178

makeLibCall(DAG, LC, VT, StackPtr, CallOptions, dl, Chain);

30179

return IsStrict ? DAG.getMergeValues({Result, Chain}, dl) : Result;

30180

}

30181

30182

// Return true if the required (according to Opcode) shift-imm form is natively

30183

// supported by the Subtarget

30184

static bool supportedVectorShiftWithImm(MVT VT, const X86Subtarget &Subtarget,

30185

unsigned Opcode) {

30186

if (!(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))

30187

return false;

30188

30189

if (VT.getScalarSizeInBits() < 16)

30190

return false;

30191

30192

if (VT.is512BitVector() && Subtarget.useAVX512Regs() &&

30193

(VT.getScalarSizeInBits() > 16 || Subtarget.hasBWI()))

30194

return true;

30195

30196

bool LShift = (VT.is128BitVector() && Subtarget.hasSSE2()) ||

30197

(VT.is256BitVector() && Subtarget.hasInt256());

30198

30199

bool AShift = LShift && (Subtarget.hasAVX512() ||

30200

(VT != MVT::v2i64 && VT != MVT::v4i64));

30201

return (Opcode == ISD::SRA) ? AShift : LShift;

30202

}

30203

30204

// The shift amount is a variable, but it is the same for all vector lanes.

30205

// These instructions are defined together with shift-immediate.

30206

static

30207

bool supportedVectorShiftWithBaseAmnt(MVT VT, const X86Subtarget &Subtarget,

30208

unsigned Opcode) {

30209

return supportedVectorShiftWithImm(VT, Subtarget, Opcode);

30210

}

30211

30212

// Return true if the required (according to Opcode) variable-shift form is

30213

// natively supported by the Subtarget

30214

static bool supportedVectorVarShift(MVT VT, const X86Subtarget &Subtarget,

30215

unsigned Opcode) {

30216

if (!(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))

30217

return false;

30218

30219

if (!Subtarget.hasInt256() || VT.getScalarSizeInBits() < 16)

30220

return false;

30221

30222

// vXi16 supported only on AVX-512, BWI

30223

if (VT.getScalarSizeInBits() == 16 && !Subtarget.hasBWI())

30224

return false;

30225

30226

if (Subtarget.hasAVX512() &&

30227

(Subtarget.useAVX512Regs() || !VT.is512BitVector()))

30228

return true;

30229

30230

bool LShift = VT.is128BitVector() || VT.is256BitVector();

30231

bool AShift = LShift && VT != MVT::v2i64 && VT != MVT::v4i64;

30232

return (Opcode == ISD::SRA) ? AShift : LShift;

30233

}

30234

30235

static SDValue LowerShiftByScalarImmediate(SDValue Op, SelectionDAG &DAG,

30236

const X86Subtarget &Subtarget) {

30237

MVT VT = Op.getSimpleValueType();

30238

SDLoc dl(Op);

30239

SDValue R = Op.getOperand(0);

30240

SDValue Amt = Op.getOperand(1);

30241

unsigned X86Opc = getTargetVShiftUniformOpcode(Op.getOpcode(), false);

30242

30243

auto ArithmeticShiftRight64 = [&](uint64_t ShiftAmt) {

30244

assert((VT == MVT::v2i64 || VT == MVT::v4i64) && "Unexpected SRA type")(static_cast <bool> ((VT == MVT::v2i64 || VT == MVT::v4i64
) && "Unexpected SRA type") ? void (0) : __assert_fail
("(VT == MVT::v2i64 || VT == MVT::v4i64) && \"Unexpected SRA type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30244, __extension__
__PRETTY_FUNCTION__));

30245

MVT ExVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);

30246

SDValue Ex = DAG.getBitcast(ExVT, R);

30247

30248

// ashr(R, 63) === cmp_slt(R, 0)

30249

if (ShiftAmt == 63 && Subtarget.hasSSE42()) {

30250

assert((VT != MVT::v4i64 || Subtarget.hasInt256()) &&(static_cast <bool> ((VT != MVT::v4i64 || Subtarget.hasInt256
()) && "Unsupported PCMPGT op") ? void (0) : __assert_fail
("(VT != MVT::v4i64 || Subtarget.hasInt256()) && \"Unsupported PCMPGT op\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30251, __extension__
__PRETTY_FUNCTION__))

30251

"Unsupported PCMPGT op")(static_cast <bool> ((VT != MVT::v4i64 || Subtarget.hasInt256
()) && "Unsupported PCMPGT op") ? void (0) : __assert_fail
("(VT != MVT::v4i64 || Subtarget.hasInt256()) && \"Unsupported PCMPGT op\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30251, __extension__
__PRETTY_FUNCTION__));

30252

return DAG.getNode(X86ISD::PCMPGT, dl, VT, DAG.getConstant(0, dl, VT), R);

30253

}

30254

30255

if (ShiftAmt >= 32) {

30256

// Splat sign to upper i32 dst, and SRA upper i32 src to lower i32.

30257

SDValue Upper =

30258

getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, 31, DAG);

30259

SDValue Lower = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,

30260

ShiftAmt - 32, DAG);

30261

if (VT == MVT::v2i64)

30262

Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {5, 1, 7, 3});

30263

if (VT == MVT::v4i64)

30264

Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,

30265

{9, 1, 11, 3, 13, 5, 15, 7});

30266

} else {

30267

// SRA upper i32, SRL whole i64 and select lower i32.

30268

SDValue Upper = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,

30269

ShiftAmt, DAG);

30270

SDValue Lower =

30271

getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt, DAG);

30272

Lower = DAG.getBitcast(ExVT, Lower);

30273

if (VT == MVT::v2i64)

30274

Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {4, 1, 6, 3});

30275

if (VT == MVT::v4i64)

30276

Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,

30277

{8, 1, 10, 3, 12, 5, 14, 7});

30278

}

30279

return DAG.getBitcast(VT, Ex);

30280

};

30281

30282

// Optimize shl/srl/sra with constant shift amount.

30283

APInt APIntShiftAmt;

30284

if (!X86::isConstantSplat(Amt, APIntShiftAmt))

30285

return SDValue();

30286

30287

// If the shift amount is out of range, return undef.

30288

if (APIntShiftAmt.uge(VT.getScalarSizeInBits()))

30289

return DAG.getUNDEF(VT);

30290

30291

uint64_t ShiftAmt = APIntShiftAmt.getZExtValue();

30292

30293

if (supportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode())) {

30294

// Hardware support for vector shifts is sparse which makes us scalarize the

30295

// vector operations in many cases. Also, on sandybridge ADD is faster than

30296

// shl: (shl V, 1) -> (add (freeze V), (freeze V))

30297

if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1) {

30298

// R may be undef at run-time, but (shl R, 1) must be an even number (LSB

30299

// must be 0). (add undef, undef) however can be any value. To make this

30300

// safe, we must freeze R to ensure that register allocation uses the same

30301

// register for an undefined value. This ensures that the result will

30302

// still be even and preserves the original semantics.

30303

R = DAG.getFreeze(R);

30304

return DAG.getNode(ISD::ADD, dl, VT, R, R);

30305

}

30306

30307

return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);

30308

}

30309

30310

// i64 SRA needs to be performed as partial shifts.

30311

if (((!Subtarget.hasXOP() && VT == MVT::v2i64) ||

30312

(Subtarget.hasInt256() && VT == MVT::v4i64)) &&

30313

Op.getOpcode() == ISD::SRA)

30314

return ArithmeticShiftRight64(ShiftAmt);

30315

30316

if (VT == MVT::v16i8 || (Subtarget.hasInt256() && VT == MVT::v32i8) ||

30317

(Subtarget.hasBWI() && VT == MVT::v64i8)) {

30318

unsigned NumElts = VT.getVectorNumElements();

30319

MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);

30320

30321

// Simple i8 add case

30322

if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1) {

30323

// R may be undef at run-time, but (shl R, 1) must be an even number (LSB

30324

// must be 0). (add undef, undef) however can be any value. To make this

30325

// safe, we must freeze R to ensure that register allocation uses the same

30326

// register for an undefined value. This ensures that the result will

30327

// still be even and preserves the original semantics.

30328

R = DAG.getFreeze(R);

30329

return DAG.getNode(ISD::ADD, dl, VT, R, R);

30330

}

30331

30332

// ashr(R, 7) === cmp_slt(R, 0)

30333

if (Op.getOpcode() == ISD::SRA && ShiftAmt == 7) {

30334

SDValue Zeros = DAG.getConstant(0, dl, VT);

30335

if (VT.is512BitVector()) {

30336

assert(VT == MVT::v64i8 && "Unexpected element type!")(static_cast <bool> (VT == MVT::v64i8 && "Unexpected element type!"
) ? void (0) : __assert_fail ("VT == MVT::v64i8 && \"Unexpected element type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30336, __extension__
__PRETTY_FUNCTION__));

30337

SDValue CMP = DAG.getSetCC(dl, MVT::v64i1, Zeros, R, ISD::SETGT);

30338

return DAG.getNode(ISD::SIGN_EXTEND, dl, VT, CMP);

30339

}

30340

return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);

30341

}

30342

30343

// XOP can shift v16i8 directly instead of as shift v8i16 + mask.

30344

if (VT == MVT::v16i8 && Subtarget.hasXOP())

30345

return SDValue();

30346

30347

if (Op.getOpcode() == ISD::SHL) {

30348

// Make a large shift.

30349

SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT, R,

30350

ShiftAmt, DAG);

30351

SHL = DAG.getBitcast(VT, SHL);

30352

// Zero out the rightmost bits.

30353

APInt Mask = APInt::getHighBitsSet(8, 8 - ShiftAmt);

30354

return DAG.getNode(ISD::AND, dl, VT, SHL, DAG.getConstant(Mask, dl, VT));

30355

}

30356

if (Op.getOpcode() == ISD::SRL) {

30357

// Make a large shift.

30358

SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ShiftVT, R,

30359

ShiftAmt, DAG);

30360

SRL = DAG.getBitcast(VT, SRL);

30361

// Zero out the leftmost bits.

30362

APInt Mask = APInt::getLowBitsSet(8, 8 - ShiftAmt);

30363

return DAG.getNode(ISD::AND, dl, VT, SRL, DAG.getConstant(Mask, dl, VT));

30364

}

30365

if (Op.getOpcode() == ISD::SRA) {

30366

// ashr(R, Amt) === sub(xor(lshr(R, Amt), Mask), Mask)

30367

SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);

30368

30369

SDValue Mask = DAG.getConstant(128 >> ShiftAmt, dl, VT);

30370

Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);

30371

Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);

30372

return Res;

30373

}

30374

llvm_unreachable("Unknown shift opcode.")::llvm::llvm_unreachable_internal("Unknown shift opcode.", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 30374);

30375

}

30376

30377

return SDValue();

30378

}

30379

30380

static SDValue LowerShiftByScalarVariable(SDValue Op, SelectionDAG &DAG,

30381

const X86Subtarget &Subtarget) {

30382

MVT VT = Op.getSimpleValueType();

30383

SDLoc dl(Op);

30384

SDValue R = Op.getOperand(0);

30385

SDValue Amt = Op.getOperand(1);

30386

unsigned Opcode = Op.getOpcode();

30387

unsigned X86OpcI = getTargetVShiftUniformOpcode(Opcode, false);

30388

30389

int BaseShAmtIdx = -1;

30390

if (SDValue BaseShAmt = DAG.getSplatSourceVector(Amt, BaseShAmtIdx)) {

30391

if (supportedVectorShiftWithBaseAmnt(VT, Subtarget, Opcode))

30392

return getTargetVShiftNode(X86OpcI, dl, VT, R, BaseShAmt, BaseShAmtIdx,

30393

Subtarget, DAG);

30394

30395

// vXi8 shifts - shift as v8i16 + mask result.

30396

if (((VT == MVT::v16i8 && !Subtarget.canExtendTo512DQ()) ||

30397

(VT == MVT::v32i8 && !Subtarget.canExtendTo512BW()) ||

30398

VT == MVT::v64i8) &&

30399

!Subtarget.hasXOP()) {

30400

unsigned NumElts = VT.getVectorNumElements();

30401

MVT ExtVT = MVT::getVectorVT(MVT::i16, NumElts / 2);

30402

if (supportedVectorShiftWithBaseAmnt(ExtVT, Subtarget, Opcode)) {

30403

unsigned LogicalOp = (Opcode == ISD::SHL ? ISD::SHL : ISD::SRL);

30404

unsigned LogicalX86Op = getTargetVShiftUniformOpcode(LogicalOp, false);

30405

30406

// Create the mask using vXi16 shifts. For shift-rights we need to move

30407

// the upper byte down before splatting the vXi8 mask.

30408

SDValue BitMask = DAG.getConstant(-1, dl, ExtVT);

30409

BitMask = getTargetVShiftNode(LogicalX86Op, dl, ExtVT, BitMask,

30410

BaseShAmt, BaseShAmtIdx, Subtarget, DAG);

30411

if (Opcode != ISD::SHL)

30412

BitMask = getTargetVShiftByConstNode(LogicalX86Op, dl, ExtVT, BitMask,

30413

8, DAG);

30414

BitMask = DAG.getBitcast(VT, BitMask);

30415

BitMask = DAG.getVectorShuffle(VT, dl, BitMask, BitMask,

30416

SmallVector<int, 64>(NumElts, 0));

30417

30418

SDValue Res = getTargetVShiftNode(LogicalX86Op, dl, ExtVT,

30419

DAG.getBitcast(ExtVT, R), BaseShAmt,

30420

BaseShAmtIdx, Subtarget, DAG);

30421

Res = DAG.getBitcast(VT, Res);

30422

Res = DAG.getNode(ISD::AND, dl, VT, Res, BitMask);

30423

30424

if (Opcode == ISD::SRA) {

30425

// ashr(R, Amt) === sub(xor(lshr(R, Amt), SignMask), SignMask)

30426

// SignMask = lshr(SignBit, Amt) - safe to do this with PSRLW.

30427

SDValue SignMask = DAG.getConstant(0x8080, dl, ExtVT);

30428

SignMask =

30429

getTargetVShiftNode(LogicalX86Op, dl, ExtVT, SignMask, BaseShAmt,

30430

BaseShAmtIdx, Subtarget, DAG);

30431

SignMask = DAG.getBitcast(VT, SignMask);

30432

Res = DAG.getNode(ISD::XOR, dl, VT, Res, SignMask);

30433

Res = DAG.getNode(ISD::SUB, dl, VT, Res, SignMask);

30434

}

30435

return Res;

30436

}

30437

}

30438

}

30439

30440

return SDValue();

30441

}

30442

30443

// Convert a shift/rotate left amount to a multiplication scale factor.

30444

static SDValue convertShiftLeftToScale(SDValue Amt, const SDLoc &dl,

30445

const X86Subtarget &Subtarget,

30446

SelectionDAG &DAG) {

30447

MVT VT = Amt.getSimpleValueType();

30448

if (!(VT == MVT::v8i16 || VT == MVT::v4i32 ||

30449

(Subtarget.hasInt256() && VT == MVT::v16i16) ||

30450

(Subtarget.hasAVX512() && VT == MVT::v32i16) ||

30451

(!Subtarget.hasAVX512() && VT == MVT::v16i8) ||

30452

(Subtarget.hasInt256() && VT == MVT::v32i8) ||

30453

(Subtarget.hasBWI() && VT == MVT::v64i8)))

30454

return SDValue();

30455

30456

MVT SVT = VT.getVectorElementType();

30457

unsigned SVTBits = SVT.getSizeInBits();

30458

unsigned NumElems = VT.getVectorNumElements();

30459

30460

APInt UndefElts;

30461

SmallVector<APInt> EltBits;

30462

if (getTargetConstantBitsFromNode(Amt, SVTBits, UndefElts, EltBits)) {

30463

APInt One(SVTBits, 1);

30464

SmallVector<SDValue> Elts(NumElems, DAG.getUNDEF(SVT));

30465

for (unsigned I = 0; I != NumElems; ++I) {

30466

if (UndefElts[I] || EltBits[I].uge(SVTBits))

30467

continue;

30468

uint64_t ShAmt = EltBits[I].getZExtValue();

30469

Elts[I] = DAG.getConstant(One.shl(ShAmt), dl, SVT);

30470

}

30471

return DAG.getBuildVector(VT, dl, Elts);

30472

}

30473

30474

// If the target doesn't support variable shifts, use either FP conversion

30475

// or integer multiplication to avoid shifting each element individually.

30476

if (VT == MVT::v4i32) {

30477

Amt = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, dl, VT));

30478

Amt = DAG.getNode(ISD::ADD, dl, VT, Amt,

30479

DAG.getConstant(0x3f800000U, dl, VT));

30480

Amt = DAG.getBitcast(MVT::v4f32, Amt);

30481

return DAG.getNode(ISD::FP_TO_SINT, dl, VT, Amt);

30482

}

30483

30484

// AVX2 can more effectively perform this as a zext/trunc to/from v8i32.

30485

if (VT == MVT::v8i16 && !Subtarget.hasAVX2()) {

30486

SDValue Z = DAG.getConstant(0, dl, VT);

30487

SDValue Lo = DAG.getBitcast(MVT::v4i32, getUnpackl(DAG, dl, VT, Amt, Z));

30488

SDValue Hi = DAG.getBitcast(MVT::v4i32, getUnpackh(DAG, dl, VT, Amt, Z));

30489

Lo = convertShiftLeftToScale(Lo, dl, Subtarget, DAG);

30490

Hi = convertShiftLeftToScale(Hi, dl, Subtarget, DAG);

30491

if (Subtarget.hasSSE41())

30492

return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);

30493

return getPack(DAG, Subtarget, dl, VT, Lo, Hi);

30494

}

30495

30496

return SDValue();

30497

}

30498

30499

static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,

30500

SelectionDAG &DAG) {

30501

MVT VT = Op.getSimpleValueType();

30502

SDLoc dl(Op);

30503

SDValue R = Op.getOperand(0);

30504

SDValue Amt = Op.getOperand(1);

30505

unsigned EltSizeInBits = VT.getScalarSizeInBits();

30506

bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());

30507

30508

unsigned Opc = Op.getOpcode();

30509

unsigned X86OpcV = getTargetVShiftUniformOpcode(Opc, true);

30510

unsigned X86OpcI = getTargetVShiftUniformOpcode(Opc, false);

30511

30512

assert(VT.isVector() && "Custom lowering only for vector shifts!")(static_cast <bool> (VT.isVector() && "Custom lowering only for vector shifts!"
) ? void (0) : __assert_fail ("VT.isVector() && \"Custom lowering only for vector shifts!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30512, __extension__
__PRETTY_FUNCTION__));

30513

assert(Subtarget.hasSSE2() && "Only custom lower when we have SSE2!")(static_cast <bool> (Subtarget.hasSSE2() && "Only custom lower when we have SSE2!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"Only custom lower when we have SSE2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30513, __extension__
__PRETTY_FUNCTION__));

30514

30515

if (SDValue V = LowerShiftByScalarImmediate(Op, DAG, Subtarget))

30516

return V;

30517

30518

if (SDValue V = LowerShiftByScalarVariable(Op, DAG, Subtarget))

30519

return V;

30520

30521

if (supportedVectorVarShift(VT, Subtarget, Opc))

30522

return Op;

30523

30524

// i64 vector arithmetic shift can be emulated with the transform:

30525

// M = lshr(SIGN_MASK, Amt)

30526

// ashr(R, Amt) === sub(xor(lshr(R, Amt), M), M)

30527

if (((VT == MVT::v2i64 && !Subtarget.hasXOP()) ||

30528

(VT == MVT::v4i64 && Subtarget.hasInt256())) &&

30529

Opc == ISD::SRA) {

30530

SDValue S = DAG.getConstant(APInt::getSignMask(64), dl, VT);

30531

SDValue M = DAG.getNode(ISD::SRL, dl, VT, S, Amt);

30532

R = DAG.getNode(ISD::SRL, dl, VT, R, Amt);

30533

R = DAG.getNode(ISD::XOR, dl, VT, R, M);

30534

R = DAG.getNode(ISD::SUB, dl, VT, R, M);

30535

return R;

30536

}

30537

30538

// XOP has 128-bit variable logical/arithmetic shifts.

30539

// +ve/-ve Amt = shift left/right.

30540

if (Subtarget.hasXOP() && (VT == MVT::v2i64 || VT == MVT::v4i32 ||

30541

VT == MVT::v8i16 || VT == MVT::v16i8)) {

30542

if (Opc == ISD::SRL || Opc == ISD::SRA) {

30543

SDValue Zero = DAG.getConstant(0, dl, VT);

30544

Amt = DAG.getNode(ISD::SUB, dl, VT, Zero, Amt);

30545

}

30546

if (Opc == ISD::SHL || Opc == ISD::SRL)

30547

return DAG.getNode(X86ISD::VPSHL, dl, VT, R, Amt);

30548

if (Opc == ISD::SRA)

30549

return DAG.getNode(X86ISD::VPSHA, dl, VT, R, Amt);

30550

}

30551

30552

// 2i64 vector logical shifts can efficiently avoid scalarization - do the

30553

// shifts per-lane and then shuffle the partial results back together.

30554

if (VT == MVT::v2i64 && Opc != ISD::SRA) {

30555

// Splat the shift amounts so the scalar shifts above will catch it.

30556

SDValue Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {0, 0});

30557

SDValue Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {1, 1});

30558

SDValue R0 = DAG.getNode(Opc, dl, VT, R, Amt0);

30559

SDValue R1 = DAG.getNode(Opc, dl, VT, R, Amt1);

30560

return DAG.getVectorShuffle(VT, dl, R0, R1, {0, 3});

30561

}

30562

30563

// If possible, lower this shift as a sequence of two shifts by

30564

// constant plus a BLENDing shuffle instead of scalarizing it.

30565

// Example:

30566

// (v4i32 (srl A, (build_vector < X, Y, Y, Y>)))

30567

//

30568

// Could be rewritten as:

30569

// (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))

30570

//

30571

// The advantage is that the two shifts from the example would be

30572

// lowered as X86ISD::VSRLI nodes in parallel before blending.

30573

if (ConstantAmt && (VT == MVT::v8i16 || VT == MVT::v4i32 ||

30574

(VT == MVT::v16i16 && Subtarget.hasInt256()))) {

30575

SDValue Amt1, Amt2;

30576

unsigned NumElts = VT.getVectorNumElements();

30577

SmallVector<int, 8> ShuffleMask;

30578

for (unsigned i = 0; i != NumElts; ++i) {

30579

SDValue A = Amt->getOperand(i);

30580

if (A.isUndef()) {

30581

ShuffleMask.push_back(SM_SentinelUndef);

30582

continue;

30583

}

30584

if (!Amt1 || Amt1 == A) {

30585

ShuffleMask.push_back(i);

30586

Amt1 = A;

30587

continue;

30588

}

30589

if (!Amt2 || Amt2 == A) {

30590

ShuffleMask.push_back(i + NumElts);

30591

Amt2 = A;

30592

continue;

30593

}

30594

break;

30595

}

30596

30597

// Only perform this blend if we can perform it without loading a mask.

30598

if (ShuffleMask.size() == NumElts && Amt1 && Amt2 &&

30599

(VT != MVT::v16i16 ||

30600

is128BitLaneRepeatedShuffleMask(VT, ShuffleMask)) &&

30601

(VT == MVT::v4i32 || Subtarget.hasSSE41() || Opc != ISD::SHL ||

30602

canWidenShuffleElements(ShuffleMask))) {

30603

auto *Cst1 = dyn_cast<ConstantSDNode>(Amt1);

30604

auto *Cst2 = dyn_cast<ConstantSDNode>(Amt2);

30605

if (Cst1 && Cst2 && Cst1->getAPIntValue().ult(EltSizeInBits) &&

30606

Cst2->getAPIntValue().ult(EltSizeInBits)) {

30607

SDValue Shift1 = getTargetVShiftByConstNode(X86OpcI, dl, VT, R,

30608

Cst1->getZExtValue(), DAG);

30609

SDValue Shift2 = getTargetVShiftByConstNode(X86OpcI, dl, VT, R,

30610

Cst2->getZExtValue(), DAG);

30611

return DAG.getVectorShuffle(VT, dl, Shift1, Shift2, ShuffleMask);

30612

}

30613

}

30614

}

30615

30616

// If possible, lower this packed shift into a vector multiply instead of

30617

// expanding it into a sequence of scalar shifts.

30618

// For v32i8 cases, it might be quicker to split/extend to vXi16 shifts.

30619

if (Opc == ISD::SHL && !(VT == MVT::v32i8 && (Subtarget.hasXOP() ||

30620

Subtarget.canExtendTo512BW())))

30621

if (SDValue Scale = convertShiftLeftToScale(Amt, dl, Subtarget, DAG))

30622

return DAG.getNode(ISD::MUL, dl, VT, R, Scale);

30623

30624

// Constant ISD::SRL can be performed efficiently on vXi16 vectors as we

30625

// can replace with ISD::MULHU, creating scale factor from (NumEltBits - Amt).

30626

if (Opc == ISD::SRL && ConstantAmt &&

30627

(VT == MVT::v8i16 || (VT == MVT::v16i16 && Subtarget.hasInt256()))) {

30628

SDValue EltBits = DAG.getConstant(EltSizeInBits, dl, VT);

30629

SDValue RAmt = DAG.getNode(ISD::SUB, dl, VT, EltBits, Amt);

30630

if (SDValue Scale = convertShiftLeftToScale(RAmt, dl, Subtarget, DAG)) {

30631

SDValue Zero = DAG.getConstant(0, dl, VT);

30632

SDValue ZAmt = DAG.getSetCC(dl, VT, Amt, Zero, ISD::SETEQ);

30633

SDValue Res = DAG.getNode(ISD::MULHU, dl, VT, R, Scale);

30634

return DAG.getSelect(dl, VT, ZAmt, R, Res);

30635

}

30636

}

30637

30638

// Constant ISD::SRA can be performed efficiently on vXi16 vectors as we

30639

// can replace with ISD::MULHS, creating scale factor from (NumEltBits - Amt).

30640

// TODO: Special case handling for shift by 0/1, really we can afford either

30641

// of these cases in pre-SSE41/XOP/AVX512 but not both.

30642

if (Opc == ISD::SRA && ConstantAmt &&

30643

(VT == MVT::v8i16 || (VT == MVT::v16i16 && Subtarget.hasInt256())) &&

30644

((Subtarget.hasSSE41() && !Subtarget.hasXOP() &&

30645

!Subtarget.hasAVX512()) ||

30646

DAG.isKnownNeverZero(Amt))) {

30647

SDValue EltBits = DAG.getConstant(EltSizeInBits, dl, VT);

30648

SDValue RAmt = DAG.getNode(ISD::SUB, dl, VT, EltBits, Amt);

30649

if (SDValue Scale = convertShiftLeftToScale(RAmt, dl, Subtarget, DAG)) {

30650

SDValue Amt0 =

30651

DAG.getSetCC(dl, VT, Amt, DAG.getConstant(0, dl, VT), ISD::SETEQ);

30652

SDValue Amt1 =

30653

DAG.getSetCC(dl, VT, Amt, DAG.getConstant(1, dl, VT), ISD::SETEQ);

30654

SDValue Sra1 =

30655

getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, R, 1, DAG);

30656

SDValue Res = DAG.getNode(ISD::MULHS, dl, VT, R, Scale);

30657

Res = DAG.getSelect(dl, VT, Amt0, R, Res);

30658

return DAG.getSelect(dl, VT, Amt1, Sra1, Res);

30659

}

30660

}

30661

30662

// v4i32 Non Uniform Shifts.

30663

// If the shift amount is constant we can shift each lane using the SSE2

30664

// immediate shifts, else we need to zero-extend each lane to the lower i64

30665

// and shift using the SSE2 variable shifts.

30666

// The separate results can then be blended together.

30667

if (VT == MVT::v4i32) {

30668

SDValue Amt0, Amt1, Amt2, Amt3;

30669

if (ConstantAmt) {

30670

Amt0 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {0, 0, 0, 0});

30671

Amt1 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {1, 1, 1, 1});

30672

Amt2 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {2, 2, 2, 2});

30673

Amt3 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {3, 3, 3, 3});

30674

} else {

30675

// The SSE2 shifts use the lower i64 as the same shift amount for

30676

// all lanes and the upper i64 is ignored. On AVX we're better off

30677

// just zero-extending, but for SSE just duplicating the top 16-bits is

30678

// cheaper and has the same effect for out of range values.

30679

if (Subtarget.hasAVX()) {

30680

SDValue Z = DAG.getConstant(0, dl, VT);

30681

Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Z, {0, 4, -1, -1});

30682

Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Z, {1, 5, -1, -1});

30683

Amt2 = DAG.getVectorShuffle(VT, dl, Amt, Z, {2, 6, -1, -1});

30684

Amt3 = DAG.getVectorShuffle(VT, dl, Amt, Z, {3, 7, -1, -1});

30685

} else {

30686

SDValue Amt01 = DAG.getBitcast(MVT::v8i16, Amt);

30687

SDValue Amt23 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01,

30688

{4, 5, 6, 7, -1, -1, -1, -1});

30689

SDValue Msk02 = getV4X86ShuffleImm8ForMask({0, 1, 1, 1}, dl, DAG);

30690

SDValue Msk13 = getV4X86ShuffleImm8ForMask({2, 3, 3, 3}, dl, DAG);

30691

Amt0 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt01, Msk02);

30692

Amt1 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt01, Msk13);

30693

Amt2 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt23, Msk02);

30694

Amt3 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt23, Msk13);

30695

}

30696

}

30697

30698

unsigned ShOpc = ConstantAmt ? Opc : X86OpcV;

30699

SDValue R0 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt0));

30700

SDValue R1 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt1));

30701

SDValue R2 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt2));

30702

SDValue R3 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt3));

30703

30704

// Merge the shifted lane results optimally with/without PBLENDW.

30705

// TODO - ideally shuffle combining would handle this.

30706

if (Subtarget.hasSSE41()) {

30707

SDValue R02 = DAG.getVectorShuffle(VT, dl, R0, R2, {0, -1, 6, -1});

30708

SDValue R13 = DAG.getVectorShuffle(VT, dl, R1, R3, {-1, 1, -1, 7});

30709

return DAG.getVectorShuffle(VT, dl, R02, R13, {0, 5, 2, 7});

30710

}

30711

SDValue R01 = DAG.getVectorShuffle(VT, dl, R0, R1, {0, -1, -1, 5});

30712

SDValue R23 = DAG.getVectorShuffle(VT, dl, R2, R3, {2, -1, -1, 7});

30713

return DAG.getVectorShuffle(VT, dl, R01, R23, {0, 3, 4, 7});

30714

}

30715

30716

// It's worth extending once and using the vXi16/vXi32 shifts for smaller

30717

// types, but without AVX512 the extra overheads to get from vXi8 to vXi32

30718

// make the existing SSE solution better.

30719

// NOTE: We honor prefered vector width before promoting to 512-bits.

30720

if ((Subtarget.hasInt256() && VT == MVT::v8i16) ||

30721

(Subtarget.canExtendTo512DQ() && VT == MVT::v16i16) ||

30722

(Subtarget.canExtendTo512DQ() && VT == MVT::v16i8) ||

30723

(Subtarget.canExtendTo512BW() && VT == MVT::v32i8) ||

30724

(Subtarget.hasBWI() && Subtarget.hasVLX() && VT == MVT::v16i8)) {

30725

assert((!Subtarget.hasBWI() || VT == MVT::v32i8 || VT == MVT::v16i8) &&(static_cast <bool> ((!Subtarget.hasBWI() || VT == MVT::
v32i8 || VT == MVT::v16i8) && "Unexpected vector type"
) ? void (0) : __assert_fail ("(!Subtarget.hasBWI() || VT == MVT::v32i8 || VT == MVT::v16i8) && \"Unexpected vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30726, __extension__
__PRETTY_FUNCTION__))

30726

"Unexpected vector type")(static_cast <bool> ((!Subtarget.hasBWI() || VT == MVT::
v32i8 || VT == MVT::v16i8) && "Unexpected vector type"
) ? void (0) : __assert_fail ("(!Subtarget.hasBWI() || VT == MVT::v32i8 || VT == MVT::v16i8) && \"Unexpected vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30726, __extension__
__PRETTY_FUNCTION__));

30727

MVT EvtSVT = Subtarget.hasBWI() ? MVT::i16 : MVT::i32;

30728

MVT ExtVT = MVT::getVectorVT(EvtSVT, VT.getVectorNumElements());

30729

unsigned ExtOpc = Opc == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;

30730

R = DAG.getNode(ExtOpc, dl, ExtVT, R);

30731

Amt = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Amt);

30732

return DAG.getNode(ISD::TRUNCATE, dl, VT,

30733

DAG.getNode(Opc, dl, ExtVT, R, Amt));

30734

}

30735

30736

// Constant ISD::SRA/SRL can be performed efficiently on vXi8 vectors as we

30737

// extend to vXi16 to perform a MUL scale effectively as a MUL_LOHI.

30738

if (ConstantAmt && (Opc == ISD::SRA || Opc == ISD::SRL) &&

30739

(VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) ||

30740

(VT == MVT::v64i8 && Subtarget.hasBWI())) &&

30741

!Subtarget.hasXOP()) {

30742

int NumElts = VT.getVectorNumElements();

30743

SDValue Cst8 = DAG.getTargetConstant(8, dl, MVT::i8);

30744

30745

// Extend constant shift amount to vXi16 (it doesn't matter if the type

30746

// isn't legal).

30747

MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts);

30748

Amt = DAG.getZExtOrTrunc(Amt, dl, ExVT);

30749

Amt = DAG.getNode(ISD::SUB, dl, ExVT, DAG.getConstant(8, dl, ExVT), Amt);

30750

Amt = DAG.getNode(ISD::SHL, dl, ExVT, DAG.getConstant(1, dl, ExVT), Amt);

30751

assert(ISD::isBuildVectorOfConstantSDNodes(Amt.getNode()) &&(static_cast <bool> (ISD::isBuildVectorOfConstantSDNodes
(Amt.getNode()) && "Constant build vector expected") ?
void (0) : __assert_fail ("ISD::isBuildVectorOfConstantSDNodes(Amt.getNode()) && \"Constant build vector expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30752, __extension__
__PRETTY_FUNCTION__))

30752

"Constant build vector expected")(static_cast <bool> (ISD::isBuildVectorOfConstantSDNodes
(Amt.getNode()) && "Constant build vector expected") ?
void (0) : __assert_fail ("ISD::isBuildVectorOfConstantSDNodes(Amt.getNode()) && \"Constant build vector expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30752, __extension__
__PRETTY_FUNCTION__));

30753

30754

if (VT == MVT::v16i8 && Subtarget.hasInt256()) {

30755

R = Opc == ISD::SRA ? DAG.getSExtOrTrunc(R, dl, ExVT)

30756

: DAG.getZExtOrTrunc(R, dl, ExVT);

30757

R = DAG.getNode(ISD::MUL, dl, ExVT, R, Amt);

30758

R = DAG.getNode(X86ISD::VSRLI, dl, ExVT, R, Cst8);

30759

return DAG.getZExtOrTrunc(R, dl, VT);

30760

}

30761

30762

SmallVector<SDValue, 16> LoAmt, HiAmt;

30763

for (int i = 0; i != NumElts; i += 16) {

30764

for (int j = 0; j != 8; ++j) {

30765

LoAmt.push_back(Amt.getOperand(i + j));

30766

HiAmt.push_back(Amt.getOperand(i + j + 8));

30767

}

30768

}

30769

30770

MVT VT16 = MVT::getVectorVT(MVT::i16, NumElts / 2);

30771

SDValue LoA = DAG.getBuildVector(VT16, dl, LoAmt);

30772

SDValue HiA = DAG.getBuildVector(VT16, dl, HiAmt);

30773

30774

SDValue LoR = DAG.getBitcast(VT16, getUnpackl(DAG, dl, VT, R, R));

30775

SDValue HiR = DAG.getBitcast(VT16, getUnpackh(DAG, dl, VT, R, R));

30776

LoR = DAG.getNode(X86OpcI, dl, VT16, LoR, Cst8);

30777

HiR = DAG.getNode(X86OpcI, dl, VT16, HiR, Cst8);

30778

LoR = DAG.getNode(ISD::MUL, dl, VT16, LoR, LoA);

30779

HiR = DAG.getNode(ISD::MUL, dl, VT16, HiR, HiA);

30780

LoR = DAG.getNode(X86ISD::VSRLI, dl, VT16, LoR, Cst8);

30781

HiR = DAG.getNode(X86ISD::VSRLI, dl, VT16, HiR, Cst8);

30782

return DAG.getNode(X86ISD::PACKUS, dl, VT, LoR, HiR);

30783

}

30784

30785

if (VT == MVT::v16i8 ||

30786

(VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP()) ||

30787

(VT == MVT::v64i8 && Subtarget.hasBWI())) {

30788

MVT ExtVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2);

30789

30790

auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {

30791

if (VT.is512BitVector()) {

30792

// On AVX512BW targets we make use of the fact that VSELECT lowers

30793

// to a masked blend which selects bytes based just on the sign bit

30794

// extracted to a mask.

30795

MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());

30796

V0 = DAG.getBitcast(VT, V0);

30797

V1 = DAG.getBitcast(VT, V1);

30798

Sel = DAG.getBitcast(VT, Sel);

30799

Sel = DAG.getSetCC(dl, MaskVT, DAG.getConstant(0, dl, VT), Sel,

30800

ISD::SETGT);

30801

return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));

30802

} else if (Subtarget.hasSSE41()) {

30803

// On SSE41 targets we can use PBLENDVB which selects bytes based just

30804

// on the sign bit.

30805

V0 = DAG.getBitcast(VT, V0);

30806

V1 = DAG.getBitcast(VT, V1);

30807

Sel = DAG.getBitcast(VT, Sel);

30808

return DAG.getBitcast(SelVT,

30809

DAG.getNode(X86ISD::BLENDV, dl, VT, Sel, V0, V1));

30810

}

30811

// On pre-SSE41 targets we test for the sign bit by comparing to

30812

// zero - a negative value will set all bits of the lanes to true

30813

// and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.

30814

SDValue Z = DAG.getConstant(0, dl, SelVT);

30815

SDValue C = DAG.getNode(X86ISD::PCMPGT, dl, SelVT, Z, Sel);

30816

return DAG.getSelect(dl, SelVT, C, V0, V1);

30817

};

30818

30819

// Turn 'a' into a mask suitable for VSELECT: a = a << 5;

30820

// We can safely do this using i16 shifts as we're only interested in

30821

// the 3 lower bits of each byte.

30822

Amt = DAG.getBitcast(ExtVT, Amt);

30823

Amt = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ExtVT, Amt, 5, DAG);

30824

Amt = DAG.getBitcast(VT, Amt);

30825

30826

if (Opc == ISD::SHL || Opc == ISD::SRL) {

30827

// r = VSELECT(r, shift(r, 4), a);

30828

SDValue M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(4, dl, VT));

30829

R = SignBitSelect(VT, Amt, M, R);

30830

30831

// a += a

30832

Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);

30833

30834

// r = VSELECT(r, shift(r, 2), a);

30835

M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(2, dl, VT));

30836

R = SignBitSelect(VT, Amt, M, R);

30837

30838

// a += a

30839

Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);

30840

30841

// return VSELECT(r, shift(r, 1), a);

30842

M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(1, dl, VT));

30843

R = SignBitSelect(VT, Amt, M, R);

30844

return R;

30845

}

30846

30847

if (Opc == ISD::SRA) {

30848

// For SRA we need to unpack each byte to the higher byte of a i16 vector

30849

// so we can correctly sign extend. We don't care what happens to the

30850

// lower byte.

30851

SDValue ALo = getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), Amt);

30852

SDValue AHi = getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), Amt);

30853

SDValue RLo = getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), R);

30854

SDValue RHi = getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), R);

30855

ALo = DAG.getBitcast(ExtVT, ALo);

30856

AHi = DAG.getBitcast(ExtVT, AHi);

30857

RLo = DAG.getBitcast(ExtVT, RLo);

30858

RHi = DAG.getBitcast(ExtVT, RHi);

30859

30860

// r = VSELECT(r, shift(r, 4), a);

30861

SDValue MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 4, DAG);

30862

SDValue MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 4, DAG);

30863

RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);

30864

RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);

30865

30866

// a += a

30867

ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);

30868

AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);

30869

30870

// r = VSELECT(r, shift(r, 2), a);

30871

MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 2, DAG);

30872

MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 2, DAG);

30873

RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);

30874

RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);

30875

30876

// a += a

30877

ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);

30878

AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);

30879

30880

// r = VSELECT(r, shift(r, 1), a);

30881

MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 1, DAG);

30882

MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 1, DAG);

30883

RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);

30884

RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);

30885

30886

// Logical shift the result back to the lower byte, leaving a zero upper

30887

// byte meaning that we can safely pack with PACKUSWB.

30888

RLo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, RLo, 8, DAG);

30889

RHi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, RHi, 8, DAG);

30890

return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);

30891

}

30892

}

30893

30894

if (Subtarget.hasInt256() && !Subtarget.hasXOP() && VT == MVT::v16i16) {

30895

MVT ExtVT = MVT::v8i32;

30896

SDValue Z = DAG.getConstant(0, dl, VT);

30897

SDValue ALo = getUnpackl(DAG, dl, VT, Amt, Z);

30898

SDValue AHi = getUnpackh(DAG, dl, VT, Amt, Z);

30899

SDValue RLo = getUnpackl(DAG, dl, VT, Z, R);

30900

SDValue RHi = getUnpackh(DAG, dl, VT, Z, R);

30901

ALo = DAG.getBitcast(ExtVT, ALo);

30902

AHi = DAG.getBitcast(ExtVT, AHi);

30903

RLo = DAG.getBitcast(ExtVT, RLo);

30904

RHi = DAG.getBitcast(ExtVT, RHi);

30905

SDValue Lo = DAG.getNode(Opc, dl, ExtVT, RLo, ALo);

30906

SDValue Hi = DAG.getNode(Opc, dl, ExtVT, RHi, AHi);

30907

Lo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, Lo, 16, DAG);

30908

Hi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, Hi, 16, DAG);

30909

return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);

30910

}

30911

30912

if (VT == MVT::v8i16) {

30913

// If we have a constant shift amount, the non-SSE41 path is best as

30914

// avoiding bitcasts make it easier to constant fold and reduce to PBLENDW.

30915

bool UseSSE41 = Subtarget.hasSSE41() &&

30916

!ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());

30917

30918

auto SignBitSelect = [&](SDValue Sel, SDValue V0, SDValue V1) {

30919

// On SSE41 targets we can use PBLENDVB which selects bytes based just on

30920

// the sign bit.

30921

if (UseSSE41) {

30922

MVT ExtVT = MVT::getVectorVT(MVT::i8, VT.getVectorNumElements() * 2);

30923

V0 = DAG.getBitcast(ExtVT, V0);

30924

V1 = DAG.getBitcast(ExtVT, V1);

30925

Sel = DAG.getBitcast(ExtVT, Sel);

30926

return DAG.getBitcast(

30927

VT, DAG.getNode(X86ISD::BLENDV, dl, ExtVT, Sel, V0, V1));

30928

}

30929

// On pre-SSE41 targets we splat the sign bit - a negative value will

30930

// set all bits of the lanes to true and VSELECT uses that in

30931

// its OR(AND(V0,C),AND(V1,~C)) lowering.

30932

SDValue C =

30933

getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, Sel, 15, DAG);

30934

return DAG.getSelect(dl, VT, C, V0, V1);

30935

};

30936

30937

// Turn 'a' into a mask suitable for VSELECT: a = a << 12;

30938

if (UseSSE41) {

30939

// On SSE41 targets we need to replicate the shift mask in both

30940

// bytes for PBLENDVB.

30941

Amt = DAG.getNode(

30942

ISD::OR, dl, VT,

30943

getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 4, DAG),

30944

getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 12, DAG));

30945

} else {

30946

Amt = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 12, DAG);

30947

}

30948

30949

// r = VSELECT(r, shift(r, 8), a);

30950

SDValue M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 8, DAG);

30951

R = SignBitSelect(Amt, M, R);

30952

30953

// a += a

30954

Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);

30955

30956

// r = VSELECT(r, shift(r, 4), a);

30957

M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 4, DAG);

30958

R = SignBitSelect(Amt, M, R);

30959

30960

// a += a

30961

Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);

30962

30963

// r = VSELECT(r, shift(r, 2), a);

30964

M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 2, DAG);

30965

R = SignBitSelect(Amt, M, R);

30966

30967

// a += a

30968

Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);

30969

30970

// return VSELECT(r, shift(r, 1), a);

30971

M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 1, DAG);

30972

R = SignBitSelect(Amt, M, R);

30973

return R;

30974

}

30975

30976

// Decompose 256-bit shifts into 128-bit shifts.

30977

if (VT.is256BitVector())

30978

return splitVectorIntBinary(Op, DAG);

30979

30980

if (VT == MVT::v32i16 || VT == MVT::v64i8)

30981

return splitVectorIntBinary(Op, DAG);

30982

30983

return SDValue();

30984

}

30985

30986

static SDValue LowerFunnelShift(SDValue Op, const X86Subtarget &Subtarget,

30987

SelectionDAG &DAG) {

30988

MVT VT = Op.getSimpleValueType();

30989

assert((Op.getOpcode() == ISD::FSHL || Op.getOpcode() == ISD::FSHR) &&(static_cast <bool> ((Op.getOpcode() == ISD::FSHL || Op
.getOpcode() == ISD::FSHR) && "Unexpected funnel shift opcode!"
) ? void (0) : __assert_fail ("(Op.getOpcode() == ISD::FSHL || Op.getOpcode() == ISD::FSHR) && \"Unexpected funnel shift opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30990, __extension__
__PRETTY_FUNCTION__))

30990

"Unexpected funnel shift opcode!")(static_cast <bool> ((Op.getOpcode() == ISD::FSHL || Op
.getOpcode() == ISD::FSHR) && "Unexpected funnel shift opcode!"
) ? void (0) : __assert_fail ("(Op.getOpcode() == ISD::FSHL || Op.getOpcode() == ISD::FSHR) && \"Unexpected funnel shift opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 30990, __extension__
__PRETTY_FUNCTION__));

30991

30992

SDLoc DL(Op);

30993

SDValue Op0 = Op.getOperand(0);

30994

SDValue Op1 = Op.getOperand(1);

30995

SDValue Amt = Op.getOperand(2);

30996

unsigned EltSizeInBits = VT.getScalarSizeInBits();

30997

bool IsFSHR = Op.getOpcode() == ISD::FSHR;

30998

30999

if (VT.isVector()) {

31000

APInt APIntShiftAmt;

31001

bool IsCstSplat = X86::isConstantSplat(Amt, APIntShiftAmt);

31002

31003

if (Subtarget.hasVBMI2() && EltSizeInBits > 8) {

31004

if (IsFSHR)

31005

std::swap(Op0, Op1);

31006

31007

if (IsCstSplat) {

31008

uint64_t ShiftAmt = APIntShiftAmt.urem(EltSizeInBits);

31009

SDValue Imm = DAG.getTargetConstant(ShiftAmt, DL, MVT::i8);

31010

return getAVX512Node(IsFSHR ? X86ISD::VSHRD : X86ISD::VSHLD, DL, VT,

31011

{Op0, Op1, Imm}, DAG, Subtarget);

31012

}

31013

return getAVX512Node(IsFSHR ? X86ISD::VSHRDV : X86ISD::VSHLDV, DL, VT,

31014

{Op0, Op1, Amt}, DAG, Subtarget);

31015

}

31016

assert((VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8 ||(static_cast <bool> ((VT == MVT::v16i8 || VT == MVT::v32i8
|| VT == MVT::v64i8 || VT == MVT::v8i16 || VT == MVT::v16i16
|| VT == MVT::v32i16 || VT == MVT::v4i32 || VT == MVT::v8i32
|| VT == MVT::v16i32) && "Unexpected funnel shift type!"
) ? void (0) : __assert_fail ("(VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8 || VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16 || VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) && \"Unexpected funnel shift type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31019, __extension__
__PRETTY_FUNCTION__))

31017

VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16 ||(static_cast <bool> ((VT == MVT::v16i8 || VT == MVT::v32i8
|| VT == MVT::v64i8 || VT == MVT::v8i16 || VT == MVT::v16i16
|| VT == MVT::v32i16 || VT == MVT::v4i32 || VT == MVT::v8i32
|| VT == MVT::v16i32) && "Unexpected funnel shift type!"
) ? void (0) : __assert_fail ("(VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8 || VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16 || VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) && \"Unexpected funnel shift type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31019, __extension__
__PRETTY_FUNCTION__))

31018

VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) &&(static_cast <bool> ((VT == MVT::v16i8 || VT == MVT::v32i8
|| VT == MVT::v64i8 || VT == MVT::v8i16 || VT == MVT::v16i16
|| VT == MVT::v32i16 || VT == MVT::v4i32 || VT == MVT::v8i32
|| VT == MVT::v16i32) && "Unexpected funnel shift type!"
) ? void (0) : __assert_fail ("(VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8 || VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16 || VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) && \"Unexpected funnel shift type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31019, __extension__
__PRETTY_FUNCTION__))

31019

"Unexpected funnel shift type!")(static_cast <bool> ((VT == MVT::v16i8 || VT == MVT::v32i8
|| VT == MVT::v64i8 || VT == MVT::v8i16 || VT == MVT::v16i16
|| VT == MVT::v32i16 || VT == MVT::v4i32 || VT == MVT::v8i32
|| VT == MVT::v16i32) && "Unexpected funnel shift type!"
) ? void (0) : __assert_fail ("(VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8 || VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16 || VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) && \"Unexpected funnel shift type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31019, __extension__
__PRETTY_FUNCTION__));

31020

31021

// fshl(x,y,z) -> unpack(y,x) << (z & (bw-1))) >> bw.

31022

// fshr(x,y,z) -> unpack(y,x) >> (z & (bw-1))).

31023

if (IsCstSplat)

31024

return SDValue();

31025

31026

SDValue AmtMask = DAG.getConstant(EltSizeInBits - 1, DL, VT);

31027

SDValue AmtMod = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);

31028

bool IsCst = ISD::isBuildVectorOfConstantSDNodes(AmtMod.getNode());

31029

31030

// Constant vXi16 funnel shifts can be efficiently handled by default.

31031

if (IsCst && EltSizeInBits == 16)

31032

return SDValue();

31033

31034

unsigned ShiftOpc = IsFSHR ? ISD::SRL : ISD::SHL;

31035

unsigned NumElts = VT.getVectorNumElements();

31036

MVT ExtSVT = MVT::getIntegerVT(2 * EltSizeInBits);

31037

MVT ExtVT = MVT::getVectorVT(ExtSVT, NumElts / 2);

31038

31039

// Split 256-bit integers on XOP/pre-AVX2 targets.

31040

// Split 512-bit integers on non 512-bit BWI targets.

31041

if ((VT.is256BitVector() && ((Subtarget.hasXOP() && EltSizeInBits < 16) ||

31042

!Subtarget.hasAVX2())) ||

31043

(VT.is512BitVector() && !Subtarget.useBWIRegs() &&

31044

EltSizeInBits < 32)) {

31045

// Pre-mask the amount modulo using the wider vector.

31046

Op = DAG.getNode(Op.getOpcode(), DL, VT, Op0, Op1, AmtMod);

31047

return splitVectorOp(Op, DAG);

31048

}

31049

31050

// Attempt to fold scalar shift as unpack(y,x) << zext(splat(z))

31051

if (supportedVectorShiftWithBaseAmnt(ExtVT, Subtarget, ShiftOpc)) {

31052

int ScalarAmtIdx = -1;

31053

if (SDValue ScalarAmt = DAG.getSplatSourceVector(AmtMod, ScalarAmtIdx)) {

31054

// Uniform vXi16 funnel shifts can be efficiently handled by default.

31055

if (EltSizeInBits == 16)

31056

return SDValue();

31057

31058

SDValue Lo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, Op1, Op0));

31059

SDValue Hi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, Op1, Op0));

31060

Lo = getTargetVShiftNode(ShiftOpc, DL, ExtVT, Lo, ScalarAmt,

31061

ScalarAmtIdx, Subtarget, DAG);

31062

Hi = getTargetVShiftNode(ShiftOpc, DL, ExtVT, Hi, ScalarAmt,

31063

ScalarAmtIdx, Subtarget, DAG);

31064

return getPack(DAG, Subtarget, DL, VT, Lo, Hi, !IsFSHR);

31065

}

31066

}

31067

31068

MVT WideSVT = MVT::getIntegerVT(

31069

std::min<unsigned>(EltSizeInBits * 2, Subtarget.hasBWI() ? 16 : 32));

31070

MVT WideVT = MVT::getVectorVT(WideSVT, NumElts);

31071

31072

// If per-element shifts are legal, fallback to generic expansion.

31073

if (supportedVectorVarShift(VT, Subtarget, ShiftOpc) || Subtarget.hasXOP())

31074

return SDValue();

31075

31076

// Attempt to fold as:

31077

// fshl(x,y,z) -> (((aext(x) << bw) | zext(y)) << (z & (bw-1))) >> bw.

31078

// fshr(x,y,z) -> (((aext(x) << bw) | zext(y)) >> (z & (bw-1))).

31079

if (supportedVectorVarShift(WideVT, Subtarget, ShiftOpc) &&

31080

supportedVectorShiftWithImm(WideVT, Subtarget, ShiftOpc)) {

31081

Op0 = DAG.getNode(ISD::ANY_EXTEND, DL, WideVT, Op0);

31082

Op1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, Op1);

31083

AmtMod = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, AmtMod);

31084

Op0 = getTargetVShiftByConstNode(X86ISD::VSHLI, DL, WideVT, Op0,

31085

EltSizeInBits, DAG);

31086

SDValue Res = DAG.getNode(ISD::OR, DL, WideVT, Op0, Op1);

31087

Res = DAG.getNode(ShiftOpc, DL, WideVT, Res, AmtMod);

31088

if (!IsFSHR)

31089

Res = getTargetVShiftByConstNode(X86ISD::VSRLI, DL, WideVT, Res,

31090

EltSizeInBits, DAG);

31091

return DAG.getNode(ISD::TRUNCATE, DL, VT, Res);

31092

}

31093

31094

// Attempt to fold per-element (ExtVT) shift as unpack(y,x) << zext(z)

31095

if (((IsCst || !Subtarget.hasAVX512()) && !IsFSHR && EltSizeInBits <= 16) ||

31096

supportedVectorVarShift(ExtVT, Subtarget, ShiftOpc)) {

31097

SDValue Z = DAG.getConstant(0, DL, VT);

31098

SDValue RLo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, Op1, Op0));

31099

SDValue RHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, Op1, Op0));

31100

SDValue ALo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, AmtMod, Z));

31101

SDValue AHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, AmtMod, Z));

31102

SDValue Lo = DAG.getNode(ShiftOpc, DL, ExtVT, RLo, ALo);

31103

SDValue Hi = DAG.getNode(ShiftOpc, DL, ExtVT, RHi, AHi);

31104

return getPack(DAG, Subtarget, DL, VT, Lo, Hi, !IsFSHR);

31105

}

31106

31107

// Fallback to generic expansion.

31108

return SDValue();

31109

}

31110

assert((static_cast <bool> ((VT == MVT::i8 || VT == MVT::i16 ||
VT == MVT::i32 || VT == MVT::i64) && "Unexpected funnel shift type!"
) ? void (0) : __assert_fail ("(VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) && \"Unexpected funnel shift type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31112, __extension__
__PRETTY_FUNCTION__))

31111

(VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) &&(static_cast <bool> ((VT == MVT::i8 || VT == MVT::i16 ||
VT == MVT::i32 || VT == MVT::i64) && "Unexpected funnel shift type!"
) ? void (0) : __assert_fail ("(VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) && \"Unexpected funnel shift type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31112, __extension__
__PRETTY_FUNCTION__))

31112

"Unexpected funnel shift type!")(static_cast <bool> ((VT == MVT::i8 || VT == MVT::i16 ||
VT == MVT::i32 || VT == MVT::i64) && "Unexpected funnel shift type!"
) ? void (0) : __assert_fail ("(VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) && \"Unexpected funnel shift type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31112, __extension__
__PRETTY_FUNCTION__));

31113

31114

// Expand slow SHLD/SHRD cases if we are not optimizing for size.

31115

bool OptForSize = DAG.shouldOptForSize();

31116

bool ExpandFunnel = !OptForSize && Subtarget.isSHLDSlow();

31117

31118

// fshl(x,y,z) -> (((aext(x) << bw) | zext(y)) << (z & (bw-1))) >> bw.

31119

// fshr(x,y,z) -> (((aext(x) << bw) | zext(y)) >> (z & (bw-1))).

31120

if ((VT == MVT::i8 || (ExpandFunnel && VT == MVT::i16)) &&

31121

!isa<ConstantSDNode>(Amt)) {

31122

SDValue Mask = DAG.getConstant(EltSizeInBits - 1, DL, Amt.getValueType());

31123

SDValue HiShift = DAG.getConstant(EltSizeInBits, DL, Amt.getValueType());

31124

Op0 = DAG.getAnyExtOrTrunc(Op0, DL, MVT::i32);

31125

Op1 = DAG.getZExtOrTrunc(Op1, DL, MVT::i32);

31126

Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt, Mask);

31127

SDValue Res = DAG.getNode(ISD::SHL, DL, MVT::i32, Op0, HiShift);

31128

Res = DAG.getNode(ISD::OR, DL, MVT::i32, Res, Op1);

31129

if (IsFSHR) {

31130

Res = DAG.getNode(ISD::SRL, DL, MVT::i32, Res, Amt);

31131

} else {

31132

Res = DAG.getNode(ISD::SHL, DL, MVT::i32, Res, Amt);

31133

Res = DAG.getNode(ISD::SRL, DL, MVT::i32, Res, HiShift);

31134

}

31135

return DAG.getZExtOrTrunc(Res, DL, VT);

31136

}

31137

31138

if (VT == MVT::i8 || ExpandFunnel)

31139

return SDValue();

31140

31141

// i16 needs to modulo the shift amount, but i32/i64 have implicit modulo.

31142

if (VT == MVT::i16) {

31143

Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt,

31144

DAG.getConstant(15, DL, Amt.getValueType()));

31145

unsigned FSHOp = (IsFSHR ? X86ISD::FSHR : X86ISD::FSHL);

31146

return DAG.getNode(FSHOp, DL, VT, Op0, Op1, Amt);

31147

}

31148

31149

return Op;

31150

}

31151

31152

static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,

31153

SelectionDAG &DAG) {

31154

MVT VT = Op.getSimpleValueType();

31155

assert(VT.isVector() && "Custom lowering only for vector rotates!")(static_cast <bool> (VT.isVector() && "Custom lowering only for vector rotates!"
) ? void (0) : __assert_fail ("VT.isVector() && \"Custom lowering only for vector rotates!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31155, __extension__
__PRETTY_FUNCTION__));

31156

31157

SDLoc DL(Op);

31158

SDValue R = Op.getOperand(0);

31159

SDValue Amt = Op.getOperand(1);

31160

unsigned Opcode = Op.getOpcode();

31161

unsigned EltSizeInBits = VT.getScalarSizeInBits();

31162

int NumElts = VT.getVectorNumElements();

31163

bool IsROTL = Opcode == ISD::ROTL;

31164

31165

// Check for constant splat rotation amount.

31166

APInt CstSplatValue;

31167

bool IsCstSplat = X86::isConstantSplat(Amt, CstSplatValue);

31168

31169

// Check for splat rotate by zero.

31170

if (IsCstSplat && CstSplatValue.urem(EltSizeInBits) == 0)

31171

return R;

31172

31173

// AVX512 implicitly uses modulo rotation amounts.

31174

if (Subtarget.hasAVX512() && 32 <= EltSizeInBits) {

31175

// Attempt to rotate by immediate.

31176

if (IsCstSplat) {

31177

unsigned RotOpc = IsROTL ? X86ISD::VROTLI : X86ISD::VROTRI;

31178

uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);

31179

return DAG.getNode(RotOpc, DL, VT, R,

31180

DAG.getTargetConstant(RotAmt, DL, MVT::i8));

31181

}

31182

31183

// Else, fall-back on VPROLV/VPRORV.

31184

return Op;

31185

}

31186

31187

// AVX512 VBMI2 vXi16 - lower to funnel shifts.

31188

if (Subtarget.hasVBMI2() && 16 == EltSizeInBits) {

31189

unsigned FunnelOpc = IsROTL ? ISD::FSHL : ISD::FSHR;

31190

return DAG.getNode(FunnelOpc, DL, VT, R, R, Amt);

31191

}

31192

31193

SDValue Z = DAG.getConstant(0, DL, VT);

31194

31195

if (!IsROTL) {

31196

// If the ISD::ROTR amount is constant, we're always better converting to

31197

// ISD::ROTL.

31198

if (SDValue NegAmt = DAG.FoldConstantArithmetic(ISD::SUB, DL, VT, {Z, Amt}))

31199

return DAG.getNode(ISD::ROTL, DL, VT, R, NegAmt);

31200

31201

// XOP targets always prefers ISD::ROTL.

31202

if (Subtarget.hasXOP())

31203

return DAG.getNode(ISD::ROTL, DL, VT, R,

31204

DAG.getNode(ISD::SUB, DL, VT, Z, Amt));

31205

}

31206

31207

// Split 256-bit integers on XOP/pre-AVX2 targets.

31208

if (VT.is256BitVector() && (Subtarget.hasXOP() || !Subtarget.hasAVX2()))

31209

return splitVectorIntBinary(Op, DAG);

31210

31211

// XOP has 128-bit vector variable + immediate rotates.

31212

// +ve/-ve Amt = rotate left/right - just need to handle ISD::ROTL.

31213

// XOP implicitly uses modulo rotation amounts.

31214

if (Subtarget.hasXOP()) {

31215

assert(IsROTL && "Only ROTL expected")(static_cast <bool> (IsROTL && "Only ROTL expected"
) ? void (0) : __assert_fail ("IsROTL && \"Only ROTL expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31215, __extension__
__PRETTY_FUNCTION__));

31216

assert(VT.is128BitVector() && "Only rotate 128-bit vectors!")(static_cast <bool> (VT.is128BitVector() && "Only rotate 128-bit vectors!"
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Only rotate 128-bit vectors!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31216, __extension__
__PRETTY_FUNCTION__));

31217

31218

// Attempt to rotate by immediate.

31219

if (IsCstSplat) {

31220

uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);

31221

return DAG.getNode(X86ISD::VROTLI, DL, VT, R,

31222

DAG.getTargetConstant(RotAmt, DL, MVT::i8));

31223

}

31224

31225

// Use general rotate by variable (per-element).

31226

return Op;

31227

}

31228

31229

// Rotate by an uniform constant - expand back to shifts.

31230

if (IsCstSplat)

31231

return SDValue();

31232

31233

// Split 512-bit integers on non 512-bit BWI targets.

31234

if (VT.is512BitVector() && !Subtarget.useBWIRegs())

31235

return splitVectorIntBinary(Op, DAG);

31236

31237

assert((static_cast <bool> ((VT == MVT::v4i32 || VT == MVT::v8i16
|| VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16
|| VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT
== MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs
())) && "Only vXi32/vXi16/vXi8 vector rotates supported"
) ? void (0) : __assert_fail ("(VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT == MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs())) && \"Only vXi32/vXi16/vXi8 vector rotates supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31242, __extension__
__PRETTY_FUNCTION__))

31238

(VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 ||(static_cast <bool> ((VT == MVT::v4i32 || VT == MVT::v8i16
|| VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16
|| VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT
== MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs
())) && "Only vXi32/vXi16/vXi8 vector rotates supported"
) ? void (0) : __assert_fail ("(VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT == MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs())) && \"Only vXi32/vXi16/vXi8 vector rotates supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31242, __extension__
__PRETTY_FUNCTION__))

31239

((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) &&(static_cast <bool> ((VT == MVT::v4i32 || VT == MVT::v8i16
|| VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16
|| VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT
== MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs
())) && "Only vXi32/vXi16/vXi8 vector rotates supported"
) ? void (0) : __assert_fail ("(VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT == MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs())) && \"Only vXi32/vXi16/vXi8 vector rotates supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31242, __extension__
__PRETTY_FUNCTION__))

31240

Subtarget.hasAVX2()) ||(static_cast <bool> ((VT == MVT::v4i32 || VT == MVT::v8i16
|| VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16
|| VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT
== MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs
())) && "Only vXi32/vXi16/vXi8 vector rotates supported"
) ? void (0) : __assert_fail ("(VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT == MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs())) && \"Only vXi32/vXi16/vXi8 vector rotates supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31242, __extension__
__PRETTY_FUNCTION__))

31241

((VT == MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs())) &&(static_cast <bool> ((VT == MVT::v4i32 || VT == MVT::v8i16
|| VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16
|| VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT
== MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs
())) && "Only vXi32/vXi16/vXi8 vector rotates supported"
) ? void (0) : __assert_fail ("(VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT == MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs())) && \"Only vXi32/vXi16/vXi8 vector rotates supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31242, __extension__
__PRETTY_FUNCTION__))

31242

"Only vXi32/vXi16/vXi8 vector rotates supported")(static_cast <bool> ((VT == MVT::v4i32 || VT == MVT::v8i16
|| VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16
|| VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT
== MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs
())) && "Only vXi32/vXi16/vXi8 vector rotates supported"
) ? void (0) : __assert_fail ("(VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT == MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs())) && \"Only vXi32/vXi16/vXi8 vector rotates supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31242, __extension__
__PRETTY_FUNCTION__));

31243

31244

MVT ExtSVT = MVT::getIntegerVT(2 * EltSizeInBits);

31245

MVT ExtVT = MVT::getVectorVT(ExtSVT, NumElts / 2);

31246

31247

SDValue AmtMask = DAG.getConstant(EltSizeInBits - 1, DL, VT);

31248

SDValue AmtMod = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);

31249

31250

// Attempt to fold as unpack(x,x) << zext(splat(y)):

31251

// rotl(x,y) -> (unpack(x,x) << (y & (bw-1))) >> bw.

31252

// rotr(x,y) -> (unpack(x,x) >> (y & (bw-1))).

31253

if (EltSizeInBits == 8 || EltSizeInBits == 16 || EltSizeInBits == 32) {

31254

int BaseRotAmtIdx = -1;

31255

if (SDValue BaseRotAmt = DAG.getSplatSourceVector(AmtMod, BaseRotAmtIdx)) {

31256

if (EltSizeInBits == 16 && Subtarget.hasSSE41()) {

31257

unsigned FunnelOpc = IsROTL ? ISD::FSHL : ISD::FSHR;

31258

return DAG.getNode(FunnelOpc, DL, VT, R, R, Amt);

31259

}

31260

unsigned ShiftX86Opc = IsROTL ? X86ISD::VSHLI : X86ISD::VSRLI;

31261

SDValue Lo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, R, R));

31262

SDValue Hi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, R, R));

31263

Lo = getTargetVShiftNode(ShiftX86Opc, DL, ExtVT, Lo, BaseRotAmt,

31264

BaseRotAmtIdx, Subtarget, DAG);

31265

Hi = getTargetVShiftNode(ShiftX86Opc, DL, ExtVT, Hi, BaseRotAmt,

31266

BaseRotAmtIdx, Subtarget, DAG);

31267

return getPack(DAG, Subtarget, DL, VT, Lo, Hi, IsROTL);

31268

}

31269

}

31270

31271

// v16i8/v32i8/v64i8: Split rotation into rot4/rot2/rot1 stages and select by

31272

// the amount bit.

31273

// TODO: We're doing nothing here that we couldn't do for funnel shifts.

31274

if (EltSizeInBits == 8) {

31275

bool IsConstAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());

31276

MVT WideVT =

31277

MVT::getVectorVT(Subtarget.hasBWI() ? MVT::i16 : MVT::i32, NumElts);

31278

unsigned ShiftOpc = IsROTL ? ISD::SHL : ISD::SRL;

31279

31280

// Attempt to fold as:

31281

// rotl(x,y) -> (((aext(x) << bw) | zext(x)) << (y & (bw-1))) >> bw.

31282

// rotr(x,y) -> (((aext(x) << bw) | zext(x)) >> (y & (bw-1))).

31283

if (supportedVectorVarShift(WideVT, Subtarget, ShiftOpc) &&

31284

supportedVectorShiftWithImm(WideVT, Subtarget, ShiftOpc)) {

31285

// If we're rotating by constant, just use default promotion.

31286

if (IsConstAmt)

31287

return SDValue();

31288

// See if we can perform this by widening to vXi16 or vXi32.

31289

R = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, R);

31290

R = DAG.getNode(

31291

ISD::OR, DL, WideVT, R,

31292

getTargetVShiftByConstNode(X86ISD::VSHLI, DL, WideVT, R, 8, DAG));

31293

Amt = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, AmtMod);

31294

R = DAG.getNode(ShiftOpc, DL, WideVT, R, Amt);

31295

if (IsROTL)

31296

R = getTargetVShiftByConstNode(X86ISD::VSRLI, DL, WideVT, R, 8, DAG);

31297

return DAG.getNode(ISD::TRUNCATE, DL, VT, R);

31298

}

31299

31300

// Attempt to fold as unpack(x,x) << zext(y):

31301

// rotl(x,y) -> (unpack(x,x) << (y & (bw-1))) >> bw.

31302

// rotr(x,y) -> (unpack(x,x) >> (y & (bw-1))).

31303

if (IsConstAmt || supportedVectorVarShift(ExtVT, Subtarget, ShiftOpc)) {

31304

// See if we can perform this by unpacking to lo/hi vXi16.

31305

SDValue RLo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, R, R));

31306

SDValue RHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, R, R));

31307

SDValue ALo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, AmtMod, Z));

31308

SDValue AHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, AmtMod, Z));

31309

SDValue Lo = DAG.getNode(ShiftOpc, DL, ExtVT, RLo, ALo);

31310

SDValue Hi = DAG.getNode(ShiftOpc, DL, ExtVT, RHi, AHi);

31311

return getPack(DAG, Subtarget, DL, VT, Lo, Hi, IsROTL);

31312

}

31313

assert((VT == MVT::v16i8 || VT == MVT::v32i8) && "Unsupported vXi8 type")(static_cast <bool> ((VT == MVT::v16i8 || VT == MVT::v32i8
) && "Unsupported vXi8 type") ? void (0) : __assert_fail
("(VT == MVT::v16i8 || VT == MVT::v32i8) && \"Unsupported vXi8 type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31313, __extension__
__PRETTY_FUNCTION__));

31314

31315

// We don't need ModuloAmt here as we just peek at individual bits.

31316

auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {

31317

if (Subtarget.hasSSE41()) {

31318

// On SSE41 targets we can use PBLENDVB which selects bytes based just

31319

// on the sign bit.

31320

V0 = DAG.getBitcast(VT, V0);

31321

V1 = DAG.getBitcast(VT, V1);

31322

Sel = DAG.getBitcast(VT, Sel);

31323

return DAG.getBitcast(SelVT,

31324

DAG.getNode(X86ISD::BLENDV, DL, VT, Sel, V0, V1));

31325

}

31326

// On pre-SSE41 targets we test for the sign bit by comparing to

31327

// zero - a negative value will set all bits of the lanes to true

31328

// and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.

31329

SDValue Z = DAG.getConstant(0, DL, SelVT);

31330

SDValue C = DAG.getNode(X86ISD::PCMPGT, DL, SelVT, Z, Sel);

31331

return DAG.getSelect(DL, SelVT, C, V0, V1);

31332

};

31333

31334

// ISD::ROTR is currently only profitable on AVX512 targets with VPTERNLOG.

31335

if (!IsROTL && !useVPTERNLOG(Subtarget, VT)) {

31336

Amt = DAG.getNode(ISD::SUB, DL, VT, Z, Amt);

31337

IsROTL = true;

31338

}

31339

31340

unsigned ShiftLHS = IsROTL ? ISD::SHL : ISD::SRL;

31341

unsigned ShiftRHS = IsROTL ? ISD::SRL : ISD::SHL;

31342

31343

// Turn 'a' into a mask suitable for VSELECT: a = a << 5;

31344

// We can safely do this using i16 shifts as we're only interested in

31345

// the 3 lower bits of each byte.

31346

Amt = DAG.getBitcast(ExtVT, Amt);

31347

Amt = DAG.getNode(ISD::SHL, DL, ExtVT, Amt, DAG.getConstant(5, DL, ExtVT));

31348

Amt = DAG.getBitcast(VT, Amt);

31349

31350

// r = VSELECT(r, rot(r, 4), a);

31351

SDValue M;

31352

M = DAG.getNode(

31353

ISD::OR, DL, VT,

31354

DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(4, DL, VT)),

31355

DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(4, DL, VT)));

31356

R = SignBitSelect(VT, Amt, M, R);

31357

31358

// a += a

31359

Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt);

31360

31361

// r = VSELECT(r, rot(r, 2), a);

31362

M = DAG.getNode(

31363

ISD::OR, DL, VT,

31364

DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(2, DL, VT)),

31365

DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(6, DL, VT)));

31366

R = SignBitSelect(VT, Amt, M, R);

31367

31368

// a += a

31369

Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt);

31370

31371

// return VSELECT(r, rot(r, 1), a);

31372

M = DAG.getNode(

31373

ISD::OR, DL, VT,

31374

DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(1, DL, VT)),

31375

DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(7, DL, VT)));

31376

return SignBitSelect(VT, Amt, M, R);

31377

}

31378

31379

bool IsSplatAmt = DAG.isSplatValue(Amt);

31380

bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());

31381

bool LegalVarShifts = supportedVectorVarShift(VT, Subtarget, ISD::SHL) &&

31382

supportedVectorVarShift(VT, Subtarget, ISD::SRL);

31383

31384

// Fallback for splats + all supported variable shifts.

31385

// Fallback for non-constants AVX2 vXi16 as well.

31386

if (IsSplatAmt || LegalVarShifts || (Subtarget.hasAVX2() && !ConstantAmt)) {

31387

Amt = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);

31388

SDValue AmtR = DAG.getConstant(EltSizeInBits, DL, VT);

31389

AmtR = DAG.getNode(ISD::SUB, DL, VT, AmtR, Amt);

31390

SDValue SHL = DAG.getNode(IsROTL ? ISD::SHL : ISD::SRL, DL, VT, R, Amt);

31391

SDValue SRL = DAG.getNode(IsROTL ? ISD::SRL : ISD::SHL, DL, VT, R, AmtR);

31392

return DAG.getNode(ISD::OR, DL, VT, SHL, SRL);

31393

}

31394

31395

// Everything below assumes ISD::ROTL.

31396

if (!IsROTL) {

31397

Amt = DAG.getNode(ISD::SUB, DL, VT, Z, Amt);

31398

IsROTL = true;

31399

}

31400

31401

// ISD::ROT* uses modulo rotate amounts.

31402

Amt = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask);

31403

31404

assert(IsROTL && "Only ROTL supported")(static_cast <bool> (IsROTL && "Only ROTL supported"
) ? void (0) : __assert_fail ("IsROTL && \"Only ROTL supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31404, __extension__
__PRETTY_FUNCTION__));

31405

31406

// As with shifts, attempt to convert the rotation amount to a multiplication

31407

// factor, fallback to general expansion.

31408

SDValue Scale = convertShiftLeftToScale(Amt, DL, Subtarget, DAG);

31409

if (!Scale)

31410

return SDValue();

31411

31412

// v8i16/v16i16: perform unsigned multiply hi/lo and OR the results.

31413

if (EltSizeInBits == 16) {

31414

SDValue Lo = DAG.getNode(ISD::MUL, DL, VT, R, Scale);

31415

SDValue Hi = DAG.getNode(ISD::MULHU, DL, VT, R, Scale);

31416

return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);

31417

}

31418

31419

// v4i32: make use of the PMULUDQ instruction to multiply 2 lanes of v4i32

31420

// to v2i64 results at a time. The upper 32-bits contain the wrapped bits

31421

// that can then be OR'd with the lower 32-bits.

31422

assert(VT == MVT::v4i32 && "Only v4i32 vector rotate expected")(static_cast <bool> (VT == MVT::v4i32 && "Only v4i32 vector rotate expected"
) ? void (0) : __assert_fail ("VT == MVT::v4i32 && \"Only v4i32 vector rotate expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31422, __extension__
__PRETTY_FUNCTION__));

31423

static const int OddMask[] = {1, -1, 3, -1};

31424

SDValue R13 = DAG.getVectorShuffle(VT, DL, R, R, OddMask);

31425

SDValue Scale13 = DAG.getVectorShuffle(VT, DL, Scale, Scale, OddMask);

31426

31427

SDValue Res02 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64,

31428

DAG.getBitcast(MVT::v2i64, R),

31429

DAG.getBitcast(MVT::v2i64, Scale));

31430

SDValue Res13 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64,

31431

DAG.getBitcast(MVT::v2i64, R13),

31432

DAG.getBitcast(MVT::v2i64, Scale13));

31433

Res02 = DAG.getBitcast(VT, Res02);

31434

Res13 = DAG.getBitcast(VT, Res13);

31435

31436

return DAG.getNode(ISD::OR, DL, VT,

31437

DAG.getVectorShuffle(VT, DL, Res02, Res13, {0, 4, 2, 6}),

31438

DAG.getVectorShuffle(VT, DL, Res02, Res13, {1, 5, 3, 7}));

31439

}

31440

31441

/// Returns true if the operand type is exactly twice the native width, and

31442

/// the corresponding cmpxchg8b or cmpxchg16b instruction is available.

31443

/// Used to know whether to use cmpxchg8/16b when expanding atomic operations

31444

/// (otherwise we leave them alone to become __sync_fetch_and_... calls).

31445

bool X86TargetLowering::needsCmpXchgNb(Type *MemType) const {

31446

unsigned OpWidth = MemType->getPrimitiveSizeInBits();

31447

31448

if (OpWidth == 64)

31449

return Subtarget.canUseCMPXCHG8B() && !Subtarget.is64Bit();

31450

if (OpWidth == 128)

31451

return Subtarget.canUseCMPXCHG16B();

31452

31453

return false;

31454

}

31455

31456

TargetLoweringBase::AtomicExpansionKind

31457

X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {

31458

Type *MemType = SI->getValueOperand()->getType();

31459

31460

bool NoImplicitFloatOps =

31461

SI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat);

31462

if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&

31463

!Subtarget.useSoftFloat() && !NoImplicitFloatOps &&

31464

(Subtarget.hasSSE1() || Subtarget.hasX87()))

31465

return AtomicExpansionKind::None;

31466

31467

return needsCmpXchgNb(MemType) ? AtomicExpansionKind::Expand

31468

: AtomicExpansionKind::None;

31469

}

31470

31471

// Note: this turns large loads into lock cmpxchg8b/16b.

31472

// TODO: In 32-bit mode, use MOVLPS when SSE1 is available?

31473

TargetLowering::AtomicExpansionKind

31474

X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {

31475

Type *MemType = LI->getType();

31476

31477

// If this a 64 bit atomic load on a 32-bit target and SSE2 is enabled, we

31478

// can use movq to do the load. If we have X87 we can load into an 80-bit

31479

// X87 register and store it to a stack temporary.

31480

bool NoImplicitFloatOps =

31481

LI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat);

31482

if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&

31483

!Subtarget.useSoftFloat() && !NoImplicitFloatOps &&

31484

(Subtarget.hasSSE1() || Subtarget.hasX87()))

31485

return AtomicExpansionKind::None;

31486

31487

return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg

31488

: AtomicExpansionKind::None;

31489

}

31490

31491

enum BitTestKind : unsigned {

31492

UndefBit,

31493

ConstantBit,

31494

NotConstantBit,

31495

ShiftBit,

31496

NotShiftBit

31497

};

31498

31499

static std::pair<Value *, BitTestKind> FindSingleBitChange(Value *V) {

31500

using namespace llvm::PatternMatch;

31501

BitTestKind BTK = UndefBit;

31502

auto *C = dyn_cast<ConstantInt>(V);

31503

if (C) {

31504

// Check if V is a power of 2 or NOT power of 2.

31505

if (isPowerOf2_64(C->getZExtValue()))

31506

BTK = ConstantBit;

31507

else if (isPowerOf2_64((~C->getValue()).getZExtValue()))

31508

BTK = NotConstantBit;

31509

return {V, BTK};

31510

}

31511

31512

// Check if V is some power of 2 pattern known to be non-zero

31513

auto *I = dyn_cast<Instruction>(V);

31514

if (I) {

31515

bool Not = false;

31516

// Check if we have a NOT

31517

Value *PeekI;

31518

if (match(I, m_c_Xor(m_Value(PeekI), m_AllOnes())) ||

31519

match(I, m_Sub(m_AllOnes(), m_Value(PeekI)))) {

31520

Not = true;

31521

I = dyn_cast<Instruction>(PeekI);

31522

31523

// If I is constant, it will fold and we can evaluate later. If its an

31524

// argument or something of that nature, we can't analyze.

31525

if (I == nullptr)

31526

return {nullptr, UndefBit};

31527

}

31528

// We can only use 1 << X without more sophisticated analysis. C << X where

31529

// C is a power of 2 but not 1 can result in zero which cannot be translated

31530

// to bittest. Likewise any C >> X (either arith or logical) can be zero.

31531

if (I->getOpcode() == Instruction::Shl) {

31532

// Todo(1): The cmpxchg case is pretty costly so matching `BLSI(X)`, `X &

31533

// -X` and some other provable power of 2 patterns that we can use CTZ on

31534

// may be profitable.

31535

// Todo(2): It may be possible in some cases to prove that Shl(C, X) is

31536

// non-zero even where C != 1. Likewise LShr(C, X) and AShr(C, X) may also

31537

// be provably a non-zero power of 2.

31538

// Todo(3): ROTL and ROTR patterns on a power of 2 C should also be

31539

// transformable to bittest.

31540

auto *ShiftVal = dyn_cast<ConstantInt>(I->getOperand(0));

31541

if (!ShiftVal)

31542

return {nullptr, UndefBit};

31543

if (ShiftVal->equalsInt(1))

31544

BTK = Not ? NotShiftBit : ShiftBit;

31545

31546

if (BTK == UndefBit)

31547

return {nullptr, UndefBit};

31548

31549

Value *BitV = I->getOperand(1);

31550

31551

Value *AndOp;

31552

const APInt *AndC;

31553

if (match(BitV, m_c_And(m_Value(AndOp), m_APInt(AndC)))) {

31554

// Read past a shiftmask instruction to find count

31555

if (*AndC == (I->getType()->getPrimitiveSizeInBits() - 1))

31556

BitV = AndOp;

31557

}

31558

return {BitV, BTK};

31559

}

31560

}

31561

return {nullptr, UndefBit};

31562

}

31563

31564

TargetLowering::AtomicExpansionKind

31565

X86TargetLowering::shouldExpandLogicAtomicRMWInIR(AtomicRMWInst *AI) const {

31566

// If the atomicrmw's result isn't actually used, we can just add a "lock"

31567

// prefix to a normal instruction for these operations.

31568

if (AI->use_empty())

31569

return AtomicExpansionKind::None;

31570

31571

// If the atomicrmw's result is used by a single bit AND, we may use

31572

// bts/btr/btc instruction for these operations.

31573

// Note: InstCombinePass can cause a de-optimization here. It replaces the

31574

// SETCC(And(AtomicRMW(P, power_of_2), power_of_2)) with LShr and Xor

31575

// (depending on CC). This pattern can only use bts/btr/btc but we don't

31576

// detect it.

31577

Instruction *I = AI->user_back();

31578

auto BitChange = FindSingleBitChange(AI->getValOperand());

31579

if (BitChange.second == UndefBit || !AI->hasOneUse() ||

31580

I->getOpcode() != Instruction::And ||

31581

AI->getType()->getPrimitiveSizeInBits() == 8 ||

31582

AI->getParent() != I->getParent())

31583

return AtomicExpansionKind::CmpXChg;

31584

31585

unsigned OtherIdx = I->getOperand(0) == AI ? 1 : 0;

31586

31587

// This is a redundant AND, it should get cleaned up elsewhere.

31588

if (AI == I->getOperand(OtherIdx))

31589

return AtomicExpansionKind::CmpXChg;

31590

31591

// The following instruction must be a AND single bit.

31592

if (BitChange.second == ConstantBit || BitChange.second == NotConstantBit) {

31593

auto *C1 = cast<ConstantInt>(AI->getValOperand());

31594

auto *C2 = dyn_cast<ConstantInt>(I->getOperand(OtherIdx));

31595

if (!C2 || !isPowerOf2_64(C2->getZExtValue())) {

31596

return AtomicExpansionKind::CmpXChg;

31597

}

31598

if (AI->getOperation() == AtomicRMWInst::And) {

31599

return ~C1->getValue() == C2->getValue()

31600

? AtomicExpansionKind::BitTestIntrinsic

31601

: AtomicExpansionKind::CmpXChg;

31602

}

31603

return C1 == C2 ? AtomicExpansionKind::BitTestIntrinsic

31604

: AtomicExpansionKind::CmpXChg;

31605

}

31606

31607

assert(BitChange.second == ShiftBit || BitChange.second == NotShiftBit)(static_cast <bool> (BitChange.second == ShiftBit || BitChange
.second == NotShiftBit) ? void (0) : __assert_fail ("BitChange.second == ShiftBit || BitChange.second == NotShiftBit"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31607, __extension__
__PRETTY_FUNCTION__));

31608

31609

auto BitTested = FindSingleBitChange(I->getOperand(OtherIdx));

31610

if (BitTested.second != ShiftBit && BitTested.second != NotShiftBit)

31611

return AtomicExpansionKind::CmpXChg;

31612

31613

assert(BitChange.first != nullptr && BitTested.first != nullptr)(static_cast <bool> (BitChange.first != nullptr &&
BitTested.first != nullptr) ? void (0) : __assert_fail ("BitChange.first != nullptr && BitTested.first != nullptr"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31613, __extension__
__PRETTY_FUNCTION__));

31614

31615

// If shift amounts are not the same we can't use BitTestIntrinsic.

31616

if (BitChange.first != BitTested.first)

31617

return AtomicExpansionKind::CmpXChg;

31618

31619

// If atomic AND need to be masking all be one bit and testing the one bit

31620

// unset in the mask.

31621

if (AI->getOperation() == AtomicRMWInst::And)

31622

return (BitChange.second == NotShiftBit && BitTested.second == ShiftBit)

31623

? AtomicExpansionKind::BitTestIntrinsic

31624

: AtomicExpansionKind::CmpXChg;

31625

31626

// If atomic XOR/OR need to be setting and testing the same bit.

31627

return (BitChange.second == ShiftBit && BitTested.second == ShiftBit)

31628

? AtomicExpansionKind::BitTestIntrinsic

31629

: AtomicExpansionKind::CmpXChg;

31630

}

31631

31632

void X86TargetLowering::emitBitTestAtomicRMWIntrinsic(AtomicRMWInst *AI) const {

31633

IRBuilder<> Builder(AI);

31634

Intrinsic::ID IID_C = Intrinsic::not_intrinsic;

31635

Intrinsic::ID IID_I = Intrinsic::not_intrinsic;

31636

switch (AI->getOperation()) {

31637

default:

31638

llvm_unreachable("Unknown atomic operation")::llvm::llvm_unreachable_internal("Unknown atomic operation",
"llvm/lib/Target/X86/X86ISelLowering.cpp", 31638);

31639

case AtomicRMWInst::Or:

31640

IID_C = Intrinsic::x86_atomic_bts;

31641

IID_I = Intrinsic::x86_atomic_bts_rm;

31642

break;

31643

case AtomicRMWInst::Xor:

31644

IID_C = Intrinsic::x86_atomic_btc;

31645

IID_I = Intrinsic::x86_atomic_btc_rm;

31646

break;

31647

case AtomicRMWInst::And:

31648

IID_C = Intrinsic::x86_atomic_btr;

31649

IID_I = Intrinsic::x86_atomic_btr_rm;

31650

break;

31651

}

31652

Instruction *I = AI->user_back();

31653

LLVMContext &Ctx = AI->getContext();

31654

Value *Addr = Builder.CreatePointerCast(AI->getPointerOperand(),

31655

Type::getInt8PtrTy(Ctx));

31656

Function *BitTest = nullptr;

31657

Value *Result = nullptr;

31658

auto BitTested = FindSingleBitChange(AI->getValOperand());

31659

assert(BitTested.first != nullptr)(static_cast <bool> (BitTested.first != nullptr) ? void
(0) : __assert_fail ("BitTested.first != nullptr", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31659, __extension__ __PRETTY_FUNCTION__));

31660

31661

if (BitTested.second == ConstantBit || BitTested.second == NotConstantBit) {

31662

auto *C = cast<ConstantInt>(I->getOperand(I->getOperand(0) == AI ? 1 : 0));

31663

31664

BitTest = Intrinsic::getDeclaration(AI->getModule(), IID_C, AI->getType());

31665

31666

unsigned Imm = countTrailingZeros(C->getZExtValue());

31667

Result = Builder.CreateCall(BitTest, {Addr, Builder.getInt8(Imm)});

31668

} else {

31669

BitTest = Intrinsic::getDeclaration(AI->getModule(), IID_I, AI->getType());

31670

31671

assert(BitTested.second == ShiftBit || BitTested.second == NotShiftBit)(static_cast <bool> (BitTested.second == ShiftBit || BitTested
.second == NotShiftBit) ? void (0) : __assert_fail ("BitTested.second == ShiftBit || BitTested.second == NotShiftBit"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31671, __extension__
__PRETTY_FUNCTION__));

31672

31673

Value *SI = BitTested.first;

31674

assert(SI != nullptr)(static_cast <bool> (SI != nullptr) ? void (0) : __assert_fail
("SI != nullptr", "llvm/lib/Target/X86/X86ISelLowering.cpp",
31674, __extension__ __PRETTY_FUNCTION__));

31675

31676

// BT{S|R|C} on memory operand don't modulo bit position so we need to

31677

// mask it.

31678

unsigned ShiftBits = SI->getType()->getPrimitiveSizeInBits();

31679

Value *BitPos =

31680

Builder.CreateAnd(SI, Builder.getIntN(ShiftBits, ShiftBits - 1));

31681

// Todo(1): In many cases it may be provable that SI is less than

31682

// ShiftBits in which case this mask is unnecessary

31683

// Todo(2): In the fairly idiomatic case of P[X / sizeof_bits(X)] OP 1

31684

// << (X % sizeof_bits(X)) we can drop the shift mask and AGEN in

31685

// favor of just a raw BT{S|R|C}.

31686

31687

Result = Builder.CreateCall(BitTest, {Addr, BitPos});

31688

Result = Builder.CreateZExtOrTrunc(Result, AI->getType());

31689

31690

// If the result is only used for zero/non-zero status then we don't need to

31691

// shift value back. Otherwise do so.

31692

for (auto It = I->user_begin(); It != I->user_end(); ++It) {

31693

if (auto *ICmp = dyn_cast<ICmpInst>(*It)) {

31694

if (ICmp->isEquality()) {

31695

auto *C0 = dyn_cast<ConstantInt>(ICmp->getOperand(0));

31696

auto *C1 = dyn_cast<ConstantInt>(ICmp->getOperand(1));

31697

if (C0 || C1) {

31698

assert(C0 == nullptr || C1 == nullptr)(static_cast <bool> (C0 == nullptr || C1 == nullptr) ? void
(0) : __assert_fail ("C0 == nullptr || C1 == nullptr", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31698, __extension__ __PRETTY_FUNCTION__));

31699

if ((C0 ? C0 : C1)->isZero())

31700

continue;

31701

}

31702

}

31703

}

31704

Result = Builder.CreateShl(Result, BitPos);

31705

break;

31706

}

31707

}

31708

31709

I->replaceAllUsesWith(Result);

31710

I->eraseFromParent();

31711

AI->eraseFromParent();

31712

}

31713

31714

static bool shouldExpandCmpArithRMWInIR(AtomicRMWInst *AI) {

31715

using namespace llvm::PatternMatch;

31716

if (!AI->hasOneUse())

31717

return false;

31718

31719

Value *Op = AI->getOperand(1);

31720

ICmpInst::Predicate Pred;

31721

Instruction *I = AI->user_back();

31722

AtomicRMWInst::BinOp Opc = AI->getOperation();

31723

if (Opc == AtomicRMWInst::Add) {

31724

if (match(I, m_c_ICmp(Pred, m_Sub(m_ZeroInt(), m_Specific(Op)), m_Value())))

31725

return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE;

31726

if (match(I, m_OneUse(m_c_Add(m_Specific(Op), m_Value())))) {

31727

if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_ZeroInt())))

31728

return Pred == CmpInst::ICMP_SLT;

31729

if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_AllOnes())))

31730

return Pred == CmpInst::ICMP_SGT;

31731

}

31732

return false;

31733

}

31734

if (Opc == AtomicRMWInst::Sub) {

31735

if (match(I, m_c_ICmp(Pred, m_Specific(Op), m_Value())))

31736

return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE;

31737

if (match(I, m_OneUse(m_Sub(m_Value(), m_Specific(Op))))) {

31738

if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_ZeroInt())))

31739

return Pred == CmpInst::ICMP_SLT;

31740

if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_AllOnes())))

31741

return Pred == CmpInst::ICMP_SGT;

31742

}

31743

return false;

31744

}

31745

if ((Opc == AtomicRMWInst::Or &&

31746

match(I, m_OneUse(m_c_Or(m_Specific(Op), m_Value())))) ||

31747

(Opc == AtomicRMWInst::And &&

31748

match(I, m_OneUse(m_c_And(m_Specific(Op), m_Value()))))) {

31749

if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_ZeroInt())))

31750

return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE ||

31751

Pred == CmpInst::ICMP_SLT;

31752

if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_AllOnes())))

31753

return Pred == CmpInst::ICMP_SGT;

31754

return false;

31755

}

31756

if (Opc == AtomicRMWInst::Xor) {

31757

if (match(I, m_c_ICmp(Pred, m_Specific(Op), m_Value())))

31758

return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE;

31759

if (match(I, m_OneUse(m_c_Xor(m_Specific(Op), m_Value())))) {

31760

if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_ZeroInt())))

31761

return Pred == CmpInst::ICMP_SLT;

31762

if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_AllOnes())))

31763

return Pred == CmpInst::ICMP_SGT;

31764

}

31765

return false;

31766

}

31767

31768

return false;

31769

}

31770

31771

void X86TargetLowering::emitCmpArithAtomicRMWIntrinsic(

31772

AtomicRMWInst *AI) const {

31773

IRBuilder<> Builder(AI);

31774

Instruction *TempI = nullptr;

31775

LLVMContext &Ctx = AI->getContext();

31776

ICmpInst *ICI = dyn_cast<ICmpInst>(AI->user_back());

31777

if (!ICI) {

31778

TempI = AI->user_back();

31779

assert(TempI->hasOneUse() && "Must have one use")(static_cast <bool> (TempI->hasOneUse() && "Must have one use"
) ? void (0) : __assert_fail ("TempI->hasOneUse() && \"Must have one use\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 31779, __extension__
__PRETTY_FUNCTION__));

31780

ICI = cast<ICmpInst>(TempI->user_back());

31781

}

31782

X86::CondCode CC = X86::COND_INVALID;

31783

ICmpInst::Predicate Pred = ICI->getPredicate();

31784

switch (Pred) {

31785

default:

31786

llvm_unreachable("Not supported Pred")::llvm::llvm_unreachable_internal("Not supported Pred", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 31786);

31787

case CmpInst::ICMP_EQ:

31788

CC = X86::COND_E;

31789

break;

31790

case CmpInst::ICMP_NE:

31791

CC = X86::COND_NE;

31792

break;

31793

case CmpInst::ICMP_SLT:

31794

CC = X86::COND_S;

31795

break;

31796

case CmpInst::ICMP_SGT:

31797

CC = X86::COND_NS;

31798

break;

31799

}

31800

Intrinsic::ID IID = Intrinsic::not_intrinsic;

31801

switch (AI->getOperation()) {

31802

default:

31803

llvm_unreachable("Unknown atomic operation")::llvm::llvm_unreachable_internal("Unknown atomic operation",
"llvm/lib/Target/X86/X86ISelLowering.cpp", 31803);

31804

case AtomicRMWInst::Add:

31805

IID = Intrinsic::x86_atomic_add_cc;

31806

break;

31807

case AtomicRMWInst::Sub:

31808

IID = Intrinsic::x86_atomic_sub_cc;

31809

break;

31810

case AtomicRMWInst::Or:

31811

IID = Intrinsic::x86_atomic_or_cc;

31812

break;

31813

case AtomicRMWInst::And:

31814

IID = Intrinsic::x86_atomic_and_cc;

31815

break;

31816

case AtomicRMWInst::Xor:

31817

IID = Intrinsic::x86_atomic_xor_cc;

31818

break;

31819

}

31820

Function *CmpArith =

31821

Intrinsic::getDeclaration(AI->getModule(), IID, AI->getType());

31822

Value *Addr = Builder.CreatePointerCast(AI->getPointerOperand(),

31823

Type::getInt8PtrTy(Ctx));

31824

Value *Call = Builder.CreateCall(

31825

CmpArith, {Addr, AI->getValOperand(), Builder.getInt32((unsigned)CC)});

31826

Value *Result = Builder.CreateTrunc(Call, Type::getInt1Ty(Ctx));

31827

ICI->replaceAllUsesWith(Result);

31828

ICI->eraseFromParent();

31829

if (TempI)

31830

TempI->eraseFromParent();

31831

AI->eraseFromParent();

31832

}

31833

31834

TargetLowering::AtomicExpansionKind

31835

X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {

31836

unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;

31837

Type *MemType = AI->getType();

31838

31839

// If the operand is too big, we must see if cmpxchg8/16b is available

31840

// and default to library calls otherwise.

31841

if (MemType->getPrimitiveSizeInBits() > NativeWidth) {

31842

return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg

31843

: AtomicExpansionKind::None;

31844

}

31845

31846

AtomicRMWInst::BinOp Op = AI->getOperation();

31847

switch (Op) {

31848

case AtomicRMWInst::Xchg:

31849

return AtomicExpansionKind::None;

31850

case AtomicRMWInst::Add:

31851

case AtomicRMWInst::Sub:

31852

if (shouldExpandCmpArithRMWInIR(AI))

31853

return AtomicExpansionKind::CmpArithIntrinsic;

31854

// It's better to use xadd, xsub or xchg for these in other cases.

31855

return AtomicExpansionKind::None;

31856

case AtomicRMWInst::Or:

31857

case AtomicRMWInst::And:

31858

case AtomicRMWInst::Xor:

31859

if (shouldExpandCmpArithRMWInIR(AI))

31860

return AtomicExpansionKind::CmpArithIntrinsic;

31861

return shouldExpandLogicAtomicRMWInIR(AI);

31862

case AtomicRMWInst::Nand:

31863

case AtomicRMWInst::Max:

31864

case AtomicRMWInst::Min:

31865

case AtomicRMWInst::UMax:

31866

case AtomicRMWInst::UMin:

31867

case AtomicRMWInst::FAdd:

31868

case AtomicRMWInst::FSub:

31869

case AtomicRMWInst::FMax:

31870

case AtomicRMWInst::FMin:

31871

case AtomicRMWInst::UIncWrap:

31872

case AtomicRMWInst::UDecWrap:

31873

default:

31874

// These always require a non-trivial set of data operations on x86. We must

31875

// use a cmpxchg loop.

31876

return AtomicExpansionKind::CmpXChg;

31877

}

31878

}

31879

31880

LoadInst *

31881

X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {

31882

unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;

31883

Type *MemType = AI->getType();

31884

// Accesses larger than the native width are turned into cmpxchg/libcalls, so

31885

// there is no benefit in turning such RMWs into loads, and it is actually

31886

// harmful as it introduces a mfence.

31887

if (MemType->getPrimitiveSizeInBits() > NativeWidth)

31888

return nullptr;

31889

31890

// If this is a canonical idempotent atomicrmw w/no uses, we have a better

31891

// lowering available in lowerAtomicArith.

31892

// TODO: push more cases through this path.

31893

if (auto *C = dyn_cast<ConstantInt>(AI->getValOperand()))

31894

if (AI->getOperation() == AtomicRMWInst::Or && C->isZero() &&

31895

AI->use_empty())

31896

return nullptr;

31897

31898

IRBuilder<> Builder(AI);

31899

Module *M = Builder.GetInsertBlock()->getParent()->getParent();

31900

auto SSID = AI->getSyncScopeID();

31901

// We must restrict the ordering to avoid generating loads with Release or

31902

// ReleaseAcquire orderings.

31903

auto Order = AtomicCmpXchgInst::getStrongestFailureOrdering(AI->getOrdering());

31904

31905

// Before the load we need a fence. Here is an example lifted from

31906

// http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence

31907

// is required:

31908

// Thread 0:

31909

// x.store(1, relaxed);

31910

// r1 = y.fetch_add(0, release);

31911

// Thread 1:

31912

// y.fetch_add(42, acquire);

31913

// r2 = x.load(relaxed);

31914

// r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is

31915

// lowered to just a load without a fence. A mfence flushes the store buffer,

31916

// making the optimization clearly correct.

31917

// FIXME: it is required if isReleaseOrStronger(Order) but it is not clear

31918

// otherwise, we might be able to be more aggressive on relaxed idempotent

31919

// rmw. In practice, they do not look useful, so we don't try to be

31920

// especially clever.

31921

if (SSID == SyncScope::SingleThread)

31922

// FIXME: we could just insert an ISD::MEMBARRIER here, except we are at

31923

// the IR level, so we must wrap it in an intrinsic.

31924

return nullptr;

31925

31926

if (!Subtarget.hasMFence())

31927

// FIXME: it might make sense to use a locked operation here but on a

31928

// different cache-line to prevent cache-line bouncing. In practice it

31929

// is probably a small win, and x86 processors without mfence are rare

31930

// enough that we do not bother.

31931

return nullptr;

31932

31933

Function *MFence =

31934

llvm::Intrinsic::getDeclaration(M, Intrinsic::x86_sse2_mfence);

31935

Builder.CreateCall(MFence, {});

31936

31937

// Finally we can emit the atomic load.

31938

LoadInst *Loaded = Builder.CreateAlignedLoad(

31939

AI->getType(), AI->getPointerOperand(), AI->getAlign());

31940

Loaded->setAtomic(Order, SSID);

31941

AI->replaceAllUsesWith(Loaded);

31942

AI->eraseFromParent();

31943

return Loaded;

31944

}

31945

31946

bool X86TargetLowering::lowerAtomicStoreAsStoreSDNode(const StoreInst &SI) const {

31947

if (!SI.isUnordered())

31948

return false;

31949

return ExperimentalUnorderedISEL;

31950

}

31951

bool X86TargetLowering::lowerAtomicLoadAsLoadSDNode(const LoadInst &LI) const {

31952

if (!LI.isUnordered())

31953

return false;

31954

return ExperimentalUnorderedISEL;

31955

}

31956

31957

31958

/// Emit a locked operation on a stack location which does not change any

31959

/// memory location, but does involve a lock prefix. Location is chosen to be

31960

/// a) very likely accessed only by a single thread to minimize cache traffic,

31961

/// and b) definitely dereferenceable. Returns the new Chain result.

31962

static SDValue emitLockedStackOp(SelectionDAG &DAG,

31963

const X86Subtarget &Subtarget, SDValue Chain,

31964

const SDLoc &DL) {

31965

// Implementation notes:

31966

// 1) LOCK prefix creates a full read/write reordering barrier for memory

31967

// operations issued by the current processor. As such, the location

31968

// referenced is not relevant for the ordering properties of the instruction.

31969

// See: Intel® 64 and IA-32 ArchitecturesSoftware Developer’s Manual,

31970

// 8.2.3.9 Loads and Stores Are Not Reordered with Locked Instructions

31971

// 2) Using an immediate operand appears to be the best encoding choice

31972

// here since it doesn't require an extra register.

31973

// 3) OR appears to be very slightly faster than ADD. (Though, the difference

31974

// is small enough it might just be measurement noise.)

31975

// 4) When choosing offsets, there are several contributing factors:

31976

// a) If there's no redzone, we default to TOS. (We could allocate a cache

31977

// line aligned stack object to improve this case.)

31978

// b) To minimize our chances of introducing a false dependence, we prefer

31979

// to offset the stack usage from TOS slightly.

31980

// c) To minimize concerns about cross thread stack usage - in particular,

31981

// the idiomatic MyThreadPool.run([&StackVars]() {...}) pattern which

31982

// captures state in the TOS frame and accesses it from many threads -

31983

// we want to use an offset such that the offset is in a distinct cache

31984

// line from the TOS frame.

31985

//

31986

// For a general discussion of the tradeoffs and benchmark results, see:

31987

// https://shipilev.net/blog/2014/on-the-fence-with-dependencies/

31988

31989

auto &MF = DAG.getMachineFunction();

31990

auto &TFL = *Subtarget.getFrameLowering();

31991

const unsigned SPOffset = TFL.has128ByteRedZone(MF) ? -64 : 0;

31992

31993

if (Subtarget.is64Bit()) {

31994

SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);

31995

SDValue Ops[] = {

31996

DAG.getRegister(X86::RSP, MVT::i64), // Base

31997

DAG.getTargetConstant(1, DL, MVT::i8), // Scale

31998

DAG.getRegister(0, MVT::i64), // Index

31999

DAG.getTargetConstant(SPOffset, DL, MVT::i32), // Disp

32000

DAG.getRegister(0, MVT::i16), // Segment.

32001

Zero,

32002

Chain};

32003

SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32,

32004

MVT::Other, Ops);

32005

return SDValue(Res, 1);

32006

}

32007

32008

SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);

32009

SDValue Ops[] = {

32010

DAG.getRegister(X86::ESP, MVT::i32), // Base

32011

DAG.getTargetConstant(1, DL, MVT::i8), // Scale

32012

DAG.getRegister(0, MVT::i32), // Index

32013

DAG.getTargetConstant(SPOffset, DL, MVT::i32), // Disp

32014

DAG.getRegister(0, MVT::i16), // Segment.

32015

Zero,

32016

Chain

32017

};

32018

SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32,

32019

MVT::Other, Ops);

32020

return SDValue(Res, 1);

32021

}

32022

32023

static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget &Subtarget,

32024

SelectionDAG &DAG) {

32025

SDLoc dl(Op);

32026

AtomicOrdering FenceOrdering =

32027

static_cast<AtomicOrdering>(Op.getConstantOperandVal(1));

32028

SyncScope::ID FenceSSID =

32029

static_cast<SyncScope::ID>(Op.getConstantOperandVal(2));

32030

32031

// The only fence that needs an instruction is a sequentially-consistent

32032

// cross-thread fence.

32033

if (FenceOrdering == AtomicOrdering::SequentiallyConsistent &&

32034

FenceSSID == SyncScope::System) {

32035

if (Subtarget.hasMFence())

32036

return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));

32037

32038

SDValue Chain = Op.getOperand(0);

32039

return emitLockedStackOp(DAG, Subtarget, Chain, dl);

32040

}

32041

32042

// MEMBARRIER is a compiler barrier; it codegens to a no-op.

32043

return DAG.getNode(ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));

32044

}

32045

32046

static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget &Subtarget,

32047

SelectionDAG &DAG) {

32048

MVT T = Op.getSimpleValueType();

32049

SDLoc DL(Op);

32050

unsigned Reg = 0;

32051

unsigned size = 0;

32052

switch(T.SimpleTy) {

32053

default: llvm_unreachable("Invalid value type!")::llvm::llvm_unreachable_internal("Invalid value type!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32053);

32054

case MVT::i8: Reg = X86::AL; size = 1; break;

32055

case MVT::i16: Reg = X86::AX; size = 2; break;

32056

case MVT::i32: Reg = X86::EAX; size = 4; break;

32057

case MVT::i64:

32058

assert(Subtarget.is64Bit() && "Node not type legal!")(static_cast <bool> (Subtarget.is64Bit() && "Node not type legal!"
) ? void (0) : __assert_fail ("Subtarget.is64Bit() && \"Node not type legal!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32058, __extension__
__PRETTY_FUNCTION__));

32059

Reg = X86::RAX; size = 8;

32060

break;

32061

}

32062

SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg,

32063

Op.getOperand(2), SDValue());

32064

SDValue Ops[] = { cpIn.getValue(0),

32065

Op.getOperand(1),

32066

Op.getOperand(3),

32067

DAG.getTargetConstant(size, DL, MVT::i8),

32068

cpIn.getValue(1) };

32069

SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);

32070

MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();

32071

SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys,

32072

Ops, T, MMO);

32073

32074

SDValue cpOut =

32075

DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));

32076

SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS,

32077

MVT::i32, cpOut.getValue(2));

32078

SDValue Success = getSETCC(X86::COND_E, EFLAGS, DL, DAG);

32079

32080

return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(),

32081

cpOut, Success, EFLAGS.getValue(1));

32082

}

32083

32084

// Create MOVMSKB, taking into account whether we need to split for AVX1.

32085

static SDValue getPMOVMSKB(const SDLoc &DL, SDValue V, SelectionDAG &DAG,

32086

const X86Subtarget &Subtarget) {

32087

MVT InVT = V.getSimpleValueType();

32088

32089

if (InVT == MVT::v64i8) {

32090

SDValue Lo, Hi;

32091

std::tie(Lo, Hi) = DAG.SplitVector(V, DL);

32092

Lo = getPMOVMSKB(DL, Lo, DAG, Subtarget);

32093

Hi = getPMOVMSKB(DL, Hi, DAG, Subtarget);

32094

Lo = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Lo);

32095

Hi = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Hi);

32096

Hi = DAG.getNode(ISD::SHL, DL, MVT::i64, Hi,

32097

DAG.getConstant(32, DL, MVT::i8));

32098

return DAG.getNode(ISD::OR, DL, MVT::i64, Lo, Hi);

32099

}

32100

if (InVT == MVT::v32i8 && !Subtarget.hasInt256()) {

32101

SDValue Lo, Hi;

32102

std::tie(Lo, Hi) = DAG.SplitVector(V, DL);

32103

Lo = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Lo);

32104

Hi = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Hi);

32105

Hi = DAG.getNode(ISD::SHL, DL, MVT::i32, Hi,

32106

DAG.getConstant(16, DL, MVT::i8));

32107

return DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi);

32108

}

32109

32110

return DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);

32111

}

32112

32113

static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget,

32114

SelectionDAG &DAG) {

32115

SDValue Src = Op.getOperand(0);

32116

MVT SrcVT = Src.getSimpleValueType();

32117

MVT DstVT = Op.getSimpleValueType();

32118

32119

// Legalize (v64i1 (bitcast i64 (X))) by splitting the i64, bitcasting each

32120

// half to v32i1 and concatenating the result.

32121

if (SrcVT == MVT::i64 && DstVT == MVT::v64i1) {

32122

assert(!Subtarget.is64Bit() && "Expected 32-bit mode")(static_cast <bool> (!Subtarget.is64Bit() && "Expected 32-bit mode"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"Expected 32-bit mode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32122, __extension__
__PRETTY_FUNCTION__));

32123

assert(Subtarget.hasBWI() && "Expected BWI target")(static_cast <bool> (Subtarget.hasBWI() && "Expected BWI target"
) ? void (0) : __assert_fail ("Subtarget.hasBWI() && \"Expected BWI target\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32123, __extension__
__PRETTY_FUNCTION__));

32124

SDLoc dl(Op);

32125

SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Src,

32126

DAG.getIntPtrConstant(0, dl));

32127

Lo = DAG.getBitcast(MVT::v32i1, Lo);

32128

SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Src,

32129

DAG.getIntPtrConstant(1, dl));

32130

Hi = DAG.getBitcast(MVT::v32i1, Hi);

32131

return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);

32132

}

32133

32134

// Use MOVMSK for vector to scalar conversion to prevent scalarization.

32135

if ((SrcVT == MVT::v16i1 || SrcVT == MVT::v32i1) && DstVT.isScalarInteger()) {

32136

assert(!Subtarget.hasAVX512() && "Should use K-registers with AVX512")(static_cast <bool> (!Subtarget.hasAVX512() && "Should use K-registers with AVX512"
) ? void (0) : __assert_fail ("!Subtarget.hasAVX512() && \"Should use K-registers with AVX512\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32136, __extension__
__PRETTY_FUNCTION__));

32137

MVT SExtVT = SrcVT == MVT::v16i1 ? MVT::v16i8 : MVT::v32i8;

32138

SDLoc DL(Op);

32139

SDValue V = DAG.getSExtOrTrunc(Src, DL, SExtVT);

32140

V = getPMOVMSKB(DL, V, DAG, Subtarget);

32141

return DAG.getZExtOrTrunc(V, DL, DstVT);

32142

}

32143

32144

assert((SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 ||(static_cast <bool> ((SrcVT == MVT::v2i32 || SrcVT == MVT
::v4i16 || SrcVT == MVT::v8i8 || SrcVT == MVT::i64) &&
"Unexpected VT!") ? void (0) : __assert_fail ("(SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 || SrcVT == MVT::i64) && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32145, __extension__
__PRETTY_FUNCTION__))

32145

SrcVT == MVT::i64) && "Unexpected VT!")(static_cast <bool> ((SrcVT == MVT::v2i32 || SrcVT == MVT
::v4i16 || SrcVT == MVT::v8i8 || SrcVT == MVT::i64) &&
"Unexpected VT!") ? void (0) : __assert_fail ("(SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 || SrcVT == MVT::i64) && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32145, __extension__
__PRETTY_FUNCTION__));

32146

32147

assert(Subtarget.hasSSE2() && "Requires at least SSE2!")(static_cast <bool> (Subtarget.hasSSE2() && "Requires at least SSE2!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"Requires at least SSE2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32147, __extension__
__PRETTY_FUNCTION__));

32148

if (!(DstVT == MVT::f64 && SrcVT == MVT::i64) &&

32149

!(DstVT == MVT::x86mmx && SrcVT.isVector()))

32150

// This conversion needs to be expanded.

32151

return SDValue();

32152

32153

SDLoc dl(Op);

32154

if (SrcVT.isVector()) {

32155

// Widen the vector in input in the case of MVT::v2i32.

32156

// Example: from MVT::v2i32 to MVT::v4i32.

32157

MVT NewVT = MVT::getVectorVT(SrcVT.getVectorElementType(),

32158

SrcVT.getVectorNumElements() * 2);

32159

Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewVT, Src,

32160

DAG.getUNDEF(SrcVT));

32161

} else {

32162

assert(SrcVT == MVT::i64 && !Subtarget.is64Bit() &&(static_cast <bool> (SrcVT == MVT::i64 && !Subtarget
.is64Bit() && "Unexpected source type in LowerBITCAST"
) ? void (0) : __assert_fail ("SrcVT == MVT::i64 && !Subtarget.is64Bit() && \"Unexpected source type in LowerBITCAST\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32163, __extension__
__PRETTY_FUNCTION__))

32163

"Unexpected source type in LowerBITCAST")(static_cast <bool> (SrcVT == MVT::i64 && !Subtarget
.is64Bit() && "Unexpected source type in LowerBITCAST"
) ? void (0) : __assert_fail ("SrcVT == MVT::i64 && !Subtarget.is64Bit() && \"Unexpected source type in LowerBITCAST\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32163, __extension__
__PRETTY_FUNCTION__));

32164

Src = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Src);

32165

}

32166

32167

MVT V2X64VT = DstVT == MVT::f64 ? MVT::v2f64 : MVT::v2i64;

32168

Src = DAG.getNode(ISD::BITCAST, dl, V2X64VT, Src);

32169

32170

if (DstVT == MVT::x86mmx)

32171

return DAG.getNode(X86ISD::MOVDQ2Q, dl, DstVT, Src);

32172

32173

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, DstVT, Src,

32174

DAG.getIntPtrConstant(0, dl));

32175

}

32176

32177

/// Compute the horizontal sum of bytes in V for the elements of VT.

32178

///

32179

/// Requires V to be a byte vector and VT to be an integer vector type with

32180

/// wider elements than V's type. The width of the elements of VT determines

32181

/// how many bytes of V are summed horizontally to produce each element of the

32182

/// result.

32183

static SDValue LowerHorizontalByteSum(SDValue V, MVT VT,

32184

const X86Subtarget &Subtarget,

32185

SelectionDAG &DAG) {

32186

SDLoc DL(V);

32187

MVT ByteVecVT = V.getSimpleValueType();

32188

MVT EltVT = VT.getVectorElementType();

32189

assert(ByteVecVT.getVectorElementType() == MVT::i8 &&(static_cast <bool> (ByteVecVT.getVectorElementType() ==
MVT::i8 && "Expected value to have byte element type."
) ? void (0) : __assert_fail ("ByteVecVT.getVectorElementType() == MVT::i8 && \"Expected value to have byte element type.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32190, __extension__
__PRETTY_FUNCTION__))

32190

"Expected value to have byte element type.")(static_cast <bool> (ByteVecVT.getVectorElementType() ==
MVT::i8 && "Expected value to have byte element type."
) ? void (0) : __assert_fail ("ByteVecVT.getVectorElementType() == MVT::i8 && \"Expected value to have byte element type.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32190, __extension__
__PRETTY_FUNCTION__));

32191

assert(EltVT != MVT::i8 &&(static_cast <bool> (EltVT != MVT::i8 && "Horizontal byte sum only makes sense for wider elements!"
) ? void (0) : __assert_fail ("EltVT != MVT::i8 && \"Horizontal byte sum only makes sense for wider elements!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32192, __extension__
__PRETTY_FUNCTION__))

32192

"Horizontal byte sum only makes sense for wider elements!")(static_cast <bool> (EltVT != MVT::i8 && "Horizontal byte sum only makes sense for wider elements!"
) ? void (0) : __assert_fail ("EltVT != MVT::i8 && \"Horizontal byte sum only makes sense for wider elements!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32192, __extension__
__PRETTY_FUNCTION__));

32193

unsigned VecSize = VT.getSizeInBits();

32194

assert(ByteVecVT.getSizeInBits() == VecSize && "Cannot change vector size!")(static_cast <bool> (ByteVecVT.getSizeInBits() == VecSize
&& "Cannot change vector size!") ? void (0) : __assert_fail
("ByteVecVT.getSizeInBits() == VecSize && \"Cannot change vector size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32194, __extension__
__PRETTY_FUNCTION__));

32195

32196

// PSADBW instruction horizontally add all bytes and leave the result in i64

32197

// chunks, thus directly computes the pop count for v2i64 and v4i64.

32198

if (EltVT == MVT::i64) {

32199

SDValue Zeros = DAG.getConstant(0, DL, ByteVecVT);

32200

MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);

32201

V = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, V, Zeros);

32202

return DAG.getBitcast(VT, V);

32203

}

32204

32205

if (EltVT == MVT::i32) {

32206

// We unpack the low half and high half into i32s interleaved with zeros so

32207

// that we can use PSADBW to horizontally sum them. The most useful part of

32208

// this is that it lines up the results of two PSADBW instructions to be

32209

// two v2i64 vectors which concatenated are the 4 population counts. We can

32210

// then use PACKUSWB to shrink and concatenate them into a v4i32 again.

32211

SDValue Zeros = DAG.getConstant(0, DL, VT);

32212

SDValue V32 = DAG.getBitcast(VT, V);

32213

SDValue Low = getUnpackl(DAG, DL, VT, V32, Zeros);

32214

SDValue High = getUnpackh(DAG, DL, VT, V32, Zeros);

32215

32216

// Do the horizontal sums into two v2i64s.

32217

Zeros = DAG.getConstant(0, DL, ByteVecVT);

32218

MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);

32219

Low = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,

32220

DAG.getBitcast(ByteVecVT, Low), Zeros);

32221

High = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,

32222

DAG.getBitcast(ByteVecVT, High), Zeros);

32223

32224

// Merge them together.

32225

MVT ShortVecVT = MVT::getVectorVT(MVT::i16, VecSize / 16);

32226

V = DAG.getNode(X86ISD::PACKUS, DL, ByteVecVT,

32227

DAG.getBitcast(ShortVecVT, Low),

32228

DAG.getBitcast(ShortVecVT, High));

32229

32230

return DAG.getBitcast(VT, V);

32231

}

32232

32233

// The only element type left is i16.

32234

assert(EltVT == MVT::i16 && "Unknown how to handle type")(static_cast <bool> (EltVT == MVT::i16 && "Unknown how to handle type"
) ? void (0) : __assert_fail ("EltVT == MVT::i16 && \"Unknown how to handle type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32234, __extension__
__PRETTY_FUNCTION__));

32235

32236

// To obtain pop count for each i16 element starting from the pop count for

32237

// i8 elements, shift the i16s left by 8, sum as i8s, and then shift as i16s

32238

// right by 8. It is important to shift as i16s as i8 vector shift isn't

32239

// directly supported.

32240

SDValue ShifterV = DAG.getConstant(8, DL, VT);

32241

SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, DAG.getBitcast(VT, V), ShifterV);

32242

V = DAG.getNode(ISD::ADD, DL, ByteVecVT, DAG.getBitcast(ByteVecVT, Shl),

32243

DAG.getBitcast(ByteVecVT, V));

32244

return DAG.getNode(ISD::SRL, DL, VT, DAG.getBitcast(VT, V), ShifterV);

32245

}

32246

32247

static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, const SDLoc &DL,

32248

const X86Subtarget &Subtarget,

32249

SelectionDAG &DAG) {

32250

MVT VT = Op.getSimpleValueType();

32251

MVT EltVT = VT.getVectorElementType();

32252

int NumElts = VT.getVectorNumElements();

32253

(void)EltVT;

32254

assert(EltVT == MVT::i8 && "Only vXi8 vector CTPOP lowering supported.")(static_cast <bool> (EltVT == MVT::i8 && "Only vXi8 vector CTPOP lowering supported."
) ? void (0) : __assert_fail ("EltVT == MVT::i8 && \"Only vXi8 vector CTPOP lowering supported.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32254, __extension__
__PRETTY_FUNCTION__));

32255

32256

// Implement a lookup table in register by using an algorithm based on:

32257

// http://wm.ite.pl/articles/sse-popcount.html

32258

//

32259

// The general idea is that every lower byte nibble in the input vector is an

32260

// index into a in-register pre-computed pop count table. We then split up the

32261

// input vector in two new ones: (1) a vector with only the shifted-right

32262

// higher nibbles for each byte and (2) a vector with the lower nibbles (and

32263

// masked out higher ones) for each byte. PSHUFB is used separately with both

32264

// to index the in-register table. Next, both are added and the result is a

32265

// i8 vector where each element contains the pop count for input byte.

32266

const int LUT[16] = {/* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2,

32267

/* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3,

32268

/* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3,

32269

/* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4};

32270

32271

SmallVector<SDValue, 64> LUTVec;

32272

for (int i = 0; i < NumElts; ++i)

32273

LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));

32274

SDValue InRegLUT = DAG.getBuildVector(VT, DL, LUTVec);

32275

SDValue M0F = DAG.getConstant(0x0F, DL, VT);

32276

32277

// High nibbles

32278

SDValue FourV = DAG.getConstant(4, DL, VT);

32279

SDValue HiNibbles = DAG.getNode(ISD::SRL, DL, VT, Op, FourV);

32280

32281

// Low nibbles

32282

SDValue LoNibbles = DAG.getNode(ISD::AND, DL, VT, Op, M0F);

32283

32284

// The input vector is used as the shuffle mask that index elements into the

32285

// LUT. After counting low and high nibbles, add the vector to obtain the

32286

// final pop count per i8 element.

32287

SDValue HiPopCnt = DAG.getNode(X86ISD::PSHUFB, DL, VT, InRegLUT, HiNibbles);

32288

SDValue LoPopCnt = DAG.getNode(X86ISD::PSHUFB, DL, VT, InRegLUT, LoNibbles);

32289

return DAG.getNode(ISD::ADD, DL, VT, HiPopCnt, LoPopCnt);

32290

}

32291

32292

// Please ensure that any codegen change from LowerVectorCTPOP is reflected in

32293

// updated cost models in X86TTIImpl::getIntrinsicInstrCost.

32294

static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget &Subtarget,

32295

SelectionDAG &DAG) {

32296

MVT VT = Op.getSimpleValueType();

32297

assert((VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) &&(static_cast <bool> ((VT.is512BitVector() || VT.is256BitVector
() || VT.is128BitVector()) && "Unknown CTPOP type to handle"
) ? void (0) : __assert_fail ("(VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) && \"Unknown CTPOP type to handle\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32298, __extension__
__PRETTY_FUNCTION__))

32298

"Unknown CTPOP type to handle")(static_cast <bool> ((VT.is512BitVector() || VT.is256BitVector
() || VT.is128BitVector()) && "Unknown CTPOP type to handle"
) ? void (0) : __assert_fail ("(VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) && \"Unknown CTPOP type to handle\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32298, __extension__
__PRETTY_FUNCTION__));

32299

SDLoc DL(Op.getNode());

32300

SDValue Op0 = Op.getOperand(0);

32301

32302

// TRUNC(CTPOP(ZEXT(X))) to make use of vXi32/vXi64 VPOPCNT instructions.

32303

if (Subtarget.hasVPOPCNTDQ()) {

32304

unsigned NumElems = VT.getVectorNumElements();

32305

assert((VT.getVectorElementType() == MVT::i8 ||(static_cast <bool> ((VT.getVectorElementType() == MVT::
i8 || VT.getVectorElementType() == MVT::i16) && "Unexpected type"
) ? void (0) : __assert_fail ("(VT.getVectorElementType() == MVT::i8 || VT.getVectorElementType() == MVT::i16) && \"Unexpected type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32306, __extension__
__PRETTY_FUNCTION__))

32306

VT.getVectorElementType() == MVT::i16) && "Unexpected type")(static_cast <bool> ((VT.getVectorElementType() == MVT::
i8 || VT.getVectorElementType() == MVT::i16) && "Unexpected type"
) ? void (0) : __assert_fail ("(VT.getVectorElementType() == MVT::i8 || VT.getVectorElementType() == MVT::i16) && \"Unexpected type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32306, __extension__
__PRETTY_FUNCTION__));

32307

if (NumElems < 16 || (NumElems == 16 && Subtarget.canExtendTo512DQ())) {

32308

MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);

32309

Op = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, Op0);

32310

Op = DAG.getNode(ISD::CTPOP, DL, NewVT, Op);

32311

return DAG.getNode(ISD::TRUNCATE, DL, VT, Op);

32312

}

32313

}

32314

32315

// Decompose 256-bit ops into smaller 128-bit ops.

32316

if (VT.is256BitVector() && !Subtarget.hasInt256())

32317

return splitVectorIntUnary(Op, DAG);

32318

32319

// Decompose 512-bit ops into smaller 256-bit ops.

32320

if (VT.is512BitVector() && !Subtarget.hasBWI())

32321

return splitVectorIntUnary(Op, DAG);

32322

32323

// For element types greater than i8, do vXi8 pop counts and a bytesum.

32324

if (VT.getScalarType() != MVT::i8) {

32325

MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);

32326

SDValue ByteOp = DAG.getBitcast(ByteVT, Op0);

32327

SDValue PopCnt8 = DAG.getNode(ISD::CTPOP, DL, ByteVT, ByteOp);

32328

return LowerHorizontalByteSum(PopCnt8, VT, Subtarget, DAG);

32329

}

32330

32331

// We can't use the fast LUT approach, so fall back on LegalizeDAG.

32332

if (!Subtarget.hasSSSE3())

32333

return SDValue();

32334

32335

return LowerVectorCTPOPInRegLUT(Op0, DL, Subtarget, DAG);

32336

}

32337

32338

static SDValue LowerCTPOP(SDValue Op, const X86Subtarget &Subtarget,

32339

SelectionDAG &DAG) {

32340

assert(Op.getSimpleValueType().isVector() &&(static_cast <bool> (Op.getSimpleValueType().isVector()
&& "We only do custom lowering for vector population count."
) ? void (0) : __assert_fail ("Op.getSimpleValueType().isVector() && \"We only do custom lowering for vector population count.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32341, __extension__
__PRETTY_FUNCTION__))

32341

"We only do custom lowering for vector population count.")(static_cast <bool> (Op.getSimpleValueType().isVector()
&& "We only do custom lowering for vector population count."
) ? void (0) : __assert_fail ("Op.getSimpleValueType().isVector() && \"We only do custom lowering for vector population count.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32341, __extension__
__PRETTY_FUNCTION__));

32342

return LowerVectorCTPOP(Op, Subtarget, DAG);

32343

}

32344

32345

static SDValue LowerBITREVERSE_XOP(SDValue Op, SelectionDAG &DAG) {

32346

MVT VT = Op.getSimpleValueType();

32347

SDValue In = Op.getOperand(0);

32348

SDLoc DL(Op);

32349

32350

// For scalars, its still beneficial to transfer to/from the SIMD unit to

32351

// perform the BITREVERSE.

32352

if (!VT.isVector()) {

32353

MVT VecVT = MVT::getVectorVT(VT, 128 / VT.getSizeInBits());

32354

SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In);

32355

Res = DAG.getNode(ISD::BITREVERSE, DL, VecVT, Res);

32356

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Res,

32357

DAG.getIntPtrConstant(0, DL));

32358

}

32359

32360

int NumElts = VT.getVectorNumElements();

32361

int ScalarSizeInBytes = VT.getScalarSizeInBits() / 8;

32362

32363

// Decompose 256-bit ops into smaller 128-bit ops.

32364

if (VT.is256BitVector())

32365

return splitVectorIntUnary(Op, DAG);

32366

32367

assert(VT.is128BitVector() &&(static_cast <bool> (VT.is128BitVector() && "Only 128-bit vector bitreverse lowering supported."
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Only 128-bit vector bitreverse lowering supported.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32368, __extension__
__PRETTY_FUNCTION__))

32368

"Only 128-bit vector bitreverse lowering supported.")(static_cast <bool> (VT.is128BitVector() && "Only 128-bit vector bitreverse lowering supported."
) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Only 128-bit vector bitreverse lowering supported.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32368, __extension__
__PRETTY_FUNCTION__));

32369

32370

// VPPERM reverses the bits of a byte with the permute Op (2 << 5), and we

32371

// perform the BSWAP in the shuffle.

32372

// Its best to shuffle using the second operand as this will implicitly allow

32373

// memory folding for multiple vectors.

32374

SmallVector<SDValue, 16> MaskElts;

32375

for (int i = 0; i != NumElts; ++i) {

32376

for (int j = ScalarSizeInBytes - 1; j >= 0; --j) {

32377

int SourceByte = 16 + (i * ScalarSizeInBytes) + j;

32378

int PermuteByte = SourceByte | (2 << 5);

32379

MaskElts.push_back(DAG.getConstant(PermuteByte, DL, MVT::i8));

32380

}

32381

}

32382

32383

SDValue Mask = DAG.getBuildVector(MVT::v16i8, DL, MaskElts);

32384

SDValue Res = DAG.getBitcast(MVT::v16i8, In);

32385

Res = DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, DAG.getUNDEF(MVT::v16i8),

32386

Res, Mask);

32387

return DAG.getBitcast(VT, Res);

32388

}

32389

32390

static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget,

32391

SelectionDAG &DAG) {

32392

MVT VT = Op.getSimpleValueType();

32393

32394

if (Subtarget.hasXOP() && !VT.is512BitVector())

32395

return LowerBITREVERSE_XOP(Op, DAG);

32396

32397

assert(Subtarget.hasSSSE3() && "SSSE3 required for BITREVERSE")(static_cast <bool> (Subtarget.hasSSSE3() && "SSSE3 required for BITREVERSE"
) ? void (0) : __assert_fail ("Subtarget.hasSSSE3() && \"SSSE3 required for BITREVERSE\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32397, __extension__
__PRETTY_FUNCTION__));

32398

32399

SDValue In = Op.getOperand(0);

32400

SDLoc DL(Op);

32401

32402

assert(VT.getScalarType() == MVT::i8 &&(static_cast <bool> (VT.getScalarType() == MVT::i8 &&
"Only byte vector BITREVERSE supported") ? void (0) : __assert_fail
("VT.getScalarType() == MVT::i8 && \"Only byte vector BITREVERSE supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32403, __extension__
__PRETTY_FUNCTION__))

32403

"Only byte vector BITREVERSE supported")(static_cast <bool> (VT.getScalarType() == MVT::i8 &&
"Only byte vector BITREVERSE supported") ? void (0) : __assert_fail
("VT.getScalarType() == MVT::i8 && \"Only byte vector BITREVERSE supported\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32403, __extension__
__PRETTY_FUNCTION__));

32404

32405

// Split v64i8 without BWI so that we can still use the PSHUFB lowering.

32406

if (VT == MVT::v64i8 && !Subtarget.hasBWI())

32407

return splitVectorIntUnary(Op, DAG);

32408

32409

// Decompose 256-bit ops into smaller 128-bit ops on pre-AVX2.

32410

if (VT == MVT::v32i8 && !Subtarget.hasInt256())

32411

return splitVectorIntUnary(Op, DAG);

32412

32413

unsigned NumElts = VT.getVectorNumElements();

32414

32415

// If we have GFNI, we can use GF2P8AFFINEQB to reverse the bits.

32416

if (Subtarget.hasGFNI()) {

32417

MVT MatrixVT = MVT::getVectorVT(MVT::i64, NumElts / 8);

32418

SDValue Matrix = DAG.getConstant(0x8040201008040201ULL, DL, MatrixVT);

32419

Matrix = DAG.getBitcast(VT, Matrix);

32420

return DAG.getNode(X86ISD::GF2P8AFFINEQB, DL, VT, In, Matrix,

32421

DAG.getTargetConstant(0, DL, MVT::i8));

32422

}

32423

32424

// Perform BITREVERSE using PSHUFB lookups. Each byte is split into

32425

// two nibbles and a PSHUFB lookup to find the bitreverse of each

32426

// 0-15 value (moved to the other nibble).

32427

SDValue NibbleMask = DAG.getConstant(0xF, DL, VT);

32428

SDValue Lo = DAG.getNode(ISD::AND, DL, VT, In, NibbleMask);

32429

SDValue Hi = DAG.getNode(ISD::SRL, DL, VT, In, DAG.getConstant(4, DL, VT));

32430

32431

const int LoLUT[16] = {

32432

/* 0 */ 0x00, /* 1 */ 0x80, /* 2 */ 0x40, /* 3 */ 0xC0,

32433

/* 4 */ 0x20, /* 5 */ 0xA0, /* 6 */ 0x60, /* 7 */ 0xE0,

32434

/* 8 */ 0x10, /* 9 */ 0x90, /* a */ 0x50, /* b */ 0xD0,

32435

/* c */ 0x30, /* d */ 0xB0, /* e */ 0x70, /* f */ 0xF0};

32436

const int HiLUT[16] = {

32437

/* 0 */ 0x00, /* 1 */ 0x08, /* 2 */ 0x04, /* 3 */ 0x0C,

32438

/* 4 */ 0x02, /* 5 */ 0x0A, /* 6 */ 0x06, /* 7 */ 0x0E,

32439

/* 8 */ 0x01, /* 9 */ 0x09, /* a */ 0x05, /* b */ 0x0D,

32440

/* c */ 0x03, /* d */ 0x0B, /* e */ 0x07, /* f */ 0x0F};

32441

32442

SmallVector<SDValue, 16> LoMaskElts, HiMaskElts;

32443

for (unsigned i = 0; i < NumElts; ++i) {

32444

LoMaskElts.push_back(DAG.getConstant(LoLUT[i % 16], DL, MVT::i8));

32445

HiMaskElts.push_back(DAG.getConstant(HiLUT[i % 16], DL, MVT::i8));

32446

}

32447

32448

SDValue LoMask = DAG.getBuildVector(VT, DL, LoMaskElts);

32449

SDValue HiMask = DAG.getBuildVector(VT, DL, HiMaskElts);

32450

Lo = DAG.getNode(X86ISD::PSHUFB, DL, VT, LoMask, Lo);

32451

Hi = DAG.getNode(X86ISD::PSHUFB, DL, VT, HiMask, Hi);

32452

return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);

32453

}

32454

32455

static SDValue LowerPARITY(SDValue Op, const X86Subtarget &Subtarget,

32456

SelectionDAG &DAG) {

32457

SDLoc DL(Op);

32458

SDValue X = Op.getOperand(0);

32459

MVT VT = Op.getSimpleValueType();

32460

32461

// Special case. If the input fits in 8-bits we can use a single 8-bit TEST.

32462

if (VT == MVT::i8 ||

32463

DAG.MaskedValueIsZero(X, APInt::getBitsSetFrom(VT.getSizeInBits(), 8))) {

32464

X = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X);

32465

SDValue Flags = DAG.getNode(X86ISD::CMP, DL, MVT::i32, X,

32466

DAG.getConstant(0, DL, MVT::i8));

32467

// Copy the inverse of the parity flag into a register with setcc.

32468

SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG);

32469

// Extend to the original type.

32470

return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Setnp);

32471

}

32472

32473

// If we have POPCNT, use the default expansion.

32474

if (Subtarget.hasPOPCNT())

32475

return SDValue();

32476

32477

if (VT == MVT::i64) {

32478

// Xor the high and low 16-bits together using a 32-bit operation.

32479

SDValue Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32,

32480

DAG.getNode(ISD::SRL, DL, MVT::i64, X,

32481

DAG.getConstant(32, DL, MVT::i8)));

32482

SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, X);

32483

X = DAG.getNode(ISD::XOR, DL, MVT::i32, Lo, Hi);

32484

}

32485

32486

if (VT != MVT::i16) {

32487

// Xor the high and low 16-bits together using a 32-bit operation.

32488

SDValue Hi16 = DAG.getNode(ISD::SRL, DL, MVT::i32, X,

32489

DAG.getConstant(16, DL, MVT::i8));

32490

X = DAG.getNode(ISD::XOR, DL, MVT::i32, X, Hi16);

32491

} else {

32492

// If the input is 16-bits, we need to extend to use an i32 shift below.

32493

X = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, X);

32494

}

32495

32496

// Finally xor the low 2 bytes together and use a 8-bit flag setting xor.

32497

// This should allow an h-reg to be used to save a shift.

32498

SDValue Hi = DAG.getNode(

32499

ISD::TRUNCATE, DL, MVT::i8,

32500

DAG.getNode(ISD::SRL, DL, MVT::i32, X, DAG.getConstant(8, DL, MVT::i8)));

32501

SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X);

32502

SDVTList VTs = DAG.getVTList(MVT::i8, MVT::i32);

32503

SDValue Flags = DAG.getNode(X86ISD::XOR, DL, VTs, Lo, Hi).getValue(1);

32504

32505

// Copy the inverse of the parity flag into a register with setcc.

32506

SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG);

32507

// Extend to the original type.

32508

return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Setnp);

32509

}

32510

32511

static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG,

32512

const X86Subtarget &Subtarget) {

32513

unsigned NewOpc = 0;

32514

switch (N->getOpcode()) {

32515

case ISD::ATOMIC_LOAD_ADD:

32516

NewOpc = X86ISD::LADD;

32517

break;

32518

case ISD::ATOMIC_LOAD_SUB:

32519

NewOpc = X86ISD::LSUB;

32520

break;

32521

case ISD::ATOMIC_LOAD_OR:

32522

NewOpc = X86ISD::LOR;

32523

break;

32524

case ISD::ATOMIC_LOAD_XOR:

32525

NewOpc = X86ISD::LXOR;

32526

break;

32527

case ISD::ATOMIC_LOAD_AND:

32528

NewOpc = X86ISD::LAND;

32529

break;

32530

default:

32531

llvm_unreachable("Unknown ATOMIC_LOAD_ opcode")::llvm::llvm_unreachable_internal("Unknown ATOMIC_LOAD_ opcode"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32531);

32532

}

32533

32534

MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand();

32535

32536

return DAG.getMemIntrinsicNode(

32537

NewOpc, SDLoc(N), DAG.getVTList(MVT::i32, MVT::Other),

32538

{N->getOperand(0), N->getOperand(1), N->getOperand(2)},

32539

/*MemVT=*/N->getSimpleValueType(0), MMO);

32540

}

32541

32542

/// Lower atomic_load_ops into LOCK-prefixed operations.

32543

static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG,

32544

const X86Subtarget &Subtarget) {

32545

AtomicSDNode *AN = cast<AtomicSDNode>(N.getNode());

32546

SDValue Chain = N->getOperand(0);

32547

SDValue LHS = N->getOperand(1);

32548

SDValue RHS = N->getOperand(2);

32549

unsigned Opc = N->getOpcode();

32550

MVT VT = N->getSimpleValueType(0);

32551

SDLoc DL(N);

32552

32553

// We can lower atomic_load_add into LXADD. However, any other atomicrmw op

32554

// can only be lowered when the result is unused. They should have already

32555

// been transformed into a cmpxchg loop in AtomicExpand.

32556

if (N->hasAnyUseOfValue(0)) {

32557

// Handle (atomic_load_sub p, v) as (atomic_load_add p, -v), to be able to

32558

// select LXADD if LOCK_SUB can't be selected.

32559

if (Opc == ISD::ATOMIC_LOAD_SUB) {

32560

RHS = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), RHS);

32561

return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, DL, VT, Chain, LHS,

32562

RHS, AN->getMemOperand());

32563

}

32564

assert(Opc == ISD::ATOMIC_LOAD_ADD &&(static_cast <bool> (Opc == ISD::ATOMIC_LOAD_ADD &&
"Used AtomicRMW ops other than Add should have been expanded!"
) ? void (0) : __assert_fail ("Opc == ISD::ATOMIC_LOAD_ADD && \"Used AtomicRMW ops other than Add should have been expanded!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32565, __extension__
__PRETTY_FUNCTION__))

32565

"Used AtomicRMW ops other than Add should have been expanded!")(static_cast <bool> (Opc == ISD::ATOMIC_LOAD_ADD &&
"Used AtomicRMW ops other than Add should have been expanded!"
) ? void (0) : __assert_fail ("Opc == ISD::ATOMIC_LOAD_ADD && \"Used AtomicRMW ops other than Add should have been expanded!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32565, __extension__
__PRETTY_FUNCTION__));

32566

return N;

32567

}

32568

32569

// Specialized lowering for the canonical form of an idemptotent atomicrmw.

32570

// The core idea here is that since the memory location isn't actually

32571

// changing, all we need is a lowering for the *ordering* impacts of the

32572

// atomicrmw. As such, we can chose a different operation and memory

32573

// location to minimize impact on other code.

32574

if (Opc == ISD::ATOMIC_LOAD_OR && isNullConstant(RHS)) {

32575

// On X86, the only ordering which actually requires an instruction is

32576

// seq_cst which isn't SingleThread, everything just needs to be preserved

32577

// during codegen and then dropped. Note that we expect (but don't assume),

32578

// that orderings other than seq_cst and acq_rel have been canonicalized to

32579

// a store or load.

32580

if (AN->getSuccessOrdering() == AtomicOrdering::SequentiallyConsistent &&

32581

AN->getSyncScopeID() == SyncScope::System) {

32582

// Prefer a locked operation against a stack location to minimize cache

32583

// traffic. This assumes that stack locations are very likely to be

32584

// accessed only by the owning thread.

32585

SDValue NewChain = emitLockedStackOp(DAG, Subtarget, Chain, DL);

32586

assert(!N->hasAnyUseOfValue(0))(static_cast <bool> (!N->hasAnyUseOfValue(0)) ? void
(0) : __assert_fail ("!N->hasAnyUseOfValue(0)", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32586, __extension__ __PRETTY_FUNCTION__));

32587

// NOTE: The getUNDEF is needed to give something for the unused result 0.

32588

return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),

32589

DAG.getUNDEF(VT), NewChain);

32590

}

32591

// MEMBARRIER is a compiler barrier; it codegens to a no-op.

32592

SDValue NewChain = DAG.getNode(ISD::MEMBARRIER, DL, MVT::Other, Chain);

32593

assert(!N->hasAnyUseOfValue(0))(static_cast <bool> (!N->hasAnyUseOfValue(0)) ? void
(0) : __assert_fail ("!N->hasAnyUseOfValue(0)", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32593, __extension__ __PRETTY_FUNCTION__));

32594

// NOTE: The getUNDEF is needed to give something for the unused result 0.

32595

return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),

32596

DAG.getUNDEF(VT), NewChain);

32597

}

32598

32599

SDValue LockOp = lowerAtomicArithWithLOCK(N, DAG, Subtarget);

32600

// RAUW the chain, but don't worry about the result, as it's unused.

32601

assert(!N->hasAnyUseOfValue(0))(static_cast <bool> (!N->hasAnyUseOfValue(0)) ? void
(0) : __assert_fail ("!N->hasAnyUseOfValue(0)", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 32601, __extension__ __PRETTY_FUNCTION__));

32602

// NOTE: The getUNDEF is needed to give something for the unused result 0.

32603

return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(),

32604

DAG.getUNDEF(VT), LockOp.getValue(1));

32605

}

32606

32607

static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG,

32608

const X86Subtarget &Subtarget) {

32609

auto *Node = cast<AtomicSDNode>(Op.getNode());

32610

SDLoc dl(Node);

32611

EVT VT = Node->getMemoryVT();

32612

32613

bool IsSeqCst =

32614

Node->getSuccessOrdering() == AtomicOrdering::SequentiallyConsistent;

32615

bool IsTypeLegal = DAG.getTargetLoweringInfo().isTypeLegal(VT);

32616

32617

// If this store is not sequentially consistent and the type is legal

32618

// we can just keep it.

32619

if (!IsSeqCst && IsTypeLegal)

32620

return Op;

32621

32622

if (VT == MVT::i64 && !IsTypeLegal) {

32623

// For illegal i64 atomic_stores, we can try to use MOVQ or MOVLPS if SSE

32624

// is enabled.

32625

bool NoImplicitFloatOps =

32626

DAG.getMachineFunction().getFunction().hasFnAttribute(

32627

Attribute::NoImplicitFloat);

32628

if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps) {

32629

SDValue Chain;

32630

if (Subtarget.hasSSE1()) {

32631

SDValue SclToVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,

32632

Node->getOperand(2));

32633

MVT StVT = Subtarget.hasSSE2() ? MVT::v2i64 : MVT::v4f32;

32634

SclToVec = DAG.getBitcast(StVT, SclToVec);

32635

SDVTList Tys = DAG.getVTList(MVT::Other);

32636

SDValue Ops[] = {Node->getChain(), SclToVec, Node->getBasePtr()};

32637

Chain = DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, Ops,

32638

MVT::i64, Node->getMemOperand());

32639

} else if (Subtarget.hasX87()) {

32640

// First load this into an 80-bit X87 register using a stack temporary.

32641

// This will put the whole integer into the significand.

32642

SDValue StackPtr = DAG.CreateStackTemporary(MVT::i64);

32643

int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();

32644

MachinePointerInfo MPI =

32645

MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);

32646

Chain =

32647

DAG.getStore(Node->getChain(), dl, Node->getOperand(2), StackPtr,

32648

MPI, MaybeAlign(), MachineMemOperand::MOStore);

32649

SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);

32650

SDValue LdOps[] = {Chain, StackPtr};

32651

SDValue Value = DAG.getMemIntrinsicNode(

32652

X86ISD::FILD, dl, Tys, LdOps, MVT::i64, MPI,

32653

/*Align*/ std::nullopt, MachineMemOperand::MOLoad);

32654

Chain = Value.getValue(1);

32655

32656

// Now use an FIST to do the atomic store.

32657

SDValue StoreOps[] = {Chain, Value, Node->getBasePtr()};

32658

Chain =

32659

DAG.getMemIntrinsicNode(X86ISD::FIST, dl, DAG.getVTList(MVT::Other),

32660

StoreOps, MVT::i64, Node->getMemOperand());

32661

}

32662

32663

if (Chain) {

32664

// If this is a sequentially consistent store, also emit an appropriate

32665

// barrier.

32666

if (IsSeqCst)

32667

Chain = emitLockedStackOp(DAG, Subtarget, Chain, dl);

32668

32669

return Chain;

32670

}

32671

}

32672

}

32673

32674

// Convert seq_cst store -> xchg

32675

// Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)

32676

// FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.

32677

SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl,

32678

Node->getMemoryVT(),

32679

Node->getOperand(0),

32680

Node->getOperand(1), Node->getOperand(2),

32681

Node->getMemOperand());

32682

return Swap.getValue(1);

32683

}

32684

32685

static SDValue LowerADDSUBCARRY(SDValue Op, SelectionDAG &DAG) {

32686

SDNode *N = Op.getNode();

32687

MVT VT = N->getSimpleValueType(0);

32688

unsigned Opc = Op.getOpcode();

32689

32690

// Let legalize expand this if it isn't a legal type yet.

32691

if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))

32692

return SDValue();

32693

32694

SDVTList VTs = DAG.getVTList(VT, MVT::i32);

32695

SDLoc DL(N);

32696

32697

// Set the carry flag.

32698

SDValue Carry = Op.getOperand(2);

32699

EVT CarryVT = Carry.getValueType();

32700

Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),

32701

Carry, DAG.getAllOnesConstant(DL, CarryVT));

32702

32703

bool IsAdd = Opc == ISD::ADDCARRY || Opc == ISD::SADDO_CARRY;

32704

SDValue Sum = DAG.getNode(IsAdd ? X86ISD::ADC : X86ISD::SBB, DL, VTs,

32705

Op.getOperand(0), Op.getOperand(1),

32706

Carry.getValue(1));

32707

32708

bool IsSigned = Opc == ISD::SADDO_CARRY || Opc == ISD::SSUBO_CARRY;

32709

SDValue SetCC = getSETCC(IsSigned ? X86::COND_O : X86::COND_B,

32710

Sum.getValue(1), DL, DAG);

32711

if (N->getValueType(1) == MVT::i1)

32712

SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);

32713

32714

return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);

32715

}

32716

32717

static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,

32718

SelectionDAG &DAG) {

32719

assert(Subtarget.isTargetDarwin() && Subtarget.is64Bit())(static_cast <bool> (Subtarget.isTargetDarwin() &&
Subtarget.is64Bit()) ? void (0) : __assert_fail ("Subtarget.isTargetDarwin() && Subtarget.is64Bit()"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32719, __extension__
__PRETTY_FUNCTION__));

32720

32721

// For MacOSX, we want to call an alternative entry point: __sincos_stret,

32722

// which returns the values as { float, float } (in XMM0) or

32723

// { double, double } (which is returned in XMM0, XMM1).

32724

SDLoc dl(Op);

32725

SDValue Arg = Op.getOperand(0);

32726

EVT ArgVT = Arg.getValueType();

32727

Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());

32728

32729

TargetLowering::ArgListTy Args;

32730

TargetLowering::ArgListEntry Entry;

32731

32732

Entry.Node = Arg;

32733

Entry.Ty = ArgTy;

32734

Entry.IsSExt = false;

32735

Entry.IsZExt = false;

32736

Args.push_back(Entry);

32737

32738

bool isF64 = ArgVT == MVT::f64;

32739

// Only optimize x86_64 for now. i386 is a bit messy. For f32,

32740

// the small struct {f32, f32} is returned in (eax, edx). For f64,

32741

// the results are returned via SRet in memory.

32742

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

32743

RTLIB::Libcall LC = isF64 ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32;

32744

const char *LibcallName = TLI.getLibcallName(LC);

32745

SDValue Callee =

32746

DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout()));

32747

32748

Type *RetTy = isF64 ? (Type *)StructType::get(ArgTy, ArgTy)

32749

: (Type *)FixedVectorType::get(ArgTy, 4);

32750

32751

TargetLowering::CallLoweringInfo CLI(DAG);

32752

CLI.setDebugLoc(dl)

32753

.setChain(DAG.getEntryNode())

32754

.setLibCallee(CallingConv::C, RetTy, Callee, std::move(Args));

32755

32756

std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);

32757

32758

if (isF64)

32759

// Returned in xmm0 and xmm1.

32760

return CallResult.first;

32761

32762

// Returned in bits 0:31 and 32:64 xmm0.

32763

SDValue SinVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,

32764

CallResult.first, DAG.getIntPtrConstant(0, dl));

32765

SDValue CosVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,

32766

CallResult.first, DAG.getIntPtrConstant(1, dl));

32767

SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);

32768

return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);

32769

}

32770

32771

/// Widen a vector input to a vector of NVT. The

32772

/// input vector must have the same element type as NVT.

32773

static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG,

32774

bool FillWithZeroes = false) {

32775

// Check if InOp already has the right width.

32776

MVT InVT = InOp.getSimpleValueType();

32777

if (InVT == NVT)

32778

return InOp;

32779

32780

if (InOp.isUndef())

32781

return DAG.getUNDEF(NVT);

32782

32783

assert(InVT.getVectorElementType() == NVT.getVectorElementType() &&(static_cast <bool> (InVT.getVectorElementType() == NVT
.getVectorElementType() && "input and widen element type must match"
) ? void (0) : __assert_fail ("InVT.getVectorElementType() == NVT.getVectorElementType() && \"input and widen element type must match\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32784, __extension__
__PRETTY_FUNCTION__))

32784

"input and widen element type must match")(static_cast <bool> (InVT.getVectorElementType() == NVT
.getVectorElementType() && "input and widen element type must match"
) ? void (0) : __assert_fail ("InVT.getVectorElementType() == NVT.getVectorElementType() && \"input and widen element type must match\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32784, __extension__
__PRETTY_FUNCTION__));

32785

32786

unsigned InNumElts = InVT.getVectorNumElements();

32787

unsigned WidenNumElts = NVT.getVectorNumElements();

32788

assert(WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 &&(static_cast <bool> (WidenNumElts > InNumElts &&
WidenNumElts % InNumElts == 0 && "Unexpected request for vector widening"
) ? void (0) : __assert_fail ("WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 && \"Unexpected request for vector widening\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32789, __extension__
__PRETTY_FUNCTION__))

32789

"Unexpected request for vector widening")(static_cast <bool> (WidenNumElts > InNumElts &&
WidenNumElts % InNumElts == 0 && "Unexpected request for vector widening"
) ? void (0) : __assert_fail ("WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 && \"Unexpected request for vector widening\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32789, __extension__
__PRETTY_FUNCTION__));

32790

32791

SDLoc dl(InOp);

32792

if (InOp.getOpcode() == ISD::CONCAT_VECTORS &&

32793

InOp.getNumOperands() == 2) {

32794

SDValue N1 = InOp.getOperand(1);

32795

if ((ISD::isBuildVectorAllZeros(N1.getNode()) && FillWithZeroes) ||

32796

N1.isUndef()) {

32797

InOp = InOp.getOperand(0);

32798

InVT = InOp.getSimpleValueType();

32799

InNumElts = InVT.getVectorNumElements();

32800

}

32801

}

32802

if (ISD::isBuildVectorOfConstantSDNodes(InOp.getNode()) ||

32803

ISD::isBuildVectorOfConstantFPSDNodes(InOp.getNode())) {

32804

SmallVector<SDValue, 16> Ops;

32805

for (unsigned i = 0; i < InNumElts; ++i)

32806

Ops.push_back(InOp.getOperand(i));

32807

32808

EVT EltVT = InOp.getOperand(0).getValueType();

32809

32810

SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, EltVT) :

32811

DAG.getUNDEF(EltVT);

32812

for (unsigned i = 0; i < WidenNumElts - InNumElts; ++i)

32813

Ops.push_back(FillVal);

32814

return DAG.getBuildVector(NVT, dl, Ops);

32815

}

32816

SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, NVT) :

32817

DAG.getUNDEF(NVT);

32818

return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NVT, FillVal,

32819

InOp, DAG.getIntPtrConstant(0, dl));

32820

}

32821

32822

static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget,

32823

SelectionDAG &DAG) {

32824

assert(Subtarget.hasAVX512() &&(static_cast <bool> (Subtarget.hasAVX512() && "MGATHER/MSCATTER are supported on AVX-512 arch only"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && \"MGATHER/MSCATTER are supported on AVX-512 arch only\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32825, __extension__
__PRETTY_FUNCTION__))

32825

"MGATHER/MSCATTER are supported on AVX-512 arch only")(static_cast <bool> (Subtarget.hasAVX512() && "MGATHER/MSCATTER are supported on AVX-512 arch only"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && \"MGATHER/MSCATTER are supported on AVX-512 arch only\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32825, __extension__
__PRETTY_FUNCTION__));

32826

32827

MaskedScatterSDNode *N = cast<MaskedScatterSDNode>(Op.getNode());

32828

SDValue Src = N->getValue();

32829

MVT VT = Src.getSimpleValueType();

32830

assert(VT.getScalarSizeInBits() >= 32 && "Unsupported scatter op")(static_cast <bool> (VT.getScalarSizeInBits() >= 32 &&
"Unsupported scatter op") ? void (0) : __assert_fail ("VT.getScalarSizeInBits() >= 32 && \"Unsupported scatter op\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32830, __extension__
__PRETTY_FUNCTION__));

32831

SDLoc dl(Op);

32832

32833

SDValue Scale = N->getScale();

32834

SDValue Index = N->getIndex();

32835

SDValue Mask = N->getMask();

32836

SDValue Chain = N->getChain();

32837

SDValue BasePtr = N->getBasePtr();

32838

32839

if (VT == MVT::v2f32 || VT == MVT::v2i32) {

32840

assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type")(static_cast <bool> (Mask.getValueType() == MVT::v2i1 &&
"Unexpected mask type") ? void (0) : __assert_fail ("Mask.getValueType() == MVT::v2i1 && \"Unexpected mask type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32840, __extension__
__PRETTY_FUNCTION__));

32841

// If the index is v2i64 and we have VLX we can use xmm for data and index.

32842

if (Index.getValueType() == MVT::v2i64 && Subtarget.hasVLX()) {

32843

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

32844

EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);

32845

Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, Src, DAG.getUNDEF(VT));

32846

SDVTList VTs = DAG.getVTList(MVT::Other);

32847

SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};

32848

return DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,

32849

N->getMemoryVT(), N->getMemOperand());

32850

}

32851

return SDValue();

32852

}

32853

32854

MVT IndexVT = Index.getSimpleValueType();

32855

32856

// If the index is v2i32, we're being called by type legalization and we

32857

// should just let the default handling take care of it.

32858

if (IndexVT == MVT::v2i32)

32859

return SDValue();

32860

32861

// If we don't have VLX and neither the passthru or index is 512-bits, we

32862

// need to widen until one is.

32863

if (!Subtarget.hasVLX() && !VT.is512BitVector() &&

32864

!Index.getSimpleValueType().is512BitVector()) {

32865

// Determine how much we need to widen by to get a 512-bit type.

32866

unsigned Factor = std::min(512/VT.getSizeInBits(),

32867

512/IndexVT.getSizeInBits());

32868

unsigned NumElts = VT.getVectorNumElements() * Factor;

32869

32870

VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);

32871

IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);

32872

MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);

32873

32874

Src = ExtendToType(Src, VT, DAG);

32875

Index = ExtendToType(Index, IndexVT, DAG);

32876

Mask = ExtendToType(Mask, MaskVT, DAG, true);

32877

}

32878

32879

SDVTList VTs = DAG.getVTList(MVT::Other);

32880

SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};

32881

return DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,

32882

N->getMemoryVT(), N->getMemOperand());

32883

}

32884

32885

static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,

32886

SelectionDAG &DAG) {

32887

32888

MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode());

32889

MVT VT = Op.getSimpleValueType();

32890

MVT ScalarVT = VT.getScalarType();

32891

SDValue Mask = N->getMask();

32892

MVT MaskVT = Mask.getSimpleValueType();

32893

SDValue PassThru = N->getPassThru();

32894

SDLoc dl(Op);

32895

32896

// Handle AVX masked loads which don't support passthru other than 0.

32897

if (MaskVT.getVectorElementType() != MVT::i1) {

32898

// We also allow undef in the isel pattern.

32899

if (PassThru.isUndef() || ISD::isBuildVectorAllZeros(PassThru.getNode()))

32900

return Op;

32901

32902

SDValue NewLoad = DAG.getMaskedLoad(

32903

VT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask,

32904

getZeroVector(VT, Subtarget, DAG, dl), N->getMemoryVT(),

32905

N->getMemOperand(), N->getAddressingMode(), N->getExtensionType(),

32906

N->isExpandingLoad());

32907

// Emit a blend.

32908

SDValue Select = DAG.getNode(ISD::VSELECT, dl, VT, Mask, NewLoad, PassThru);

32909

return DAG.getMergeValues({ Select, NewLoad.getValue(1) }, dl);

32910

}

32911

32912

assert((!N->isExpandingLoad() || Subtarget.hasAVX512()) &&(static_cast <bool> ((!N->isExpandingLoad() || Subtarget
.hasAVX512()) && "Expanding masked load is supported on AVX-512 target only!"
) ? void (0) : __assert_fail ("(!N->isExpandingLoad() || Subtarget.hasAVX512()) && \"Expanding masked load is supported on AVX-512 target only!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32913, __extension__
__PRETTY_FUNCTION__))

32913

"Expanding masked load is supported on AVX-512 target only!")(static_cast <bool> ((!N->isExpandingLoad() || Subtarget
.hasAVX512()) && "Expanding masked load is supported on AVX-512 target only!"
) ? void (0) : __assert_fail ("(!N->isExpandingLoad() || Subtarget.hasAVX512()) && \"Expanding masked load is supported on AVX-512 target only!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32913, __extension__
__PRETTY_FUNCTION__));

32914

32915

assert((!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) &&(static_cast <bool> ((!N->isExpandingLoad() || ScalarVT
.getSizeInBits() >= 32) && "Expanding masked load is supported for 32 and 64-bit types only!"
) ? void (0) : __assert_fail ("(!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) && \"Expanding masked load is supported for 32 and 64-bit types only!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32916, __extension__
__PRETTY_FUNCTION__))

32916

"Expanding masked load is supported for 32 and 64-bit types only!")(static_cast <bool> ((!N->isExpandingLoad() || ScalarVT
.getSizeInBits() >= 32) && "Expanding masked load is supported for 32 and 64-bit types only!"
) ? void (0) : __assert_fail ("(!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) && \"Expanding masked load is supported for 32 and 64-bit types only!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32916, __extension__
__PRETTY_FUNCTION__));

32917

32918

assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&(static_cast <bool> (Subtarget.hasAVX512() && !
Subtarget.hasVLX() && !VT.is512BitVector() &&
"Cannot lower masked load op.") ? void (0) : __assert_fail (
"Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() && \"Cannot lower masked load op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32919, __extension__
__PRETTY_FUNCTION__))

32919

"Cannot lower masked load op.")(static_cast <bool> (Subtarget.hasAVX512() && !
Subtarget.hasVLX() && !VT.is512BitVector() &&
"Cannot lower masked load op.") ? void (0) : __assert_fail (
"Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() && \"Cannot lower masked load op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32919, __extension__
__PRETTY_FUNCTION__));

32920

32921

assert((ScalarVT.getSizeInBits() >= 32 ||(static_cast <bool> ((ScalarVT.getSizeInBits() >= 32
|| (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT
== MVT::i16))) && "Unsupported masked load op.") ? void
(0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked load op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32924, __extension__
__PRETTY_FUNCTION__))

32922

(Subtarget.hasBWI() &&(static_cast <bool> ((ScalarVT.getSizeInBits() >= 32
|| (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT
== MVT::i16))) && "Unsupported masked load op.") ? void
(0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked load op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32924, __extension__
__PRETTY_FUNCTION__))

32923

(ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&(static_cast <bool> ((ScalarVT.getSizeInBits() >= 32
|| (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT
== MVT::i16))) && "Unsupported masked load op.") ? void
(0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked load op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32924, __extension__
__PRETTY_FUNCTION__))

32924

"Unsupported masked load op.")(static_cast <bool> ((ScalarVT.getSizeInBits() >= 32
|| (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT
== MVT::i16))) && "Unsupported masked load op.") ? void
(0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked load op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32924, __extension__
__PRETTY_FUNCTION__));

32925

32926

// This operation is legal for targets with VLX, but without

32927

// VLX the vector should be widened to 512 bit

32928

unsigned NumEltsInWideVec = 512 / VT.getScalarSizeInBits();

32929

MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);

32930

PassThru = ExtendToType(PassThru, WideDataVT, DAG);

32931

32932

// Mask element has to be i1.

32933

assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&(static_cast <bool> (Mask.getSimpleValueType().getScalarType
() == MVT::i1 && "Unexpected mask type") ? void (0) :
__assert_fail ("Mask.getSimpleValueType().getScalarType() == MVT::i1 && \"Unexpected mask type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32934, __extension__
__PRETTY_FUNCTION__))

32934

"Unexpected mask type")(static_cast <bool> (Mask.getSimpleValueType().getScalarType
() == MVT::i1 && "Unexpected mask type") ? void (0) :
__assert_fail ("Mask.getSimpleValueType().getScalarType() == MVT::i1 && \"Unexpected mask type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32934, __extension__
__PRETTY_FUNCTION__));

32935

32936

MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);

32937

32938

Mask = ExtendToType(Mask, WideMaskVT, DAG, true);

32939

SDValue NewLoad = DAG.getMaskedLoad(

32940

WideDataVT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask,

32941

PassThru, N->getMemoryVT(), N->getMemOperand(), N->getAddressingMode(),

32942

N->getExtensionType(), N->isExpandingLoad());

32943

32944

SDValue Extract =

32945

DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, NewLoad.getValue(0),

32946

DAG.getIntPtrConstant(0, dl));

32947

SDValue RetOps[] = {Extract, NewLoad.getValue(1)};

32948

return DAG.getMergeValues(RetOps, dl);

32949

}

32950

32951

static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget,

32952

SelectionDAG &DAG) {

32953

MaskedStoreSDNode *N = cast<MaskedStoreSDNode>(Op.getNode());

32954

SDValue DataToStore = N->getValue();

32955

MVT VT = DataToStore.getSimpleValueType();

32956

MVT ScalarVT = VT.getScalarType();

32957

SDValue Mask = N->getMask();

32958

SDLoc dl(Op);

32959

32960

assert((!N->isCompressingStore() || Subtarget.hasAVX512()) &&(static_cast <bool> ((!N->isCompressingStore() || Subtarget
.hasAVX512()) && "Expanding masked load is supported on AVX-512 target only!"
) ? void (0) : __assert_fail ("(!N->isCompressingStore() || Subtarget.hasAVX512()) && \"Expanding masked load is supported on AVX-512 target only!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32961, __extension__
__PRETTY_FUNCTION__))

32961

"Expanding masked load is supported on AVX-512 target only!")(static_cast <bool> ((!N->isCompressingStore() || Subtarget
.hasAVX512()) && "Expanding masked load is supported on AVX-512 target only!"
) ? void (0) : __assert_fail ("(!N->isCompressingStore() || Subtarget.hasAVX512()) && \"Expanding masked load is supported on AVX-512 target only!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32961, __extension__
__PRETTY_FUNCTION__));

32962

32963

assert((!N->isCompressingStore() || ScalarVT.getSizeInBits() >= 32) &&(static_cast <bool> ((!N->isCompressingStore() || ScalarVT
.getSizeInBits() >= 32) && "Expanding masked load is supported for 32 and 64-bit types only!"
) ? void (0) : __assert_fail ("(!N->isCompressingStore() || ScalarVT.getSizeInBits() >= 32) && \"Expanding masked load is supported for 32 and 64-bit types only!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32964, __extension__
__PRETTY_FUNCTION__))

32964

"Expanding masked load is supported for 32 and 64-bit types only!")(static_cast <bool> ((!N->isCompressingStore() || ScalarVT
.getSizeInBits() >= 32) && "Expanding masked load is supported for 32 and 64-bit types only!"
) ? void (0) : __assert_fail ("(!N->isCompressingStore() || ScalarVT.getSizeInBits() >= 32) && \"Expanding masked load is supported for 32 and 64-bit types only!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32964, __extension__
__PRETTY_FUNCTION__));

32965

32966

assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&(static_cast <bool> (Subtarget.hasAVX512() && !
Subtarget.hasVLX() && !VT.is512BitVector() &&
"Cannot lower masked store op.") ? void (0) : __assert_fail (
"Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() && \"Cannot lower masked store op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32967, __extension__
__PRETTY_FUNCTION__))

32967

"Cannot lower masked store op.")(static_cast <bool> (Subtarget.hasAVX512() && !
Subtarget.hasVLX() && !VT.is512BitVector() &&
"Cannot lower masked store op.") ? void (0) : __assert_fail (
"Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() && \"Cannot lower masked store op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32967, __extension__
__PRETTY_FUNCTION__));

32968

32969

assert((ScalarVT.getSizeInBits() >= 32 ||(static_cast <bool> ((ScalarVT.getSizeInBits() >= 32
|| (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT
== MVT::i16))) && "Unsupported masked store op.") ? void
(0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked store op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32972, __extension__
__PRETTY_FUNCTION__))

32970

(Subtarget.hasBWI() &&(static_cast <bool> ((ScalarVT.getSizeInBits() >= 32
|| (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT
== MVT::i16))) && "Unsupported masked store op.") ? void
(0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked store op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32972, __extension__
__PRETTY_FUNCTION__))

32971

(ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&(static_cast <bool> ((ScalarVT.getSizeInBits() >= 32
|| (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT
== MVT::i16))) && "Unsupported masked store op.") ? void
(0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked store op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32972, __extension__
__PRETTY_FUNCTION__))

32972

"Unsupported masked store op.")(static_cast <bool> ((ScalarVT.getSizeInBits() >= 32
|| (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT
== MVT::i16))) && "Unsupported masked store op.") ? void
(0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked store op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32972, __extension__
__PRETTY_FUNCTION__));

32973

32974

// This operation is legal for targets with VLX, but without

32975

// VLX the vector should be widened to 512 bit

32976

unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits();

32977

MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);

32978

32979

// Mask element has to be i1.

32980

assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&(static_cast <bool> (Mask.getSimpleValueType().getScalarType
() == MVT::i1 && "Unexpected mask type") ? void (0) :
__assert_fail ("Mask.getSimpleValueType().getScalarType() == MVT::i1 && \"Unexpected mask type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32981, __extension__
__PRETTY_FUNCTION__))

32981

"Unexpected mask type")(static_cast <bool> (Mask.getSimpleValueType().getScalarType
() == MVT::i1 && "Unexpected mask type") ? void (0) :
__assert_fail ("Mask.getSimpleValueType().getScalarType() == MVT::i1 && \"Unexpected mask type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32981, __extension__
__PRETTY_FUNCTION__));

32982

32983

MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);

32984

32985

DataToStore = ExtendToType(DataToStore, WideDataVT, DAG);

32986

Mask = ExtendToType(Mask, WideMaskVT, DAG, true);

32987

return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(),

32988

N->getOffset(), Mask, N->getMemoryVT(),

32989

N->getMemOperand(), N->getAddressingMode(),

32990

N->isTruncatingStore(), N->isCompressingStore());

32991

}

32992

32993

static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,

32994

SelectionDAG &DAG) {

32995

assert(Subtarget.hasAVX2() &&(static_cast <bool> (Subtarget.hasAVX2() && "MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32996, __extension__
__PRETTY_FUNCTION__))

32996

"MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only")(static_cast <bool> (Subtarget.hasAVX2() && "MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 32996, __extension__
__PRETTY_FUNCTION__));

32997

32998

MaskedGatherSDNode *N = cast<MaskedGatherSDNode>(Op.getNode());

32999

SDLoc dl(Op);

33000

MVT VT = Op.getSimpleValueType();

33001

SDValue Index = N->getIndex();

33002

SDValue Mask = N->getMask();

33003

SDValue PassThru = N->getPassThru();

33004

MVT IndexVT = Index.getSimpleValueType();

33005

33006

assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op")(static_cast <bool> (VT.getScalarSizeInBits() >= 32 &&
"Unsupported gather op") ? void (0) : __assert_fail ("VT.getScalarSizeInBits() >= 32 && \"Unsupported gather op\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33006, __extension__
__PRETTY_FUNCTION__));

33007

33008

// If the index is v2i32, we're being called by type legalization.

33009

if (IndexVT == MVT::v2i32)

33010

return SDValue();

33011

33012

// If we don't have VLX and neither the passthru or index is 512-bits, we

33013

// need to widen until one is.

33014

MVT OrigVT = VT;

33015

if (Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&

33016

!IndexVT.is512BitVector()) {

33017

// Determine how much we need to widen by to get a 512-bit type.

33018

unsigned Factor = std::min(512/VT.getSizeInBits(),

33019

512/IndexVT.getSizeInBits());

33020

33021

unsigned NumElts = VT.getVectorNumElements() * Factor;

33022

33023

VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);

33024

IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);

33025

MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);

33026

33027

PassThru = ExtendToType(PassThru, VT, DAG);

33028

Index = ExtendToType(Index, IndexVT, DAG);

33029

Mask = ExtendToType(Mask, MaskVT, DAG, true);

33030

}

33031

33032

// Break dependency on the data register.

33033

if (PassThru.isUndef())

33034

PassThru = getZeroVector(VT, Subtarget, DAG, dl);

33035

33036

SDValue Ops[] = { N->getChain(), PassThru, Mask, N->getBasePtr(), Index,

33037

N->getScale() };

33038

SDValue NewGather = DAG.getMemIntrinsicNode(

33039

X86ISD::MGATHER, dl, DAG.getVTList(VT, MVT::Other), Ops, N->getMemoryVT(),

33040

N->getMemOperand());

33041

SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OrigVT,

33042

NewGather, DAG.getIntPtrConstant(0, dl));

33043

return DAG.getMergeValues({Extract, NewGather.getValue(1)}, dl);

33044

}

33045

33046

static SDValue LowerADDRSPACECAST(SDValue Op, SelectionDAG &DAG) {

33047

SDLoc dl(Op);

33048

SDValue Src = Op.getOperand(0);

33049

MVT DstVT = Op.getSimpleValueType();

33050

33051

AddrSpaceCastSDNode *N = cast<AddrSpaceCastSDNode>(Op.getNode());

33052

unsigned SrcAS = N->getSrcAddressSpace();

33053

33054

assert(SrcAS != N->getDestAddressSpace() &&(static_cast <bool> (SrcAS != N->getDestAddressSpace
() && "addrspacecast must be between different address spaces"
) ? void (0) : __assert_fail ("SrcAS != N->getDestAddressSpace() && \"addrspacecast must be between different address spaces\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33055, __extension__
__PRETTY_FUNCTION__))

33055

"addrspacecast must be between different address spaces")(static_cast <bool> (SrcAS != N->getDestAddressSpace
() && "addrspacecast must be between different address spaces"
) ? void (0) : __assert_fail ("SrcAS != N->getDestAddressSpace() && \"addrspacecast must be between different address spaces\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33055, __extension__
__PRETTY_FUNCTION__));

33056

33057

if (SrcAS == X86AS::PTR32_UPTR && DstVT == MVT::i64) {

33058

Op = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Src);

33059

} else if (DstVT == MVT::i64) {

33060

Op = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Src);

33061

} else if (DstVT == MVT::i32) {

33062

Op = DAG.getNode(ISD::TRUNCATE, dl, DstVT, Src);

33063

} else {

33064

report_fatal_error("Bad address space in addrspacecast");

33065

}

33066

return Op;

33067

}

33068

33069

SDValue X86TargetLowering::LowerGC_TRANSITION(SDValue Op,

33070

SelectionDAG &DAG) const {

33071

// TODO: Eventually, the lowering of these nodes should be informed by or

33072

// deferred to the GC strategy for the function in which they appear. For

33073

// now, however, they must be lowered to something. Since they are logically

33074

// no-ops in the case of a null GC strategy (or a GC strategy which does not

33075

// require special handling for these nodes), lower them as literal NOOPs for

33076

// the time being.

33077

SmallVector<SDValue, 2> Ops;

33078

Ops.push_back(Op.getOperand(0));

33079

if (Op->getGluedNode())

33080

Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));

33081

33082

SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);

33083

return SDValue(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);

33084

}

33085

33086

// Custom split CVTPS2PH with wide types.

33087

static SDValue LowerCVTPS2PH(SDValue Op, SelectionDAG &DAG) {

33088

SDLoc dl(Op);

33089

EVT VT = Op.getValueType();

33090

SDValue Lo, Hi;

33091

std::tie(Lo, Hi) = DAG.SplitVectorOperand(Op.getNode(), 0);

33092

EVT LoVT, HiVT;

33093

std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);

33094

SDValue RC = Op.getOperand(1);

33095

Lo = DAG.getNode(X86ISD::CVTPS2PH, dl, LoVT, Lo, RC);

33096

Hi = DAG.getNode(X86ISD::CVTPS2PH, dl, HiVT, Hi, RC);

33097

return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);

33098

}

33099

33100

static StringRef getInstrStrFromOpNo(const SmallVectorImpl<StringRef> &AsmStrs,

33101

unsigned OpNo) {

33102

const APInt Operand(32, OpNo);

33103

std::string OpNoStr = llvm::toString(Operand, 10, false);

33104

std::string Str(" $");

33105

33106

std::string OpNoStr1(Str + OpNoStr); // e.g. " $1" (OpNo=1)

33107

std::string OpNoStr2(Str + "{" + OpNoStr + ":"); // With modifier, e.g. ${1:P}

33108

33109

auto I = StringRef::npos;

33110

for (auto &AsmStr : AsmStrs) {

33111

// Match the OpNo string. We should match exactly to exclude match

33112

// sub-string, e.g. "$12" contain "$1"

33113

if (AsmStr.endswith(OpNoStr1))

33114

I = AsmStr.size() - OpNoStr1.size();

33115

33116

// Get the index of operand in AsmStr.

33117

if (I == StringRef::npos)

33118

I = AsmStr.find(OpNoStr1 + ",");

33119

if (I == StringRef::npos)

33120

I = AsmStr.find(OpNoStr2);

33121

33122

if (I == StringRef::npos)

33123

continue;

33124

33125

assert(I > 0 && "Unexpected inline asm string!")(static_cast <bool> (I > 0 && "Unexpected inline asm string!"
) ? void (0) : __assert_fail ("I > 0 && \"Unexpected inline asm string!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33125, __extension__
__PRETTY_FUNCTION__));

33126

// Remove the operand string and label (if exsit).

33127

// For example:

33128

// ".L__MSASMLABEL_.${:uid}__l:call dword ptr ${0:P}"

33129

// ==>

33130

// ".L__MSASMLABEL_.${:uid}__l:call dword ptr "

33131

// ==>

33132

// "call dword ptr "

33133

auto TmpStr = AsmStr.substr(0, I);

33134

I = TmpStr.rfind(':');

33135

if (I == StringRef::npos)

33136

return TmpStr;

33137

33138

assert(I < TmpStr.size() && "Unexpected inline asm string!")(static_cast <bool> (I < TmpStr.size() && "Unexpected inline asm string!"
) ? void (0) : __assert_fail ("I < TmpStr.size() && \"Unexpected inline asm string!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33138, __extension__
__PRETTY_FUNCTION__));

33139

auto Asm = TmpStr.drop_front(I + 1);

33140

return Asm;

33141

}

33142

33143

return StringRef();

33144

}

33145

33146

bool X86TargetLowering::isInlineAsmTargetBranch(

33147

const SmallVectorImpl<StringRef> &AsmStrs, unsigned OpNo) const {

33148

StringRef InstrStr = getInstrStrFromOpNo(AsmStrs, OpNo);

33149

33150

if (InstrStr.contains("call"))

33151

return true;

33152

33153

return false;

33154

}

33155

33156

/// Provide custom lowering hooks for some operations.

33157

SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {

33158

switch (Op.getOpcode()) {

33159

default: llvm_unreachable("Should not custom lower this!")::llvm::llvm_unreachable_internal("Should not custom lower this!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33159);

33160

case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, Subtarget, DAG);

33161

case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:

33162

return LowerCMP_SWAP(Op, Subtarget, DAG);

33163

case ISD::CTPOP: return LowerCTPOP(Op, Subtarget, DAG);

33164

case ISD::ATOMIC_LOAD_ADD:

33165

case ISD::ATOMIC_LOAD_SUB:

33166

case ISD::ATOMIC_LOAD_OR:

33167

case ISD::ATOMIC_LOAD_XOR:

33168

case ISD::ATOMIC_LOAD_AND: return lowerAtomicArith(Op, DAG, Subtarget);

33169

case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op, DAG, Subtarget);

33170

case ISD::BITREVERSE: return LowerBITREVERSE(Op, Subtarget, DAG);

33171

case ISD::PARITY: return LowerPARITY(Op, Subtarget, DAG);

33172

case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG);

33173

case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, Subtarget, DAG);

33174

case ISD::VECTOR_SHUFFLE: return lowerVECTOR_SHUFFLE(Op, Subtarget, DAG);

33175

case ISD::VSELECT: return LowerVSELECT(Op, DAG);

33176

case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);

33177

case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);

33178

case ISD::INSERT_SUBVECTOR: return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG);

33179

case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG);

33180

case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, Subtarget,DAG);

33181

case ISD::ConstantPool: return LowerConstantPool(Op, DAG);

33182

case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);

33183

case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);

33184

case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG);

33185

case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);

33186

case ISD::SHL_PARTS:

33187

case ISD::SRA_PARTS:

33188

case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG);

33189

case ISD::FSHL:

33190

case ISD::FSHR: return LowerFunnelShift(Op, Subtarget, DAG);

33191

case ISD::STRICT_SINT_TO_FP:

33192

case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);

33193

case ISD::STRICT_UINT_TO_FP:

33194

case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);

33195

case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG);

33196

case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, Subtarget, DAG);

33197

case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, Subtarget, DAG);

33198

case ISD::ANY_EXTEND: return LowerANY_EXTEND(Op, Subtarget, DAG);

33199

case ISD::ZERO_EXTEND_VECTOR_INREG:

33200

case ISD::SIGN_EXTEND_VECTOR_INREG:

33201

return LowerEXTEND_VECTOR_INREG(Op, Subtarget, DAG);

33202

case ISD::FP_TO_SINT:

33203

case ISD::STRICT_FP_TO_SINT:

33204

case ISD::FP_TO_UINT:

33205

case ISD::STRICT_FP_TO_UINT: return LowerFP_TO_INT(Op, DAG);

33206

case ISD::FP_TO_SINT_SAT:

33207

case ISD::FP_TO_UINT_SAT: return LowerFP_TO_INT_SAT(Op, DAG);

33208

case ISD::FP_EXTEND:

33209

case ISD::STRICT_FP_EXTEND: return LowerFP_EXTEND(Op, DAG);

33210

case ISD::FP_ROUND:

33211

case ISD::STRICT_FP_ROUND: return LowerFP_ROUND(Op, DAG);

33212

case ISD::FP16_TO_FP:

33213

case ISD::STRICT_FP16_TO_FP: return LowerFP16_TO_FP(Op, DAG);

33214

case ISD::FP_TO_FP16:

33215

case ISD::STRICT_FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);

33216

case ISD::FP_TO_BF16: return LowerFP_TO_BF16(Op, DAG);

33217

case ISD::LOAD: return LowerLoad(Op, Subtarget, DAG);

33218

case ISD::STORE: return LowerStore(Op, Subtarget, DAG);

33219

case ISD::FADD:

33220

case ISD::FSUB: return lowerFaddFsub(Op, DAG);

33221

case ISD::FROUND: return LowerFROUND(Op, DAG);

33222

case ISD::FABS:

33223

case ISD::FNEG: return LowerFABSorFNEG(Op, DAG);

33224

case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);

33225

case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG);

33226

case ISD::LRINT:

33227

case ISD::LLRINT: return LowerLRINT_LLRINT(Op, DAG);

33228

case ISD::SETCC:

33229

case ISD::STRICT_FSETCC:

33230

case ISD::STRICT_FSETCCS: return LowerSETCC(Op, DAG);

33231

case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG);

33232

case ISD::SELECT: return LowerSELECT(Op, DAG);

33233

case ISD::BRCOND: return LowerBRCOND(Op, DAG);

33234

case ISD::JumpTable: return LowerJumpTable(Op, DAG);

33235

case ISD::VASTART: return LowerVASTART(Op, DAG);

33236

case ISD::VAARG: return LowerVAARG(Op, DAG);

33237

case ISD::VACOPY: return LowerVACOPY(Op, Subtarget, DAG);

33238

case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);

33239

case ISD::INTRINSIC_VOID:

33240

case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG);

33241

case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);

33242

case ISD::ADDROFRETURNADDR: return LowerADDROFRETURNADDR(Op, DAG);

33243

case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);

33244

case ISD::FRAME_TO_ARGS_OFFSET:

33245

return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);

33246

case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);

33247

case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG);

33248

case ISD::EH_SJLJ_SETJMP: return lowerEH_SJLJ_SETJMP(Op, DAG);

33249

case ISD::EH_SJLJ_LONGJMP: return lowerEH_SJLJ_LONGJMP(Op, DAG);

33250

case ISD::EH_SJLJ_SETUP_DISPATCH:

33251

return lowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);

33252

case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG);

33253

case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG);

33254

case ISD::GET_ROUNDING: return LowerGET_ROUNDING(Op, DAG);

33255

case ISD::SET_ROUNDING: return LowerSET_ROUNDING(Op, DAG);

33256

case ISD::CTLZ:

33257

case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ(Op, Subtarget, DAG);

33258

case ISD::CTTZ:

33259

case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op, Subtarget, DAG);

33260

case ISD::MUL: return LowerMUL(Op, Subtarget, DAG);

33261

case ISD::MULHS:

33262

case ISD::MULHU: return LowerMULH(Op, Subtarget, DAG);

33263

case ISD::ROTL:

33264

case ISD::ROTR: return LowerRotate(Op, Subtarget, DAG);

33265

case ISD::SRA:

33266

case ISD::SRL:

33267

case ISD::SHL: return LowerShift(Op, Subtarget, DAG);

33268

case ISD::SADDO:

33269

case ISD::UADDO:

33270

case ISD::SSUBO:

33271

case ISD::USUBO: return LowerXALUO(Op, DAG);

33272

case ISD::SMULO:

33273

case ISD::UMULO: return LowerMULO(Op, Subtarget, DAG);

33274

case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);

33275

case ISD::BITCAST: return LowerBITCAST(Op, Subtarget, DAG);

33276

case ISD::SADDO_CARRY:

33277

case ISD::SSUBO_CARRY:

33278

case ISD::ADDCARRY:

33279

case ISD::SUBCARRY: return LowerADDSUBCARRY(Op, DAG);

33280

case ISD::ADD:

33281

case ISD::SUB: return lowerAddSub(Op, DAG, Subtarget);

33282

case ISD::UADDSAT:

33283

case ISD::SADDSAT:

33284

case ISD::USUBSAT:

33285

case ISD::SSUBSAT: return LowerADDSAT_SUBSAT(Op, DAG, Subtarget);

33286

case ISD::SMAX:

33287

case ISD::SMIN:

33288

case ISD::UMAX:

33289

case ISD::UMIN: return LowerMINMAX(Op, Subtarget, DAG);

33290

case ISD::ABS: return LowerABS(Op, Subtarget, DAG);

33291

case ISD::AVGCEILU: return LowerAVG(Op, Subtarget, DAG);

33292

case ISD::FSINCOS: return LowerFSINCOS(Op, Subtarget, DAG);

33293

case ISD::MLOAD: return LowerMLOAD(Op, Subtarget, DAG);

33294

case ISD::MSTORE: return LowerMSTORE(Op, Subtarget, DAG);

33295

case ISD::MGATHER: return LowerMGATHER(Op, Subtarget, DAG);

33296

case ISD::MSCATTER: return LowerMSCATTER(Op, Subtarget, DAG);

33297

case ISD::GC_TRANSITION_START:

33298

case ISD::GC_TRANSITION_END: return LowerGC_TRANSITION(Op, DAG);

33299

case ISD::ADDRSPACECAST: return LowerADDRSPACECAST(Op, DAG);

33300

case X86ISD::CVTPS2PH: return LowerCVTPS2PH(Op, DAG);

33301

}

33302

}

33303

33304

/// Replace a node with an illegal result type with a new node built out of

33305

/// custom code.

33306

void X86TargetLowering::ReplaceNodeResults(SDNode *N,

33307

SmallVectorImpl<SDValue>&Results,

33308

SelectionDAG &DAG) const {

33309

SDLoc dl(N);

33310

switch (N->getOpcode()) {

33311

default:

33312

#ifndef NDEBUG

33313

dbgs() << "ReplaceNodeResults: ";

33314

N->dump(&DAG);

33315

#endif

33316

llvm_unreachable("Do not know how to custom type legalize this operation!")::llvm::llvm_unreachable_internal("Do not know how to custom type legalize this operation!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33316);

33317

case X86ISD::CVTPH2PS: {

33318

EVT VT = N->getValueType(0);

33319

SDValue Lo, Hi;

33320

std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);

33321

EVT LoVT, HiVT;

33322

std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);

33323

Lo = DAG.getNode(X86ISD::CVTPH2PS, dl, LoVT, Lo);

33324

Hi = DAG.getNode(X86ISD::CVTPH2PS, dl, HiVT, Hi);

33325

SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);

33326

Results.push_back(Res);

33327

return;

33328

}

33329

case X86ISD::STRICT_CVTPH2PS: {

33330

EVT VT = N->getValueType(0);

33331

SDValue Lo, Hi;

33332

std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 1);

33333

EVT LoVT, HiVT;

33334

std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);

33335

Lo = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {LoVT, MVT::Other},

33336

{N->getOperand(0), Lo});

33337

Hi = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {HiVT, MVT::Other},

33338

{N->getOperand(0), Hi});

33339

SDValue Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,

33340

Lo.getValue(1), Hi.getValue(1));

33341

SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);

33342

Results.push_back(Res);

33343

Results.push_back(Chain);

33344

return;

33345

}

33346

case X86ISD::CVTPS2PH:

33347

Results.push_back(LowerCVTPS2PH(SDValue(N, 0), DAG));

33348

return;

33349

case ISD::CTPOP: {

33350

assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!")(static_cast <bool> (N->getValueType(0) == MVT::i64 &&
"Unexpected VT!") ? void (0) : __assert_fail ("N->getValueType(0) == MVT::i64 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33350, __extension__
__PRETTY_FUNCTION__));

33351

// Use a v2i64 if possible.

33352

bool NoImplicitFloatOps =

33353

DAG.getMachineFunction().getFunction().hasFnAttribute(

33354

Attribute::NoImplicitFloat);

33355

if (isTypeLegal(MVT::v2i64) && !NoImplicitFloatOps) {

33356

SDValue Wide =

33357

DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, N->getOperand(0));

33358

Wide = DAG.getNode(ISD::CTPOP, dl, MVT::v2i64, Wide);

33359

// Bit count should fit in 32-bits, extract it as that and then zero

33360

// extend to i64. Otherwise we end up extracting bits 63:32 separately.

33361

Wide = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Wide);

33362

Wide = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, Wide,

33363

DAG.getIntPtrConstant(0, dl));

33364

Wide = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Wide);

33365

Results.push_back(Wide);

33366

}

33367

return;

33368

}

33369

case ISD::MUL: {

33370

EVT VT = N->getValueType(0);

33371

assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && VT.getVectorElementType() == MVT
::i8 && "Unexpected VT!") ? void (0) : __assert_fail (
"getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && VT.getVectorElementType() == MVT::i8 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33372, __extension__
__PRETTY_FUNCTION__))

33372

VT.getVectorElementType() == MVT::i8 && "Unexpected VT!")(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && VT.getVectorElementType() == MVT
::i8 && "Unexpected VT!") ? void (0) : __assert_fail (
"getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && VT.getVectorElementType() == MVT::i8 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33372, __extension__
__PRETTY_FUNCTION__));

33373

// Pre-promote these to vXi16 to avoid op legalization thinking all 16

33374

// elements are needed.

33375

MVT MulVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());

33376

SDValue Op0 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(0));

33377

SDValue Op1 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(1));

33378

SDValue Res = DAG.getNode(ISD::MUL, dl, MulVT, Op0, Op1);

33379

Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);

33380

unsigned NumConcats = 16 / VT.getVectorNumElements();

33381

SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(VT));

33382

ConcatOps[0] = Res;

33383

Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i8, ConcatOps);

33384

Results.push_back(Res);

33385

return;

33386

}

33387

case ISD::SMULO:

33388

case ISD::UMULO: {

33389

EVT VT = N->getValueType(0);

33390

assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && VT == MVT::v2i32 && "Unexpected VT!"
) ? void (0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && VT == MVT::v2i32 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33391, __extension__
__PRETTY_FUNCTION__))

33391

VT == MVT::v2i32 && "Unexpected VT!")(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && VT == MVT::v2i32 && "Unexpected VT!"
) ? void (0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && VT == MVT::v2i32 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33391, __extension__
__PRETTY_FUNCTION__));

33392

bool IsSigned = N->getOpcode() == ISD::SMULO;

33393

unsigned ExtOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;

33394

SDValue Op0 = DAG.getNode(ExtOpc, dl, MVT::v2i64, N->getOperand(0));

33395

SDValue Op1 = DAG.getNode(ExtOpc, dl, MVT::v2i64, N->getOperand(1));

33396

SDValue Res = DAG.getNode(ISD::MUL, dl, MVT::v2i64, Op0, Op1);

33397

// Extract the high 32 bits from each result using PSHUFD.

33398

// TODO: Could use SRL+TRUNCATE but that doesn't become a PSHUFD.

33399

SDValue Hi = DAG.getBitcast(MVT::v4i32, Res);

33400

Hi = DAG.getVectorShuffle(MVT::v4i32, dl, Hi, Hi, {1, 3, -1, -1});

33401

Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Hi,

33402

DAG.getIntPtrConstant(0, dl));

33403

33404

// Truncate the low bits of the result. This will become PSHUFD.

33405

Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);

33406

33407

SDValue HiCmp;

33408

if (IsSigned) {

33409

// SMULO overflows if the high bits don't match the sign of the low.

33410

HiCmp = DAG.getNode(ISD::SRA, dl, VT, Res, DAG.getConstant(31, dl, VT));

33411

} else {

33412

// UMULO overflows if the high bits are non-zero.

33413

HiCmp = DAG.getConstant(0, dl, VT);

33414

}

33415

SDValue Ovf = DAG.getSetCC(dl, N->getValueType(1), Hi, HiCmp, ISD::SETNE);

33416

33417

// Widen the result with by padding with undef.

33418

Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Res,

33419

DAG.getUNDEF(VT));

33420

Results.push_back(Res);

33421

Results.push_back(Ovf);

33422

return;

33423

}

33424

case X86ISD::VPMADDWD: {

33425

// Legalize types for X86ISD::VPMADDWD by widening.

33426

assert(Subtarget.hasSSE2() && "Requires at least SSE2!")(static_cast <bool> (Subtarget.hasSSE2() && "Requires at least SSE2!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"Requires at least SSE2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33426, __extension__
__PRETTY_FUNCTION__));

33427

33428

EVT VT = N->getValueType(0);

33429

EVT InVT = N->getOperand(0).getValueType();

33430

assert(VT.getSizeInBits() < 128 && 128 % VT.getSizeInBits() == 0 &&(static_cast <bool> (VT.getSizeInBits() < 128 &&
128 % VT.getSizeInBits() == 0 && "Expected a VT that divides into 128 bits."
) ? void (0) : __assert_fail ("VT.getSizeInBits() < 128 && 128 % VT.getSizeInBits() == 0 && \"Expected a VT that divides into 128 bits.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33431, __extension__
__PRETTY_FUNCTION__))

33431

"Expected a VT that divides into 128 bits.")(static_cast <bool> (VT.getSizeInBits() < 128 &&
128 % VT.getSizeInBits() == 0 && "Expected a VT that divides into 128 bits."
) ? void (0) : __assert_fail ("VT.getSizeInBits() < 128 && 128 % VT.getSizeInBits() == 0 && \"Expected a VT that divides into 128 bits.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33431, __extension__
__PRETTY_FUNCTION__));

33432

assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33433, __extension__
__PRETTY_FUNCTION__))

33433

"Unexpected type action!")(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33433, __extension__
__PRETTY_FUNCTION__));

33434

unsigned NumConcat = 128 / InVT.getSizeInBits();

33435

33436

EVT InWideVT = EVT::getVectorVT(*DAG.getContext(),

33437

InVT.getVectorElementType(),

33438

NumConcat * InVT.getVectorNumElements());

33439

EVT WideVT = EVT::getVectorVT(*DAG.getContext(),

33440

VT.getVectorElementType(),

33441

NumConcat * VT.getVectorNumElements());

33442

33443

SmallVector<SDValue, 16> Ops(NumConcat, DAG.getUNDEF(InVT));

33444

Ops[0] = N->getOperand(0);

33445

SDValue InVec0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWideVT, Ops);

33446

Ops[0] = N->getOperand(1);

33447

SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWideVT, Ops);

33448

33449

SDValue Res = DAG.getNode(N->getOpcode(), dl, WideVT, InVec0, InVec1);

33450

Results.push_back(Res);

33451

return;

33452

}

33453

// We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32.

33454

case X86ISD::FMINC:

33455

case X86ISD::FMIN:

33456

case X86ISD::FMAXC:

33457

case X86ISD::FMAX: {

33458

EVT VT = N->getValueType(0);

33459

assert(VT == MVT::v2f32 && "Unexpected type (!= v2f32) on FMIN/FMAX.")(static_cast <bool> (VT == MVT::v2f32 && "Unexpected type (!= v2f32) on FMIN/FMAX."
) ? void (0) : __assert_fail ("VT == MVT::v2f32 && \"Unexpected type (!= v2f32) on FMIN/FMAX.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33459, __extension__
__PRETTY_FUNCTION__));

33460

SDValue UNDEF = DAG.getUNDEF(VT);

33461

SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,

33462

N->getOperand(0), UNDEF);

33463

SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,

33464

N->getOperand(1), UNDEF);

33465

Results.push_back(DAG.getNode(N->getOpcode(), dl, MVT::v4f32, LHS, RHS));

33466

return;

33467

}

33468

case ISD::SDIV:

33469

case ISD::UDIV:

33470

case ISD::SREM:

33471

case ISD::UREM: {

33472

EVT VT = N->getValueType(0);

33473

if (VT.isVector()) {

33474

assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33475, __extension__
__PRETTY_FUNCTION__))

33475

"Unexpected type action!")(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33475, __extension__
__PRETTY_FUNCTION__));

33476

// If this RHS is a constant splat vector we can widen this and let

33477

// division/remainder by constant optimize it.

33478

// TODO: Can we do something for non-splat?

33479

APInt SplatVal;

33480

if (ISD::isConstantSplatVector(N->getOperand(1).getNode(), SplatVal)) {

33481

unsigned NumConcats = 128 / VT.getSizeInBits();

33482

SmallVector<SDValue, 8> Ops0(NumConcats, DAG.getUNDEF(VT));

33483

Ops0[0] = N->getOperand(0);

33484

EVT ResVT = getTypeToTransformTo(*DAG.getContext(), VT);

33485

SDValue N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Ops0);

33486

SDValue N1 = DAG.getConstant(SplatVal, dl, ResVT);

33487

SDValue Res = DAG.getNode(N->getOpcode(), dl, ResVT, N0, N1);

33488

Results.push_back(Res);

33489

}

33490

return;

33491

}

33492

33493

SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG);

33494

Results.push_back(V);

33495

return;

33496

}

33497

case ISD::TRUNCATE: {

33498

MVT VT = N->getSimpleValueType(0);

33499

if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector)

33500

return;

33501

33502

// The generic legalizer will try to widen the input type to the same

33503

// number of elements as the widened result type. But this isn't always

33504

// the best thing so do some custom legalization to avoid some cases.

33505

MVT WidenVT = getTypeToTransformTo(*DAG.getContext(), VT).getSimpleVT();

33506

SDValue In = N->getOperand(0);

33507

EVT InVT = In.getValueType();

33508

33509

unsigned InBits = InVT.getSizeInBits();

33510

if (128 % InBits == 0) {

33511

// 128 bit and smaller inputs should avoid truncate all together and

33512

// just use a build_vector that will become a shuffle.

33513

// TODO: Widen and use a shuffle directly?

33514

MVT InEltVT = InVT.getSimpleVT().getVectorElementType();

33515

EVT EltVT = VT.getVectorElementType();

33516

unsigned WidenNumElts = WidenVT.getVectorNumElements();

33517

SmallVector<SDValue, 16> Ops(WidenNumElts, DAG.getUNDEF(EltVT));

33518

// Use the original element count so we don't do more scalar opts than

33519

// necessary.

33520

unsigned MinElts = VT.getVectorNumElements();

33521

for (unsigned i=0; i < MinElts; ++i) {

33522

SDValue Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, InEltVT, In,

33523

DAG.getIntPtrConstant(i, dl));

33524

Ops[i] = DAG.getNode(ISD::TRUNCATE, dl, EltVT, Val);

33525

}

33526

Results.push_back(DAG.getBuildVector(WidenVT, dl, Ops));

33527

return;

33528

}

33529

// With AVX512 there are some cases that can use a target specific

33530

// truncate node to go from 256/512 to less than 128 with zeros in the

33531

// upper elements of the 128 bit result.

33532

if (Subtarget.hasAVX512() && isTypeLegal(InVT)) {

33533

// We can use VTRUNC directly if for 256 bits with VLX or for any 512.

33534

if ((InBits == 256 && Subtarget.hasVLX()) || InBits == 512) {

33535

Results.push_back(DAG.getNode(X86ISD::VTRUNC, dl, WidenVT, In));

33536

return;

33537

}

33538

// There's one case we can widen to 512 bits and use VTRUNC.

33539

if (InVT == MVT::v4i64 && VT == MVT::v4i8 && isTypeLegal(MVT::v8i64)) {

33540

In = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i64, In,

33541

DAG.getUNDEF(MVT::v4i64));

33542

Results.push_back(DAG.getNode(X86ISD::VTRUNC, dl, WidenVT, In));

33543

return;

33544

}

33545

}

33546

if (Subtarget.hasVLX() && InVT == MVT::v8i64 && VT == MVT::v8i8 &&

33547

getTypeAction(*DAG.getContext(), InVT) == TypeSplitVector &&

33548

isTypeLegal(MVT::v4i64)) {

33549

// Input needs to be split and output needs to widened. Let's use two

33550

// VTRUNCs, and shuffle their results together into the wider type.

33551

SDValue Lo, Hi;

33552

std::tie(Lo, Hi) = DAG.SplitVector(In, dl);

33553

33554

Lo = DAG.getNode(X86ISD::VTRUNC, dl, MVT::v16i8, Lo);

33555

Hi = DAG.getNode(X86ISD::VTRUNC, dl, MVT::v16i8, Hi);

33556

SDValue Res = DAG.getVectorShuffle(MVT::v16i8, dl, Lo, Hi,

33557

{ 0, 1, 2, 3, 16, 17, 18, 19,

33558

-1, -1, -1, -1, -1, -1, -1, -1 });

33559

Results.push_back(Res);

33560

return;

33561

}

33562

33563

return;

33564

}

33565

case ISD::ANY_EXTEND:

33566

// Right now, only MVT::v8i8 has Custom action for an illegal type.

33567

// It's intended to custom handle the input type.

33568

assert(N->getValueType(0) == MVT::v8i8 &&(static_cast <bool> (N->getValueType(0) == MVT::v8i8
&& "Do not know how to legalize this Node") ? void (
0) : __assert_fail ("N->getValueType(0) == MVT::v8i8 && \"Do not know how to legalize this Node\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33569, __extension__
__PRETTY_FUNCTION__))

33569

"Do not know how to legalize this Node")(static_cast <bool> (N->getValueType(0) == MVT::v8i8
&& "Do not know how to legalize this Node") ? void (
0) : __assert_fail ("N->getValueType(0) == MVT::v8i8 && \"Do not know how to legalize this Node\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33569, __extension__
__PRETTY_FUNCTION__));

33570

return;

33571

case ISD::SIGN_EXTEND:

33572

case ISD::ZERO_EXTEND: {

33573

EVT VT = N->getValueType(0);

33574

SDValue In = N->getOperand(0);

33575

EVT InVT = In.getValueType();

33576

if (!Subtarget.hasSSE41() && VT == MVT::v4i64 &&

33577

(InVT == MVT::v4i16 || InVT == MVT::v4i8)){

33578

assert(getTypeAction(*DAG.getContext(), InVT) == TypeWidenVector &&(static_cast <bool> (getTypeAction(*DAG.getContext(), InVT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), InVT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33579, __extension__
__PRETTY_FUNCTION__))

33579

"Unexpected type action!")(static_cast <bool> (getTypeAction(*DAG.getContext(), InVT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), InVT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33579, __extension__
__PRETTY_FUNCTION__));

33580

assert(N->getOpcode() == ISD::SIGN_EXTEND && "Unexpected opcode")(static_cast <bool> (N->getOpcode() == ISD::SIGN_EXTEND
&& "Unexpected opcode") ? void (0) : __assert_fail (
"N->getOpcode() == ISD::SIGN_EXTEND && \"Unexpected opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33580, __extension__
__PRETTY_FUNCTION__));

33581

// Custom split this so we can extend i8/i16->i32 invec. This is better

33582

// since sign_extend_inreg i8/i16->i64 requires an extend to i32 using

33583

// sra. Then extending from i32 to i64 using pcmpgt. By custom splitting

33584

// we allow the sra from the extend to i32 to be shared by the split.

33585

In = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, In);

33586

33587

// Fill a vector with sign bits for each element.

33588

SDValue Zero = DAG.getConstant(0, dl, MVT::v4i32);

33589

SDValue SignBits = DAG.getSetCC(dl, MVT::v4i32, Zero, In, ISD::SETGT);

33590

33591

// Create an unpackl and unpackh to interleave the sign bits then bitcast

33592

// to v2i64.

33593

SDValue Lo = DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits,

33594

{0, 4, 1, 5});

33595

Lo = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Lo);

33596

SDValue Hi = DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits,

33597

{2, 6, 3, 7});

33598

Hi = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Hi);

33599

33600

SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);

33601

Results.push_back(Res);

33602

return;

33603

}

33604

33605

if (VT == MVT::v16i32 || VT == MVT::v8i64) {

33606

if (!InVT.is128BitVector()) {

33607

// Not a 128 bit vector, but maybe type legalization will promote

33608

// it to 128 bits.

33609

if (getTypeAction(*DAG.getContext(), InVT) != TypePromoteInteger)

33610

return;

33611

InVT = getTypeToTransformTo(*DAG.getContext(), InVT);

33612

if (!InVT.is128BitVector())

33613

return;

33614

33615

// Promote the input to 128 bits. Type legalization will turn this into

33616

// zext_inreg/sext_inreg.

33617

In = DAG.getNode(N->getOpcode(), dl, InVT, In);

33618

}

33619

33620

// Perform custom splitting instead of the two stage extend we would get

33621

// by default.

33622

EVT LoVT, HiVT;

33623

std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));

33624

assert(isTypeLegal(LoVT) && "Split VT not legal?")(static_cast <bool> (isTypeLegal(LoVT) && "Split VT not legal?"
) ? void (0) : __assert_fail ("isTypeLegal(LoVT) && \"Split VT not legal?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33624, __extension__
__PRETTY_FUNCTION__));

33625

33626

SDValue Lo = getEXTEND_VECTOR_INREG(N->getOpcode(), dl, LoVT, In, DAG);

33627

33628

// We need to shift the input over by half the number of elements.

33629

unsigned NumElts = InVT.getVectorNumElements();

33630

unsigned HalfNumElts = NumElts / 2;

33631

SmallVector<int, 16> ShufMask(NumElts, SM_SentinelUndef);

33632

for (unsigned i = 0; i != HalfNumElts; ++i)

33633

ShufMask[i] = i + HalfNumElts;

33634

33635

SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask);

33636

Hi = getEXTEND_VECTOR_INREG(N->getOpcode(), dl, HiVT, Hi, DAG);

33637

33638

SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);

33639

Results.push_back(Res);

33640

}

33641

return;

33642

}

33643

case ISD::FP_TO_SINT:

33644

case ISD::STRICT_FP_TO_SINT:

33645

case ISD::FP_TO_UINT:

33646

case ISD::STRICT_FP_TO_UINT: {

33647

bool IsStrict = N->isStrictFPOpcode();

33648

bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT ||

33649

N->getOpcode() == ISD::STRICT_FP_TO_SINT;

33650

EVT VT = N->getValueType(0);

33651

SDValue Src = N->getOperand(IsStrict ? 1 : 0);

33652

SDValue Chain = IsStrict ? N->getOperand(0) : SDValue();

33653

EVT SrcVT = Src.getValueType();

33654

33655

SDValue Res;

33656

if (isSoftFP16(SrcVT)) {

33657

EVT NVT = VT.isVector() ? VT.changeVectorElementType(MVT::f32) : MVT::f32;

33658

if (IsStrict) {

33659

Res =

33660

DAG.getNode(N->getOpcode(), dl, {VT, MVT::Other},

33661

{Chain, DAG.getNode(ISD::STRICT_FP_EXTEND, dl,

33662

{NVT, MVT::Other}, {Chain, Src})});

33663

Chain = Res.getValue(1);

33664

} else {

33665

Res = DAG.getNode(N->getOpcode(), dl, VT,

33666

DAG.getNode(ISD::FP_EXTEND, dl, NVT, Src));

33667

}

33668

Results.push_back(Res);

33669

if (IsStrict)

33670

Results.push_back(Chain);

33671

33672

return;

33673

}

33674

33675

if (VT.isVector() && Subtarget.hasFP16() &&

33676

SrcVT.getVectorElementType() == MVT::f16) {

33677

EVT EleVT = VT.getVectorElementType();

33678

EVT ResVT = EleVT == MVT::i32 ? MVT::v4i32 : MVT::v8i16;

33679

33680

if (SrcVT != MVT::v8f16) {

33681

SDValue Tmp =

33682

IsStrict ? DAG.getConstantFP(0.0, dl, SrcVT) : DAG.getUNDEF(SrcVT);

33683

SmallVector<SDValue, 4> Ops(SrcVT == MVT::v2f16 ? 4 : 2, Tmp);

33684

Ops[0] = Src;

33685

Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f16, Ops);

33686

}

33687

33688

if (IsStrict) {

33689

unsigned Opc =

33690

IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI;

33691

Res =

33692

DAG.getNode(Opc, dl, {ResVT, MVT::Other}, {N->getOperand(0), Src});

33693

Chain = Res.getValue(1);

33694

} else {

33695

unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;

33696

Res = DAG.getNode(Opc, dl, ResVT, Src);

33697

}

33698

33699

// TODO: Need to add exception check code for strict FP.

33700

if (EleVT.getSizeInBits() < 16) {

33701

MVT TmpVT = MVT::getVectorVT(EleVT.getSimpleVT(), 8);

33702

Res = DAG.getNode(ISD::TRUNCATE, dl, TmpVT, Res);

33703

33704

// Now widen to 128 bits.

33705

unsigned NumConcats = 128 / TmpVT.getSizeInBits();

33706

MVT ConcatVT = MVT::getVectorVT(EleVT.getSimpleVT(), 8 * NumConcats);

33707

SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(TmpVT));

33708

ConcatOps[0] = Res;

33709

Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatVT, ConcatOps);

33710

}

33711

33712

Results.push_back(Res);

33713

if (IsStrict)

33714

Results.push_back(Chain);

33715

33716

return;

33717

}

33718

33719

if (VT.isVector() && VT.getScalarSizeInBits() < 32) {

33720

assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33721, __extension__
__PRETTY_FUNCTION__))

33721

"Unexpected type action!")(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33721, __extension__
__PRETTY_FUNCTION__));

33722

33723

// Try to create a 128 bit vector, but don't exceed a 32 bit element.

33724

unsigned NewEltWidth = std::min(128 / VT.getVectorNumElements(), 32U);

33725

MVT PromoteVT = MVT::getVectorVT(MVT::getIntegerVT(NewEltWidth),

33726

VT.getVectorNumElements());

33727

SDValue Res;

33728

SDValue Chain;

33729

if (IsStrict) {

33730

Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {PromoteVT, MVT::Other},

33731

{N->getOperand(0), Src});

33732

Chain = Res.getValue(1);

33733

} else

33734

Res = DAG.getNode(ISD::FP_TO_SINT, dl, PromoteVT, Src);

33735

33736

// Preserve what we know about the size of the original result. If the

33737

// result is v2i32, we have to manually widen the assert.

33738

if (PromoteVT == MVT::v2i32)

33739

Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Res,

33740

DAG.getUNDEF(MVT::v2i32));

33741

33742

Res = DAG.getNode(!IsSigned ? ISD::AssertZext : ISD::AssertSext, dl,

33743

Res.getValueType(), Res,

33744

DAG.getValueType(VT.getVectorElementType()));

33745

33746

if (PromoteVT == MVT::v2i32)

33747

Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res,

33748

DAG.getIntPtrConstant(0, dl));

33749

33750

// Truncate back to the original width.

33751

Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);

33752

33753

// Now widen to 128 bits.

33754

unsigned NumConcats = 128 / VT.getSizeInBits();

33755

MVT ConcatVT = MVT::getVectorVT(VT.getSimpleVT().getVectorElementType(),

33756

VT.getVectorNumElements() * NumConcats);

33757

SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(VT));

33758

ConcatOps[0] = Res;

33759

Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatVT, ConcatOps);

33760

Results.push_back(Res);

33761

if (IsStrict)

33762

Results.push_back(Chain);

33763

return;

33764

}

33765

33766

33767

if (VT == MVT::v2i32) {

33768

assert((!IsStrict || IsSigned || Subtarget.hasAVX512()) &&(static_cast <bool> ((!IsStrict || IsSigned || Subtarget
.hasAVX512()) && "Strict unsigned conversion requires AVX512"
) ? void (0) : __assert_fail ("(!IsStrict || IsSigned || Subtarget.hasAVX512()) && \"Strict unsigned conversion requires AVX512\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33769, __extension__
__PRETTY_FUNCTION__))

33769

"Strict unsigned conversion requires AVX512")(static_cast <bool> ((!IsStrict || IsSigned || Subtarget
.hasAVX512()) && "Strict unsigned conversion requires AVX512"
) ? void (0) : __assert_fail ("(!IsStrict || IsSigned || Subtarget.hasAVX512()) && \"Strict unsigned conversion requires AVX512\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33769, __extension__
__PRETTY_FUNCTION__));

33770

assert(Subtarget.hasSSE2() && "Requires at least SSE2!")(static_cast <bool> (Subtarget.hasSSE2() && "Requires at least SSE2!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"Requires at least SSE2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33770, __extension__
__PRETTY_FUNCTION__));

33771

assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33772, __extension__
__PRETTY_FUNCTION__))

33772

"Unexpected type action!")(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33772, __extension__
__PRETTY_FUNCTION__));

33773

if (Src.getValueType() == MVT::v2f64) {

33774

if (!IsSigned && !Subtarget.hasAVX512()) {

33775

SDValue Res =

33776

expandFP_TO_UINT_SSE(MVT::v4i32, Src, dl, DAG, Subtarget);

33777

Results.push_back(Res);

33778

return;

33779

}

33780

33781

unsigned Opc;

33782

if (IsStrict)

33783

Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI;

33784

else

33785

Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;

33786

33787

// If we have VLX we can emit a target specific FP_TO_UINT node,.

33788

if (!IsSigned && !Subtarget.hasVLX()) {

33789

// Otherwise we can defer to the generic legalizer which will widen

33790

// the input as well. This will be further widened during op

33791

// legalization to v8i32<-v8f64.

33792

// For strict nodes we'll need to widen ourselves.

33793

// FIXME: Fix the type legalizer to safely widen strict nodes?

33794

if (!IsStrict)

33795

return;

33796

Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f64, Src,

33797

DAG.getConstantFP(0.0, dl, MVT::v2f64));

33798

Opc = N->getOpcode();

33799

}

33800

SDValue Res;

33801

SDValue Chain;

33802

if (IsStrict) {

33803

Res = DAG.getNode(Opc, dl, {MVT::v4i32, MVT::Other},

33804

{N->getOperand(0), Src});

33805

Chain = Res.getValue(1);

33806

} else {

33807

Res = DAG.getNode(Opc, dl, MVT::v4i32, Src);

33808

}

33809

Results.push_back(Res);

33810

if (IsStrict)

33811

Results.push_back(Chain);

33812

return;

33813

}

33814

33815

// Custom widen strict v2f32->v2i32 by padding with zeros.

33816

// FIXME: Should generic type legalizer do this?

33817

if (Src.getValueType() == MVT::v2f32 && IsStrict) {

33818

Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,

33819

DAG.getConstantFP(0.0, dl, MVT::v2f32));

33820

SDValue Res = DAG.getNode(N->getOpcode(), dl, {MVT::v4i32, MVT::Other},

33821

{N->getOperand(0), Src});

33822

Results.push_back(Res);

33823

Results.push_back(Res.getValue(1));

33824

return;

33825

}

33826

33827

// The FP_TO_INTHelper below only handles f32/f64/f80 scalar inputs,

33828

// so early out here.

33829

return;

33830

}

33831

33832

assert(!VT.isVector() && "Vectors should have been handled above!")(static_cast <bool> (!VT.isVector() && "Vectors should have been handled above!"
) ? void (0) : __assert_fail ("!VT.isVector() && \"Vectors should have been handled above!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33832, __extension__
__PRETTY_FUNCTION__));

33833

33834

if ((Subtarget.hasDQI() && VT == MVT::i64 &&

33835

(SrcVT == MVT::f32 || SrcVT == MVT::f64)) ||

33836

(Subtarget.hasFP16() && SrcVT == MVT::f16)) {

33837

assert(!Subtarget.is64Bit() && "i64 should be legal")(static_cast <bool> (!Subtarget.is64Bit() && "i64 should be legal"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"i64 should be legal\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 33837, __extension__
__PRETTY_FUNCTION__));

33838

unsigned NumElts = Subtarget.hasVLX() ? 2 : 8;

33839

// If we use a 128-bit result we might need to use a target specific node.

33840

unsigned SrcElts =

33841

std::max(NumElts, 128U / (unsigned)SrcVT.getSizeInBits());

33842

MVT VecVT = MVT::getVectorVT(MVT::i64, NumElts);

33843

MVT VecInVT = MVT::getVectorVT(SrcVT.getSimpleVT(), SrcElts);

33844

unsigned Opc = N->getOpcode();

33845

if (NumElts != SrcElts) {

33846

if (IsStrict)

33847

Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI;

33848

else

33849

Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;

33850

}

33851

33852

SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);

33853

SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecInVT,

33854

DAG.getConstantFP(0.0, dl, VecInVT), Src,

33855

ZeroIdx);

33856

SDValue Chain;

33857

if (IsStrict) {

33858

SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);

33859

Res = DAG.getNode(Opc, SDLoc(N), Tys, N->getOperand(0), Res);

33860

Chain = Res.getValue(1);

33861

} else

33862

Res = DAG.getNode(Opc, SDLoc(N), VecVT, Res);

33863

Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Res, ZeroIdx);

33864

Results.push_back(Res);

33865

if (IsStrict)

33866

Results.push_back(Chain);

33867

return;

33868

}

33869

33870

if (VT == MVT::i128 && Subtarget.isTargetWin64()) {

33871

SDValue Chain;

33872

SDValue V = LowerWin64_FP_TO_INT128(SDValue(N, 0), DAG, Chain);

33873

Results.push_back(V);

33874

if (IsStrict)

33875

Results.push_back(Chain);

33876

return;

33877

}

33878

33879

if (SDValue V = FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, Chain)) {

33880

Results.push_back(V);

33881

if (IsStrict)

33882

Results.push_back(Chain);

33883

}

33884

return;

33885

}

33886

case ISD::LRINT:

33887

case ISD::LLRINT: {

33888

if (SDValue V = LRINT_LLRINTHelper(N, DAG))

33889

Results.push_back(V);

33890

return;

33891

}

33892

33893

case ISD::SINT_TO_FP:

33894

case ISD::STRICT_SINT_TO_FP:

33895

case ISD::UINT_TO_FP:

33896

case ISD::STRICT_UINT_TO_FP: {

33897

bool IsStrict = N->isStrictFPOpcode();

33898

bool IsSigned = N->getOpcode() == ISD::SINT_TO_FP ||

33899

N->getOpcode() == ISD::STRICT_SINT_TO_FP;

33900

EVT VT = N->getValueType(0);

33901

SDValue Src = N->getOperand(IsStrict ? 1 : 0);

33902

if (VT.getVectorElementType() == MVT::f16 && Subtarget.hasFP16() &&

33903

Subtarget.hasVLX()) {

33904

if (Src.getValueType().getVectorElementType() == MVT::i16)

33905

return;

33906

33907

if (VT == MVT::v2f16 && Src.getValueType() == MVT::v2i32)

33908

Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,

33909

IsStrict ? DAG.getConstant(0, dl, MVT::v2i32)

33910

: DAG.getUNDEF(MVT::v2i32));

33911

if (IsStrict) {

33912

unsigned Opc =

33913

IsSigned ? X86ISD::STRICT_CVTSI2P : X86ISD::STRICT_CVTUI2P;

33914

SDValue Res = DAG.getNode(Opc, dl, {MVT::v8f16, MVT::Other},

33915

{N->getOperand(0), Src});

33916

Results.push_back(Res);

33917

Results.push_back(Res.getValue(1));

33918

} else {

33919

unsigned Opc = IsSigned ? X86ISD::CVTSI2P : X86ISD::CVTUI2P;

33920

Results.push_back(DAG.getNode(Opc, dl, MVT::v8f16, Src));

33921

}

33922

return;

33923

}

33924

if (VT != MVT::v2f32)

33925

return;

33926

EVT SrcVT = Src.getValueType();

33927

if (Subtarget.hasDQI() && Subtarget.hasVLX() && SrcVT == MVT::v2i64) {

33928

if (IsStrict) {

33929

unsigned Opc = IsSigned ? X86ISD::STRICT_CVTSI2P

33930

: X86ISD::STRICT_CVTUI2P;

33931

SDValue Res = DAG.getNode(Opc, dl, {MVT::v4f32, MVT::Other},

33932

{N->getOperand(0), Src});

33933

Results.push_back(Res);

33934

Results.push_back(Res.getValue(1));

33935

} else {

33936

unsigned Opc = IsSigned ? X86ISD::CVTSI2P : X86ISD::CVTUI2P;

33937

Results.push_back(DAG.getNode(Opc, dl, MVT::v4f32, Src));

33938

}

33939

return;

33940

}

33941

if (SrcVT == MVT::v2i64 && !IsSigned && Subtarget.is64Bit() &&

33942

Subtarget.hasSSE41() && !Subtarget.hasAVX512()) {

33943

SDValue Zero = DAG.getConstant(0, dl, SrcVT);

33944

SDValue One = DAG.getConstant(1, dl, SrcVT);

33945

SDValue Sign = DAG.getNode(ISD::OR, dl, SrcVT,

33946

DAG.getNode(ISD::SRL, dl, SrcVT, Src, One),

33947

DAG.getNode(ISD::AND, dl, SrcVT, Src, One));

33948

SDValue IsNeg = DAG.getSetCC(dl, MVT::v2i64, Src, Zero, ISD::SETLT);

33949

SDValue SignSrc = DAG.getSelect(dl, SrcVT, IsNeg, Sign, Src);

33950

SmallVector<SDValue, 4> SignCvts(4, DAG.getConstantFP(0.0, dl, MVT::f32));

33951

for (int i = 0; i != 2; ++i) {

33952

SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64,

33953

SignSrc, DAG.getIntPtrConstant(i, dl));

33954

if (IsStrict)

33955

SignCvts[i] =

33956

DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {MVT::f32, MVT::Other},

33957

{N->getOperand(0), Elt});

33958

else

33959

SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::f32, Elt);

33960

};

33961

SDValue SignCvt = DAG.getBuildVector(MVT::v4f32, dl, SignCvts);

33962

SDValue Slow, Chain;

33963

if (IsStrict) {

33964

Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,

33965

SignCvts[0].getValue(1), SignCvts[1].getValue(1));

33966

Slow = DAG.getNode(ISD::STRICT_FADD, dl, {MVT::v4f32, MVT::Other},

33967

{Chain, SignCvt, SignCvt});

33968

Chain = Slow.getValue(1);

33969

} else {

33970

Slow = DAG.getNode(ISD::FADD, dl, MVT::v4f32, SignCvt, SignCvt);

33971

}

33972

IsNeg = DAG.getBitcast(MVT::v4i32, IsNeg);

33973

IsNeg =

33974

DAG.getVectorShuffle(MVT::v4i32, dl, IsNeg, IsNeg, {1, 3, -1, -1});

33975

SDValue Cvt = DAG.getSelect(dl, MVT::v4f32, IsNeg, Slow, SignCvt);

33976

Results.push_back(Cvt);

33977

if (IsStrict)

33978

Results.push_back(Chain);

33979

return;

33980

}

33981

33982

if (SrcVT != MVT::v2i32)

33983

return;

33984

33985

if (IsSigned || Subtarget.hasAVX512()) {

33986

if (!IsStrict)

33987

return;

33988

33989

// Custom widen strict v2i32->v2f32 to avoid scalarization.

33990

// FIXME: Should generic type legalizer do this?

33991

Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,

33992

DAG.getConstant(0, dl, MVT::v2i32));

33993

SDValue Res = DAG.getNode(N->getOpcode(), dl, {MVT::v4f32, MVT::Other},

33994

{N->getOperand(0), Src});

33995

Results.push_back(Res);

33996

Results.push_back(Res.getValue(1));

33997

return;

33998

}

33999

34000

assert(Subtarget.hasSSE2() && "Requires at least SSE2!")(static_cast <bool> (Subtarget.hasSSE2() && "Requires at least SSE2!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"Requires at least SSE2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34000, __extension__
__PRETTY_FUNCTION__));

34001

SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, Src);

34002

SDValue VBias =

34003

DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl, MVT::v2f64);

34004

SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,

34005

DAG.getBitcast(MVT::v2i64, VBias));

34006

Or = DAG.getBitcast(MVT::v2f64, Or);

34007

if (IsStrict) {

34008

SDValue Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::v2f64, MVT::Other},

34009

{N->getOperand(0), Or, VBias});

34010

SDValue Res = DAG.getNode(X86ISD::STRICT_VFPROUND, dl,

34011

{MVT::v4f32, MVT::Other},

34012

{Sub.getValue(1), Sub});

34013

Results.push_back(Res);

34014

Results.push_back(Res.getValue(1));

34015

} else {

34016

// TODO: Are there any fast-math-flags to propagate here?

34017

SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);

34018

Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));

34019

}

34020

return;

34021

}

34022

case ISD::STRICT_FP_ROUND:

34023

case ISD::FP_ROUND: {

34024

bool IsStrict = N->isStrictFPOpcode();

34025

SDValue Chain = IsStrict ? N->getOperand(0) : SDValue();

34026

SDValue Src = N->getOperand(IsStrict ? 1 : 0);

34027

SDValue Rnd = N->getOperand(IsStrict ? 2 : 1);

34028

EVT SrcVT = Src.getValueType();

34029

EVT VT = N->getValueType(0);

34030

SDValue V;

34031

if (VT == MVT::v2f16 && Src.getValueType() == MVT::v2f32) {

34032

SDValue Ext = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v2f32)

34033

: DAG.getUNDEF(MVT::v2f32);

34034

Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src, Ext);

34035

}

34036

if (!Subtarget.hasFP16() && VT.getVectorElementType() == MVT::f16) {

34037

assert(Subtarget.hasF16C() && "Cannot widen f16 without F16C")(static_cast <bool> (Subtarget.hasF16C() && "Cannot widen f16 without F16C"
) ? void (0) : __assert_fail ("Subtarget.hasF16C() && \"Cannot widen f16 without F16C\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34037, __extension__
__PRETTY_FUNCTION__));

34038

if (SrcVT.getVectorElementType() != MVT::f32)

34039

return;

34040

34041

if (IsStrict)

34042

V = DAG.getNode(X86ISD::STRICT_CVTPS2PH, dl, {MVT::v8i16, MVT::Other},

34043

{Chain, Src, Rnd});

34044

else

34045

V = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Src, Rnd);

34046

34047

Results.push_back(DAG.getBitcast(MVT::v8f16, V));

34048

if (IsStrict)

34049

Results.push_back(V.getValue(1));

34050

return;

34051

}

34052

if (!isTypeLegal(Src.getValueType()))

34053

return;

34054

EVT NewVT = VT.getVectorElementType() == MVT::f16 ? MVT::v8f16 : MVT::v4f32;

34055

if (IsStrict)

34056

V = DAG.getNode(X86ISD::STRICT_VFPROUND, dl, {NewVT, MVT::Other},

34057

{Chain, Src});

34058

else

34059

V = DAG.getNode(X86ISD::VFPROUND, dl, NewVT, Src);

34060

Results.push_back(V);

34061

if (IsStrict)

34062

Results.push_back(V.getValue(1));

34063

return;

34064

}

34065

case ISD::FP_EXTEND:

34066

case ISD::STRICT_FP_EXTEND: {

34067

// Right now, only MVT::v2f32 has OperationAction for FP_EXTEND.

34068

// No other ValueType for FP_EXTEND should reach this point.

34069

assert(N->getValueType(0) == MVT::v2f32 &&(static_cast <bool> (N->getValueType(0) == MVT::v2f32
&& "Do not know how to legalize this Node") ? void (
0) : __assert_fail ("N->getValueType(0) == MVT::v2f32 && \"Do not know how to legalize this Node\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34070, __extension__
__PRETTY_FUNCTION__))

34070

"Do not know how to legalize this Node")(static_cast <bool> (N->getValueType(0) == MVT::v2f32
&& "Do not know how to legalize this Node") ? void (
0) : __assert_fail ("N->getValueType(0) == MVT::v2f32 && \"Do not know how to legalize this Node\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34070, __extension__
__PRETTY_FUNCTION__));

34071

if (!Subtarget.hasFP16() || !Subtarget.hasVLX())

34072

return;

34073

bool IsStrict = N->isStrictFPOpcode();

34074

SDValue Src = N->getOperand(IsStrict ? 1 : 0);

34075

SDValue Ext = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v2f16)

34076

: DAG.getUNDEF(MVT::v2f16);

34077

SDValue V = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f16, Src, Ext);

34078

if (IsStrict)

34079

V = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::v4f32, MVT::Other},

34080

{N->getOperand(0), V});

34081

else

34082

V = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, V);

34083

Results.push_back(V);

34084

if (IsStrict)

34085

Results.push_back(V.getValue(1));

34086

return;

34087

}

34088

case ISD::INTRINSIC_W_CHAIN: {

34089

unsigned IntNo = N->getConstantOperandVal(1);

34090

switch (IntNo) {

34091

default : llvm_unreachable("Do not know how to custom type "::llvm::llvm_unreachable_internal("Do not know how to custom type "
"legalize this intrinsic operation!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34092)

34092

"legalize this intrinsic operation!")::llvm::llvm_unreachable_internal("Do not know how to custom type "
"legalize this intrinsic operation!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 34092);

34093

case Intrinsic::x86_rdtsc:

34094

return getReadTimeStampCounter(N, dl, X86::RDTSC, DAG, Subtarget,

34095

Results);

34096

case Intrinsic::x86_rdtscp:

34097

return getReadTimeStampCounter(N, dl, X86::RDTSCP, DAG, Subtarget,

34098

Results);

34099

case Intrinsic::x86_rdpmc:

34100

expandIntrinsicWChainHelper(N, dl, DAG, X86::RDPMC, X86::ECX, Subtarget,

34101

Results);

34102

return;

34103

case Intrinsic::x86_rdpru:

34104

expandIntrinsicWChainHelper(N, dl, DAG, X86::RDPRU, X86::ECX, Subtarget,

34105

Results);

34106

return;

34107

case Intrinsic::x86_xgetbv:

34108

expandIntrinsicWChainHelper(N, dl, DAG, X86::XGETBV, X86::ECX, Subtarget,

34109

Results);

34110

return;

34111

}

34112

}

34113

case ISD::READCYCLECOUNTER: {

34114

return getReadTimeStampCounter(N, dl, X86::RDTSC, DAG, Subtarget, Results);

34115

}

34116

case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: {

34117

EVT T = N->getValueType(0);

34118

assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair")(static_cast <bool> ((T == MVT::i64 || T == MVT::i128) &&
"can only expand cmpxchg pair") ? void (0) : __assert_fail (
"(T == MVT::i64 || T == MVT::i128) && \"can only expand cmpxchg pair\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34118, __extension__
__PRETTY_FUNCTION__));

34119

bool Regs64bit = T == MVT::i128;

34120

assert((!Regs64bit || Subtarget.canUseCMPXCHG16B()) &&(static_cast <bool> ((!Regs64bit || Subtarget.canUseCMPXCHG16B
()) && "64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B"
) ? void (0) : __assert_fail ("(!Regs64bit || Subtarget.canUseCMPXCHG16B()) && \"64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34121, __extension__
__PRETTY_FUNCTION__))

34121

"64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B")(static_cast <bool> ((!Regs64bit || Subtarget.canUseCMPXCHG16B
()) && "64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B"
) ? void (0) : __assert_fail ("(!Regs64bit || Subtarget.canUseCMPXCHG16B()) && \"64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34121, __extension__
__PRETTY_FUNCTION__));

34122

MVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;

34123

SDValue cpInL, cpInH;

34124

cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),

34125

DAG.getConstant(0, dl, HalfT));

34126

cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),

34127

DAG.getConstant(1, dl, HalfT));

34128

cpInL = DAG.getCopyToReg(N->getOperand(0), dl,

34129

Regs64bit ? X86::RAX : X86::EAX,

34130

cpInL, SDValue());

34131

cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl,

34132

Regs64bit ? X86::RDX : X86::EDX,

34133

cpInH, cpInL.getValue(1));

34134

SDValue swapInL, swapInH;

34135

swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),

34136

DAG.getConstant(0, dl, HalfT));

34137

swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),

34138

DAG.getConstant(1, dl, HalfT));

34139

swapInH =

34140

DAG.getCopyToReg(cpInH.getValue(0), dl, Regs64bit ? X86::RCX : X86::ECX,

34141

swapInH, cpInH.getValue(1));

34142

34143

// In 64-bit mode we might need the base pointer in RBX, but we can't know

34144

// until later. So we keep the RBX input in a vreg and use a custom

34145

// inserter.

34146

// Since RBX will be a reserved register the register allocator will not

34147

// make sure its value will be properly saved and restored around this

34148

// live-range.

34149

SDValue Result;

34150

SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);

34151

MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();

34152

if (Regs64bit) {

34153

SDValue Ops[] = {swapInH.getValue(0), N->getOperand(1), swapInL,

34154

swapInH.getValue(1)};

34155

Result =

34156

DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG16_DAG, dl, Tys, Ops, T, MMO);

34157

} else {

34158

swapInL = DAG.getCopyToReg(swapInH.getValue(0), dl, X86::EBX, swapInL,

34159

swapInH.getValue(1));

34160

SDValue Ops[] = {swapInL.getValue(0), N->getOperand(1),

34161

swapInL.getValue(1)};

34162

Result =

34163

DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG8_DAG, dl, Tys, Ops, T, MMO);

34164

}

34165

34166

SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,

34167

Regs64bit ? X86::RAX : X86::EAX,

34168

HalfT, Result.getValue(1));

34169

SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl,

34170

Regs64bit ? X86::RDX : X86::EDX,

34171

HalfT, cpOutL.getValue(2));

34172

SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};

34173

34174

SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS,

34175

MVT::i32, cpOutH.getValue(2));

34176

SDValue Success = getSETCC(X86::COND_E, EFLAGS, dl, DAG);

34177

Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1));

34178

34179

Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF));

34180

Results.push_back(Success);

34181

Results.push_back(EFLAGS.getValue(1));

34182

return;

34183

}

34184

case ISD::ATOMIC_LOAD: {

34185

assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!")(static_cast <bool> (N->getValueType(0) == MVT::i64 &&
"Unexpected VT!") ? void (0) : __assert_fail ("N->getValueType(0) == MVT::i64 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34185, __extension__
__PRETTY_FUNCTION__));

34186

bool NoImplicitFloatOps =

34187

DAG.getMachineFunction().getFunction().hasFnAttribute(

34188

Attribute::NoImplicitFloat);

34189

if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps) {

34190

auto *Node = cast<AtomicSDNode>(N);

34191

if (Subtarget.hasSSE1()) {

34192

// Use a VZEXT_LOAD which will be selected as MOVQ or XORPS+MOVLPS.

34193

// Then extract the lower 64-bits.

34194

MVT LdVT = Subtarget.hasSSE2() ? MVT::v2i64 : MVT::v4f32;

34195

SDVTList Tys = DAG.getVTList(LdVT, MVT::Other);

34196

SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };

34197

SDValue Ld = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,

34198

MVT::i64, Node->getMemOperand());

34199

if (Subtarget.hasSSE2()) {

34200

SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Ld,

34201

DAG.getIntPtrConstant(0, dl));

34202

Results.push_back(Res);

34203

Results.push_back(Ld.getValue(1));

34204

return;

34205

}

34206

// We use an alternative sequence for SSE1 that extracts as v2f32 and

34207

// then casts to i64. This avoids a 128-bit stack temporary being

34208

// created by type legalization if we were to cast v4f32->v2i64.

34209

SDValue Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2f32, Ld,

34210

DAG.getIntPtrConstant(0, dl));

34211

Res = DAG.getBitcast(MVT::i64, Res);

34212

Results.push_back(Res);

34213

Results.push_back(Ld.getValue(1));

34214

return;

34215

}

34216

if (Subtarget.hasX87()) {

34217

// First load this into an 80-bit X87 register. This will put the whole

34218

// integer into the significand.

34219

SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);

34220

SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };

34221

SDValue Result = DAG.getMemIntrinsicNode(X86ISD::FILD,

34222

dl, Tys, Ops, MVT::i64,

34223

Node->getMemOperand());

34224

SDValue Chain = Result.getValue(1);

34225

34226

// Now store the X87 register to a stack temporary and convert to i64.

34227

// This store is not atomic and doesn't need to be.

34228

// FIXME: We don't need a stack temporary if the result of the load

34229

// is already being stored. We could just directly store there.

34230

SDValue StackPtr = DAG.CreateStackTemporary(MVT::i64);

34231

int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();

34232

MachinePointerInfo MPI =

34233

MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);

34234

SDValue StoreOps[] = { Chain, Result, StackPtr };

34235

Chain = DAG.getMemIntrinsicNode(

34236

X86ISD::FIST, dl, DAG.getVTList(MVT::Other), StoreOps, MVT::i64,

34237

MPI, std::nullopt /*Align*/, MachineMemOperand::MOStore);

34238

34239

// Finally load the value back from the stack temporary and return it.

34240

// This load is not atomic and doesn't need to be.

34241

// This load will be further type legalized.

34242

Result = DAG.getLoad(MVT::i64, dl, Chain, StackPtr, MPI);

34243

Results.push_back(Result);

34244

Results.push_back(Result.getValue(1));

34245

return;

34246

}

34247

}

34248

// TODO: Use MOVLPS when SSE1 is available?

34249

// Delegate to generic TypeLegalization. Situations we can really handle

34250

// should have already been dealt with by AtomicExpandPass.cpp.

34251

break;

34252

}

34253

case ISD::ATOMIC_SWAP:

34254

case ISD::ATOMIC_LOAD_ADD:

34255

case ISD::ATOMIC_LOAD_SUB:

34256

case ISD::ATOMIC_LOAD_AND:

34257

case ISD::ATOMIC_LOAD_OR:

34258

case ISD::ATOMIC_LOAD_XOR:

34259

case ISD::ATOMIC_LOAD_NAND:

34260

case ISD::ATOMIC_LOAD_MIN:

34261

case ISD::ATOMIC_LOAD_MAX:

34262

case ISD::ATOMIC_LOAD_UMIN:

34263

case ISD::ATOMIC_LOAD_UMAX:

34264

// Delegate to generic TypeLegalization. Situations we can really handle

34265

// should have already been dealt with by AtomicExpandPass.cpp.

34266

break;

34267

34268

case ISD::BITCAST: {

34269

assert(Subtarget.hasSSE2() && "Requires at least SSE2!")(static_cast <bool> (Subtarget.hasSSE2() && "Requires at least SSE2!"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"Requires at least SSE2!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34269, __extension__
__PRETTY_FUNCTION__));

34270

EVT DstVT = N->getValueType(0);

34271

EVT SrcVT = N->getOperand(0).getValueType();

34272

34273

// If this is a bitcast from a v64i1 k-register to a i64 on a 32-bit target

34274

// we can split using the k-register rather than memory.

34275

if (SrcVT == MVT::v64i1 && DstVT == MVT::i64 && Subtarget.hasBWI()) {

34276

assert(!Subtarget.is64Bit() && "Expected 32-bit mode")(static_cast <bool> (!Subtarget.is64Bit() && "Expected 32-bit mode"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"Expected 32-bit mode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34276, __extension__
__PRETTY_FUNCTION__));

34277

SDValue Lo, Hi;

34278

std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);

34279

Lo = DAG.getBitcast(MVT::i32, Lo);

34280

Hi = DAG.getBitcast(MVT::i32, Hi);

34281

SDValue Res = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);

34282

Results.push_back(Res);

34283

return;

34284

}

34285

34286

if (DstVT.isVector() && SrcVT == MVT::x86mmx) {

34287

// FIXME: Use v4f32 for SSE1?

34288

assert(Subtarget.hasSSE2() && "Requires SSE2")(static_cast <bool> (Subtarget.hasSSE2() && "Requires SSE2"
) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"Requires SSE2\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34288, __extension__
__PRETTY_FUNCTION__));

34289

assert(getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector &&(static_cast <bool> (getTypeAction(*DAG.getContext(), DstVT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34290, __extension__
__PRETTY_FUNCTION__))

34290

"Unexpected type action!")(static_cast <bool> (getTypeAction(*DAG.getContext(), DstVT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34290, __extension__
__PRETTY_FUNCTION__));

34291

EVT WideVT = getTypeToTransformTo(*DAG.getContext(), DstVT);

34292

SDValue Res = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64,

34293

N->getOperand(0));

34294

Res = DAG.getBitcast(WideVT, Res);

34295

Results.push_back(Res);

34296

return;

34297

}

34298

34299

return;

34300

}

34301

case ISD::MGATHER: {

34302

EVT VT = N->getValueType(0);

34303

if ((VT == MVT::v2f32 || VT == MVT::v2i32) &&

34304

(Subtarget.hasVLX() || !Subtarget.hasAVX512())) {

34305

auto *Gather = cast<MaskedGatherSDNode>(N);

34306

SDValue Index = Gather->getIndex();

34307

if (Index.getValueType() != MVT::v2i64)

34308

return;

34309

assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34310, __extension__
__PRETTY_FUNCTION__))

34310

"Unexpected type action!")(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34310, __extension__
__PRETTY_FUNCTION__));

34311

EVT WideVT = getTypeToTransformTo(*DAG.getContext(), VT);

34312

SDValue Mask = Gather->getMask();

34313

assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type")(static_cast <bool> (Mask.getValueType() == MVT::v2i1 &&
"Unexpected mask type") ? void (0) : __assert_fail ("Mask.getValueType() == MVT::v2i1 && \"Unexpected mask type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34313, __extension__
__PRETTY_FUNCTION__));

34314

SDValue PassThru = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT,

34315

Gather->getPassThru(),

34316

DAG.getUNDEF(VT));

34317

if (!Subtarget.hasVLX()) {

34318

// We need to widen the mask, but the instruction will only use 2

34319

// of its elements. So we can use undef.

34320

Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,

34321

DAG.getUNDEF(MVT::v2i1));

34322

Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Mask);

34323

}

34324

SDValue Ops[] = { Gather->getChain(), PassThru, Mask,

34325

Gather->getBasePtr(), Index, Gather->getScale() };

34326

SDValue Res = DAG.getMemIntrinsicNode(

34327

X86ISD::MGATHER, dl, DAG.getVTList(WideVT, MVT::Other), Ops,

34328

Gather->getMemoryVT(), Gather->getMemOperand());

34329

Results.push_back(Res);

34330

Results.push_back(Res.getValue(1));

34331

return;

34332

}

34333

return;

34334

}

34335

case ISD::LOAD: {

34336

// Use an f64/i64 load and a scalar_to_vector for v2f32/v2i32 loads. This

34337

// avoids scalarizing in 32-bit mode. In 64-bit mode this avoids a int->fp

34338

// cast since type legalization will try to use an i64 load.

34339

MVT VT = N->getSimpleValueType(0);

34340

assert(VT.isVector() && VT.getSizeInBits() == 64 && "Unexpected VT")(static_cast <bool> (VT.isVector() && VT.getSizeInBits
() == 64 && "Unexpected VT") ? void (0) : __assert_fail
("VT.isVector() && VT.getSizeInBits() == 64 && \"Unexpected VT\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34340, __extension__
__PRETTY_FUNCTION__));

34341

assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34342, __extension__
__PRETTY_FUNCTION__))

34342

"Unexpected type action!")(static_cast <bool> (getTypeAction(*DAG.getContext(), VT
) == TypeWidenVector && "Unexpected type action!") ? void
(0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34342, __extension__
__PRETTY_FUNCTION__));

34343

if (!ISD::isNON_EXTLoad(N))

34344

return;

34345

auto *Ld = cast<LoadSDNode>(N);

34346

if (Subtarget.hasSSE2()) {

34347

MVT LdVT = Subtarget.is64Bit() && VT.isInteger() ? MVT::i64 : MVT::f64;

34348

SDValue Res = DAG.getLoad(LdVT, dl, Ld->getChain(), Ld->getBasePtr(),

34349

Ld->getPointerInfo(), Ld->getOriginalAlign(),

34350

Ld->getMemOperand()->getFlags());

34351

SDValue Chain = Res.getValue(1);

34352

MVT VecVT = MVT::getVectorVT(LdVT, 2);

34353

Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Res);

34354

EVT WideVT = getTypeToTransformTo(*DAG.getContext(), VT);

34355

Res = DAG.getBitcast(WideVT, Res);

34356

Results.push_back(Res);

34357

Results.push_back(Chain);

34358

return;

34359

}

34360

assert(Subtarget.hasSSE1() && "Expected SSE")(static_cast <bool> (Subtarget.hasSSE1() && "Expected SSE"
) ? void (0) : __assert_fail ("Subtarget.hasSSE1() && \"Expected SSE\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34360, __extension__
__PRETTY_FUNCTION__));

34361

SDVTList Tys = DAG.getVTList(MVT::v4f32, MVT::Other);

34362

SDValue Ops[] = {Ld->getChain(), Ld->getBasePtr()};

34363

SDValue Res = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,

34364

MVT::i64, Ld->getMemOperand());

34365

Results.push_back(Res);

34366

Results.push_back(Res.getValue(1));

34367

return;

34368

}

34369

case ISD::ADDRSPACECAST: {

34370

SDValue V = LowerADDRSPACECAST(SDValue(N,0), DAG);

34371

Results.push_back(V);

34372

return;

34373

}

34374

case ISD::BITREVERSE: {

34375

assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!")(static_cast <bool> (N->getValueType(0) == MVT::i64 &&
"Unexpected VT!") ? void (0) : __assert_fail ("N->getValueType(0) == MVT::i64 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34375, __extension__
__PRETTY_FUNCTION__));

34376

assert(Subtarget.hasXOP() && "Expected XOP")(static_cast <bool> (Subtarget.hasXOP() && "Expected XOP"
) ? void (0) : __assert_fail ("Subtarget.hasXOP() && \"Expected XOP\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34376, __extension__
__PRETTY_FUNCTION__));

34377

// We can use VPPERM by copying to a vector register and back. We'll need

34378

// to move the scalar in two i32 pieces.

34379

Results.push_back(LowerBITREVERSE(SDValue(N, 0), Subtarget, DAG));

34380

return;

34381

}

34382

case ISD::EXTRACT_VECTOR_ELT: {

34383

// f16 = extract vXf16 %vec, i64 %idx

34384

assert(N->getSimpleValueType(0) == MVT::f16 &&(static_cast <bool> (N->getSimpleValueType(0) == MVT
::f16 && "Unexpected Value type of EXTRACT_VECTOR_ELT!"
) ? void (0) : __assert_fail ("N->getSimpleValueType(0) == MVT::f16 && \"Unexpected Value type of EXTRACT_VECTOR_ELT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34385, __extension__
__PRETTY_FUNCTION__))

34385

"Unexpected Value type of EXTRACT_VECTOR_ELT!")(static_cast <bool> (N->getSimpleValueType(0) == MVT
::f16 && "Unexpected Value type of EXTRACT_VECTOR_ELT!"
) ? void (0) : __assert_fail ("N->getSimpleValueType(0) == MVT::f16 && \"Unexpected Value type of EXTRACT_VECTOR_ELT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34385, __extension__
__PRETTY_FUNCTION__));

34386

assert(Subtarget.hasFP16() && "Expected FP16")(static_cast <bool> (Subtarget.hasFP16() && "Expected FP16"
) ? void (0) : __assert_fail ("Subtarget.hasFP16() && \"Expected FP16\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34386, __extension__
__PRETTY_FUNCTION__));

34387

SDValue VecOp = N->getOperand(0);

34388

EVT ExtVT = VecOp.getValueType().changeVectorElementTypeToInteger();

34389

SDValue Split = DAG.getBitcast(ExtVT, N->getOperand(0));

34390

Split = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Split,

34391

N->getOperand(1));

34392

Split = DAG.getBitcast(MVT::f16, Split);

34393

Results.push_back(Split);

34394

return;

34395

}

34396

}

34397

}

34398

34399

const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {

34400

switch ((X86ISD::NodeType)Opcode) {

34401

case X86ISD::FIRST_NUMBER: break;

34402

#define NODE_NAME_CASE(NODE) case X86ISD::NODE: return "X86ISD::" #NODE;

34403

NODE_NAME_CASE(BSF)

34404

NODE_NAME_CASE(BSR)

34405

NODE_NAME_CASE(FSHL)

34406

NODE_NAME_CASE(FSHR)

34407

NODE_NAME_CASE(FAND)

34408

NODE_NAME_CASE(FANDN)

34409

NODE_NAME_CASE(FOR)

34410

NODE_NAME_CASE(FXOR)

34411

NODE_NAME_CASE(FILD)

34412

NODE_NAME_CASE(FIST)

34413

NODE_NAME_CASE(FP_TO_INT_IN_MEM)

34414

NODE_NAME_CASE(FLD)

34415

NODE_NAME_CASE(FST)

34416

NODE_NAME_CASE(CALL)

34417

NODE_NAME_CASE(CALL_RVMARKER)

34418

NODE_NAME_CASE(BT)

34419

NODE_NAME_CASE(CMP)

34420

NODE_NAME_CASE(FCMP)

34421

NODE_NAME_CASE(STRICT_FCMP)

34422

NODE_NAME_CASE(STRICT_FCMPS)

34423

NODE_NAME_CASE(COMI)

34424

NODE_NAME_CASE(UCOMI)

34425

NODE_NAME_CASE(CMPM)

34426

NODE_NAME_CASE(CMPMM)

34427

NODE_NAME_CASE(STRICT_CMPM)

34428

NODE_NAME_CASE(CMPMM_SAE)

34429

NODE_NAME_CASE(SETCC)

34430

NODE_NAME_CASE(SETCC_CARRY)

34431

NODE_NAME_CASE(FSETCC)

34432

NODE_NAME_CASE(FSETCCM)

34433

NODE_NAME_CASE(FSETCCM_SAE)

34434

NODE_NAME_CASE(CMOV)

34435

NODE_NAME_CASE(BRCOND)

34436

NODE_NAME_CASE(RET_FLAG)

34437

NODE_NAME_CASE(IRET)

34438

NODE_NAME_CASE(REP_STOS)

34439

NODE_NAME_CASE(REP_MOVS)

34440

NODE_NAME_CASE(GlobalBaseReg)

34441

NODE_NAME_CASE(Wrapper)

34442

NODE_NAME_CASE(WrapperRIP)

34443

NODE_NAME_CASE(MOVQ2DQ)

34444

NODE_NAME_CASE(MOVDQ2Q)

34445

NODE_NAME_CASE(MMX_MOVD2W)

34446

NODE_NAME_CASE(MMX_MOVW2D)

34447

NODE_NAME_CASE(PEXTRB)

34448

NODE_NAME_CASE(PEXTRW)

34449

NODE_NAME_CASE(INSERTPS)

34450

NODE_NAME_CASE(PINSRB)

34451

NODE_NAME_CASE(PINSRW)

34452

NODE_NAME_CASE(PSHUFB)

34453

NODE_NAME_CASE(ANDNP)

34454

NODE_NAME_CASE(BLENDI)

34455

NODE_NAME_CASE(BLENDV)

34456

NODE_NAME_CASE(HADD)

34457

NODE_NAME_CASE(HSUB)

34458

NODE_NAME_CASE(FHADD)

34459

NODE_NAME_CASE(FHSUB)

34460

NODE_NAME_CASE(CONFLICT)

34461

NODE_NAME_CASE(FMAX)

34462

NODE_NAME_CASE(FMAXS)

34463

NODE_NAME_CASE(FMAX_SAE)

34464

NODE_NAME_CASE(FMAXS_SAE)

34465

NODE_NAME_CASE(FMIN)

34466

NODE_NAME_CASE(FMINS)

34467

NODE_NAME_CASE(FMIN_SAE)

34468

NODE_NAME_CASE(FMINS_SAE)

34469

NODE_NAME_CASE(FMAXC)

34470

NODE_NAME_CASE(FMINC)

34471

NODE_NAME_CASE(FRSQRT)

34472

NODE_NAME_CASE(FRCP)

34473

NODE_NAME_CASE(EXTRQI)

34474

NODE_NAME_CASE(INSERTQI)

34475

NODE_NAME_CASE(TLSADDR)

34476

NODE_NAME_CASE(TLSBASEADDR)

34477

NODE_NAME_CASE(TLSCALL)

34478

NODE_NAME_CASE(EH_SJLJ_SETJMP)

34479

NODE_NAME_CASE(EH_SJLJ_LONGJMP)

34480

NODE_NAME_CASE(EH_SJLJ_SETUP_DISPATCH)

34481

NODE_NAME_CASE(EH_RETURN)

34482

NODE_NAME_CASE(TC_RETURN)

34483

NODE_NAME_CASE(FNSTCW16m)

34484

NODE_NAME_CASE(FLDCW16m)

34485

NODE_NAME_CASE(LCMPXCHG_DAG)

34486

NODE_NAME_CASE(LCMPXCHG8_DAG)

34487

NODE_NAME_CASE(LCMPXCHG16_DAG)

34488

NODE_NAME_CASE(LCMPXCHG16_SAVE_RBX_DAG)

34489

NODE_NAME_CASE(LADD)

34490

NODE_NAME_CASE(LSUB)

34491

NODE_NAME_CASE(LOR)

34492

NODE_NAME_CASE(LXOR)

34493

NODE_NAME_CASE(LAND)

34494

NODE_NAME_CASE(LBTS)

34495

NODE_NAME_CASE(LBTC)

34496

NODE_NAME_CASE(LBTR)

34497

NODE_NAME_CASE(LBTS_RM)

34498

NODE_NAME_CASE(LBTC_RM)

34499

NODE_NAME_CASE(LBTR_RM)

34500

NODE_NAME_CASE(AADD)

34501

NODE_NAME_CASE(AOR)

34502

NODE_NAME_CASE(AXOR)

34503

NODE_NAME_CASE(AAND)

34504

NODE_NAME_CASE(VZEXT_MOVL)

34505

NODE_NAME_CASE(VZEXT_LOAD)

34506

NODE_NAME_CASE(VEXTRACT_STORE)

34507

NODE_NAME_CASE(VTRUNC)

34508

NODE_NAME_CASE(VTRUNCS)

34509

NODE_NAME_CASE(VTRUNCUS)

34510

NODE_NAME_CASE(VMTRUNC)

34511

NODE_NAME_CASE(VMTRUNCS)

34512

NODE_NAME_CASE(VMTRUNCUS)

34513

NODE_NAME_CASE(VTRUNCSTORES)

34514

NODE_NAME_CASE(VTRUNCSTOREUS)

34515

NODE_NAME_CASE(VMTRUNCSTORES)

34516

NODE_NAME_CASE(VMTRUNCSTOREUS)

34517

NODE_NAME_CASE(VFPEXT)

34518

NODE_NAME_CASE(STRICT_VFPEXT)

34519

NODE_NAME_CASE(VFPEXT_SAE)

34520

NODE_NAME_CASE(VFPEXTS)

34521

NODE_NAME_CASE(VFPEXTS_SAE)

34522

NODE_NAME_CASE(VFPROUND)

34523

NODE_NAME_CASE(STRICT_VFPROUND)

34524

NODE_NAME_CASE(VMFPROUND)

34525

NODE_NAME_CASE(VFPROUND_RND)

34526

NODE_NAME_CASE(VFPROUNDS)

34527

NODE_NAME_CASE(VFPROUNDS_RND)

34528

NODE_NAME_CASE(VSHLDQ)

34529

NODE_NAME_CASE(VSRLDQ)

34530

NODE_NAME_CASE(VSHL)

34531

NODE_NAME_CASE(VSRL)

34532

NODE_NAME_CASE(VSRA)

34533

NODE_NAME_CASE(VSHLI)

34534

NODE_NAME_CASE(VSRLI)

34535

NODE_NAME_CASE(VSRAI)

34536

NODE_NAME_CASE(VSHLV)

34537

NODE_NAME_CASE(VSRLV)

34538

NODE_NAME_CASE(VSRAV)

34539

NODE_NAME_CASE(VROTLI)

34540

NODE_NAME_CASE(VROTRI)

34541

NODE_NAME_CASE(VPPERM)

34542

NODE_NAME_CASE(CMPP)

34543

NODE_NAME_CASE(STRICT_CMPP)

34544

NODE_NAME_CASE(PCMPEQ)

34545

NODE_NAME_CASE(PCMPGT)

34546

NODE_NAME_CASE(PHMINPOS)

34547

NODE_NAME_CASE(ADD)

34548

NODE_NAME_CASE(SUB)

34549

NODE_NAME_CASE(ADC)

34550

NODE_NAME_CASE(SBB)

34551

NODE_NAME_CASE(SMUL)

34552

NODE_NAME_CASE(UMUL)

34553

NODE_NAME_CASE(OR)

34554

NODE_NAME_CASE(XOR)

34555

NODE_NAME_CASE(AND)

34556

NODE_NAME_CASE(BEXTR)

34557

NODE_NAME_CASE(BEXTRI)

34558

NODE_NAME_CASE(BZHI)

34559

NODE_NAME_CASE(PDEP)

34560

NODE_NAME_CASE(PEXT)

34561

NODE_NAME_CASE(MUL_IMM)

34562

NODE_NAME_CASE(MOVMSK)

34563

NODE_NAME_CASE(PTEST)

34564

NODE_NAME_CASE(TESTP)

34565

NODE_NAME_CASE(KORTEST)

34566

NODE_NAME_CASE(KTEST)

34567

NODE_NAME_CASE(KADD)

34568

NODE_NAME_CASE(KSHIFTL)

34569

NODE_NAME_CASE(KSHIFTR)

34570

NODE_NAME_CASE(PACKSS)

34571

NODE_NAME_CASE(PACKUS)

34572

NODE_NAME_CASE(PALIGNR)

34573

NODE_NAME_CASE(VALIGN)

34574

NODE_NAME_CASE(VSHLD)

34575

NODE_NAME_CASE(VSHRD)

34576

NODE_NAME_CASE(VSHLDV)

34577

NODE_NAME_CASE(VSHRDV)

34578

NODE_NAME_CASE(PSHUFD)

34579

NODE_NAME_CASE(PSHUFHW)

34580

NODE_NAME_CASE(PSHUFLW)

34581

NODE_NAME_CASE(SHUFP)

34582

NODE_NAME_CASE(SHUF128)

34583

NODE_NAME_CASE(MOVLHPS)

34584

NODE_NAME_CASE(MOVHLPS)

34585

NODE_NAME_CASE(MOVDDUP)

34586

NODE_NAME_CASE(MOVSHDUP)

34587

NODE_NAME_CASE(MOVSLDUP)

34588

NODE_NAME_CASE(MOVSD)

34589

NODE_NAME_CASE(MOVSS)

34590

NODE_NAME_CASE(MOVSH)

34591

NODE_NAME_CASE(UNPCKL)

34592

NODE_NAME_CASE(UNPCKH)

34593

NODE_NAME_CASE(VBROADCAST)

34594

NODE_NAME_CASE(VBROADCAST_LOAD)

34595

NODE_NAME_CASE(VBROADCASTM)

34596

NODE_NAME_CASE(SUBV_BROADCAST_LOAD)

34597

NODE_NAME_CASE(VPERMILPV)

34598

NODE_NAME_CASE(VPERMILPI)

34599

NODE_NAME_CASE(VPERM2X128)

34600

NODE_NAME_CASE(VPERMV)

34601

NODE_NAME_CASE(VPERMV3)

34602

NODE_NAME_CASE(VPERMI)

34603

NODE_NAME_CASE(VPTERNLOG)

34604

NODE_NAME_CASE(VFIXUPIMM)

34605

NODE_NAME_CASE(VFIXUPIMM_SAE)

34606

NODE_NAME_CASE(VFIXUPIMMS)

34607

NODE_NAME_CASE(VFIXUPIMMS_SAE)

34608

NODE_NAME_CASE(VRANGE)

34609

NODE_NAME_CASE(VRANGE_SAE)

34610

NODE_NAME_CASE(VRANGES)

34611

NODE_NAME_CASE(VRANGES_SAE)

34612

NODE_NAME_CASE(PMULUDQ)

34613

NODE_NAME_CASE(PMULDQ)

34614

NODE_NAME_CASE(PSADBW)

34615

NODE_NAME_CASE(DBPSADBW)

34616

NODE_NAME_CASE(VASTART_SAVE_XMM_REGS)

34617

NODE_NAME_CASE(VAARG_64)

34618

NODE_NAME_CASE(VAARG_X32)

34619

NODE_NAME_CASE(DYN_ALLOCA)

34620

NODE_NAME_CASE(MFENCE)

34621

NODE_NAME_CASE(SEG_ALLOCA)

34622

NODE_NAME_CASE(PROBED_ALLOCA)

34623

NODE_NAME_CASE(RDRAND)

34624

NODE_NAME_CASE(RDSEED)

34625

NODE_NAME_CASE(RDPKRU)

34626

NODE_NAME_CASE(WRPKRU)

34627

NODE_NAME_CASE(VPMADDUBSW)

34628

NODE_NAME_CASE(VPMADDWD)

34629

NODE_NAME_CASE(VPSHA)

34630

NODE_NAME_CASE(VPSHL)

34631

NODE_NAME_CASE(VPCOM)

34632

NODE_NAME_CASE(VPCOMU)

34633

NODE_NAME_CASE(VPERMIL2)

34634

NODE_NAME_CASE(FMSUB)

34635

NODE_NAME_CASE(STRICT_FMSUB)

34636

NODE_NAME_CASE(FNMADD)

34637

NODE_NAME_CASE(STRICT_FNMADD)

34638

NODE_NAME_CASE(FNMSUB)

34639

NODE_NAME_CASE(STRICT_FNMSUB)

34640

NODE_NAME_CASE(FMADDSUB)

34641

NODE_NAME_CASE(FMSUBADD)

34642

NODE_NAME_CASE(FMADD_RND)

34643

NODE_NAME_CASE(FNMADD_RND)

34644

NODE_NAME_CASE(FMSUB_RND)

34645

NODE_NAME_CASE(FNMSUB_RND)

34646

NODE_NAME_CASE(FMADDSUB_RND)

34647

NODE_NAME_CASE(FMSUBADD_RND)

34648

NODE_NAME_CASE(VFMADDC)

34649

NODE_NAME_CASE(VFMADDC_RND)

34650

NODE_NAME_CASE(VFCMADDC)

34651

NODE_NAME_CASE(VFCMADDC_RND)

34652

NODE_NAME_CASE(VFMULC)

34653

NODE_NAME_CASE(VFMULC_RND)

34654

NODE_NAME_CASE(VFCMULC)

34655

NODE_NAME_CASE(VFCMULC_RND)

34656

NODE_NAME_CASE(VFMULCSH)

34657

NODE_NAME_CASE(VFMULCSH_RND)

34658

NODE_NAME_CASE(VFCMULCSH)

34659

NODE_NAME_CASE(VFCMULCSH_RND)

34660

NODE_NAME_CASE(VFMADDCSH)

34661

NODE_NAME_CASE(VFMADDCSH_RND)

34662

NODE_NAME_CASE(VFCMADDCSH)

34663

NODE_NAME_CASE(VFCMADDCSH_RND)

34664

NODE_NAME_CASE(VPMADD52H)

34665

NODE_NAME_CASE(VPMADD52L)

34666

NODE_NAME_CASE(VRNDSCALE)

34667

NODE_NAME_CASE(STRICT_VRNDSCALE)

34668

NODE_NAME_CASE(VRNDSCALE_SAE)

34669

NODE_NAME_CASE(VRNDSCALES)

34670

NODE_NAME_CASE(VRNDSCALES_SAE)

34671

NODE_NAME_CASE(VREDUCE)

34672

NODE_NAME_CASE(VREDUCE_SAE)

34673

NODE_NAME_CASE(VREDUCES)

34674

NODE_NAME_CASE(VREDUCES_SAE)

34675

NODE_NAME_CASE(VGETMANT)

34676

NODE_NAME_CASE(VGETMANT_SAE)

34677

NODE_NAME_CASE(VGETMANTS)

34678

NODE_NAME_CASE(VGETMANTS_SAE)

34679

NODE_NAME_CASE(PCMPESTR)

34680

NODE_NAME_CASE(PCMPISTR)

34681

NODE_NAME_CASE(XTEST)

34682

NODE_NAME_CASE(COMPRESS)

34683

NODE_NAME_CASE(EXPAND)

34684

NODE_NAME_CASE(SELECTS)

34685

NODE_NAME_CASE(ADDSUB)

34686

NODE_NAME_CASE(RCP14)

34687

NODE_NAME_CASE(RCP14S)

34688

NODE_NAME_CASE(RCP28)

34689

NODE_NAME_CASE(RCP28_SAE)

34690

NODE_NAME_CASE(RCP28S)

34691

NODE_NAME_CASE(RCP28S_SAE)

34692

NODE_NAME_CASE(EXP2)

34693

NODE_NAME_CASE(EXP2_SAE)

34694

NODE_NAME_CASE(RSQRT14)

34695

NODE_NAME_CASE(RSQRT14S)

34696

NODE_NAME_CASE(RSQRT28)

34697

NODE_NAME_CASE(RSQRT28_SAE)

34698

NODE_NAME_CASE(RSQRT28S)

34699

NODE_NAME_CASE(RSQRT28S_SAE)

34700

NODE_NAME_CASE(FADD_RND)

34701

NODE_NAME_CASE(FADDS)

34702

NODE_NAME_CASE(FADDS_RND)

34703

NODE_NAME_CASE(FSUB_RND)

34704

NODE_NAME_CASE(FSUBS)

34705

NODE_NAME_CASE(FSUBS_RND)

34706

NODE_NAME_CASE(FMUL_RND)

34707

NODE_NAME_CASE(FMULS)

34708

NODE_NAME_CASE(FMULS_RND)

34709

NODE_NAME_CASE(FDIV_RND)

34710

NODE_NAME_CASE(FDIVS)

34711

NODE_NAME_CASE(FDIVS_RND)

34712

NODE_NAME_CASE(FSQRT_RND)

34713

NODE_NAME_CASE(FSQRTS)

34714

NODE_NAME_CASE(FSQRTS_RND)

34715

NODE_NAME_CASE(FGETEXP)

34716

NODE_NAME_CASE(FGETEXP_SAE)

34717

NODE_NAME_CASE(FGETEXPS)

34718

NODE_NAME_CASE(FGETEXPS_SAE)

34719

NODE_NAME_CASE(SCALEF)

34720

NODE_NAME_CASE(SCALEF_RND)

34721

NODE_NAME_CASE(SCALEFS)

34722

NODE_NAME_CASE(SCALEFS_RND)

34723

NODE_NAME_CASE(MULHRS)

34724

NODE_NAME_CASE(SINT_TO_FP_RND)

34725

NODE_NAME_CASE(UINT_TO_FP_RND)

34726

NODE_NAME_CASE(CVTTP2SI)

34727

NODE_NAME_CASE(CVTTP2UI)

34728

NODE_NAME_CASE(STRICT_CVTTP2SI)

34729

NODE_NAME_CASE(STRICT_CVTTP2UI)

34730

NODE_NAME_CASE(MCVTTP2SI)

34731

NODE_NAME_CASE(MCVTTP2UI)

34732

NODE_NAME_CASE(CVTTP2SI_SAE)

34733

NODE_NAME_CASE(CVTTP2UI_SAE)

34734

NODE_NAME_CASE(CVTTS2SI)

34735

NODE_NAME_CASE(CVTTS2UI)

34736

NODE_NAME_CASE(CVTTS2SI_SAE)

34737

NODE_NAME_CASE(CVTTS2UI_SAE)

34738

NODE_NAME_CASE(CVTSI2P)

34739

NODE_NAME_CASE(CVTUI2P)

34740

NODE_NAME_CASE(STRICT_CVTSI2P)

34741

NODE_NAME_CASE(STRICT_CVTUI2P)

34742

NODE_NAME_CASE(MCVTSI2P)

34743

NODE_NAME_CASE(MCVTUI2P)

34744

NODE_NAME_CASE(VFPCLASS)

34745

NODE_NAME_CASE(VFPCLASSS)

34746

NODE_NAME_CASE(MULTISHIFT)

34747

NODE_NAME_CASE(SCALAR_SINT_TO_FP)

34748

NODE_NAME_CASE(SCALAR_SINT_TO_FP_RND)

34749

NODE_NAME_CASE(SCALAR_UINT_TO_FP)

34750

NODE_NAME_CASE(SCALAR_UINT_TO_FP_RND)

34751

NODE_NAME_CASE(CVTPS2PH)

34752

NODE_NAME_CASE(STRICT_CVTPS2PH)

34753

NODE_NAME_CASE(CVTPS2PH_SAE)

34754

NODE_NAME_CASE(MCVTPS2PH)

34755

NODE_NAME_CASE(MCVTPS2PH_SAE)

34756

NODE_NAME_CASE(CVTPH2PS)

34757

NODE_NAME_CASE(STRICT_CVTPH2PS)

34758

NODE_NAME_CASE(CVTPH2PS_SAE)

34759

NODE_NAME_CASE(CVTP2SI)

34760

NODE_NAME_CASE(CVTP2UI)

34761

NODE_NAME_CASE(MCVTP2SI)

34762

NODE_NAME_CASE(MCVTP2UI)

34763

NODE_NAME_CASE(CVTP2SI_RND)

34764

NODE_NAME_CASE(CVTP2UI_RND)

34765

NODE_NAME_CASE(CVTS2SI)

34766

NODE_NAME_CASE(CVTS2UI)

34767

NODE_NAME_CASE(CVTS2SI_RND)

34768

NODE_NAME_CASE(CVTS2UI_RND)

34769

NODE_NAME_CASE(CVTNE2PS2BF16)

34770

NODE_NAME_CASE(CVTNEPS2BF16)

34771

NODE_NAME_CASE(MCVTNEPS2BF16)

34772

NODE_NAME_CASE(DPBF16PS)

34773

NODE_NAME_CASE(LWPINS)

34774

NODE_NAME_CASE(MGATHER)

34775

NODE_NAME_CASE(MSCATTER)

34776

NODE_NAME_CASE(VPDPBUSD)

34777

NODE_NAME_CASE(VPDPBUSDS)

34778

NODE_NAME_CASE(VPDPWSSD)

34779

NODE_NAME_CASE(VPDPWSSDS)

34780

NODE_NAME_CASE(VPSHUFBITQMB)

34781

NODE_NAME_CASE(GF2P8MULB)

34782

NODE_NAME_CASE(GF2P8AFFINEQB)

34783

NODE_NAME_CASE(GF2P8AFFINEINVQB)

34784

NODE_NAME_CASE(NT_CALL)

34785

NODE_NAME_CASE(NT_BRIND)

34786

NODE_NAME_CASE(UMWAIT)

34787

NODE_NAME_CASE(TPAUSE)

34788

NODE_NAME_CASE(ENQCMD)

34789

NODE_NAME_CASE(ENQCMDS)

34790

NODE_NAME_CASE(VP2INTERSECT)

34791

NODE_NAME_CASE(VPDPBSUD)

34792

NODE_NAME_CASE(VPDPBSUDS)

34793

NODE_NAME_CASE(VPDPBUUD)

34794

NODE_NAME_CASE(VPDPBUUDS)

34795

NODE_NAME_CASE(VPDPBSSD)

34796

NODE_NAME_CASE(VPDPBSSDS)

34797

NODE_NAME_CASE(AESENC128KL)

34798

NODE_NAME_CASE(AESDEC128KL)

34799

NODE_NAME_CASE(AESENC256KL)

34800

NODE_NAME_CASE(AESDEC256KL)

34801

NODE_NAME_CASE(AESENCWIDE128KL)

34802

NODE_NAME_CASE(AESDECWIDE128KL)

34803

NODE_NAME_CASE(AESENCWIDE256KL)

34804

NODE_NAME_CASE(AESDECWIDE256KL)

34805

NODE_NAME_CASE(CMPCCXADD)

34806

NODE_NAME_CASE(TESTUI)

34807

}

34808

return nullptr;

34809

#undef NODE_NAME_CASE

34810

}

34811

34812

/// Return true if the addressing mode represented by AM is legal for this

34813

/// target, for a load/store of the specified type.

34814

bool X86TargetLowering::isLegalAddressingMode(const DataLayout &DL,

34815

const AddrMode &AM, Type *Ty,

34816

unsigned AS,

34817

Instruction *I) const {

34818

// X86 supports extremely general addressing modes.

34819

CodeModel::Model M = getTargetMachine().getCodeModel();

34820

34821

// X86 allows a sign-extended 32-bit immediate field as a displacement.

34822

if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr))

34823

return false;

34824

34825

if (AM.BaseGV) {

34826

unsigned GVFlags = Subtarget.classifyGlobalReference(AM.BaseGV);

34827

34828

// If a reference to this global requires an extra load, we can't fold it.

34829

if (isGlobalStubReference(GVFlags))

34830

return false;

34831

34832

// If BaseGV requires a register for the PIC base, we cannot also have a

34833

// BaseReg specified.

34834

if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))

34835

return false;

34836

34837

// If lower 4G is not available, then we must use rip-relative addressing.

34838

if ((M != CodeModel::Small || isPositionIndependent()) &&

34839

Subtarget.is64Bit() && (AM.BaseOffs || AM.Scale > 1))

34840

return false;

34841

}

34842

34843

switch (AM.Scale) {

34844

case 0:

34845

case 1:

34846

case 2:

34847

case 4:

34848

case 8:

34849

// These scales always work.

34850

break;

34851

case 3:

34852

case 5:

34853

case 9:

34854

// These scales are formed with basereg+scalereg. Only accept if there is

34855

// no basereg yet.

34856

if (AM.HasBaseReg)

34857

return false;

34858

break;

34859

default: // Other stuff never works.

34860

return false;

34861

}

34862

34863

return true;

34864

}

34865

34866

bool X86TargetLowering::isVectorShiftByScalarCheap(Type *Ty) const {

34867

unsigned Bits = Ty->getScalarSizeInBits();

34868

34869

// XOP has v16i8/v8i16/v4i32/v2i64 variable vector shifts.

34870

// Splitting for v32i8/v16i16 on XOP+AVX2 targets is still preferred.

34871

if (Subtarget.hasXOP() &&

34872

(Bits == 8 || Bits == 16 || Bits == 32 || Bits == 64))

34873

return false;

34874

34875

// AVX2 has vpsllv[dq] instructions (and other shifts) that make variable

34876

// shifts just as cheap as scalar ones.

34877

if (Subtarget.hasAVX2() && (Bits == 32 || Bits == 64))

34878

return false;

34879

34880

// AVX512BW has shifts such as vpsllvw.

34881

if (Subtarget.hasBWI() && Bits == 16)

34882

return false;

34883

34884

// Otherwise, it's significantly cheaper to shift by a scalar amount than by a

34885

// fully general vector.

34886

return true;

34887

}

34888

34889

bool X86TargetLowering::isBinOp(unsigned Opcode) const {

34890

switch (Opcode) {

34891

// These are non-commutative binops.

34892

// TODO: Add more X86ISD opcodes once we have test coverage.

34893

case X86ISD::ANDNP:

34894

case X86ISD::PCMPGT:

34895

case X86ISD::FMAX:

34896

case X86ISD::FMIN:

34897

case X86ISD::FANDN:

34898

case X86ISD::VPSHA:

34899

case X86ISD::VPSHL:

34900

case X86ISD::VSHLV:

34901

case X86ISD::VSRLV:

34902

case X86ISD::VSRAV:

34903

return true;

34904

}

34905

34906

return TargetLoweringBase::isBinOp(Opcode);

34907

}

34908

34909

bool X86TargetLowering::isCommutativeBinOp(unsigned Opcode) const {

34910

switch (Opcode) {

34911

// TODO: Add more X86ISD opcodes once we have test coverage.

34912

case X86ISD::PCMPEQ:

34913

case X86ISD::PMULDQ:

34914

case X86ISD::PMULUDQ:

34915

case X86ISD::FMAXC:

34916

case X86ISD::FMINC:

34917

case X86ISD::FAND:

34918

case X86ISD::FOR:

34919

case X86ISD::FXOR:

34920

return true;

34921

}

34922

34923

return TargetLoweringBase::isCommutativeBinOp(Opcode);

34924

}

34925

34926

bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {

34927

if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())

34928

return false;

34929

unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();

34930

unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();

34931

return NumBits1 > NumBits2;

34932

}

34933

34934

bool X86TargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const {

34935

if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())

34936

return false;

34937

34938

if (!isTypeLegal(EVT::getEVT(Ty1)))

34939

return false;

34940

34941

assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop")(static_cast <bool> (Ty1->getPrimitiveSizeInBits() <=
64 && "i128 is probably not a noop") ? void (0) : __assert_fail
("Ty1->getPrimitiveSizeInBits() <= 64 && \"i128 is probably not a noop\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 34941, __extension__
__PRETTY_FUNCTION__));

34942

34943

// Assuming the caller doesn't have a zeroext or signext return parameter,

34944

// truncation all the way down to i1 is valid.

34945

return true;

34946

}

34947

34948

bool X86TargetLowering::isLegalICmpImmediate(int64_t Imm) const {

34949

return isInt<32>(Imm);

34950

}

34951

34952

bool X86TargetLowering::isLegalAddImmediate(int64_t Imm) const {

34953

// Can also use sub to handle negated immediates.

34954

return isInt<32>(Imm);

34955

}

34956

34957

bool X86TargetLowering::isLegalStoreImmediate(int64_t Imm) const {

34958

return isInt<32>(Imm);

34959

}

34960

34961

bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {

34962

if (!VT1.isScalarInteger() || !VT2.isScalarInteger())

34963

return false;

34964

unsigned NumBits1 = VT1.getSizeInBits();

34965

unsigned NumBits2 = VT2.getSizeInBits();

34966

return NumBits1 > NumBits2;

34967

}

34968

34969

bool X86TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {

34970

// x86-64 implicitly zero-extends 32-bit results in 64-bit registers.

34971

return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget.is64Bit();

34972

}

34973

34974

bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {

34975

// x86-64 implicitly zero-extends 32-bit results in 64-bit registers.

34976

return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget.is64Bit();

34977

}

34978

34979

bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {

34980

EVT VT1 = Val.getValueType();

34981

if (isZExtFree(VT1, VT2))

34982

return true;

34983

34984

if (Val.getOpcode() != ISD::LOAD)

34985

return false;

34986

34987

if (!VT1.isSimple() || !VT1.isInteger() ||

34988

!VT2.isSimple() || !VT2.isInteger())

34989

return false;

34990

34991

switch (VT1.getSimpleVT().SimpleTy) {

34992

default: break;

34993

case MVT::i8:

34994

case MVT::i16:

34995

case MVT::i32:

34996

// X86 has 8, 16, and 32-bit zero-extending loads.

34997

return true;

34998

}

34999

35000

return false;

35001

}

35002

35003

bool X86TargetLowering::shouldSinkOperands(Instruction *I,

35004

SmallVectorImpl<Use *> &Ops) const {

35005

using namespace llvm::PatternMatch;

35006

35007

FixedVectorType *VTy = dyn_cast<FixedVectorType>(I->getType());

35008

if (!VTy)

35009

return false;

35010

35011

if (I->getOpcode() == Instruction::Mul &&

35012

VTy->getElementType()->isIntegerTy(64)) {

35013

for (auto &Op : I->operands()) {

35014

// Make sure we are not already sinking this operand

35015

if (any_of(Ops, [&](Use *U) { return U->get() == Op; }))

35016

continue;

35017

35018

// Look for PMULDQ pattern where the input is a sext_inreg from vXi32 or

35019

// the PMULUDQ pattern where the input is a zext_inreg from vXi32.

35020

if (Subtarget.hasSSE41() &&

35021

match(Op.get(), m_AShr(m_Shl(m_Value(), m_SpecificInt(32)),

35022

m_SpecificInt(32)))) {

35023

Ops.push_back(&cast<Instruction>(Op)->getOperandUse(0));

35024

Ops.push_back(&Op);

35025

} else if (Subtarget.hasSSE2() &&

35026

match(Op.get(),

35027

m_And(m_Value(), m_SpecificInt(UINT64_C(0xffffffff)0xffffffffUL)))) {

35028

Ops.push_back(&Op);

35029

}

35030

}

35031

35032

return !Ops.empty();

35033

}

35034

35035

// A uniform shift amount in a vector shift or funnel shift may be much

35036

// cheaper than a generic variable vector shift, so make that pattern visible

35037

// to SDAG by sinking the shuffle instruction next to the shift.

35038

int ShiftAmountOpNum = -1;

35039

if (I->isShift())

35040

ShiftAmountOpNum = 1;

35041

else if (auto *II = dyn_cast<IntrinsicInst>(I)) {

35042

if (II->getIntrinsicID() == Intrinsic::fshl ||

35043

II->getIntrinsicID() == Intrinsic::fshr)

35044

ShiftAmountOpNum = 2;

35045

}

35046

35047

if (ShiftAmountOpNum == -1)

35048

return false;

35049

35050

auto *Shuf = dyn_cast<ShuffleVectorInst>(I->getOperand(ShiftAmountOpNum));

35051

if (Shuf && getSplatIndex(Shuf->getShuffleMask()) >= 0 &&

35052

isVectorShiftByScalarCheap(I->getType())) {

35053

Ops.push_back(&I->getOperandUse(ShiftAmountOpNum));

35054

return true;

35055

}

35056

35057

return false;

35058

}

35059

35060

bool X86TargetLowering::shouldConvertPhiType(Type *From, Type *To) const {

35061

if (!Subtarget.is64Bit())

35062

return false;

35063

return TargetLowering::shouldConvertPhiType(From, To);

35064

}

35065

35066

bool X86TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {

35067

if (isa<MaskedLoadSDNode>(ExtVal.getOperand(0)))

35068

return false;

35069

35070

EVT SrcVT = ExtVal.getOperand(0).getValueType();

35071

35072

// There is no extending load for vXi1.

35073

if (SrcVT.getScalarType() == MVT::i1)

35074

return false;

35075

35076

return true;

35077

}

35078

35079

bool X86TargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,

35080

EVT VT) const {

35081

if (!Subtarget.hasAnyFMA())

35082

return false;

35083

35084

VT = VT.getScalarType();

35085

35086

if (!VT.isSimple())

35087

return false;

35088

35089

switch (VT.getSimpleVT().SimpleTy) {

35090

case MVT::f16:

35091

return Subtarget.hasFP16();

35092

case MVT::f32:

35093

case MVT::f64:

35094

return true;

35095

default:

35096

break;

35097

}

35098

35099

return false;

35100

}

35101

35102

bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const {

35103

// i16 instructions are longer (0x66 prefix) and potentially slower.

35104

return !(VT1 == MVT::i32 && VT2 == MVT::i16);

35105

}

35106

35107

bool X86TargetLowering::shouldFoldSelectWithIdentityConstant(unsigned Opcode,

35108

EVT VT) const {

35109

// TODO: This is too general. There are cases where pre-AVX512 codegen would

35110

// benefit. The transform may also be profitable for scalar code.

35111

if (!Subtarget.hasAVX512())

35112

return false;

35113

if (!Subtarget.hasVLX() && !VT.is512BitVector())

35114

return false;

35115

if (!VT.isVector() || VT.getScalarType() == MVT::i1)

35116

return false;

35117

35118

return true;

35119

}

35120

35121

/// Targets can use this to indicate that they only support *some*

35122

/// VECTOR_SHUFFLE operations, those with specific masks.

35123

/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values

35124

/// are assumed to be legal.

35125

bool X86TargetLowering::isShuffleMaskLegal(ArrayRef<int> Mask, EVT VT) const {

35126

if (!VT.isSimple())

35127

return false;

35128

35129

// Not for i1 vectors

35130

if (VT.getSimpleVT().getScalarType() == MVT::i1)

35131

return false;

35132

35133

// Very little shuffling can be done for 64-bit vectors right now.

35134

if (VT.getSimpleVT().getSizeInBits() == 64)

35135

return false;

35136

35137

// We only care that the types being shuffled are legal. The lowering can

35138

// handle any possible shuffle mask that results.

35139

return isTypeLegal(VT.getSimpleVT());

35140

}

35141

35142

bool X86TargetLowering::isVectorClearMaskLegal(ArrayRef<int> Mask,

35143

EVT VT) const {

35144

// Don't convert an 'and' into a shuffle that we don't directly support.

35145

// vpblendw and vpshufb for 256-bit vectors are not available on AVX1.

35146

if (!Subtarget.hasAVX2())

35147

if (VT == MVT::v32i8 || VT == MVT::v16i16)

35148

return false;

35149

35150

// Just delegate to the generic legality, clear masks aren't special.

35151

return isShuffleMaskLegal(Mask, VT);

35152

}

35153

35154

bool X86TargetLowering::areJTsAllowed(const Function *Fn) const {

35155

// If the subtarget is using thunks, we need to not generate jump tables.

35156

if (Subtarget.useIndirectThunkBranches())

35157

return false;

35158

35159

// Otherwise, fallback on the generic logic.

35160

return TargetLowering::areJTsAllowed(Fn);

35161

}

35162

35163

MVT X86TargetLowering::getPreferredSwitchConditionType(LLVMContext &Context,

35164

EVT ConditionVT) const {

35165

// Avoid 8 and 16 bit types because they increase the chance for unnecessary

35166

// zero-extensions.

35167

if (ConditionVT.getSizeInBits() < 32)

35168

return MVT::i32;

35169

return TargetLoweringBase::getPreferredSwitchConditionType(Context,

35170

ConditionVT);

35171

}

35172

35173

//===----------------------------------------------------------------------===//

35174

// X86 Scheduler Hooks

35175

//===----------------------------------------------------------------------===//

35176

35177

// Returns true if EFLAG is consumed after this iterator in the rest of the

35178

// basic block or any successors of the basic block.

35179

static bool isEFLAGSLiveAfter(MachineBasicBlock::iterator Itr,

35180

MachineBasicBlock *BB) {

35181

// Scan forward through BB for a use/def of EFLAGS.

35182

for (const MachineInstr &mi : llvm::make_range(std::next(Itr), BB->end())) {

35183

if (mi.readsRegister(X86::EFLAGS))

35184

return true;

35185

// If we found a def, we can stop searching.

35186

if (mi.definesRegister(X86::EFLAGS))

35187

return false;

35188

}

35189

35190

// If we hit the end of the block, check whether EFLAGS is live into a

35191

// successor.

35192

for (MachineBasicBlock *Succ : BB->successors())

35193

if (Succ->isLiveIn(X86::EFLAGS))

35194

return true;

35195

35196

return false;

35197

}

35198

35199

/// Utility function to emit xbegin specifying the start of an RTM region.

35200

static MachineBasicBlock *emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB,

35201

const TargetInstrInfo *TII) {

35202

const DebugLoc &DL = MI.getDebugLoc();

35203

35204

const BasicBlock *BB = MBB->getBasicBlock();

35205

MachineFunction::iterator I = ++MBB->getIterator();

35206

35207

// For the v = xbegin(), we generate

35208

//

35209

// thisMBB:

35210

// xbegin sinkMBB

35211

//

35212

// mainMBB:

35213

// s0 = -1

35214

//

35215

// fallBB:

35216

// eax = # XABORT_DEF

35217

// s1 = eax

35218

//

35219

// sinkMBB:

35220

// v = phi(s0/mainBB, s1/fallBB)

35221

35222

MachineBasicBlock *thisMBB = MBB;

35223

MachineFunction *MF = MBB->getParent();

35224

MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);

35225

MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);

35226

MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);

35227

MF->insert(I, mainMBB);

35228

MF->insert(I, fallMBB);

35229

MF->insert(I, sinkMBB);

35230

35231

if (isEFLAGSLiveAfter(MI, MBB)) {

35232

mainMBB->addLiveIn(X86::EFLAGS);

35233

fallMBB->addLiveIn(X86::EFLAGS);

35234

sinkMBB->addLiveIn(X86::EFLAGS);

35235

}

35236

35237

// Transfer the remainder of BB and its successor edges to sinkMBB.

35238

sinkMBB->splice(sinkMBB->begin(), MBB,

35239

std::next(MachineBasicBlock::iterator(MI)), MBB->end());

35240

sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);

35241

35242

MachineRegisterInfo &MRI = MF->getRegInfo();

35243

Register DstReg = MI.getOperand(0).getReg();

35244

const TargetRegisterClass *RC = MRI.getRegClass(DstReg);

35245

Register mainDstReg = MRI.createVirtualRegister(RC);

35246

Register fallDstReg = MRI.createVirtualRegister(RC);

35247

35248

// thisMBB:

35249

// xbegin fallMBB

35250

// # fallthrough to mainMBB

35251

// # abortion to fallMBB

35252

BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(fallMBB);

35253

thisMBB->addSuccessor(mainMBB);

35254

thisMBB->addSuccessor(fallMBB);

35255

35256

// mainMBB:

35257

// mainDstReg := -1

35258

BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), mainDstReg).addImm(-1);

35259

BuildMI(mainMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);

35260

mainMBB->addSuccessor(sinkMBB);

35261

35262

// fallMBB:

35263

// ; pseudo instruction to model hardware's definition from XABORT

35264

// EAX := XABORT_DEF

35265

// fallDstReg := EAX

35266

BuildMI(fallMBB, DL, TII->get(X86::XABORT_DEF));

35267

BuildMI(fallMBB, DL, TII->get(TargetOpcode::COPY), fallDstReg)

35268

.addReg(X86::EAX);

35269

fallMBB->addSuccessor(sinkMBB);

35270

35271

// sinkMBB:

35272

// DstReg := phi(mainDstReg/mainBB, fallDstReg/fallBB)

35273

BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII->get(X86::PHI), DstReg)

35274

.addReg(mainDstReg).addMBB(mainMBB)

35275

.addReg(fallDstReg).addMBB(fallMBB);

35276

35277

MI.eraseFromParent();

35278

return sinkMBB;

35279

}

35280

35281

MachineBasicBlock *

35282

X86TargetLowering::EmitVAARGWithCustomInserter(MachineInstr &MI,

35283

MachineBasicBlock *MBB) const {

35284

// Emit va_arg instruction on X86-64.

35285

35286

// Operands to this pseudo-instruction:

35287

// 0 ) Output : destination address (reg)

35288

// 1-5) Input : va_list address (addr, i64mem)

35289

// 6 ) ArgSize : Size (in bytes) of vararg type

35290

// 7 ) ArgMode : 0=overflow only, 1=use gp_offset, 2=use fp_offset

35291

// 8 ) Align : Alignment of type

35292

// 9 ) EFLAGS (implicit-def)

35293

35294

assert(MI.getNumOperands() == 10 && "VAARG should have 10 operands!")(static_cast <bool> (MI.getNumOperands() == 10 &&
"VAARG should have 10 operands!") ? void (0) : __assert_fail
("MI.getNumOperands() == 10 && \"VAARG should have 10 operands!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 35294, __extension__
__PRETTY_FUNCTION__));

35295

static_assert(X86::AddrNumOperands == 5, "VAARG assumes 5 address operands");

35296

35297

Register DestReg = MI.getOperand(0).getReg();

35298

MachineOperand &Base = MI.getOperand(1);

35299

MachineOperand &Scale = MI.getOperand(2);

35300

MachineOperand &Index = MI.getOperand(3);

35301

MachineOperand &Disp = MI.getOperand(4);

35302

MachineOperand &Segment = MI.getOperand(5);

35303

unsigned ArgSize = MI.getOperand(6).getImm();

35304

unsigned ArgMode = MI.getOperand(7).getImm();

35305

Align Alignment = Align(MI.getOperand(8).getImm());

35306

35307

MachineFunction *MF = MBB->getParent();

35308

35309

// Memory Reference

35310

assert(MI.hasOneMemOperand() && "Expected VAARG to have one memoperand")(static_cast <bool> (MI.hasOneMemOperand() && "Expected VAARG to have one memoperand"
) ? void (0) : __assert_fail ("MI.hasOneMemOperand() && \"Expected VAARG to have one memoperand\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 35310, __extension__
__PRETTY_FUNCTION__));

35311

35312

MachineMemOperand *OldMMO = MI.memoperands().front();

35313

35314

// Clone the MMO into two separate MMOs for loading and storing

35315

MachineMemOperand *LoadOnlyMMO = MF->getMachineMemOperand(

35316

OldMMO, OldMMO->getFlags() & ~MachineMemOperand::MOStore);

35317

MachineMemOperand *StoreOnlyMMO = MF->getMachineMemOperand(

35318

OldMMO, OldMMO->getFlags() & ~MachineMemOperand::MOLoad);

35319

35320

// Machine Information

35321

const TargetInstrInfo *TII = Subtarget.getInstrInfo();

35322

MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();

35323

const TargetRegisterClass *AddrRegClass =

35324

getRegClassFor(getPointerTy(MBB->getParent()->getDataLayout()));

35325

const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);

35326

const DebugLoc &DL = MI.getDebugLoc();

35327

35328

// struct va_list {

35329

// i32 gp_offset

35330

// i32 fp_offset

35331

// i64 overflow_area (address)

35332

// i64 reg_save_area (address)

35333

// }

35334

// sizeof(va_list) = 24

35335

// alignment(va_list) = 8

35336

35337

unsigned TotalNumIntRegs = 6;

35338

unsigned TotalNumXMMRegs = 8;

35339

bool UseGPOffset = (ArgMode == 1);

35340

bool UseFPOffset = (ArgMode == 2);

35341

unsigned MaxOffset = TotalNumIntRegs * 8 +

35342

(UseFPOffset ? TotalNumXMMRegs * 16 : 0);

35343

35344

/* Align ArgSize to a multiple of 8 */

35345

unsigned ArgSizeA8 = (ArgSize + 7) & ~7;

35346

bool NeedsAlign = (Alignment > 8);

35347

35348

MachineBasicBlock *thisMBB = MBB;

35349

MachineBasicBlock *overflowMBB;

35350

MachineBasicBlock *offsetMBB;

35351

MachineBasicBlock *endMBB;

35352

35353

unsigned OffsetDestReg = 0; // Argument address computed by offsetMBB

35354

unsigned OverflowDestReg = 0; // Argument address computed by overflowMBB

35355

unsigned OffsetReg = 0;

35356

35357

if (!UseGPOffset && !UseFPOffset) {

35358

// If we only pull from the overflow region, we don't create a branch.

35359

// We don't need to alter control flow.

35360

OffsetDestReg = 0; // unused

35361

OverflowDestReg = DestReg;

35362

35363

offsetMBB = nullptr;

35364

overflowMBB = thisMBB;

35365

endMBB = thisMBB;

35366

} else {

35367

// First emit code to check if gp_offset (or fp_offset) is below the bound.

35368

// If so, pull the argument from reg_save_area. (branch to offsetMBB)

35369

// If not, pull from overflow_area. (branch to overflowMBB)

35370

//

35371

// thisMBB

35372

// | .

35373

// | .

35374

// offsetMBB overflowMBB

35375

// | .

35376

// | .

35377

// endMBB

35378

35379

// Registers for the PHI in endMBB

35380

OffsetDestReg = MRI.createVirtualRegister(AddrRegClass);

35381

OverflowDestReg = MRI.createVirtualRegister(AddrRegClass);

35382

35383

const BasicBlock *LLVM_BB = MBB->getBasicBlock();

35384

overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);

35385

offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);

35386

endMBB = MF->CreateMachineBasicBlock(LLVM_BB);

35387

35388

MachineFunction::iterator MBBIter = ++MBB->getIterator();

35389

35390

// Insert the new basic blocks

35391

MF->insert(MBBIter, offsetMBB);

35392

MF->insert(MBBIter, overflowMBB);

35393

MF->insert(MBBIter, endMBB);

35394

35395

// Transfer the remainder of MBB and its successor edges to endMBB.

35396

endMBB->splice(endMBB->begin(), thisMBB,

35397

std::next(MachineBasicBlock::iterator(MI)), thisMBB->end());

35398

endMBB->transferSuccessorsAndUpdatePHIs(thisMBB);

35399

35400

// Make offsetMBB and overflowMBB successors of thisMBB

35401

thisMBB->addSuccessor(offsetMBB);

35402

thisMBB->addSuccessor(overflowMBB);

35403

35404

// endMBB is a successor of both offsetMBB and overflowMBB

35405

offsetMBB->addSuccessor(endMBB);

35406

overflowMBB->addSuccessor(endMBB);

35407

35408

// Load the offset value into a register

35409

OffsetReg = MRI.createVirtualRegister(OffsetRegClass);

35410

BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg)

35411

.add(Base)

35412

.add(Scale)

35413

.add(Index)

35414

.addDisp(Disp, UseFPOffset ? 4 : 0)

35415

.add(Segment)

35416

.setMemRefs(LoadOnlyMMO);

35417

35418

// Check if there is enough room left to pull this argument.

35419

BuildMI(thisMBB, DL, TII->get(X86::CMP32ri))

35420

.addReg(OffsetReg)

35421

.addImm(MaxOffset + 8 - ArgSizeA8);

35422

35423

// Branch to "overflowMBB" if offset >= max

35424

// Fall through to "offsetMBB" otherwise

35425

BuildMI(thisMBB, DL, TII->get(X86::JCC_1))

35426

.addMBB(overflowMBB).addImm(X86::COND_AE);

35427

}

35428

35429

// In offsetMBB, emit code to use the reg_save_area.

35430

if (offsetMBB) {

35431

assert(OffsetReg != 0)(static_cast <bool> (OffsetReg != 0) ? void (0) : __assert_fail
("OffsetReg != 0", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 35431, __extension__ __PRETTY_FUNCTION__));

35432

35433

// Read the reg_save_area address.

35434

Register RegSaveReg = MRI.createVirtualRegister(AddrRegClass);

35435

BuildMI(

35436

offsetMBB, DL,

35437

TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64rm : X86::MOV32rm),

35438

RegSaveReg)

35439

.add(Base)

35440

.add(Scale)

35441

.add(Index)

35442

.addDisp(Disp, Subtarget.isTarget64BitLP64() ? 16 : 12)

35443

.add(Segment)

35444

.setMemRefs(LoadOnlyMMO);

35445

35446

if (Subtarget.isTarget64BitLP64()) {

35447

// Zero-extend the offset

35448

Register OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);

35449

BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64)

35450

.addImm(0)

35451

.addReg(OffsetReg)

35452

.addImm(X86::sub_32bit);

35453

35454

// Add the offset to the reg_save_area to get the final address.

35455

BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg)

35456

.addReg(OffsetReg64)

35457

.addReg(RegSaveReg);

35458

} else {

35459

// Add the offset to the reg_save_area to get the final address.

35460

BuildMI(offsetMBB, DL, TII->get(X86::ADD32rr), OffsetDestReg)

35461

.addReg(OffsetReg)

35462

.addReg(RegSaveReg);

35463

}

35464

35465

// Compute the offset for the next argument

35466

Register NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);

35467

BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg)

35468

.addReg(OffsetReg)

35469

.addImm(UseFPOffset ? 16 : 8);

35470

35471

// Store it back into the va_list.

35472

BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr))

35473

.add(Base)

35474

.add(Scale)

35475

.add(Index)

35476

.addDisp(Disp, UseFPOffset ? 4 : 0)

35477

.add(Segment)

35478

.addReg(NextOffsetReg)

35479

.setMemRefs(StoreOnlyMMO);

35480

35481

// Jump to endMBB

35482

BuildMI(offsetMBB, DL, TII->get(X86::JMP_1))

35483

.addMBB(endMBB);

35484

}

35485

35486

//

35487

// Emit code to use overflow area

35488

//

35489

35490

// Load the overflow_area address into a register.

35491

Register OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);

35492

BuildMI(overflowMBB, DL,

35493

TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64rm : X86::MOV32rm),

35494

OverflowAddrReg)

35495

.add(Base)

35496

.add(Scale)

35497

.add(Index)

35498

.addDisp(Disp, 8)

35499

.add(Segment)

35500

.setMemRefs(LoadOnlyMMO);

35501

35502

// If we need to align it, do so. Otherwise, just copy the address

35503

// to OverflowDestReg.

35504

if (NeedsAlign) {

35505

// Align the overflow address

35506

Register TmpReg = MRI.createVirtualRegister(AddrRegClass);

35507

35508

// aligned_addr = (addr + (align-1)) & ~(align-1)

35509

BuildMI(

35510

overflowMBB, DL,

35511

TII->get(Subtarget.isTarget64BitLP64() ? X86::ADD64ri32 : X86::ADD32ri),

35512

TmpReg)

35513

.addReg(OverflowAddrReg)

35514

.addImm(Alignment.value() - 1);

35515

35516

BuildMI(

35517

overflowMBB, DL,

35518

TII->get(Subtarget.isTarget64BitLP64() ? X86::AND64ri32 : X86::AND32ri),

35519

OverflowDestReg)

35520

.addReg(TmpReg)

35521

.addImm(~(uint64_t)(Alignment.value() - 1));

35522

} else {

35523

BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg)

35524

.addReg(OverflowAddrReg);

35525

}

35526

35527

// Compute the next overflow address after this argument.

35528

// (the overflow address should be kept 8-byte aligned)

35529

Register NextAddrReg = MRI.createVirtualRegister(AddrRegClass);

35530

BuildMI(

35531

overflowMBB, DL,

35532

TII->get(Subtarget.isTarget64BitLP64() ? X86::ADD64ri32 : X86::ADD32ri),

35533

NextAddrReg)

35534

.addReg(OverflowDestReg)

35535

.addImm(ArgSizeA8);

35536

35537

// Store the new overflow address.

35538

BuildMI(overflowMBB, DL,

35539

TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64mr : X86::MOV32mr))

35540

.add(Base)

35541

.add(Scale)

35542

.add(Index)

35543

.addDisp(Disp, 8)

35544

.add(Segment)

35545

.addReg(NextAddrReg)

35546

.setMemRefs(StoreOnlyMMO);

35547

35548

// If we branched, emit the PHI to the front of endMBB.

35549

if (offsetMBB) {

35550

BuildMI(*endMBB, endMBB->begin(), DL,

35551

TII->get(X86::PHI), DestReg)

35552

.addReg(OffsetDestReg).addMBB(offsetMBB)

35553

.addReg(OverflowDestReg).addMBB(overflowMBB);

35554

}

35555

35556

// Erase the pseudo instruction

35557

MI.eraseFromParent();

35558

35559

return endMBB;

35560

}

35561

35562

// The EFLAGS operand of SelectItr might be missing a kill marker

35563

// because there were multiple uses of EFLAGS, and ISel didn't know

35564

// which to mark. Figure out whether SelectItr should have had a

35565

// kill marker, and set it if it should. Returns the correct kill

35566

// marker value.

35567

static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,

35568

MachineBasicBlock* BB,

35569

const TargetRegisterInfo* TRI) {

35570

if (isEFLAGSLiveAfter(SelectItr, BB))

35571

return false;

35572

35573

// We found a def, or hit the end of the basic block and EFLAGS wasn't live

35574

// out. SelectMI should have a kill flag on EFLAGS.

35575

SelectItr->addRegisterKilled(X86::EFLAGS, TRI);

35576

return true;

35577

}

35578

35579

// Return true if it is OK for this CMOV pseudo-opcode to be cascaded

35580

// together with other CMOV pseudo-opcodes into a single basic-block with

35581

// conditional jump around it.

35582

static bool isCMOVPseudo(MachineInstr &MI) {

35583

switch (MI.getOpcode()) {

35584

case X86::CMOV_FR16:

35585

case X86::CMOV_FR16X:

35586

case X86::CMOV_FR32:

35587

case X86::CMOV_FR32X:

35588

case X86::CMOV_FR64:

35589

case X86::CMOV_FR64X:

35590

case X86::CMOV_GR8:

35591

case X86::CMOV_GR16:

35592

case X86::CMOV_GR32:

35593

case X86::CMOV_RFP32:

35594

case X86::CMOV_RFP64:

35595

case X86::CMOV_RFP80:

35596

case X86::CMOV_VR64:

35597

case X86::CMOV_VR128:

35598

case X86::CMOV_VR128X:

35599

case X86::CMOV_VR256:

35600

case X86::CMOV_VR256X:

35601

case X86::CMOV_VR512:

35602

case X86::CMOV_VK1:

35603

case X86::CMOV_VK2:

35604

case X86::CMOV_VK4:

35605

case X86::CMOV_VK8:

35606

case X86::CMOV_VK16:

35607

case X86::CMOV_VK32:

35608

case X86::CMOV_VK64:

35609

return true;

35610

35611

default:

35612

return false;

35613

}

35614

}

35615

35616

// Helper function, which inserts PHI functions into SinkMBB:

35617

// %Result(i) = phi [ %FalseValue(i), FalseMBB ], [ %TrueValue(i), TrueMBB ],

35618

// where %FalseValue(i) and %TrueValue(i) are taken from the consequent CMOVs

35619

// in [MIItBegin, MIItEnd) range. It returns the last MachineInstrBuilder for

35620

// the last PHI function inserted.

35621

static MachineInstrBuilder createPHIsForCMOVsInSinkBB(

35622

MachineBasicBlock::iterator MIItBegin, MachineBasicBlock::iterator MIItEnd,

35623

MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB,

35624

MachineBasicBlock *SinkMBB) {

35625

MachineFunction *MF = TrueMBB->getParent();

35626

const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();

35627

const DebugLoc &DL = MIItBegin->getDebugLoc();

35628

35629

X86::CondCode CC = X86::CondCode(MIItBegin->getOperand(3).getImm());

35630

X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);

35631

35632

MachineBasicBlock::iterator SinkInsertionPoint = SinkMBB->begin();

35633

35634

// As we are creating the PHIs, we have to be careful if there is more than

35635

// one. Later CMOVs may reference the results of earlier CMOVs, but later

35636

// PHIs have to reference the individual true/false inputs from earlier PHIs.

35637

// That also means that PHI construction must work forward from earlier to

35638

// later, and that the code must maintain a mapping from earlier PHI's

35639

// destination registers, and the registers that went into the PHI.

35640

DenseMap<unsigned, std::pair<unsigned, unsigned>> RegRewriteTable;

35641

MachineInstrBuilder MIB;

35642

35643

for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) {

35644

Register DestReg = MIIt->getOperand(0).getReg();

35645

Register Op1Reg = MIIt->getOperand(1).getReg();

35646

Register Op2Reg = MIIt->getOperand(2).getReg();

35647

35648

// If this CMOV we are generating is the opposite condition from

35649

// the jump we generated, then we have to swap the operands for the

35650

// PHI that is going to be generated.

35651

if (MIIt->getOperand(3).getImm() == OppCC)

35652

std::swap(Op1Reg, Op2Reg);

35653

35654

if (RegRewriteTable.find(Op1Reg) != RegRewriteTable.end())

35655

Op1Reg = RegRewriteTable[Op1Reg].first;

35656

35657

if (RegRewriteTable.find(Op2Reg) != RegRewriteTable.end())

35658

Op2Reg = RegRewriteTable[Op2Reg].second;

35659

35660

MIB = BuildMI(*SinkMBB, SinkInsertionPoint, DL, TII->get(X86::PHI), DestReg)

35661

.addReg(Op1Reg)

35662

.addMBB(FalseMBB)

35663

.addReg(Op2Reg)

35664

.addMBB(TrueMBB);

35665

35666

// Add this PHI to the rewrite table.

35667

RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg);

35668

}

35669

35670

return MIB;

35671

}

35672

35673

// Lower cascaded selects in form of (SecondCmov (FirstCMOV F, T, cc1), T, cc2).

35674

MachineBasicBlock *

35675

X86TargetLowering::EmitLoweredCascadedSelect(MachineInstr &FirstCMOV,

35676

MachineInstr &SecondCascadedCMOV,

35677

MachineBasicBlock *ThisMBB) const {

35678

const TargetInstrInfo *TII = Subtarget.getInstrInfo();

35679

const DebugLoc &DL = FirstCMOV.getDebugLoc();

35680

35681

// We lower cascaded CMOVs such as

35682

//

35683

// (SecondCascadedCMOV (FirstCMOV F, T, cc1), T, cc2)

35684

//

35685

// to two successive branches.

35686

//

35687

// Without this, we would add a PHI between the two jumps, which ends up

35688

// creating a few copies all around. For instance, for

35689

//

35690

// (sitofp (zext (fcmp une)))

35691

//

35692

// we would generate:

35693

//

35694

// ucomiss %xmm1, %xmm0

35695

// movss <1.0f>, %xmm0

35696

// movaps %xmm0, %xmm1

35697

// jne .LBB5_2

35698

// xorps %xmm1, %xmm1

35699

// .LBB5_2:

35700

// jp .LBB5_4

35701

// movaps %xmm1, %xmm0

35702

// .LBB5_4:

35703

// retq

35704

//

35705

// because this custom-inserter would have generated:

35706

//

35707

// A

35708

// | \

35709

// | B

35710

// | /

35711

// C

35712

// | \

35713

// | D

35714

// | /

35715

// E

35716

//

35717

// A: X = ...; Y = ...

35718

// B: empty

35719

// C: Z = PHI [X, A], [Y, B]

35720

// D: empty

35721

// E: PHI [X, C], [Z, D]

35722

//

35723

// If we lower both CMOVs in a single step, we can instead generate:

35724

//

35725

// A

35726

// | \

35727

// | C

35728

// | /|

35729

// |/ |

35730

// | |

35731

// | D

35732

// | /

35733

// E

35734

//

35735

// A: X = ...; Y = ...

35736

// D: empty

35737

// E: PHI [X, A], [X, C], [Y, D]

35738

//

35739

// Which, in our sitofp/fcmp example, gives us something like:

35740

//

35741

// ucomiss %xmm1, %xmm0

35742

// movss <1.0f>, %xmm0

35743

// jne .LBB5_4

35744

// jp .LBB5_4

35745

// xorps %xmm0, %xmm0

35746

// .LBB5_4:

35747

// retq

35748

//

35749

35750

// We lower cascaded CMOV into two successive branches to the same block.

35751

// EFLAGS is used by both, so mark it as live in the second.

35752

const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();

35753

MachineFunction *F = ThisMBB->getParent();

35754

MachineBasicBlock *FirstInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);

35755

MachineBasicBlock *SecondInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);

35756

MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);

35757

35758

MachineFunction::iterator It = ++ThisMBB->getIterator();

35759

F->insert(It, FirstInsertedMBB);

35760

F->insert(It, SecondInsertedMBB);

35761

F->insert(It, SinkMBB);

35762

35763

// For a cascaded CMOV, we lower it to two successive branches to

35764

// the same block (SinkMBB). EFLAGS is used by both, so mark it as live in

35765

// the FirstInsertedMBB.

35766

FirstInsertedMBB->addLiveIn(X86::EFLAGS);

35767

35768

// If the EFLAGS register isn't dead in the terminator, then claim that it's

35769

// live into the sink and copy blocks.

35770

const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();

35771

if (!SecondCascadedCMOV.killsRegister(X86::EFLAGS) &&

35772

!checkAndUpdateEFLAGSKill(SecondCascadedCMOV, ThisMBB, TRI)) {

35773

SecondInsertedMBB->addLiveIn(X86::EFLAGS);

35774

SinkMBB->addLiveIn(X86::EFLAGS);

35775

}

35776

35777

// Transfer the remainder of ThisMBB and its successor edges to SinkMBB.

35778

SinkMBB->splice(SinkMBB->begin(), ThisMBB,

35779

std::next(MachineBasicBlock::iterator(FirstCMOV)),

35780

ThisMBB->end());

35781

SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);

35782

35783

// Fallthrough block for ThisMBB.

35784

ThisMBB->addSuccessor(FirstInsertedMBB);

35785

// The true block target of the first branch is always SinkMBB.

35786

ThisMBB->addSuccessor(SinkMBB);

35787

// Fallthrough block for FirstInsertedMBB.

35788

FirstInsertedMBB->addSuccessor(SecondInsertedMBB);

35789

// The true block for the branch of FirstInsertedMBB.

35790

FirstInsertedMBB->addSuccessor(SinkMBB);

35791

// This is fallthrough.

35792

SecondInsertedMBB->addSuccessor(SinkMBB);

35793

35794

// Create the conditional branch instructions.

35795

X86::CondCode FirstCC = X86::CondCode(FirstCMOV.getOperand(3).getImm());

35796

BuildMI(ThisMBB, DL, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(FirstCC);

35797

35798

X86::CondCode SecondCC =

35799

X86::CondCode(SecondCascadedCMOV.getOperand(3).getImm());

35800

BuildMI(FirstInsertedMBB, DL, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(SecondCC);

35801

35802

// SinkMBB:

35803

// %Result = phi [ %FalseValue, SecondInsertedMBB ], [ %TrueValue, ThisMBB ]

35804

Register DestReg = SecondCascadedCMOV.getOperand(0).getReg();

35805

Register Op1Reg = FirstCMOV.getOperand(1).getReg();

35806

Register Op2Reg = FirstCMOV.getOperand(2).getReg();

35807

MachineInstrBuilder MIB =

35808

BuildMI(*SinkMBB, SinkMBB->begin(), DL, TII->get(X86::PHI), DestReg)

35809

.addReg(Op1Reg)

35810

.addMBB(SecondInsertedMBB)

35811

.addReg(Op2Reg)

35812

.addMBB(ThisMBB);

35813

35814

// The second SecondInsertedMBB provides the same incoming value as the

35815

// FirstInsertedMBB (the True operand of the SELECT_CC/CMOV nodes).

35816

MIB.addReg(FirstCMOV.getOperand(2).getReg()).addMBB(FirstInsertedMBB);

35817

35818

// Now remove the CMOVs.

35819

FirstCMOV.eraseFromParent();

35820

SecondCascadedCMOV.eraseFromParent();

35821

35822

return SinkMBB;

35823

}

35824

35825

MachineBasicBlock *

35826

X86TargetLowering::EmitLoweredSelect(MachineInstr &MI,

35827

MachineBasicBlock *ThisMBB) const {

35828

const TargetInstrInfo *TII = Subtarget.getInstrInfo();

35829

const DebugLoc &DL = MI.getDebugLoc();

35830

35831

// To "insert" a SELECT_CC instruction, we actually have to insert the

35832

// diamond control-flow pattern. The incoming instruction knows the

35833

// destination vreg to set, the condition code register to branch on, the

35834

// true/false values to select between and a branch opcode to use.

35835

35836

// ThisMBB:

35837

// ...

35838

// TrueVal = ...

35839

// cmpTY ccX, r1, r2

35840

// bCC copy1MBB

35841

// fallthrough --> FalseMBB

35842

35843

// This code lowers all pseudo-CMOV instructions. Generally it lowers these

35844

// as described above, by inserting a BB, and then making a PHI at the join

35845

// point to select the true and false operands of the CMOV in the PHI.

35846

//

35847

// The code also handles two different cases of multiple CMOV opcodes

35848

// in a row.

35849

//

35850

// Case 1:

35851

// In this case, there are multiple CMOVs in a row, all which are based on

35852

// the same condition setting (or the exact opposite condition setting).

35853

// In this case we can lower all the CMOVs using a single inserted BB, and

35854

// then make a number of PHIs at the join point to model the CMOVs. The only

35855

// trickiness here, is that in a case like:

35856

//

35857

// t2 = CMOV cond1 t1, f1

35858

// t3 = CMOV cond1 t2, f2

35859

//

35860

// when rewriting this into PHIs, we have to perform some renaming on the

35861

// temps since you cannot have a PHI operand refer to a PHI result earlier

35862

// in the same block. The "simple" but wrong lowering would be:

35863

//

35864

// t2 = PHI t1(BB1), f1(BB2)

35865

// t3 = PHI t2(BB1), f2(BB2)

35866

//

35867

// but clearly t2 is not defined in BB1, so that is incorrect. The proper

35868

// renaming is to note that on the path through BB1, t2 is really just a

35869

// copy of t1, and do that renaming, properly generating:

35870

//

35871

// t2 = PHI t1(BB1), f1(BB2)

35872

// t3 = PHI t1(BB1), f2(BB2)

35873

//

35874

// Case 2:

35875

// CMOV ((CMOV F, T, cc1), T, cc2) is checked here and handled by a separate

35876

// function - EmitLoweredCascadedSelect.

35877

35878

X86::CondCode CC = X86::CondCode(MI.getOperand(3).getImm());

35879

X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);

35880

MachineInstr *LastCMOV = &MI;

35881

MachineBasicBlock::iterator NextMIIt = MachineBasicBlock::iterator(MI);

35882

35883

// Check for case 1, where there are multiple CMOVs with the same condition

35884

// first. Of the two cases of multiple CMOV lowerings, case 1 reduces the

35885

// number of jumps the most.

35886

35887

if (isCMOVPseudo(MI)) {

35888

// See if we have a string of CMOVS with the same condition. Skip over

35889

// intervening debug insts.

35890

while (NextMIIt != ThisMBB->end() && isCMOVPseudo(*NextMIIt) &&

35891

(NextMIIt->getOperand(3).getImm() == CC ||

35892

NextMIIt->getOperand(3).getImm() == OppCC)) {

35893

LastCMOV = &*NextMIIt;

35894

NextMIIt = next_nodbg(NextMIIt, ThisMBB->end());

35895

}

35896

}

35897

35898

// This checks for case 2, but only do this if we didn't already find

35899

// case 1, as indicated by LastCMOV == MI.

35900

if (LastCMOV == &MI && NextMIIt != ThisMBB->end() &&

35901

NextMIIt->getOpcode() == MI.getOpcode() &&

35902

NextMIIt->getOperand(2).getReg() == MI.getOperand(2).getReg() &&

35903

NextMIIt->getOperand(1).getReg() == MI.getOperand(0).getReg() &&

35904

NextMIIt->getOperand(1).isKill()) {

35905

return EmitLoweredCascadedSelect(MI, *NextMIIt, ThisMBB);

35906

}

35907

35908

const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();

35909

MachineFunction *F = ThisMBB->getParent();

35910

MachineBasicBlock *FalseMBB = F->CreateMachineBasicBlock(LLVM_BB);

35911

MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);

35912

35913

MachineFunction::iterator It = ++ThisMBB->getIterator();

35914

F->insert(It, FalseMBB);

35915

F->insert(It, SinkMBB);

35916

35917

// If the EFLAGS register isn't dead in the terminator, then claim that it's

35918

// live into the sink and copy blocks.

35919

const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();

35920

if (!LastCMOV->killsRegister(X86::EFLAGS) &&

35921

!checkAndUpdateEFLAGSKill(LastCMOV, ThisMBB, TRI)) {

35922

FalseMBB->addLiveIn(X86::EFLAGS);

35923

SinkMBB->addLiveIn(X86::EFLAGS);

35924

}

35925

35926

// Transfer any debug instructions inside the CMOV sequence to the sunk block.

35927

auto DbgRange = llvm::make_range(MachineBasicBlock::iterator(MI),

35928

MachineBasicBlock::iterator(LastCMOV));

35929

for (MachineInstr &MI : llvm::make_early_inc_range(DbgRange))

35930

if (MI.isDebugInstr())

35931

SinkMBB->push_back(MI.removeFromParent());

35932

35933

// Transfer the remainder of ThisMBB and its successor edges to SinkMBB.

35934

SinkMBB->splice(SinkMBB->end(), ThisMBB,

35935

std::next(MachineBasicBlock::iterator(LastCMOV)),

35936

ThisMBB->end());

35937

SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);

35938

35939

// Fallthrough block for ThisMBB.

35940

ThisMBB->addSuccessor(FalseMBB);

35941

// The true block target of the first (or only) branch is always a SinkMBB.

35942

ThisMBB->addSuccessor(SinkMBB);

35943

// Fallthrough block for FalseMBB.

35944

FalseMBB->addSuccessor(SinkMBB);

35945

35946

// Create the conditional branch instruction.

35947

BuildMI(ThisMBB, DL, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(CC);

35948

35949

// SinkMBB:

35950

// %Result = phi [ %FalseValue, FalseMBB ], [ %TrueValue, ThisMBB ]

35951

// ...

35952

MachineBasicBlock::iterator MIItBegin = MachineBasicBlock::iterator(MI);

35953

MachineBasicBlock::iterator MIItEnd =

35954

std::next(MachineBasicBlock::iterator(LastCMOV));

35955

createPHIsForCMOVsInSinkBB(MIItBegin, MIItEnd, ThisMBB, FalseMBB, SinkMBB);

35956

35957

// Now remove the CMOV(s).

35958

ThisMBB->erase(MIItBegin, MIItEnd);

35959

35960

return SinkMBB;

35961

}

35962

35963

static unsigned getSUBriOpcode(bool IsLP64, int64_t Imm) {

35964

if (IsLP64) {

35965

if (isInt<8>(Imm))

35966

return X86::SUB64ri8;

35967

return X86::SUB64ri32;

35968

} else {

35969

if (isInt<8>(Imm))

35970

return X86::SUB32ri8;

35971

return X86::SUB32ri;

35972

}

35973

}

35974

35975

MachineBasicBlock *

35976

X86TargetLowering::EmitLoweredProbedAlloca(MachineInstr &MI,

35977

MachineBasicBlock *MBB) const {

35978

MachineFunction *MF = MBB->getParent();

35979

const TargetInstrInfo *TII = Subtarget.getInstrInfo();

35980

const X86FrameLowering &TFI = *Subtarget.getFrameLowering();

35981

const DebugLoc &DL = MI.getDebugLoc();

35982

const BasicBlock *LLVM_BB = MBB->getBasicBlock();

35983

35984

const unsigned ProbeSize = getStackProbeSize(*MF);

35985

35986

MachineRegisterInfo &MRI = MF->getRegInfo();

35987

MachineBasicBlock *testMBB = MF->CreateMachineBasicBlock(LLVM_BB);

35988

MachineBasicBlock *tailMBB = MF->CreateMachineBasicBlock(LLVM_BB);

35989

MachineBasicBlock *blockMBB = MF->CreateMachineBasicBlock(LLVM_BB);

35990

35991

MachineFunction::iterator MBBIter = ++MBB->getIterator();

35992

MF->insert(MBBIter, testMBB);

35993

MF->insert(MBBIter, blockMBB);

35994

MF->insert(MBBIter, tailMBB);

35995

35996

Register sizeVReg = MI.getOperand(1).getReg();

35997

35998

Register physSPReg = TFI.Uses64BitFramePtr ? X86::RSP : X86::ESP;

35999

36000

Register TmpStackPtr = MRI.createVirtualRegister(

36001

TFI.Uses64BitFramePtr ? &X86::GR64RegClass : &X86::GR32RegClass);

36002

Register FinalStackPtr = MRI.createVirtualRegister(

36003

TFI.Uses64BitFramePtr ? &X86::GR64RegClass : &X86::GR32RegClass);

36004

36005

BuildMI(*MBB, {MI}, DL, TII->get(TargetOpcode::COPY), TmpStackPtr)

36006

.addReg(physSPReg);

36007

{

36008

const unsigned Opc = TFI.Uses64BitFramePtr ? X86::SUB64rr : X86::SUB32rr;

36009

BuildMI(*MBB, {MI}, DL, TII->get(Opc), FinalStackPtr)

36010

.addReg(TmpStackPtr)

36011

.addReg(sizeVReg);

36012

}

36013

36014

// test rsp size

36015

36016

BuildMI(testMBB, DL,

36017

TII->get(TFI.Uses64BitFramePtr ? X86::CMP64rr : X86::CMP32rr))

36018

.addReg(FinalStackPtr)

36019

.addReg(physSPReg);

36020

36021

BuildMI(testMBB, DL, TII->get(X86::JCC_1))

36022

.addMBB(tailMBB)

36023

.addImm(X86::COND_GE);

36024

testMBB->addSuccessor(blockMBB);

36025

testMBB->addSuccessor(tailMBB);

36026

36027

// Touch the block then extend it. This is done on the opposite side of

36028

// static probe where we allocate then touch, to avoid the need of probing the

36029

// tail of the static alloca. Possible scenarios are:

36030

//

36031

// + ---- <- ------------ <- ------------- <- ------------ +

36032

// | |

36033

// [free probe] -> [page alloc] -> [alloc probe] -> [tail alloc] + -> [dyn probe] -> [page alloc] -> [dyn probe] -> [tail alloc] +

36034

// | |

36035

// + <- ----------- <- ------------ <- ----------- <- ------------ +

36036

//

36037

// The property we want to enforce is to never have more than [page alloc] between two probes.

36038

36039

const unsigned XORMIOpc =

36040

TFI.Uses64BitFramePtr ? X86::XOR64mi8 : X86::XOR32mi8;

36041

addRegOffset(BuildMI(blockMBB, DL, TII->get(XORMIOpc)), physSPReg, false, 0)

36042

.addImm(0);

36043

36044

BuildMI(blockMBB, DL,

36045

TII->get(getSUBriOpcode(TFI.Uses64BitFramePtr, ProbeSize)), physSPReg)

36046

.addReg(physSPReg)

36047

.addImm(ProbeSize);

36048

36049

36050

BuildMI(blockMBB, DL, TII->get(X86::JMP_1)).addMBB(testMBB);

36051

blockMBB->addSuccessor(testMBB);

36052

36053

// Replace original instruction by the expected stack ptr

36054

BuildMI(tailMBB, DL, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())

36055

.addReg(FinalStackPtr);

36056

36057

tailMBB->splice(tailMBB->end(), MBB,

36058

std::next(MachineBasicBlock::iterator(MI)), MBB->end());

36059

tailMBB->transferSuccessorsAndUpdatePHIs(MBB);

36060

MBB->addSuccessor(testMBB);

36061

36062

// Delete the original pseudo instruction.

36063

MI.eraseFromParent();

36064

36065

// And we're done.

36066

return tailMBB;

36067

}

36068

36069

MachineBasicBlock *

36070

X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI,

36071

MachineBasicBlock *BB) const {

36072

MachineFunction *MF = BB->getParent();

36073

const TargetInstrInfo *TII = Subtarget.getInstrInfo();

36074

const DebugLoc &DL = MI.getDebugLoc();

36075

const BasicBlock *LLVM_BB = BB->getBasicBlock();

36076

36077

assert(MF->shouldSplitStack())(static_cast <bool> (MF->shouldSplitStack()) ? void (
0) : __assert_fail ("MF->shouldSplitStack()", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 36077, __extension__ __PRETTY_FUNCTION__));

36078

36079

const bool Is64Bit = Subtarget.is64Bit();

36080

const bool IsLP64 = Subtarget.isTarget64BitLP64();

36081

36082

const unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;

36083

const unsigned TlsOffset = IsLP64 ? 0x70 : Is64Bit ? 0x40 : 0x30;

36084

36085

// BB:

36086

// ... [Till the alloca]

36087

// If stacklet is not large enough, jump to mallocMBB

36088

//

36089

// bumpMBB:

36090

// Allocate by subtracting from RSP

36091

// Jump to continueMBB

36092

//

36093

// mallocMBB:

36094

// Allocate by call to runtime

36095

//

36096

// continueMBB:

36097

// ...

36098

// [rest of original BB]

36099

//

36100

36101

MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB);

36102

MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB);

36103

MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB);

36104

36105

MachineRegisterInfo &MRI = MF->getRegInfo();

36106

const TargetRegisterClass *AddrRegClass =

36107

getRegClassFor(getPointerTy(MF->getDataLayout()));

36108

36109

Register mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),

36110

bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),

36111

tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),

36112

SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),

36113

sizeVReg = MI.getOperand(1).getReg(),

36114

physSPReg =

36115

IsLP64 || Subtarget.isTargetNaCl64() ? X86::RSP : X86::ESP;

36116

36117

MachineFunction::iterator MBBIter = ++BB->getIterator();

36118

36119

MF->insert(MBBIter, bumpMBB);

36120

MF->insert(MBBIter, mallocMBB);

36121

MF->insert(MBBIter, continueMBB);

36122

36123

continueMBB->splice(continueMBB->begin(), BB,

36124

std::next(MachineBasicBlock::iterator(MI)), BB->end());

36125

continueMBB->transferSuccessorsAndUpdatePHIs(BB);

36126

36127

// Add code to the main basic block to check if the stack limit has been hit,

36128

// and if so, jump to mallocMBB otherwise to bumpMBB.

36129

BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);

36130

BuildMI(BB, DL, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)

36131

.addReg(tmpSPVReg).addReg(sizeVReg);

36132

BuildMI(BB, DL, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr))

36133

.addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)

36134

.addReg(SPLimitVReg);

36135

BuildMI(BB, DL, TII->get(X86::JCC_1)).addMBB(mallocMBB).addImm(X86::COND_G);

36136

36137

// bumpMBB simply decreases the stack pointer, since we know the current

36138

// stacklet has enough space.

36139

BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), physSPReg)

36140

.addReg(SPLimitVReg);

36141

BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)

36142

.addReg(SPLimitVReg);

36143

BuildMI(bumpMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);

36144

36145

// Calls into a routine in libgcc to allocate more space from the heap.

36146

const uint32_t *RegMask =

36147

Subtarget.getRegisterInfo()->getCallPreservedMask(*MF, CallingConv::C);

36148

if (IsLP64) {

36149

BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI)

36150

.addReg(sizeVReg);

36151

BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))

36152

.addExternalSymbol("__morestack_allocate_stack_space")

36153

.addRegMask(RegMask)

36154

.addReg(X86::RDI, RegState::Implicit)

36155

.addReg(X86::RAX, RegState::ImplicitDefine);

36156

} else if (Is64Bit) {

36157

BuildMI(mallocMBB, DL, TII->get(X86::MOV32rr), X86::EDI)

36158

.addReg(sizeVReg);

36159

BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))

36160

.addExternalSymbol("__morestack_allocate_stack_space")

36161

.addRegMask(RegMask)

36162

.addReg(X86::EDI, RegState::Implicit)

36163

.addReg(X86::EAX, RegState::ImplicitDefine);

36164

} else {

36165

BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)

36166

.addImm(12);

36167

BuildMI(mallocMBB, DL, TII->get(X86::PUSH32r)).addReg(sizeVReg);

36168

BuildMI(mallocMBB, DL, TII->get(X86::CALLpcrel32))

36169

.addExternalSymbol("__morestack_allocate_stack_space")

36170

.addRegMask(RegMask)

36171

.addReg(X86::EAX, RegState::ImplicitDefine);

36172

}

36173

36174

if (!Is64Bit)

36175

BuildMI(mallocMBB, DL, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)

36176

.addImm(16);

36177

36178

BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg)

36179

.addReg(IsLP64 ? X86::RAX : X86::EAX);

36180

BuildMI(mallocMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);

36181

36182

// Set up the CFG correctly.

36183

BB->addSuccessor(bumpMBB);

36184

BB->addSuccessor(mallocMBB);

36185

mallocMBB->addSuccessor(continueMBB);

36186

bumpMBB->addSuccessor(continueMBB);

36187

36188

// Take care of the PHI nodes.

36189

BuildMI(*continueMBB, continueMBB->begin(), DL, TII->get(X86::PHI),

36190

MI.getOperand(0).getReg())

36191

.addReg(mallocPtrVReg)

36192

.addMBB(mallocMBB)

36193

.addReg(bumpSPPtrVReg)

36194

.addMBB(bumpMBB);

36195

36196

// Delete the original pseudo instruction.

36197

MI.eraseFromParent();

36198

36199

// And we're done.

36200

return continueMBB;

36201

}

36202

36203

MachineBasicBlock *

36204

X86TargetLowering::EmitLoweredCatchRet(MachineInstr &MI,

36205

MachineBasicBlock *BB) const {

36206

MachineFunction *MF = BB->getParent();

36207

const TargetInstrInfo &TII = *Subtarget.getInstrInfo();

36208

MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();

36209

const DebugLoc &DL = MI.getDebugLoc();

36210

36211

assert(!isAsynchronousEHPersonality((static_cast <bool> (!isAsynchronousEHPersonality( classifyEHPersonality
(MF->getFunction().getPersonalityFn())) && "SEH does not use catchret!"
) ? void (0) : __assert_fail ("!isAsynchronousEHPersonality( classifyEHPersonality(MF->getFunction().getPersonalityFn())) && \"SEH does not use catchret!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36213, __extension__
__PRETTY_FUNCTION__))

36212

classifyEHPersonality(MF->getFunction().getPersonalityFn())) &&(static_cast <bool> (!isAsynchronousEHPersonality( classifyEHPersonality
(MF->getFunction().getPersonalityFn())) && "SEH does not use catchret!"
) ? void (0) : __assert_fail ("!isAsynchronousEHPersonality( classifyEHPersonality(MF->getFunction().getPersonalityFn())) && \"SEH does not use catchret!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36213, __extension__
__PRETTY_FUNCTION__))

36213

"SEH does not use catchret!")(static_cast <bool> (!isAsynchronousEHPersonality( classifyEHPersonality
(MF->getFunction().getPersonalityFn())) && "SEH does not use catchret!"
) ? void (0) : __assert_fail ("!isAsynchronousEHPersonality( classifyEHPersonality(MF->getFunction().getPersonalityFn())) && \"SEH does not use catchret!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36213, __extension__
__PRETTY_FUNCTION__));

36214

36215

// Only 32-bit EH needs to worry about manually restoring stack pointers.

36216

if (!Subtarget.is32Bit())

36217

return BB;

36218

36219

// C++ EH creates a new target block to hold the restore code, and wires up

36220

// the new block to the return destination with a normal JMP_4.

36221

MachineBasicBlock *RestoreMBB =

36222

MF->CreateMachineBasicBlock(BB->getBasicBlock());

36223

assert(BB->succ_size() == 1)(static_cast <bool> (BB->succ_size() == 1) ? void (0
) : __assert_fail ("BB->succ_size() == 1", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 36223, __extension__ __PRETTY_FUNCTION__));

36224

MF->insert(std::next(BB->getIterator()), RestoreMBB);

36225

RestoreMBB->transferSuccessorsAndUpdatePHIs(BB);

36226

BB->addSuccessor(RestoreMBB);

36227

MI.getOperand(0).setMBB(RestoreMBB);

36228

36229

// Marking this as an EH pad but not a funclet entry block causes PEI to

36230

// restore stack pointers in the block.

36231

RestoreMBB->setIsEHPad(true);

36232

36233

auto RestoreMBBI = RestoreMBB->begin();

36234

BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::JMP_4)).addMBB(TargetMBB);

36235

return BB;

36236

}

36237

36238

MachineBasicBlock *

36239

X86TargetLowering::EmitLoweredTLSAddr(MachineInstr &MI,

36240

MachineBasicBlock *BB) const {

36241

// So, here we replace TLSADDR with the sequence:

36242

// adjust_stackdown -> TLSADDR -> adjust_stackup.

36243

// We need this because TLSADDR is lowered into calls

36244

// inside MC, therefore without the two markers shrink-wrapping

36245

// may push the prologue/epilogue pass them.

36246

const TargetInstrInfo &TII = *Subtarget.getInstrInfo();

36247

const DebugLoc &DL = MI.getDebugLoc();

36248

MachineFunction &MF = *BB->getParent();

36249

36250

// Emit CALLSEQ_START right before the instruction.

36251

unsigned AdjStackDown = TII.getCallFrameSetupOpcode();

36252

MachineInstrBuilder CallseqStart =

36253

BuildMI(MF, DL, TII.get(AdjStackDown)).addImm(0).addImm(0).addImm(0);

36254

BB->insert(MachineBasicBlock::iterator(MI), CallseqStart);

36255

36256

// Emit CALLSEQ_END right after the instruction.

36257

// We don't call erase from parent because we want to keep the

36258

// original instruction around.

36259

unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();

36260

MachineInstrBuilder CallseqEnd =

36261

BuildMI(MF, DL, TII.get(AdjStackUp)).addImm(0).addImm(0);

36262

BB->insertAfter(MachineBasicBlock::iterator(MI), CallseqEnd);

36263

36264

return BB;

36265

}

36266

36267

MachineBasicBlock *

36268

X86TargetLowering::EmitLoweredTLSCall(MachineInstr &MI,

36269

MachineBasicBlock *BB) const {

36270

// This is pretty easy. We're taking the value that we received from

36271

// our load from the relocation, sticking it in either RDI (x86-64)

36272

// or EAX and doing an indirect call. The return value will then

36273

// be in the normal return register.

36274

MachineFunction *F = BB->getParent();

36275

const X86InstrInfo *TII = Subtarget.getInstrInfo();

36276

const DebugLoc &DL = MI.getDebugLoc();

36277

36278

assert(Subtarget.isTargetDarwin() && "Darwin only instr emitted?")(static_cast <bool> (Subtarget.isTargetDarwin() &&
"Darwin only instr emitted?") ? void (0) : __assert_fail ("Subtarget.isTargetDarwin() && \"Darwin only instr emitted?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36278, __extension__
__PRETTY_FUNCTION__));

36279

assert(MI.getOperand(3).isGlobal() && "This should be a global")(static_cast <bool> (MI.getOperand(3).isGlobal() &&
"This should be a global") ? void (0) : __assert_fail ("MI.getOperand(3).isGlobal() && \"This should be a global\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36279, __extension__
__PRETTY_FUNCTION__));

36280

36281

// Get a register mask for the lowered call.

36282

// FIXME: The 32-bit calls have non-standard calling conventions. Use a

36283

// proper register mask.

36284

const uint32_t *RegMask =

36285

Subtarget.is64Bit() ?

36286

Subtarget.getRegisterInfo()->getDarwinTLSCallPreservedMask() :

36287

Subtarget.getRegisterInfo()->getCallPreservedMask(*F, CallingConv::C);

36288

if (Subtarget.is64Bit()) {

36289

MachineInstrBuilder MIB =

36290

BuildMI(*BB, MI, DL, TII->get(X86::MOV64rm), X86::RDI)

36291

.addReg(X86::RIP)

36292

.addImm(0)

36293

.addReg(0)

36294

.addGlobalAddress(MI.getOperand(3).getGlobal(), 0,

36295

MI.getOperand(3).getTargetFlags())

36296

.addReg(0);

36297

MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m));

36298

addDirectMem(MIB, X86::RDI);

36299

MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);

36300

} else if (!isPositionIndependent()) {

36301

MachineInstrBuilder MIB =

36302

BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)

36303

.addReg(0)

36304

.addImm(0)

36305

.addReg(0)

36306

.addGlobalAddress(MI.getOperand(3).getGlobal(), 0,

36307

MI.getOperand(3).getTargetFlags())

36308

.addReg(0);

36309

MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));

36310

addDirectMem(MIB, X86::EAX);

36311

MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);

36312

} else {

36313

MachineInstrBuilder MIB =

36314

BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)

36315

.addReg(TII->getGlobalBaseReg(F))

36316

.addImm(0)

36317

.addReg(0)

36318

.addGlobalAddress(MI.getOperand(3).getGlobal(), 0,

36319

MI.getOperand(3).getTargetFlags())

36320

.addReg(0);

36321

MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));

36322

addDirectMem(MIB, X86::EAX);

36323

MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);

36324

}

36325

36326

MI.eraseFromParent(); // The pseudo instruction is gone now.

36327

return BB;

36328

}

36329

36330

static unsigned getOpcodeForIndirectThunk(unsigned RPOpc) {

36331

switch (RPOpc) {

36332

case X86::INDIRECT_THUNK_CALL32:

36333

return X86::CALLpcrel32;

36334

case X86::INDIRECT_THUNK_CALL64:

36335

return X86::CALL64pcrel32;

36336

case X86::INDIRECT_THUNK_TCRETURN32:

36337

return X86::TCRETURNdi;

36338

case X86::INDIRECT_THUNK_TCRETURN64:

36339

return X86::TCRETURNdi64;

36340

}

36341

llvm_unreachable("not indirect thunk opcode")::llvm::llvm_unreachable_internal("not indirect thunk opcode"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36341);

36342

}

36343

36344

static const char *getIndirectThunkSymbol(const X86Subtarget &Subtarget,

36345

unsigned Reg) {

36346

if (Subtarget.useRetpolineExternalThunk()) {

36347

// When using an external thunk for retpolines, we pick names that match the

36348

// names GCC happens to use as well. This helps simplify the implementation

36349

// of the thunks for kernels where they have no easy ability to create

36350

// aliases and are doing non-trivial configuration of the thunk's body. For

36351

// example, the Linux kernel will do boot-time hot patching of the thunk

36352

// bodies and cannot easily export aliases of these to loaded modules.

36353

//

36354

// Note that at any point in the future, we may need to change the semantics

36355

// of how we implement retpolines and at that time will likely change the

36356

// name of the called thunk. Essentially, there is no hard guarantee that

36357

// LLVM will generate calls to specific thunks, we merely make a best-effort

36358

// attempt to help out kernels and other systems where duplicating the

36359

// thunks is costly.

36360

switch (Reg) {

36361

case X86::EAX:

36362

assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")(static_cast <bool> (!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36362, __extension__
__PRETTY_FUNCTION__));

36363

return "__x86_indirect_thunk_eax";

36364

case X86::ECX:

36365

assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")(static_cast <bool> (!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36365, __extension__
__PRETTY_FUNCTION__));

36366

return "__x86_indirect_thunk_ecx";

36367

case X86::EDX:

36368

assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")(static_cast <bool> (!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36368, __extension__
__PRETTY_FUNCTION__));

36369

return "__x86_indirect_thunk_edx";

36370

case X86::EDI:

36371

assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")(static_cast <bool> (!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36371, __extension__
__PRETTY_FUNCTION__));

36372

return "__x86_indirect_thunk_edi";

36373

case X86::R11:

36374

assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!")(static_cast <bool> (Subtarget.is64Bit() && "Should not be using a 64-bit thunk!"
) ? void (0) : __assert_fail ("Subtarget.is64Bit() && \"Should not be using a 64-bit thunk!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36374, __extension__
__PRETTY_FUNCTION__));

36375

return "__x86_indirect_thunk_r11";

36376

}

36377

llvm_unreachable("unexpected reg for external indirect thunk")::llvm::llvm_unreachable_internal("unexpected reg for external indirect thunk"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36377);

36378

}

36379

36380

if (Subtarget.useRetpolineIndirectCalls() ||

36381

Subtarget.useRetpolineIndirectBranches()) {

36382

// When targeting an internal COMDAT thunk use an LLVM-specific name.

36383

switch (Reg) {

36384

case X86::EAX:

36385

assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")(static_cast <bool> (!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36385, __extension__
__PRETTY_FUNCTION__));

36386

return "__llvm_retpoline_eax";

36387

case X86::ECX:

36388

assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")(static_cast <bool> (!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36388, __extension__
__PRETTY_FUNCTION__));

36389

return "__llvm_retpoline_ecx";

36390

case X86::EDX:

36391

assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")(static_cast <bool> (!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36391, __extension__
__PRETTY_FUNCTION__));

36392

return "__llvm_retpoline_edx";

36393

case X86::EDI:

36394

assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")(static_cast <bool> (!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!"
) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36394, __extension__
__PRETTY_FUNCTION__));

36395

return "__llvm_retpoline_edi";

36396

case X86::R11:

36397

assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!")(static_cast <bool> (Subtarget.is64Bit() && "Should not be using a 64-bit thunk!"
) ? void (0) : __assert_fail ("Subtarget.is64Bit() && \"Should not be using a 64-bit thunk!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36397, __extension__
__PRETTY_FUNCTION__));

36398

return "__llvm_retpoline_r11";

36399

}

36400

llvm_unreachable("unexpected reg for retpoline")::llvm::llvm_unreachable_internal("unexpected reg for retpoline"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36400);

36401

}

36402

36403

if (Subtarget.useLVIControlFlowIntegrity()) {

36404

assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!")(static_cast <bool> (Subtarget.is64Bit() && "Should not be using a 64-bit thunk!"
) ? void (0) : __assert_fail ("Subtarget.is64Bit() && \"Should not be using a 64-bit thunk!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36404, __extension__
__PRETTY_FUNCTION__));

36405

return "__llvm_lvi_thunk_r11";

36406

}

36407

llvm_unreachable("getIndirectThunkSymbol() invoked without thunk feature")::llvm::llvm_unreachable_internal("getIndirectThunkSymbol() invoked without thunk feature"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36407);

36408

}

36409

36410

MachineBasicBlock *

36411

X86TargetLowering::EmitLoweredIndirectThunk(MachineInstr &MI,

36412

MachineBasicBlock *BB) const {

36413

// Copy the virtual register into the R11 physical register and

36414

// call the retpoline thunk.

36415

const DebugLoc &DL = MI.getDebugLoc();

36416

const X86InstrInfo *TII = Subtarget.getInstrInfo();

36417

Register CalleeVReg = MI.getOperand(0).getReg();

36418

unsigned Opc = getOpcodeForIndirectThunk(MI.getOpcode());

36419

36420

// Find an available scratch register to hold the callee. On 64-bit, we can

36421

// just use R11, but we scan for uses anyway to ensure we don't generate

36422

// incorrect code. On 32-bit, we use one of EAX, ECX, or EDX that isn't

36423

// already a register use operand to the call to hold the callee. If none

36424

// are available, use EDI instead. EDI is chosen because EBX is the PIC base

36425

// register and ESI is the base pointer to realigned stack frames with VLAs.

36426

SmallVector<unsigned, 3> AvailableRegs;

36427

if (Subtarget.is64Bit())

36428

AvailableRegs.push_back(X86::R11);

36429

else

36430

AvailableRegs.append({X86::EAX, X86::ECX, X86::EDX, X86::EDI});

36431

36432

// Zero out any registers that are already used.

36433

for (const auto &MO : MI.operands()) {

36434

if (MO.isReg() && MO.isUse())

36435

for (unsigned &Reg : AvailableRegs)

36436

if (Reg == MO.getReg())

36437

Reg = 0;

36438

}

36439

36440

// Choose the first remaining non-zero available register.

36441

unsigned AvailableReg = 0;

36442

for (unsigned MaybeReg : AvailableRegs) {

36443

if (MaybeReg) {

36444

AvailableReg = MaybeReg;

36445

break;

36446

}

36447

}

36448

if (!AvailableReg)

36449

report_fatal_error("calling convention incompatible with retpoline, no "

36450

"available registers");

36451

36452

const char *Symbol = getIndirectThunkSymbol(Subtarget, AvailableReg);

36453

36454

BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), AvailableReg)

36455

.addReg(CalleeVReg);

36456

MI.getOperand(0).ChangeToES(Symbol);

36457

MI.setDesc(TII->get(Opc));

36458

MachineInstrBuilder(*BB->getParent(), &MI)

36459

.addReg(AvailableReg, RegState::Implicit | RegState::Kill);

36460

return BB;

36461

}

36462

36463

/// SetJmp implies future control flow change upon calling the corresponding

36464

/// LongJmp.

36465

/// Instead of using the 'return' instruction, the long jump fixes the stack and

36466

/// performs an indirect branch. To do so it uses the registers that were stored

36467

/// in the jump buffer (when calling SetJmp).

36468

/// In case the shadow stack is enabled we need to fix it as well, because some

36469

/// return addresses will be skipped.

36470

/// The function will save the SSP for future fixing in the function

36471

/// emitLongJmpShadowStackFix.

36472

/// \sa emitLongJmpShadowStackFix

36473

/// \param [in] MI The temporary Machine Instruction for the builtin.

36474

/// \param [in] MBB The Machine Basic Block that will be modified.

36475

void X86TargetLowering::emitSetJmpShadowStackFix(MachineInstr &MI,

36476

MachineBasicBlock *MBB) const {

36477

const DebugLoc &DL = MI.getDebugLoc();

36478

MachineFunction *MF = MBB->getParent();

36479

const TargetInstrInfo *TII = Subtarget.getInstrInfo();

36480

MachineRegisterInfo &MRI = MF->getRegInfo();

36481

MachineInstrBuilder MIB;

36482

36483

// Memory Reference.

36484

SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),

36485

MI.memoperands_end());

36486

36487

// Initialize a register with zero.

36488

MVT PVT = getPointerTy(MF->getDataLayout());

36489

const TargetRegisterClass *PtrRC = getRegClassFor(PVT);

36490

Register ZReg = MRI.createVirtualRegister(PtrRC);

36491

unsigned XorRROpc = (PVT == MVT::i64) ? X86::XOR64rr : X86::XOR32rr;

36492

BuildMI(*MBB, MI, DL, TII->get(XorRROpc))

36493

.addDef(ZReg)

36494

.addReg(ZReg, RegState::Undef)

36495

.addReg(ZReg, RegState::Undef);

36496

36497

// Read the current SSP Register value to the zeroed register.

36498

Register SSPCopyReg = MRI.createVirtualRegister(PtrRC);

36499

unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;

36500

BuildMI(*MBB, MI, DL, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);

36501

36502

// Write the SSP register value to offset 3 in input memory buffer.

36503

unsigned PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;

36504

MIB = BuildMI(*MBB, MI, DL, TII->get(PtrStoreOpc));

36505

const int64_t SSPOffset = 3 * PVT.getStoreSize();

36506

const unsigned MemOpndSlot = 1;

36507

for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {

36508

if (i == X86::AddrDisp)

36509

MIB.addDisp(MI.getOperand(MemOpndSlot + i), SSPOffset);

36510

else

36511

MIB.add(MI.getOperand(MemOpndSlot + i));

36512

}

36513

MIB.addReg(SSPCopyReg);

36514

MIB.setMemRefs(MMOs);

36515

}

36516

36517

MachineBasicBlock *

36518

X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,

36519

MachineBasicBlock *MBB) const {

36520

const DebugLoc &DL = MI.getDebugLoc();

36521

MachineFunction *MF = MBB->getParent();

36522

const TargetInstrInfo *TII = Subtarget.getInstrInfo();

36523

const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();

36524

MachineRegisterInfo &MRI = MF->getRegInfo();

36525

36526

const BasicBlock *BB = MBB->getBasicBlock();

36527

MachineFunction::iterator I = ++MBB->getIterator();

36528

36529

// Memory Reference

36530

SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),

36531

MI.memoperands_end());

36532

36533

unsigned DstReg;

36534

unsigned MemOpndSlot = 0;

36535

36536

unsigned CurOp = 0;

36537

36538

DstReg = MI.getOperand(CurOp++).getReg();

36539

const TargetRegisterClass *RC = MRI.getRegClass(DstReg);

36540

assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!")(static_cast <bool> (TRI->isTypeLegalForClass(*RC, MVT
::i32) && "Invalid destination!") ? void (0) : __assert_fail
("TRI->isTypeLegalForClass(*RC, MVT::i32) && \"Invalid destination!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36540, __extension__
__PRETTY_FUNCTION__));

36541

(void)TRI;

36542

Register mainDstReg = MRI.createVirtualRegister(RC);

36543

Register restoreDstReg = MRI.createVirtualRegister(RC);

36544

36545

MemOpndSlot = CurOp;

36546

36547

MVT PVT = getPointerTy(MF->getDataLayout());

36548

assert((PVT == MVT::i64 || PVT == MVT::i32) &&(static_cast <bool> ((PVT == MVT::i64 || PVT == MVT::i32
) && "Invalid Pointer Size!") ? void (0) : __assert_fail
("(PVT == MVT::i64 || PVT == MVT::i32) && \"Invalid Pointer Size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36549, __extension__
__PRETTY_FUNCTION__))

36549

"Invalid Pointer Size!")(static_cast <bool> ((PVT == MVT::i64 || PVT == MVT::i32
) && "Invalid Pointer Size!") ? void (0) : __assert_fail
("(PVT == MVT::i64 || PVT == MVT::i32) && \"Invalid Pointer Size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36549, __extension__
__PRETTY_FUNCTION__));

36550

36551

// For v = setjmp(buf), we generate

36552

//

36553

// thisMBB:

36554

// buf[LabelOffset] = restoreMBB <-- takes address of restoreMBB

36555

// SjLjSetup restoreMBB

36556

//

36557

// mainMBB:

36558

// v_main = 0

36559

//

36560

// sinkMBB:

36561

// v = phi(main, restore)

36562

//

36563

// restoreMBB:

36564

// if base pointer being used, load it from frame

36565

// v_restore = 1

36566

36567

MachineBasicBlock *thisMBB = MBB;

36568

MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);

36569

MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);

36570

MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB);

36571

MF->insert(I, mainMBB);

36572

MF->insert(I, sinkMBB);

36573

MF->push_back(restoreMBB);

36574

restoreMBB->setMachineBlockAddressTaken();

36575

36576

MachineInstrBuilder MIB;

36577

36578

// Transfer the remainder of BB and its successor edges to sinkMBB.

36579

sinkMBB->splice(sinkMBB->begin(), MBB,

36580

std::next(MachineBasicBlock::iterator(MI)), MBB->end());

36581

sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);

36582

36583

// thisMBB:

36584

unsigned PtrStoreOpc = 0;

36585

unsigned LabelReg = 0;

36586

const int64_t LabelOffset = 1 * PVT.getStoreSize();

36587

bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&

36588

!isPositionIndependent();

36589

36590

// Prepare IP either in reg or imm.

36591

if (!UseImmLabel) {

36592

PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;

36593

const TargetRegisterClass *PtrRC = getRegClassFor(PVT);

36594

LabelReg = MRI.createVirtualRegister(PtrRC);

36595

if (Subtarget.is64Bit()) {

36596

MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA64r), LabelReg)

36597

.addReg(X86::RIP)

36598

.addImm(0)

36599

.addReg(0)

36600

.addMBB(restoreMBB)

36601

.addReg(0);

36602

} else {

36603

const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII);

36604

MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA32r), LabelReg)

36605

.addReg(XII->getGlobalBaseReg(MF))

36606

.addImm(0)

36607

.addReg(0)

36608

.addMBB(restoreMBB, Subtarget.classifyBlockAddressReference())

36609

.addReg(0);

36610

}

36611

} else

36612

PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;

36613

// Store IP

36614

MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrStoreOpc));

36615

for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {

36616

if (i == X86::AddrDisp)

36617

MIB.addDisp(MI.getOperand(MemOpndSlot + i), LabelOffset);

36618

else

36619

MIB.add(MI.getOperand(MemOpndSlot + i));

36620

}

36621

if (!UseImmLabel)

36622

MIB.addReg(LabelReg);

36623

else

36624

MIB.addMBB(restoreMBB);

36625

MIB.setMemRefs(MMOs);

36626

36627

if (MF->getMMI().getModule()->getModuleFlag("cf-protection-return")) {

36628

emitSetJmpShadowStackFix(MI, thisMBB);

36629

}

36630

36631

// Setup

36632

MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup))

36633

.addMBB(restoreMBB);

36634

36635

const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();

36636

MIB.addRegMask(RegInfo->getNoPreservedMask());

36637

thisMBB->addSuccessor(mainMBB);

36638

thisMBB->addSuccessor(restoreMBB);

36639

36640

// mainMBB:

36641

// EAX = 0

36642

BuildMI(mainMBB, DL, TII->get(X86::MOV32r0), mainDstReg);

36643

mainMBB->addSuccessor(sinkMBB);

36644

36645

// sinkMBB:

36646

BuildMI(*sinkMBB, sinkMBB->begin(), DL,

36647

TII->get(X86::PHI), DstReg)

36648

.addReg(mainDstReg).addMBB(mainMBB)

36649

.addReg(restoreDstReg).addMBB(restoreMBB);

36650

36651

// restoreMBB:

36652

if (RegInfo->hasBasePointer(*MF)) {

36653

const bool Uses64BitFramePtr =

36654

Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();

36655

X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();

36656

X86FI->setRestoreBasePointer(MF);

36657

Register FramePtr = RegInfo->getFrameRegister(*MF);

36658

Register BasePtr = RegInfo->getBaseRegister();

36659

unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm;

36660

addRegOffset(BuildMI(restoreMBB, DL, TII->get(Opm), BasePtr),

36661

FramePtr, true, X86FI->getRestoreBasePointerOffset())

36662

.setMIFlag(MachineInstr::FrameSetup);

36663

}

36664

BuildMI(restoreMBB, DL, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);

36665

BuildMI(restoreMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);

36666

restoreMBB->addSuccessor(sinkMBB);

36667

36668

MI.eraseFromParent();

36669

return sinkMBB;

36670

}

36671

36672

/// Fix the shadow stack using the previously saved SSP pointer.

36673

/// \sa emitSetJmpShadowStackFix

36674

/// \param [in] MI The temporary Machine Instruction for the builtin.

36675

/// \param [in] MBB The Machine Basic Block that will be modified.

36676

/// \return The sink MBB that will perform the future indirect branch.

36677

MachineBasicBlock *

36678

X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI,

36679

MachineBasicBlock *MBB) const {

36680

const DebugLoc &DL = MI.getDebugLoc();

36681

MachineFunction *MF = MBB->getParent();

36682

const TargetInstrInfo *TII = Subtarget.getInstrInfo();

36683

MachineRegisterInfo &MRI = MF->getRegInfo();

36684

36685

// Memory Reference

36686

SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),

36687

MI.memoperands_end());

36688

36689

MVT PVT = getPointerTy(MF->getDataLayout());

36690

const TargetRegisterClass *PtrRC = getRegClassFor(PVT);

36691

36692

// checkSspMBB:

36693

// xor vreg1, vreg1

36694

// rdssp vreg1

36695

// test vreg1, vreg1

36696

// je sinkMBB # Jump if Shadow Stack is not supported

36697

// fallMBB:

36698

// mov buf+24/12(%rip), vreg2

36699

// sub vreg1, vreg2

36700

// jbe sinkMBB # No need to fix the Shadow Stack

36701

// fixShadowMBB:

36702

// shr 3/2, vreg2

36703

// incssp vreg2 # fix the SSP according to the lower 8 bits

36704

// shr 8, vreg2

36705

// je sinkMBB

36706

// fixShadowLoopPrepareMBB:

36707

// shl vreg2

36708

// mov 128, vreg3

36709

// fixShadowLoopMBB:

36710

// incssp vreg3

36711

// dec vreg2

36712

// jne fixShadowLoopMBB # Iterate until you finish fixing

36713

// # the Shadow Stack

36714

// sinkMBB:

36715

36716

MachineFunction::iterator I = ++MBB->getIterator();

36717

const BasicBlock *BB = MBB->getBasicBlock();

36718

36719

MachineBasicBlock *checkSspMBB = MF->CreateMachineBasicBlock(BB);

36720

MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);

36721

MachineBasicBlock *fixShadowMBB = MF->CreateMachineBasicBlock(BB);

36722

MachineBasicBlock *fixShadowLoopPrepareMBB = MF->CreateMachineBasicBlock(BB);

36723

MachineBasicBlock *fixShadowLoopMBB = MF->CreateMachineBasicBlock(BB);

36724

MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);

36725

MF->insert(I, checkSspMBB);

36726

MF->insert(I, fallMBB);

36727

MF->insert(I, fixShadowMBB);

36728

MF->insert(I, fixShadowLoopPrepareMBB);

36729

MF->insert(I, fixShadowLoopMBB);

36730

MF->insert(I, sinkMBB);

36731

36732

// Transfer the remainder of BB and its successor edges to sinkMBB.

36733

sinkMBB->splice(sinkMBB->begin(), MBB, MachineBasicBlock::iterator(MI),

36734

MBB->end());

36735

sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);

36736

36737

MBB->addSuccessor(checkSspMBB);

36738

36739

// Initialize a register with zero.

36740

Register ZReg = MRI.createVirtualRegister(&X86::GR32RegClass);

36741

BuildMI(checkSspMBB, DL, TII->get(X86::MOV32r0), ZReg);

36742

36743

if (PVT == MVT::i64) {

36744

Register TmpZReg = MRI.createVirtualRegister(PtrRC);

36745

BuildMI(checkSspMBB, DL, TII->get(X86::SUBREG_TO_REG), TmpZReg)

36746

.addImm(0)

36747

.addReg(ZReg)

36748

.addImm(X86::sub_32bit);

36749

ZReg = TmpZReg;

36750

}

36751

36752

// Read the current SSP Register value to the zeroed register.

36753

Register SSPCopyReg = MRI.createVirtualRegister(PtrRC);

36754

unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;

36755

BuildMI(checkSspMBB, DL, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);

36756

36757

// Check whether the result of the SSP register is zero and jump directly

36758

// to the sink.

36759

unsigned TestRROpc = (PVT == MVT::i64) ? X86::TEST64rr : X86::TEST32rr;

36760

BuildMI(checkSspMBB, DL, TII->get(TestRROpc))

36761

.addReg(SSPCopyReg)

36762

.addReg(SSPCopyReg);

36763

BuildMI(checkSspMBB, DL, TII->get(X86::JCC_1)).addMBB(sinkMBB).addImm(X86::COND_E);

36764

checkSspMBB->addSuccessor(sinkMBB);

36765

checkSspMBB->addSuccessor(fallMBB);

36766

36767

// Reload the previously saved SSP register value.

36768

Register PrevSSPReg = MRI.createVirtualRegister(PtrRC);

36769

unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;

36770

const int64_t SPPOffset = 3 * PVT.getStoreSize();

36771

MachineInstrBuilder MIB =

36772

BuildMI(fallMBB, DL, TII->get(PtrLoadOpc), PrevSSPReg);

36773

for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {

36774

const MachineOperand &MO = MI.getOperand(i);

36775

if (i == X86::AddrDisp)

36776

MIB.addDisp(MO, SPPOffset);

36777

else if (MO.isReg()) // Don't add the whole operand, we don't want to

36778

// preserve kill flags.

36779

MIB.addReg(MO.getReg());

36780

else

36781

MIB.add(MO);

36782

}

36783

MIB.setMemRefs(MMOs);

36784

36785

// Subtract the current SSP from the previous SSP.

36786

Register SspSubReg = MRI.createVirtualRegister(PtrRC);

36787

unsigned SubRROpc = (PVT == MVT::i64) ? X86::SUB64rr : X86::SUB32rr;

36788

BuildMI(fallMBB, DL, TII->get(SubRROpc), SspSubReg)

36789

.addReg(PrevSSPReg)

36790

.addReg(SSPCopyReg);

36791

36792

// Jump to sink in case PrevSSPReg <= SSPCopyReg.

36793

BuildMI(fallMBB, DL, TII->get(X86::JCC_1)).addMBB(sinkMBB).addImm(X86::COND_BE);

36794

fallMBB->addSuccessor(sinkMBB);

36795

fallMBB->addSuccessor(fixShadowMBB);

36796

36797

// Shift right by 2/3 for 32/64 because incssp multiplies the argument by 4/8.

36798

unsigned ShrRIOpc = (PVT == MVT::i64) ? X86::SHR64ri : X86::SHR32ri;

36799

unsigned Offset = (PVT == MVT::i64) ? 3 : 2;

36800

Register SspFirstShrReg = MRI.createVirtualRegister(PtrRC);

36801

BuildMI(fixShadowMBB, DL, TII->get(ShrRIOpc), SspFirstShrReg)

36802

.addReg(SspSubReg)

36803

.addImm(Offset);

36804

36805

// Increase SSP when looking only on the lower 8 bits of the delta.

36806

unsigned IncsspOpc = (PVT == MVT::i64) ? X86::INCSSPQ : X86::INCSSPD;

36807

BuildMI(fixShadowMBB, DL, TII->get(IncsspOpc)).addReg(SspFirstShrReg);

36808

36809

// Reset the lower 8 bits.

36810

Register SspSecondShrReg = MRI.createVirtualRegister(PtrRC);

36811

BuildMI(fixShadowMBB, DL, TII->get(ShrRIOpc), SspSecondShrReg)

36812

.addReg(SspFirstShrReg)

36813

.addImm(8);

36814

36815

// Jump if the result of the shift is zero.

36816

BuildMI(fixShadowMBB, DL, TII->get(X86::JCC_1)).addMBB(sinkMBB).addImm(X86::COND_E);

36817

fixShadowMBB->addSuccessor(sinkMBB);

36818

fixShadowMBB->addSuccessor(fixShadowLoopPrepareMBB);

36819

36820

// Do a single shift left.

36821

unsigned ShlR1Opc = (PVT == MVT::i64) ? X86::SHL64r1 : X86::SHL32r1;

36822

Register SspAfterShlReg = MRI.createVirtualRegister(PtrRC);

36823

BuildMI(fixShadowLoopPrepareMBB, DL, TII->get(ShlR1Opc), SspAfterShlReg)

36824

.addReg(SspSecondShrReg);

36825

36826

// Save the value 128 to a register (will be used next with incssp).

36827

Register Value128InReg = MRI.createVirtualRegister(PtrRC);

36828

unsigned MovRIOpc = (PVT == MVT::i64) ? X86::MOV64ri32 : X86::MOV32ri;

36829

BuildMI(fixShadowLoopPrepareMBB, DL, TII->get(MovRIOpc), Value128InReg)

36830

.addImm(128);

36831

fixShadowLoopPrepareMBB->addSuccessor(fixShadowLoopMBB);

36832

36833

// Since incssp only looks at the lower 8 bits, we might need to do several

36834

// iterations of incssp until we finish fixing the shadow stack.

36835

Register DecReg = MRI.createVirtualRegister(PtrRC);

36836

Register CounterReg = MRI.createVirtualRegister(PtrRC);

36837

BuildMI(fixShadowLoopMBB, DL, TII->get(X86::PHI), CounterReg)

36838

.addReg(SspAfterShlReg)

36839

.addMBB(fixShadowLoopPrepareMBB)

36840

.addReg(DecReg)

36841

.addMBB(fixShadowLoopMBB);

36842

36843

// Every iteration we increase the SSP by 128.

36844

BuildMI(fixShadowLoopMBB, DL, TII->get(IncsspOpc)).addReg(Value128InReg);

36845

36846

// Every iteration we decrement the counter by 1.

36847

unsigned DecROpc = (PVT == MVT::i64) ? X86::DEC64r : X86::DEC32r;

36848

BuildMI(fixShadowLoopMBB, DL, TII->get(DecROpc), DecReg).addReg(CounterReg);

36849

36850

// Jump if the counter is not zero yet.

36851

BuildMI(fixShadowLoopMBB, DL, TII->get(X86::JCC_1)).addMBB(fixShadowLoopMBB).addImm(X86::COND_NE);

36852

fixShadowLoopMBB->addSuccessor(sinkMBB);

36853

fixShadowLoopMBB->addSuccessor(fixShadowLoopMBB);

36854

36855

return sinkMBB;

36856

}

36857

36858

MachineBasicBlock *

36859

X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,

36860

MachineBasicBlock *MBB) const {

36861

const DebugLoc &DL = MI.getDebugLoc();

36862

MachineFunction *MF = MBB->getParent();

36863

const TargetInstrInfo *TII = Subtarget.getInstrInfo();

36864

MachineRegisterInfo &MRI = MF->getRegInfo();

36865

36866

// Memory Reference

36867

SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(),

36868

MI.memoperands_end());

36869

36870

MVT PVT = getPointerTy(MF->getDataLayout());

36871

assert((PVT == MVT::i64 || PVT == MVT::i32) &&(static_cast <bool> ((PVT == MVT::i64 || PVT == MVT::i32
) && "Invalid Pointer Size!") ? void (0) : __assert_fail
("(PVT == MVT::i64 || PVT == MVT::i32) && \"Invalid Pointer Size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36872, __extension__
__PRETTY_FUNCTION__))

36872

"Invalid Pointer Size!")(static_cast <bool> ((PVT == MVT::i64 || PVT == MVT::i32
) && "Invalid Pointer Size!") ? void (0) : __assert_fail
("(PVT == MVT::i64 || PVT == MVT::i32) && \"Invalid Pointer Size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36872, __extension__
__PRETTY_FUNCTION__));

36873

36874

const TargetRegisterClass *RC =

36875

(PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;

36876

Register Tmp = MRI.createVirtualRegister(RC);

36877

// Since FP is only updated here but NOT referenced, it's treated as GPR.

36878

const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();

36879

Register FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;

36880

Register SP = RegInfo->getStackRegister();

36881

36882

MachineInstrBuilder MIB;

36883

36884

const int64_t LabelOffset = 1 * PVT.getStoreSize();

36885

const int64_t SPOffset = 2 * PVT.getStoreSize();

36886

36887

unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;

36888

unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r;

36889

36890

MachineBasicBlock *thisMBB = MBB;

36891

36892

// When CET and shadow stack is enabled, we need to fix the Shadow Stack.

36893

if (MF->getMMI().getModule()->getModuleFlag("cf-protection-return")) {

36894

thisMBB = emitLongJmpShadowStackFix(MI, thisMBB);

36895

}

36896

36897

// Reload FP

36898

MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrLoadOpc), FP);

36899

for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {

36900

const MachineOperand &MO = MI.getOperand(i);

36901

if (MO.isReg()) // Don't add the whole operand, we don't want to

36902

// preserve kill flags.

36903

MIB.addReg(MO.getReg());

36904

else

36905

MIB.add(MO);

36906

}

36907

MIB.setMemRefs(MMOs);

36908

36909

// Reload IP

36910

MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrLoadOpc), Tmp);

36911

for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {

36912

const MachineOperand &MO = MI.getOperand(i);

36913

if (i == X86::AddrDisp)

36914

MIB.addDisp(MO, LabelOffset);

36915

else if (MO.isReg()) // Don't add the whole operand, we don't want to

36916

// preserve kill flags.

36917

MIB.addReg(MO.getReg());

36918

else

36919

MIB.add(MO);

36920

}

36921

MIB.setMemRefs(MMOs);

36922

36923

// Reload SP

36924

MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrLoadOpc), SP);

36925

for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {

36926

if (i == X86::AddrDisp)

36927

MIB.addDisp(MI.getOperand(i), SPOffset);

36928

else

36929

MIB.add(MI.getOperand(i)); // We can preserve the kill flags here, it's

36930

// the last instruction of the expansion.

36931

}

36932

MIB.setMemRefs(MMOs);

36933

36934

// Jump

36935

BuildMI(*thisMBB, MI, DL, TII->get(IJmpOpc)).addReg(Tmp);

36936

36937

MI.eraseFromParent();

36938

return thisMBB;

36939

}

36940

36941

void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,

36942

MachineBasicBlock *MBB,

36943

MachineBasicBlock *DispatchBB,

36944

int FI) const {

36945

const DebugLoc &DL = MI.getDebugLoc();

36946

MachineFunction *MF = MBB->getParent();

36947

MachineRegisterInfo *MRI = &MF->getRegInfo();

36948

const X86InstrInfo *TII = Subtarget.getInstrInfo();

36949

36950

MVT PVT = getPointerTy(MF->getDataLayout());

36951

assert((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!")(static_cast <bool> ((PVT == MVT::i64 || PVT == MVT::i32
) && "Invalid Pointer Size!") ? void (0) : __assert_fail
("(PVT == MVT::i64 || PVT == MVT::i32) && \"Invalid Pointer Size!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 36951, __extension__
__PRETTY_FUNCTION__));

36952

36953

unsigned Op = 0;

36954

unsigned VR = 0;

36955

36956

bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&

36957

!isPositionIndependent();

36958

36959

if (UseImmLabel) {

36960

Op = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;

36961

} else {

36962

const TargetRegisterClass *TRC =

36963

(PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;

36964

VR = MRI->createVirtualRegister(TRC);

36965

Op = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;

36966

36967

if (Subtarget.is64Bit())

36968

BuildMI(*MBB, MI, DL, TII->get(X86::LEA64r), VR)

36969

.addReg(X86::RIP)

36970

.addImm(1)

36971

.addReg(0)

36972

.addMBB(DispatchBB)

36973

.addReg(0);

36974

else

36975

BuildMI(*MBB, MI, DL, TII->get(X86::LEA32r), VR)

36976

.addReg(0) /* TII->getGlobalBaseReg(MF) */

36977

.addImm(1)

36978

.addReg(0)

36979

.addMBB(DispatchBB, Subtarget.classifyBlockAddressReference())

36980

.addReg(0);

36981

}

36982

36983

MachineInstrBuilder MIB = BuildMI(*MBB, MI, DL, TII->get(Op));

36984

addFrameReference(MIB, FI, Subtarget.is64Bit() ? 56 : 36);

36985

if (UseImmLabel)

36986

MIB.addMBB(DispatchBB);

36987

else

36988

MIB.addReg(VR);

36989

}

36990

36991

MachineBasicBlock *

36992

X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,

36993

MachineBasicBlock *BB) const {

36994

const DebugLoc &DL = MI.getDebugLoc();

36995

MachineFunction *MF = BB->getParent();

36996

MachineRegisterInfo *MRI = &MF->getRegInfo();

36997

const X86InstrInfo *TII = Subtarget.getInstrInfo();

36998

int FI = MF->getFrameInfo().getFunctionContextIndex();

36999

37000

// Get a mapping of the call site numbers to all of the landing pads they're

37001

// associated with.

37002

DenseMap<unsigned, SmallVector<MachineBasicBlock *, 2>> CallSiteNumToLPad;

37003

unsigned MaxCSNum = 0;

37004

for (auto &MBB : *MF) {

37005

if (!MBB.isEHPad())

37006

continue;

37007

37008

MCSymbol *Sym = nullptr;

37009

for (const auto &MI : MBB) {

37010

if (MI.isDebugInstr())

37011

continue;

37012

37013

assert(MI.isEHLabel() && "expected EH_LABEL")(static_cast <bool> (MI.isEHLabel() && "expected EH_LABEL"
) ? void (0) : __assert_fail ("MI.isEHLabel() && \"expected EH_LABEL\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37013, __extension__
__PRETTY_FUNCTION__));

37014

Sym = MI.getOperand(0).getMCSymbol();

37015

break;

37016

}

37017

37018

if (!MF->hasCallSiteLandingPad(Sym))

37019

continue;

37020

37021

for (unsigned CSI : MF->getCallSiteLandingPad(Sym)) {

37022

CallSiteNumToLPad[CSI].push_back(&MBB);

37023

MaxCSNum = std::max(MaxCSNum, CSI);

37024

}

37025

}

37026

37027

// Get an ordered list of the machine basic blocks for the jump table.

37028

std::vector<MachineBasicBlock *> LPadList;

37029

SmallPtrSet<MachineBasicBlock *, 32> InvokeBBs;

37030

LPadList.reserve(CallSiteNumToLPad.size());

37031

37032

for (unsigned CSI = 1; CSI <= MaxCSNum; ++CSI) {

37033

for (auto &LP : CallSiteNumToLPad[CSI]) {

37034

LPadList.push_back(LP);

37035

InvokeBBs.insert(LP->pred_begin(), LP->pred_end());

37036

}

37037

}

37038

37039

assert(!LPadList.empty() &&(static_cast <bool> (!LPadList.empty() && "No landing pad destinations for the dispatch jump table!"
) ? void (0) : __assert_fail ("!LPadList.empty() && \"No landing pad destinations for the dispatch jump table!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37040, __extension__
__PRETTY_FUNCTION__))

37040

"No landing pad destinations for the dispatch jump table!")(static_cast <bool> (!LPadList.empty() && "No landing pad destinations for the dispatch jump table!"
) ? void (0) : __assert_fail ("!LPadList.empty() && \"No landing pad destinations for the dispatch jump table!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37040, __extension__
__PRETTY_FUNCTION__));

37041

37042

// Create the MBBs for the dispatch code.

37043

37044

// Shove the dispatch's address into the return slot in the function context.

37045

MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();

37046

DispatchBB->setIsEHPad(true);

37047

37048

MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();

37049

BuildMI(TrapBB, DL, TII->get(X86::TRAP));

37050

DispatchBB->addSuccessor(TrapBB);

37051

37052

MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();

37053

DispatchBB->addSuccessor(DispContBB);

37054

37055

// Insert MBBs.

37056

MF->push_back(DispatchBB);

37057

MF->push_back(DispContBB);

37058

MF->push_back(TrapBB);

37059

37060

// Insert code into the entry block that creates and registers the function

37061

// context.

37062

SetupEntryBlockForSjLj(MI, BB, DispatchBB, FI);

37063

37064

// Create the jump table and associated information

37065

unsigned JTE = getJumpTableEncoding();

37066

MachineJumpTableInfo *JTI = MF->getOrCreateJumpTableInfo(JTE);

37067

unsigned MJTI = JTI->createJumpTableIndex(LPadList);

37068

37069

const X86RegisterInfo &RI = TII->getRegisterInfo();

37070

// Add a register mask with no preserved registers. This results in all

37071

// registers being marked as clobbered.

37072

if (RI.hasBasePointer(*MF)) {

37073

const bool FPIs64Bit =

37074

Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();

37075

X86MachineFunctionInfo *MFI = MF->getInfo<X86MachineFunctionInfo>();

37076

MFI->setRestoreBasePointer(MF);

37077

37078

Register FP = RI.getFrameRegister(*MF);

37079

Register BP = RI.getBaseRegister();

37080

unsigned Op = FPIs64Bit ? X86::MOV64rm : X86::MOV32rm;

37081

addRegOffset(BuildMI(DispatchBB, DL, TII->get(Op), BP), FP, true,

37082

MFI->getRestoreBasePointerOffset())

37083

.addRegMask(RI.getNoPreservedMask());

37084

} else {

37085

BuildMI(DispatchBB, DL, TII->get(X86::NOOP))

37086

.addRegMask(RI.getNoPreservedMask());

37087

}

37088

37089

// IReg is used as an index in a memory operand and therefore can't be SP

37090

Register IReg = MRI->createVirtualRegister(&X86::GR32_NOSPRegClass);

37091

addFrameReference(BuildMI(DispatchBB, DL, TII->get(X86::MOV32rm), IReg), FI,

37092

Subtarget.is64Bit() ? 8 : 4);

37093

BuildMI(DispatchBB, DL, TII->get(X86::CMP32ri))

37094

.addReg(IReg)

37095

.addImm(LPadList.size());

37096

BuildMI(DispatchBB, DL, TII->get(X86::JCC_1)).addMBB(TrapBB).addImm(X86::COND_AE);

37097

37098

if (Subtarget.is64Bit()) {

37099

Register BReg = MRI->createVirtualRegister(&X86::GR64RegClass);

37100

Register IReg64 = MRI->createVirtualRegister(&X86::GR64_NOSPRegClass);

37101

37102

// leaq .LJTI0_0(%rip), BReg

37103

BuildMI(DispContBB, DL, TII->get(X86::LEA64r), BReg)

37104

.addReg(X86::RIP)

37105

.addImm(1)

37106

.addReg(0)

37107

.addJumpTableIndex(MJTI)

37108

.addReg(0);

37109

// movzx IReg64, IReg

37110

BuildMI(DispContBB, DL, TII->get(TargetOpcode::SUBREG_TO_REG), IReg64)

37111

.addImm(0)

37112

.addReg(IReg)

37113

.addImm(X86::sub_32bit);

37114

37115

switch (JTE) {

37116

case MachineJumpTableInfo::EK_BlockAddress:

37117

// jmpq *(BReg,IReg64,8)

37118

BuildMI(DispContBB, DL, TII->get(X86::JMP64m))

37119

.addReg(BReg)

37120

.addImm(8)

37121

.addReg(IReg64)

37122

.addImm(0)

37123

.addReg(0);

37124

break;

37125

case MachineJumpTableInfo::EK_LabelDifference32: {

37126

Register OReg = MRI->createVirtualRegister(&X86::GR32RegClass);

37127

Register OReg64 = MRI->createVirtualRegister(&X86::GR64RegClass);

37128

Register TReg = MRI->createVirtualRegister(&X86::GR64RegClass);

37129

37130

// movl (BReg,IReg64,4), OReg

37131

BuildMI(DispContBB, DL, TII->get(X86::MOV32rm), OReg)

37132

.addReg(BReg)

37133

.addImm(4)

37134

.addReg(IReg64)

37135

.addImm(0)

37136

.addReg(0);

37137

// movsx OReg64, OReg

37138

BuildMI(DispContBB, DL, TII->get(X86::MOVSX64rr32), OReg64).addReg(OReg);

37139

// addq BReg, OReg64, TReg

37140

BuildMI(DispContBB, DL, TII->get(X86::ADD64rr), TReg)

37141

.addReg(OReg64)

37142

.addReg(BReg);

37143

// jmpq *TReg

37144

BuildMI(DispContBB, DL, TII->get(X86::JMP64r)).addReg(TReg);

37145

break;

37146

}

37147

default:

37148

llvm_unreachable("Unexpected jump table encoding")::llvm::llvm_unreachable_internal("Unexpected jump table encoding"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37148);

37149

}

37150

} else {

37151

// jmpl *.LJTI0_0(,IReg,4)

37152

BuildMI(DispContBB, DL, TII->get(X86::JMP32m))

37153

.addReg(0)

37154

.addImm(4)

37155

.addReg(IReg)

37156

.addJumpTableIndex(MJTI)

37157

.addReg(0);

37158

}

37159

37160

// Add the jump table entries as successors to the MBB.

37161

SmallPtrSet<MachineBasicBlock *, 8> SeenMBBs;

37162

for (auto &LP : LPadList)

37163

if (SeenMBBs.insert(LP).second)

37164

DispContBB->addSuccessor(LP);

37165

37166

// N.B. the order the invoke BBs are processed in doesn't matter here.

37167

SmallVector<MachineBasicBlock *, 64> MBBLPads;

37168

const MCPhysReg *SavedRegs = MF->getRegInfo().getCalleeSavedRegs();

37169

for (MachineBasicBlock *MBB : InvokeBBs) {

37170

// Remove the landing pad successor from the invoke block and replace it

37171

// with the new dispatch block.

37172

// Keep a copy of Successors since it's modified inside the loop.

37173

SmallVector<MachineBasicBlock *, 8> Successors(MBB->succ_rbegin(),

37174

MBB->succ_rend());

37175

// FIXME: Avoid quadratic complexity.

37176

for (auto *MBBS : Successors) {

37177

if (MBBS->isEHPad()) {

37178

MBB->removeSuccessor(MBBS);

37179

MBBLPads.push_back(MBBS);

37180

}

37181

}

37182

37183

MBB->addSuccessor(DispatchBB);

37184

37185

// Find the invoke call and mark all of the callee-saved registers as

37186

// 'implicit defined' so that they're spilled. This prevents code from

37187

// moving instructions to before the EH block, where they will never be

37188

// executed.

37189

for (auto &II : reverse(*MBB)) {

37190

if (!II.isCall())

37191

continue;

37192

37193

DenseMap<unsigned, bool> DefRegs;

37194

for (auto &MOp : II.operands())

37195

if (MOp.isReg())

37196

DefRegs[MOp.getReg()] = true;

37197

37198

MachineInstrBuilder MIB(*MF, &II);

37199

for (unsigned RegIdx = 0; SavedRegs[RegIdx]; ++RegIdx) {

37200

unsigned Reg = SavedRegs[RegIdx];

37201

if (!DefRegs[Reg])

37202

MIB.addReg(Reg, RegState::ImplicitDefine | RegState::Dead);

37203

}

37204

37205

break;

37206

}

37207

}

37208

37209

// Mark all former landing pads as non-landing pads. The dispatch is the only

37210

// landing pad now.

37211

for (auto &LP : MBBLPads)

37212

LP->setIsEHPad(false);

37213

37214

// The instruction is gone now.

37215

MI.eraseFromParent();

37216

return BB;

37217

}

37218

37219

MachineBasicBlock *

37220

X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,

37221

MachineBasicBlock *BB) const {

37222

MachineFunction *MF = BB->getParent();

37223

const TargetInstrInfo *TII = Subtarget.getInstrInfo();

37224

const DebugLoc &DL = MI.getDebugLoc();

37225

37226

auto TMMImmToTMMReg = [](unsigned Imm) {

37227

assert (Imm < 8 && "Illegal tmm index")(static_cast <bool> (Imm < 8 && "Illegal tmm index"
) ? void (0) : __assert_fail ("Imm < 8 && \"Illegal tmm index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37227, __extension__
__PRETTY_FUNCTION__));

37228

return X86::TMM0 + Imm;

37229

};

37230

switch (MI.getOpcode()) {

37231

default: llvm_unreachable("Unexpected instr type to insert")::llvm::llvm_unreachable_internal("Unexpected instr type to insert"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37231);

37232

case X86::TLS_addr32:

37233

case X86::TLS_addr64:

37234

case X86::TLS_addrX32:

37235

case X86::TLS_base_addr32:

37236

case X86::TLS_base_addr64:

37237

case X86::TLS_base_addrX32:

37238

return EmitLoweredTLSAddr(MI, BB);

37239

case X86::INDIRECT_THUNK_CALL32:

37240

case X86::INDIRECT_THUNK_CALL64:

37241

case X86::INDIRECT_THUNK_TCRETURN32:

37242

case X86::INDIRECT_THUNK_TCRETURN64:

37243

return EmitLoweredIndirectThunk(MI, BB);

37244

case X86::CATCHRET:

37245

return EmitLoweredCatchRet(MI, BB);

37246

case X86::SEG_ALLOCA_32:

37247

case X86::SEG_ALLOCA_64:

37248

return EmitLoweredSegAlloca(MI, BB);

37249

case X86::PROBED_ALLOCA_32:

37250

case X86::PROBED_ALLOCA_64:

37251

return EmitLoweredProbedAlloca(MI, BB);

37252

case X86::TLSCall_32:

37253

case X86::TLSCall_64:

37254

return EmitLoweredTLSCall(MI, BB);

37255

case X86::CMOV_FR16:

37256

case X86::CMOV_FR16X:

37257

case X86::CMOV_FR32:

37258

case X86::CMOV_FR32X:

37259

case X86::CMOV_FR64:

37260

case X86::CMOV_FR64X:

37261

case X86::CMOV_GR8:

37262

case X86::CMOV_GR16:

37263

case X86::CMOV_GR32:

37264

case X86::CMOV_RFP32:

37265

case X86::CMOV_RFP64:

37266

case X86::CMOV_RFP80:

37267

case X86::CMOV_VR64:

37268

case X86::CMOV_VR128:

37269

case X86::CMOV_VR128X:

37270

case X86::CMOV_VR256:

37271

case X86::CMOV_VR256X:

37272

case X86::CMOV_VR512:

37273

case X86::CMOV_VK1:

37274

case X86::CMOV_VK2:

37275

case X86::CMOV_VK4:

37276

case X86::CMOV_VK8:

37277

case X86::CMOV_VK16:

37278

case X86::CMOV_VK32:

37279

case X86::CMOV_VK64:

37280

return EmitLoweredSelect(MI, BB);

37281

37282

case X86::RDFLAGS32:

37283

case X86::RDFLAGS64: {

37284

unsigned PushF =

37285

MI.getOpcode() == X86::RDFLAGS32 ? X86::PUSHF32 : X86::PUSHF64;

37286

unsigned Pop = MI.getOpcode() == X86::RDFLAGS32 ? X86::POP32r : X86::POP64r;

37287

MachineInstr *Push = BuildMI(*BB, MI, DL, TII->get(PushF));

37288

// Permit reads of the EFLAGS and DF registers without them being defined.

37289

// This intrinsic exists to read external processor state in flags, such as

37290

// the trap flag, interrupt flag, and direction flag, none of which are

37291

// modeled by the backend.

37292

assert(Push->getOperand(2).getReg() == X86::EFLAGS &&(static_cast <bool> (Push->getOperand(2).getReg() ==
X86::EFLAGS && "Unexpected register in operand!") ? void
(0) : __assert_fail ("Push->getOperand(2).getReg() == X86::EFLAGS && \"Unexpected register in operand!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37293, __extension__
__PRETTY_FUNCTION__))

37293

"Unexpected register in operand!")(static_cast <bool> (Push->getOperand(2).getReg() ==
X86::EFLAGS && "Unexpected register in operand!") ? void
(0) : __assert_fail ("Push->getOperand(2).getReg() == X86::EFLAGS && \"Unexpected register in operand!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37293, __extension__
__PRETTY_FUNCTION__));

37294

Push->getOperand(2).setIsUndef();

37295

assert(Push->getOperand(3).getReg() == X86::DF &&(static_cast <bool> (Push->getOperand(3).getReg() ==
X86::DF && "Unexpected register in operand!") ? void
(0) : __assert_fail ("Push->getOperand(3).getReg() == X86::DF && \"Unexpected register in operand!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37296, __extension__
__PRETTY_FUNCTION__))

37296

"Unexpected register in operand!")(static_cast <bool> (Push->getOperand(3).getReg() ==
X86::DF && "Unexpected register in operand!") ? void
(0) : __assert_fail ("Push->getOperand(3).getReg() == X86::DF && \"Unexpected register in operand!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37296, __extension__
__PRETTY_FUNCTION__));

37297

Push->getOperand(3).setIsUndef();

37298

BuildMI(*BB, MI, DL, TII->get(Pop), MI.getOperand(0).getReg());

37299

37300

MI.eraseFromParent(); // The pseudo is gone now.

37301

return BB;

37302

}

37303

37304

case X86::WRFLAGS32:

37305

case X86::WRFLAGS64: {

37306

unsigned Push =

37307

MI.getOpcode() == X86::WRFLAGS32 ? X86::PUSH32r : X86::PUSH64r;

37308

unsigned PopF =

37309

MI.getOpcode() == X86::WRFLAGS32 ? X86::POPF32 : X86::POPF64;

37310

BuildMI(*BB, MI, DL, TII->get(Push)).addReg(MI.getOperand(0).getReg());

37311

BuildMI(*BB, MI, DL, TII->get(PopF));

37312

37313

MI.eraseFromParent(); // The pseudo is gone now.

37314

return BB;

37315

}

37316

37317

case X86::FP32_TO_INT16_IN_MEM:

37318

case X86::FP32_TO_INT32_IN_MEM:

37319

case X86::FP32_TO_INT64_IN_MEM:

37320

case X86::FP64_TO_INT16_IN_MEM:

37321

case X86::FP64_TO_INT32_IN_MEM:

37322

case X86::FP64_TO_INT64_IN_MEM:

37323

case X86::FP80_TO_INT16_IN_MEM:

37324

case X86::FP80_TO_INT32_IN_MEM:

37325

case X86::FP80_TO_INT64_IN_MEM: {

37326

// Change the floating point control register to use "round towards zero"

37327

// mode when truncating to an integer value.

37328

int OrigCWFrameIdx =

37329

MF->getFrameInfo().CreateStackObject(2, Align(2), false);

37330

addFrameReference(BuildMI(*BB, MI, DL,

37331

TII->get(X86::FNSTCW16m)), OrigCWFrameIdx);

37332

37333

// Load the old value of the control word...

37334

Register OldCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);

37335

addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOVZX32rm16), OldCW),

37336

OrigCWFrameIdx);

37337

37338

// OR 0b11 into bit 10 and 11. 0b11 is the encoding for round toward zero.

37339

Register NewCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass);

37340

BuildMI(*BB, MI, DL, TII->get(X86::OR32ri), NewCW)

37341

.addReg(OldCW, RegState::Kill).addImm(0xC00);

37342

37343

// Extract to 16 bits.

37344

Register NewCW16 =

37345

MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);

37346

BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), NewCW16)

37347

.addReg(NewCW, RegState::Kill, X86::sub_16bit);

37348

37349

// Prepare memory for FLDCW.

37350

int NewCWFrameIdx =

37351

MF->getFrameInfo().CreateStackObject(2, Align(2), false);

37352

addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)),

37353

NewCWFrameIdx)

37354

.addReg(NewCW16, RegState::Kill);

37355

37356

// Reload the modified control word now...

37357

addFrameReference(BuildMI(*BB, MI, DL,

37358

TII->get(X86::FLDCW16m)), NewCWFrameIdx);

37359

37360

// Get the X86 opcode to use.

37361

unsigned Opc;

37362

switch (MI.getOpcode()) {

37363

default: llvm_unreachable("illegal opcode!")::llvm::llvm_unreachable_internal("illegal opcode!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 37363);

37364

case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;

37365

case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;

37366

case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;

37367

case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;

37368

case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;

37369

case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;

37370

case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;

37371

case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;

37372

case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;

37373

}

37374

37375

X86AddressMode AM = getAddressFromInstr(&MI, 0);

37376

addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM)

37377

.addReg(MI.getOperand(X86::AddrNumOperands).getReg());

37378

37379

// Reload the original control word now.

37380

addFrameReference(BuildMI(*BB, MI, DL,

37381

TII->get(X86::FLDCW16m)), OrigCWFrameIdx);

37382

37383

MI.eraseFromParent(); // The pseudo instruction is gone now.

37384

return BB;

37385

}

37386

37387

// xbegin

37388

case X86::XBEGIN:

37389

return emitXBegin(MI, BB, Subtarget.getInstrInfo());

37390

37391

case X86::VAARG_64:

37392

case X86::VAARG_X32:

37393

return EmitVAARGWithCustomInserter(MI, BB);

37394

37395

case X86::EH_SjLj_SetJmp32:

37396

case X86::EH_SjLj_SetJmp64:

37397

return emitEHSjLjSetJmp(MI, BB);

37398

37399

case X86::EH_SjLj_LongJmp32:

37400

case X86::EH_SjLj_LongJmp64:

37401

return emitEHSjLjLongJmp(MI, BB);

37402

37403

case X86::Int_eh_sjlj_setup_dispatch:

37404

return EmitSjLjDispatchBlock(MI, BB);

37405

37406

case TargetOpcode::STATEPOINT:

37407

// As an implementation detail, STATEPOINT shares the STACKMAP format at

37408

// this point in the process. We diverge later.

37409

return emitPatchPoint(MI, BB);

37410

37411

case TargetOpcode::STACKMAP:

37412

case TargetOpcode::PATCHPOINT:

37413

return emitPatchPoint(MI, BB);

37414

37415

case TargetOpcode::PATCHABLE_EVENT_CALL:

37416

case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:

37417

return BB;

37418

37419

case X86::LCMPXCHG8B: {

37420

const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();

37421

// In addition to 4 E[ABCD] registers implied by encoding, CMPXCHG8B

37422

// requires a memory operand. If it happens that current architecture is

37423

// i686 and for current function we need a base pointer

37424

// - which is ESI for i686 - register allocator would not be able to

37425

// allocate registers for an address in form of X(%reg, %reg, Y)

37426

// - there never would be enough unreserved registers during regalloc

37427

// (without the need for base ptr the only option would be X(%edi, %esi, Y).

37428

// We are giving a hand to register allocator by precomputing the address in

37429

// a new vreg using LEA.

37430

37431

// If it is not i686 or there is no base pointer - nothing to do here.

37432

if (!Subtarget.is32Bit() || !TRI->hasBasePointer(*MF))

37433

return BB;

37434

37435

// Even though this code does not necessarily needs the base pointer to

37436

// be ESI, we check for that. The reason: if this assert fails, there are

37437

// some changes happened in the compiler base pointer handling, which most

37438

// probably have to be addressed somehow here.

37439

assert(TRI->getBaseRegister() == X86::ESI &&(static_cast <bool> (TRI->getBaseRegister() == X86::
ESI && "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "
"base pointer in mind") ? void (0) : __assert_fail ("TRI->getBaseRegister() == X86::ESI && \"LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a \" \"base pointer in mind\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37441, __extension__
__PRETTY_FUNCTION__))

37440

"LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "(static_cast <bool> (TRI->getBaseRegister() == X86::
ESI && "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "
"base pointer in mind") ? void (0) : __assert_fail ("TRI->getBaseRegister() == X86::ESI && \"LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a \" \"base pointer in mind\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37441, __extension__
__PRETTY_FUNCTION__))

37441

"base pointer in mind")(static_cast <bool> (TRI->getBaseRegister() == X86::
ESI && "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "
"base pointer in mind") ? void (0) : __assert_fail ("TRI->getBaseRegister() == X86::ESI && \"LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a \" \"base pointer in mind\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37441, __extension__
__PRETTY_FUNCTION__));

37442

37443

MachineRegisterInfo &MRI = MF->getRegInfo();

37444

MVT SPTy = getPointerTy(MF->getDataLayout());

37445

const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);

37446

Register computedAddrVReg = MRI.createVirtualRegister(AddrRegClass);

37447

37448

X86AddressMode AM = getAddressFromInstr(&MI, 0);

37449

// Regalloc does not need any help when the memory operand of CMPXCHG8B

37450

// does not use index register.

37451

if (AM.IndexReg == X86::NoRegister)

37452

return BB;

37453

37454

// After X86TargetLowering::ReplaceNodeResults CMPXCHG8B is glued to its

37455

// four operand definitions that are E[ABCD] registers. We skip them and

37456

// then insert the LEA.

37457

MachineBasicBlock::reverse_iterator RMBBI(MI.getReverseIterator());

37458

while (RMBBI != BB->rend() && (RMBBI->definesRegister(X86::EAX) ||

37459

RMBBI->definesRegister(X86::EBX) ||

37460

RMBBI->definesRegister(X86::ECX) ||

37461

RMBBI->definesRegister(X86::EDX))) {

37462

++RMBBI;

37463

}

37464

MachineBasicBlock::iterator MBBI(RMBBI);

37465

addFullAddress(

37466

BuildMI(*BB, *MBBI, DL, TII->get(X86::LEA32r), computedAddrVReg), AM);

37467

37468

setDirectAddressInInstr(&MI, 0, computedAddrVReg);

37469

37470

return BB;

37471

}

37472

case X86::LCMPXCHG16B_NO_RBX: {

37473

const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();

37474

Register BasePtr = TRI->getBaseRegister();

37475

if (TRI->hasBasePointer(*MF) &&

37476

(BasePtr == X86::RBX || BasePtr == X86::EBX)) {

37477

if (!BB->isLiveIn(BasePtr))

37478

BB->addLiveIn(BasePtr);

37479

// Save RBX into a virtual register.

37480

Register SaveRBX =

37481

MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);

37482

BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), SaveRBX)

37483

.addReg(X86::RBX);

37484

Register Dst = MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);

37485

MachineInstrBuilder MIB =

37486

BuildMI(*BB, MI, DL, TII->get(X86::LCMPXCHG16B_SAVE_RBX), Dst);

37487

for (unsigned Idx = 0; Idx < X86::AddrNumOperands; ++Idx)

37488

MIB.add(MI.getOperand(Idx));

37489

MIB.add(MI.getOperand(X86::AddrNumOperands));

37490

MIB.addReg(SaveRBX);

37491

} else {

37492

// Simple case, just copy the virtual register to RBX.

37493

BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::RBX)

37494

.add(MI.getOperand(X86::AddrNumOperands));

37495

MachineInstrBuilder MIB =

37496

BuildMI(*BB, MI, DL, TII->get(X86::LCMPXCHG16B));

37497

for (unsigned Idx = 0; Idx < X86::AddrNumOperands; ++Idx)

37498

MIB.add(MI.getOperand(Idx));

37499

}

37500

MI.eraseFromParent();

37501

return BB;

37502

}

37503

case X86::MWAITX: {

37504

const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();

37505

Register BasePtr = TRI->getBaseRegister();

37506

bool IsRBX = (BasePtr == X86::RBX || BasePtr == X86::EBX);

37507

// If no need to save the base pointer, we generate MWAITXrrr,

37508

// else we generate pseudo MWAITX_SAVE_RBX.

37509

if (!IsRBX || !TRI->hasBasePointer(*MF)) {

37510

BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::ECX)

37511

.addReg(MI.getOperand(0).getReg());

37512

BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::EAX)

37513

.addReg(MI.getOperand(1).getReg());

37514

BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::EBX)

37515

.addReg(MI.getOperand(2).getReg());

37516

BuildMI(*BB, MI, DL, TII->get(X86::MWAITXrrr));

37517

MI.eraseFromParent();

37518

} else {

37519

if (!BB->isLiveIn(BasePtr)) {

37520

BB->addLiveIn(BasePtr);

37521

}

37522

// Parameters can be copied into ECX and EAX but not EBX yet.

37523

BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::ECX)

37524

.addReg(MI.getOperand(0).getReg());

37525

BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::EAX)

37526

.addReg(MI.getOperand(1).getReg());

37527

assert(Subtarget.is64Bit() && "Expected 64-bit mode!")(static_cast <bool> (Subtarget.is64Bit() && "Expected 64-bit mode!"
) ? void (0) : __assert_fail ("Subtarget.is64Bit() && \"Expected 64-bit mode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37527, __extension__
__PRETTY_FUNCTION__));

37528

// Save RBX into a virtual register.

37529

Register SaveRBX =

37530

MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);

37531

BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), SaveRBX)

37532

.addReg(X86::RBX);

37533

// Generate mwaitx pseudo.

37534

Register Dst = MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass);

37535

BuildMI(*BB, MI, DL, TII->get(X86::MWAITX_SAVE_RBX))

37536

.addDef(Dst) // Destination tied in with SaveRBX.

37537

.addReg(MI.getOperand(2).getReg()) // input value of EBX.

37538

.addUse(SaveRBX); // Save of base pointer.

37539

MI.eraseFromParent();

37540

}

37541

return BB;

37542

}

37543

case TargetOpcode::PREALLOCATED_SETUP: {

37544

assert(Subtarget.is32Bit() && "preallocated only used in 32-bit")(static_cast <bool> (Subtarget.is32Bit() && "preallocated only used in 32-bit"
) ? void (0) : __assert_fail ("Subtarget.is32Bit() && \"preallocated only used in 32-bit\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37544, __extension__
__PRETTY_FUNCTION__));

37545

auto MFI = MF->getInfo<X86MachineFunctionInfo>();

37546

MFI->setHasPreallocatedCall(true);

37547

int64_t PreallocatedId = MI.getOperand(0).getImm();

37548

size_t StackAdjustment = MFI->getPreallocatedStackSize(PreallocatedId);

37549

assert(StackAdjustment != 0 && "0 stack adjustment")(static_cast <bool> (StackAdjustment != 0 && "0 stack adjustment"
) ? void (0) : __assert_fail ("StackAdjustment != 0 && \"0 stack adjustment\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37549, __extension__
__PRETTY_FUNCTION__));

37550

LLVM_DEBUG(dbgs() << "PREALLOCATED_SETUP stack adjustment "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("x86-isel")) { dbgs() << "PREALLOCATED_SETUP stack adjustment "
<< StackAdjustment << "\n"; } } while (false)

37551

<< StackAdjustment << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("x86-isel")) { dbgs() << "PREALLOCATED_SETUP stack adjustment "
<< StackAdjustment << "\n"; } } while (false);

37552

BuildMI(*BB, MI, DL, TII->get(X86::SUB32ri), X86::ESP)

37553

.addReg(X86::ESP)

37554

.addImm(StackAdjustment);

37555

MI.eraseFromParent();

37556

return BB;

37557

}

37558

case TargetOpcode::PREALLOCATED_ARG: {

37559

assert(Subtarget.is32Bit() && "preallocated calls only used in 32-bit")(static_cast <bool> (Subtarget.is32Bit() && "preallocated calls only used in 32-bit"
) ? void (0) : __assert_fail ("Subtarget.is32Bit() && \"preallocated calls only used in 32-bit\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37559, __extension__
__PRETTY_FUNCTION__));

37560

int64_t PreallocatedId = MI.getOperand(1).getImm();

37561

int64_t ArgIdx = MI.getOperand(2).getImm();

37562

auto MFI = MF->getInfo<X86MachineFunctionInfo>();

37563

size_t ArgOffset = MFI->getPreallocatedArgOffsets(PreallocatedId)[ArgIdx];

37564

LLVM_DEBUG(dbgs() << "PREALLOCATED_ARG arg index " << ArgIdxdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("x86-isel")) { dbgs() << "PREALLOCATED_ARG arg index "
<< ArgIdx << ", arg offset " << ArgOffset <<
"\n"; } } while (false)

37565

<< ", arg offset " << ArgOffset << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("x86-isel")) { dbgs() << "PREALLOCATED_ARG arg index "
<< ArgIdx << ", arg offset " << ArgOffset <<
"\n"; } } while (false);

37566

// stack pointer + offset

37567

addRegOffset(

37568

BuildMI(*BB, MI, DL, TII->get(X86::LEA32r), MI.getOperand(0).getReg()),

37569

X86::ESP, false, ArgOffset);

37570

MI.eraseFromParent();

37571

return BB;

37572

}

37573

case X86::PTDPBSSD:

37574

case X86::PTDPBSUD:

37575

case X86::PTDPBUSD:

37576

case X86::PTDPBUUD:

37577

case X86::PTDPBF16PS:

37578

case X86::PTDPFP16PS: {

37579

unsigned Opc;

37580

switch (MI.getOpcode()) {

37581

default: llvm_unreachable("illegal opcode!")::llvm::llvm_unreachable_internal("illegal opcode!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 37581);

37582

case X86::PTDPBSSD: Opc = X86::TDPBSSD; break;

37583

case X86::PTDPBSUD: Opc = X86::TDPBSUD; break;

37584

case X86::PTDPBUSD: Opc = X86::TDPBUSD; break;

37585

case X86::PTDPBUUD: Opc = X86::TDPBUUD; break;

37586

case X86::PTDPBF16PS: Opc = X86::TDPBF16PS; break;

37587

case X86::PTDPFP16PS: Opc = X86::TDPFP16PS; break;

37588

}

37589

37590

MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));

37591

MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Define);

37592

MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Undef);

37593

MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef);

37594

MIB.addReg(TMMImmToTMMReg(MI.getOperand(2).getImm()), RegState::Undef);

37595

37596

MI.eraseFromParent(); // The pseudo is gone now.

37597

return BB;

37598

}

37599

case X86::PTILEZERO: {

37600

unsigned Imm = MI.getOperand(0).getImm();

37601

BuildMI(*BB, MI, DL, TII->get(X86::TILEZERO), TMMImmToTMMReg(Imm));

37602

MI.eraseFromParent(); // The pseudo is gone now.

37603

return BB;

37604

}

37605

case X86::PTILELOADD:

37606

case X86::PTILELOADDT1:

37607

case X86::PTILESTORED: {

37608

unsigned Opc;

37609

switch (MI.getOpcode()) {

37610

default: llvm_unreachable("illegal opcode!")::llvm::llvm_unreachable_internal("illegal opcode!", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 37610);

37611

case X86::PTILELOADD: Opc = X86::TILELOADD; break;

37612

case X86::PTILELOADDT1: Opc = X86::TILELOADDT1; break;

37613

case X86::PTILESTORED: Opc = X86::TILESTORED; break;

37614

}

37615

37616

MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));

37617

unsigned CurOp = 0;

37618

if (Opc != X86::TILESTORED)

37619

MIB.addReg(TMMImmToTMMReg(MI.getOperand(CurOp++).getImm()),

37620

RegState::Define);

37621

37622

MIB.add(MI.getOperand(CurOp++)); // base

37623

MIB.add(MI.getOperand(CurOp++)); // scale

37624

MIB.add(MI.getOperand(CurOp++)); // index -- stride

37625

MIB.add(MI.getOperand(CurOp++)); // displacement

37626

MIB.add(MI.getOperand(CurOp++)); // segment

37627

37628

if (Opc == X86::TILESTORED)

37629

MIB.addReg(TMMImmToTMMReg(MI.getOperand(CurOp++).getImm()),

37630

RegState::Undef);

37631

37632

MI.eraseFromParent(); // The pseudo is gone now.

37633

return BB;

37634

}

37635

}

37636

}

37637

37638

//===----------------------------------------------------------------------===//

37639

// X86 Optimization Hooks

37640

//===----------------------------------------------------------------------===//

37641

37642

bool

37643

X86TargetLowering::targetShrinkDemandedConstant(SDValue Op,

37644

const APInt &DemandedBits,

37645

const APInt &DemandedElts,

37646

TargetLoweringOpt &TLO) const {

37647

EVT VT = Op.getValueType();

37648

unsigned Opcode = Op.getOpcode();

37649

unsigned EltSize = VT.getScalarSizeInBits();

37650

37651

if (VT.isVector()) {

37652

// If the constant is only all signbits in the active bits, then we should

37653

// extend it to the entire constant to allow it act as a boolean constant

37654

// vector.

37655

auto NeedsSignExtension = [&](SDValue V, unsigned ActiveBits) {

37656

if (!ISD::isBuildVectorOfConstantSDNodes(V.getNode()))

37657

return false;

37658

for (unsigned i = 0, e = V.getNumOperands(); i != e; ++i) {

37659

if (!DemandedElts[i] || V.getOperand(i).isUndef())

37660

continue;

37661

const APInt &Val = V.getConstantOperandAPInt(i);

37662

if (Val.getBitWidth() > Val.getNumSignBits() &&

37663

Val.trunc(ActiveBits).getNumSignBits() == ActiveBits)

37664

return true;

37665

}

37666

return false;

37667

};

37668

// For vectors - if we have a constant, then try to sign extend.

37669

// TODO: Handle AND/ANDN cases.

37670

unsigned ActiveBits = DemandedBits.getActiveBits();

37671

if (EltSize > ActiveBits && EltSize > 1 && isTypeLegal(VT) &&

37672

(Opcode == ISD::OR || Opcode == ISD::XOR) &&

37673

NeedsSignExtension(Op.getOperand(1), ActiveBits)) {

37674

EVT ExtSVT = EVT::getIntegerVT(*TLO.DAG.getContext(), ActiveBits);

37675

EVT ExtVT = EVT::getVectorVT(*TLO.DAG.getContext(), ExtSVT,

37676

VT.getVectorNumElements());

37677

SDValue NewC =

37678

TLO.DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(Op), VT,

37679

Op.getOperand(1), TLO.DAG.getValueType(ExtVT));

37680

SDValue NewOp =

37681

TLO.DAG.getNode(Opcode, SDLoc(Op), VT, Op.getOperand(0), NewC);

37682

return TLO.CombineTo(Op, NewOp);

37683

}

37684

return false;

37685

}

37686

37687

// Only optimize Ands to prevent shrinking a constant that could be

37688

// matched by movzx.

37689

if (Opcode != ISD::AND)

37690

return false;

37691

37692

// Make sure the RHS really is a constant.

37693

ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));

37694

if (!C)

37695

return false;

37696

37697

const APInt &Mask = C->getAPIntValue();

37698

37699

// Clear all non-demanded bits initially.

37700

APInt ShrunkMask = Mask & DemandedBits;

37701

37702

// Find the width of the shrunk mask.

37703

unsigned Width = ShrunkMask.getActiveBits();

37704

37705

// If the mask is all 0s there's nothing to do here.

37706

if (Width == 0)

37707

return false;

37708

37709

// Find the next power of 2 width, rounding up to a byte.

37710

Width = PowerOf2Ceil(std::max(Width, 8U));

37711

// Truncate the width to size to handle illegal types.

37712

Width = std::min(Width, EltSize);

37713

37714

// Calculate a possible zero extend mask for this constant.

37715

APInt ZeroExtendMask = APInt::getLowBitsSet(EltSize, Width);

37716

37717

// If we aren't changing the mask, just return true to keep it and prevent

37718

// the caller from optimizing.

37719

if (ZeroExtendMask == Mask)

37720

return true;

37721

37722

// Make sure the new mask can be represented by a combination of mask bits

37723

// and non-demanded bits.

37724

if (!ZeroExtendMask.isSubsetOf(Mask | ~DemandedBits))

37725

return false;

37726

37727

// Replace the constant with the zero extend mask.

37728

SDLoc DL(Op);

37729

SDValue NewC = TLO.DAG.getConstant(ZeroExtendMask, DL, VT);

37730

SDValue NewOp = TLO.DAG.getNode(ISD::AND, DL, VT, Op.getOperand(0), NewC);

37731

return TLO.CombineTo(Op, NewOp);

37732

}

37733

37734

void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,

37735

KnownBits &Known,

37736

const APInt &DemandedElts,

37737

const SelectionDAG &DAG,

37738

unsigned Depth) const {

37739

unsigned BitWidth = Known.getBitWidth();

37740

unsigned NumElts = DemandedElts.getBitWidth();

37741

unsigned Opc = Op.getOpcode();

37742

EVT VT = Op.getValueType();

37743

assert((Opc >= ISD::BUILTIN_OP_END ||(static_cast <bool> ((Opc >= ISD::BUILTIN_OP_END || Opc
== ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN ||
Opc == ISD::INTRINSIC_VOID) && "Should use MaskedValueIsZero if you don't know whether Op"
" is a target node!") ? void (0) : __assert_fail ("(Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && \"Should use MaskedValueIsZero if you don't know whether Op\" \" is a target node!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37748, __extension__
__PRETTY_FUNCTION__))

37744

Opc == ISD::INTRINSIC_WO_CHAIN ||(static_cast <bool> ((Opc >= ISD::BUILTIN_OP_END || Opc
== ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN ||
Opc == ISD::INTRINSIC_VOID) && "Should use MaskedValueIsZero if you don't know whether Op"
" is a target node!") ? void (0) : __assert_fail ("(Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && \"Should use MaskedValueIsZero if you don't know whether Op\" \" is a target node!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37748, __extension__
__PRETTY_FUNCTION__))

37745

Opc == ISD::INTRINSIC_W_CHAIN ||(static_cast <bool> ((Opc >= ISD::BUILTIN_OP_END || Opc
== ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN ||
Opc == ISD::INTRINSIC_VOID) && "Should use MaskedValueIsZero if you don't know whether Op"
" is a target node!") ? void (0) : __assert_fail ("(Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && \"Should use MaskedValueIsZero if you don't know whether Op\" \" is a target node!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37748, __extension__
__PRETTY_FUNCTION__))

37746

Opc == ISD::INTRINSIC_VOID) &&(static_cast <bool> ((Opc >= ISD::BUILTIN_OP_END || Opc
== ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN ||
Opc == ISD::INTRINSIC_VOID) && "Should use MaskedValueIsZero if you don't know whether Op"
" is a target node!") ? void (0) : __assert_fail ("(Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && \"Should use MaskedValueIsZero if you don't know whether Op\" \" is a target node!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37748, __extension__
__PRETTY_FUNCTION__))

37747

"Should use MaskedValueIsZero if you don't know whether Op"(static_cast <bool> ((Opc >= ISD::BUILTIN_OP_END || Opc
== ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN ||
Opc == ISD::INTRINSIC_VOID) && "Should use MaskedValueIsZero if you don't know whether Op"
" is a target node!") ? void (0) : __assert_fail ("(Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && \"Should use MaskedValueIsZero if you don't know whether Op\" \" is a target node!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37748, __extension__
__PRETTY_FUNCTION__))

37748

" is a target node!")(static_cast <bool> ((Opc >= ISD::BUILTIN_OP_END || Opc
== ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN ||
Opc == ISD::INTRINSIC_VOID) && "Should use MaskedValueIsZero if you don't know whether Op"
" is a target node!") ? void (0) : __assert_fail ("(Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && \"Should use MaskedValueIsZero if you don't know whether Op\" \" is a target node!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37748, __extension__
__PRETTY_FUNCTION__));

37749

37750

Known.resetAll();

37751

switch (Opc) {

37752

default: break;

37753

case X86ISD::SETCC:

37754

Known.Zero.setBitsFrom(1);

37755

break;

37756

case X86ISD::MOVMSK: {

37757

unsigned NumLoBits = Op.getOperand(0).getValueType().getVectorNumElements();

37758

Known.Zero.setBitsFrom(NumLoBits);

37759

break;

37760

}

37761

case X86ISD::PEXTRB:

37762

case X86ISD::PEXTRW: {

37763

SDValue Src = Op.getOperand(0);

37764

EVT SrcVT = Src.getValueType();

37765

APInt DemandedElt = APInt::getOneBitSet(SrcVT.getVectorNumElements(),

37766

Op.getConstantOperandVal(1));

37767

Known = DAG.computeKnownBits(Src, DemandedElt, Depth + 1);

37768

Known = Known.anyextOrTrunc(BitWidth);

37769

Known.Zero.setBitsFrom(SrcVT.getScalarSizeInBits());

37770

break;

37771

}

37772

case X86ISD::VSRAI:

37773

case X86ISD::VSHLI:

37774

case X86ISD::VSRLI: {

37775

unsigned ShAmt = Op.getConstantOperandVal(1);

37776

if (ShAmt >= VT.getScalarSizeInBits()) {

37777

// Out of range logical bit shifts are guaranteed to be zero.

37778

// Out of range arithmetic bit shifts splat the sign bit.

37779

if (Opc != X86ISD::VSRAI) {

37780

Known.setAllZero();

37781

break;

37782

}

37783

37784

ShAmt = VT.getScalarSizeInBits() - 1;

37785

}

37786

37787

Known = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);

37788

if (Opc == X86ISD::VSHLI) {

37789

Known.Zero <<= ShAmt;

37790

Known.One <<= ShAmt;

37791

// Low bits are known zero.

37792

Known.Zero.setLowBits(ShAmt);

37793

} else if (Opc == X86ISD::VSRLI) {

37794

Known.Zero.lshrInPlace(ShAmt);

37795

Known.One.lshrInPlace(ShAmt);

37796

// High bits are known zero.

37797

Known.Zero.setHighBits(ShAmt);

37798

} else {

37799

Known.Zero.ashrInPlace(ShAmt);

37800

Known.One.ashrInPlace(ShAmt);

37801

}

37802

break;

37803

}

37804

case X86ISD::PACKUS: {

37805

// PACKUS is just a truncation if the upper half is zero.

37806

APInt DemandedLHS, DemandedRHS;

37807

getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);

37808

37809

Known.One = APInt::getAllOnes(BitWidth * 2);

37810

Known.Zero = APInt::getAllOnes(BitWidth * 2);

37811

37812

KnownBits Known2;

37813

if (!!DemandedLHS) {

37814

Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedLHS, Depth + 1);

37815

Known = KnownBits::commonBits(Known, Known2);

37816

}

37817

if (!!DemandedRHS) {

37818

Known2 = DAG.computeKnownBits(Op.getOperand(1), DemandedRHS, Depth + 1);

37819

Known = KnownBits::commonBits(Known, Known2);

37820

}

37821

37822

if (Known.countMinLeadingZeros() < BitWidth)

37823

Known.resetAll();

37824

Known = Known.trunc(BitWidth);

37825

break;

37826

}

37827

case X86ISD::VBROADCAST: {

37828

SDValue Src = Op.getOperand(0);

37829

if (!Src.getSimpleValueType().isVector()) {

37830

Known = DAG.computeKnownBits(Src, Depth + 1);

37831

return;

37832

}

37833

break;

37834

}

37835

case X86ISD::AND: {

37836

if (Op.getResNo() == 0) {

37837

KnownBits Known2;

37838

Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);

37839

Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);

37840

Known &= Known2;

37841

}

37842

break;

37843

}

37844

case X86ISD::ANDNP: {

37845

KnownBits Known2;

37846

Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);

37847

Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);

37848

37849

// ANDNP = (~X & Y);

37850

Known.One &= Known2.Zero;

37851

Known.Zero |= Known2.One;

37852

break;

37853

}

37854

case X86ISD::FOR: {

37855

KnownBits Known2;

37856

Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);

37857

Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);

37858

37859

Known |= Known2;

37860

break;

37861

}

37862

case X86ISD::PSADBW: {

37863

assert(VT.getScalarType() == MVT::i64 &&(static_cast <bool> (VT.getScalarType() == MVT::i64 &&
Op.getOperand(0).getValueType().getScalarType() == MVT::i8 &&
"Unexpected PSADBW types") ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i64 && Op.getOperand(0).getValueType().getScalarType() == MVT::i8 && \"Unexpected PSADBW types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37865, __extension__
__PRETTY_FUNCTION__))

37864

Op.getOperand(0).getValueType().getScalarType() == MVT::i8 &&(static_cast <bool> (VT.getScalarType() == MVT::i64 &&
Op.getOperand(0).getValueType().getScalarType() == MVT::i8 &&
"Unexpected PSADBW types") ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i64 && Op.getOperand(0).getValueType().getScalarType() == MVT::i8 && \"Unexpected PSADBW types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37865, __extension__
__PRETTY_FUNCTION__))

37865

"Unexpected PSADBW types")(static_cast <bool> (VT.getScalarType() == MVT::i64 &&
Op.getOperand(0).getValueType().getScalarType() == MVT::i8 &&
"Unexpected PSADBW types") ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i64 && Op.getOperand(0).getValueType().getScalarType() == MVT::i8 && \"Unexpected PSADBW types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 37865, __extension__
__PRETTY_FUNCTION__));

37866

37867

// PSADBW - fills low 16 bits and zeros upper 48 bits of each i64 result.

37868

Known.Zero.setBitsFrom(16);

37869

break;

37870

}

37871

case X86ISD::PMULUDQ: {

37872

KnownBits Known2;

37873

Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);

37874

Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);

37875

37876

Known = Known.trunc(BitWidth / 2).zext(BitWidth);

37877

Known2 = Known2.trunc(BitWidth / 2).zext(BitWidth);

37878

Known = KnownBits::mul(Known, Known2);

37879

break;

37880

}

37881

case X86ISD::CMOV: {

37882

Known = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);

37883

// If we don't know any bits, early out.

37884

if (Known.isUnknown())

37885

break;

37886

KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);

37887

37888

// Only known if known in both the LHS and RHS.

37889

Known = KnownBits::commonBits(Known, Known2);

37890

break;

37891

}

37892

case X86ISD::BEXTR:

37893

case X86ISD::BEXTRI: {

37894

SDValue Op0 = Op.getOperand(0);

37895

SDValue Op1 = Op.getOperand(1);

37896

37897

if (auto* Cst1 = dyn_cast<ConstantSDNode>(Op1)) {

37898

unsigned Shift = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 0);

37899

unsigned Length = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 8);

37900

37901

// If the length is 0, the result is 0.

37902

if (Length == 0) {

37903

Known.setAllZero();

37904

break;

37905

}

37906

37907

if ((Shift + Length) <= BitWidth) {

37908

Known = DAG.computeKnownBits(Op0, Depth + 1);

37909

Known = Known.extractBits(Length, Shift);

37910

Known = Known.zextOrTrunc(BitWidth);

37911

}

37912

}

37913

break;

37914

}

37915

case X86ISD::PDEP: {

37916

KnownBits Known2;

37917

Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);

37918

Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);

37919

// Zeros are retained from the mask operand. But not ones.

37920

Known.One.clearAllBits();

37921

// The result will have at least as many trailing zeros as the non-mask

37922

// operand since bits can only map to the same or higher bit position.

37923

Known.Zero.setLowBits(Known2.countMinTrailingZeros());

37924

break;

37925

}

37926

case X86ISD::PEXT: {

37927

Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);

37928

// The result has as many leading zeros as the number of zeroes in the mask.

37929

unsigned Count = Known.Zero.countPopulation();

37930

Known.Zero = APInt::getHighBitsSet(BitWidth, Count);

37931

Known.One.clearAllBits();

37932

break;

37933

}

37934

case X86ISD::VTRUNC:

37935

case X86ISD::VTRUNCS:

37936

case X86ISD::VTRUNCUS:

37937

case X86ISD::CVTSI2P:

37938

case X86ISD::CVTUI2P:

37939

case X86ISD::CVTP2SI:

37940

case X86ISD::CVTP2UI:

37941

case X86ISD::MCVTP2SI:

37942

case X86ISD::MCVTP2UI:

37943

case X86ISD::CVTTP2SI:

37944

case X86ISD::CVTTP2UI:

37945

case X86ISD::MCVTTP2SI:

37946

case X86ISD::MCVTTP2UI:

37947

case X86ISD::MCVTSI2P:

37948

case X86ISD::MCVTUI2P:

37949

case X86ISD::VFPROUND:

37950

case X86ISD::VMFPROUND:

37951

case X86ISD::CVTPS2PH:

37952

case X86ISD::MCVTPS2PH: {

37953

// Truncations/Conversions - upper elements are known zero.

37954

EVT SrcVT = Op.getOperand(0).getValueType();

37955

if (SrcVT.isVector()) {

37956

unsigned NumSrcElts = SrcVT.getVectorNumElements();

37957

if (NumElts > NumSrcElts &&

37958

DemandedElts.countTrailingZeros() >= NumSrcElts)

37959

Known.setAllZero();

37960

}

37961

break;

37962

}

37963

case X86ISD::STRICT_CVTTP2SI:

37964

case X86ISD::STRICT_CVTTP2UI:

37965

case X86ISD::STRICT_CVTSI2P:

37966

case X86ISD::STRICT_CVTUI2P:

37967

case X86ISD::STRICT_VFPROUND:

37968

case X86ISD::STRICT_CVTPS2PH: {

37969

// Strict Conversions - upper elements are known zero.

37970

EVT SrcVT = Op.getOperand(1).getValueType();

37971

if (SrcVT.isVector()) {

37972

unsigned NumSrcElts = SrcVT.getVectorNumElements();

37973

if (NumElts > NumSrcElts &&

37974

DemandedElts.countTrailingZeros() >= NumSrcElts)

37975

Known.setAllZero();

37976

}

37977

break;

37978

}

37979

case X86ISD::MOVQ2DQ: {

37980

// Move from MMX to XMM. Upper half of XMM should be 0.

37981

if (DemandedElts.countTrailingZeros() >= (NumElts / 2))

37982

Known.setAllZero();

37983

break;

37984

}

37985

case X86ISD::VBROADCAST_LOAD: {

37986

APInt UndefElts;

37987

SmallVector<APInt, 16> EltBits;

37988

if (getTargetConstantBitsFromNode(Op, BitWidth, UndefElts, EltBits,

37989

/*AllowWholeUndefs*/ false,

37990

/*AllowPartialUndefs*/ false)) {

37991

Known.Zero.setAllBits();

37992

Known.One.setAllBits();

37993

for (unsigned I = 0; I != NumElts; ++I) {

37994

if (!DemandedElts[I])

37995

continue;

37996

if (UndefElts[I]) {

37997

Known.resetAll();

37998

break;

37999

}

38000

KnownBits Known2 = KnownBits::makeConstant(EltBits[I]);

38001

Known = KnownBits::commonBits(Known, Known2);

38002

}

38003

return;

38004

}

38005

break;

38006

}

38007

}

38008

38009

// Handle target shuffles.

38010

// TODO - use resolveTargetShuffleInputs once we can limit recursive depth.

38011

if (isTargetShuffle(Opc)) {

38012

SmallVector<int, 64> Mask;

38013

SmallVector<SDValue, 2> Ops;

38014

if (getTargetShuffleMask(Op.getNode(), VT.getSimpleVT(), true, Ops, Mask)) {

38015

unsigned NumOps = Ops.size();

38016

unsigned NumElts = VT.getVectorNumElements();

38017

if (Mask.size() == NumElts) {

38018

SmallVector<APInt, 2> DemandedOps(NumOps, APInt(NumElts, 0));

38019

Known.Zero.setAllBits(); Known.One.setAllBits();

38020

for (unsigned i = 0; i != NumElts; ++i) {

38021

if (!DemandedElts[i])

38022

continue;

38023

int M = Mask[i];

38024

if (M == SM_SentinelUndef) {

38025

// For UNDEF elements, we don't know anything about the common state

38026

// of the shuffle result.

38027

Known.resetAll();

38028

break;

38029

}

38030

if (M == SM_SentinelZero) {

38031

Known.One.clearAllBits();

38032

continue;

38033

}

38034

assert(0 <= M && (unsigned)M < (NumOps * NumElts) &&(static_cast <bool> (0 <= M && (unsigned)M <
(NumOps * NumElts) && "Shuffle index out of range") ?
void (0) : __assert_fail ("0 <= M && (unsigned)M < (NumOps * NumElts) && \"Shuffle index out of range\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38035, __extension__
__PRETTY_FUNCTION__))

38035

"Shuffle index out of range")(static_cast <bool> (0 <= M && (unsigned)M <
(NumOps * NumElts) && "Shuffle index out of range") ?
void (0) : __assert_fail ("0 <= M && (unsigned)M < (NumOps * NumElts) && \"Shuffle index out of range\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38035, __extension__
__PRETTY_FUNCTION__));

38036

38037

unsigned OpIdx = (unsigned)M / NumElts;

38038

unsigned EltIdx = (unsigned)M % NumElts;

38039

if (Ops[OpIdx].getValueType() != VT) {

38040

// TODO - handle target shuffle ops with different value types.

38041

Known.resetAll();

38042

break;

38043

}

38044

DemandedOps[OpIdx].setBit(EltIdx);

38045

}

38046

// Known bits are the values that are shared by every demanded element.

38047

for (unsigned i = 0; i != NumOps && !Known.isUnknown(); ++i) {

38048

if (!DemandedOps[i])

38049

continue;

38050

KnownBits Known2 =

38051

DAG.computeKnownBits(Ops[i], DemandedOps[i], Depth + 1);

38052

Known = KnownBits::commonBits(Known, Known2);

38053

}

38054

}

38055

}

38056

}

38057

}

38058

38059

unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(

38060

SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,

38061

unsigned Depth) const {

38062

EVT VT = Op.getValueType();

38063

unsigned VTBits = VT.getScalarSizeInBits();

38064

unsigned Opcode = Op.getOpcode();

38065

switch (Opcode) {

38066

case X86ISD::SETCC_CARRY:

38067

// SETCC_CARRY sets the dest to ~0 for true or 0 for false.

38068

return VTBits;

38069

38070

case X86ISD::VTRUNC: {

38071

SDValue Src = Op.getOperand(0);

38072

MVT SrcVT = Src.getSimpleValueType();

38073

unsigned NumSrcBits = SrcVT.getScalarSizeInBits();

38074

assert(VTBits < NumSrcBits && "Illegal truncation input type")(static_cast <bool> (VTBits < NumSrcBits && "Illegal truncation input type"
) ? void (0) : __assert_fail ("VTBits < NumSrcBits && \"Illegal truncation input type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38074, __extension__
__PRETTY_FUNCTION__));

38075

APInt DemandedSrc = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());

38076

unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedSrc, Depth + 1);

38077

if (Tmp > (NumSrcBits - VTBits))

38078

return Tmp - (NumSrcBits - VTBits);

38079

return 1;

38080

}

38081

38082

case X86ISD::PACKSS: {

38083

// PACKSS is just a truncation if the sign bits extend to the packed size.

38084

APInt DemandedLHS, DemandedRHS;

38085

getPackDemandedElts(Op.getValueType(), DemandedElts, DemandedLHS,

38086

DemandedRHS);

38087

38088

unsigned SrcBits = Op.getOperand(0).getScalarValueSizeInBits();

38089

unsigned Tmp0 = SrcBits, Tmp1 = SrcBits;

38090

if (!!DemandedLHS)

38091

Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), DemandedLHS, Depth + 1);

38092

if (!!DemandedRHS)

38093

Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), DemandedRHS, Depth + 1);

38094

unsigned Tmp = std::min(Tmp0, Tmp1);

38095

if (Tmp > (SrcBits - VTBits))

38096

return Tmp - (SrcBits - VTBits);

38097

return 1;

38098

}

38099

38100

case X86ISD::VBROADCAST: {

38101

SDValue Src = Op.getOperand(0);

38102

if (!Src.getSimpleValueType().isVector())

38103

return DAG.ComputeNumSignBits(Src, Depth + 1);

38104

break;

38105

}

38106

38107

case X86ISD::VSHLI: {

38108

SDValue Src = Op.getOperand(0);

38109

const APInt &ShiftVal = Op.getConstantOperandAPInt(1);

38110

if (ShiftVal.uge(VTBits))

38111

return VTBits; // Shifted all bits out --> zero.

38112

unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);

38113

if (ShiftVal.uge(Tmp))

38114

return 1; // Shifted all sign bits out --> unknown.

38115

return Tmp - ShiftVal.getZExtValue();

38116

}

38117

38118

case X86ISD::VSRAI: {

38119

SDValue Src = Op.getOperand(0);

38120

APInt ShiftVal = Op.getConstantOperandAPInt(1);

38121

if (ShiftVal.uge(VTBits - 1))

38122

return VTBits; // Sign splat.

38123

unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);

38124

ShiftVal += Tmp;

38125

return ShiftVal.uge(VTBits) ? VTBits : ShiftVal.getZExtValue();

38126

}

38127

38128

case X86ISD::FSETCC:

38129

// cmpss/cmpsd return zero/all-bits result values in the bottom element.

38130

if (VT == MVT::f32 || VT == MVT::f64 ||

38131

((VT == MVT::v4f32 || VT == MVT::v2f64) && DemandedElts == 1))

38132

return VTBits;

38133

break;

38134

38135

case X86ISD::PCMPGT:

38136

case X86ISD::PCMPEQ:

38137

case X86ISD::CMPP:

38138

case X86ISD::VPCOM:

38139

case X86ISD::VPCOMU:

38140

// Vector compares return zero/all-bits result values.

38141

return VTBits;

38142

38143

case X86ISD::ANDNP: {

38144

unsigned Tmp0 =

38145

DAG.ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth + 1);

38146

if (Tmp0 == 1) return 1; // Early out.

38147

unsigned Tmp1 =

38148

DAG.ComputeNumSignBits(Op.getOperand(1), DemandedElts, Depth + 1);

38149

return std::min(Tmp0, Tmp1);

38150

}

38151

38152

case X86ISD::CMOV: {

38153

unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth+1);

38154

if (Tmp0 == 1) return 1; // Early out.

38155

unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth+1);

38156

return std::min(Tmp0, Tmp1);

38157

}

38158

}

38159

38160

// Handle target shuffles.

38161

// TODO - use resolveTargetShuffleInputs once we can limit recursive depth.

38162

if (isTargetShuffle(Opcode)) {

38163

SmallVector<int, 64> Mask;

38164

SmallVector<SDValue, 2> Ops;

38165

if (getTargetShuffleMask(Op.getNode(), VT.getSimpleVT(), true, Ops, Mask)) {

38166

unsigned NumOps = Ops.size();

38167

unsigned NumElts = VT.getVectorNumElements();

38168

if (Mask.size() == NumElts) {

38169

SmallVector<APInt, 2> DemandedOps(NumOps, APInt(NumElts, 0));

38170

for (unsigned i = 0; i != NumElts; ++i) {

38171

if (!DemandedElts[i])

38172

continue;

38173

int M = Mask[i];

38174

if (M == SM_SentinelUndef) {

38175

// For UNDEF elements, we don't know anything about the common state

38176

// of the shuffle result.

38177

return 1;

38178

} else if (M == SM_SentinelZero) {

38179

// Zero = all sign bits.

38180

continue;

38181

}

38182

assert(0 <= M && (unsigned)M < (NumOps * NumElts) &&(static_cast <bool> (0 <= M && (unsigned)M <
(NumOps * NumElts) && "Shuffle index out of range") ?
void (0) : __assert_fail ("0 <= M && (unsigned)M < (NumOps * NumElts) && \"Shuffle index out of range\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38183, __extension__
__PRETTY_FUNCTION__))

38183

"Shuffle index out of range")(static_cast <bool> (0 <= M && (unsigned)M <
(NumOps * NumElts) && "Shuffle index out of range") ?
void (0) : __assert_fail ("0 <= M && (unsigned)M < (NumOps * NumElts) && \"Shuffle index out of range\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38183, __extension__
__PRETTY_FUNCTION__));

38184

38185

unsigned OpIdx = (unsigned)M / NumElts;

38186

unsigned EltIdx = (unsigned)M % NumElts;

38187

if (Ops[OpIdx].getValueType() != VT) {

38188

// TODO - handle target shuffle ops with different value types.

38189

return 1;

38190

}

38191

DemandedOps[OpIdx].setBit(EltIdx);

38192

}

38193

unsigned Tmp0 = VTBits;

38194

for (unsigned i = 0; i != NumOps && Tmp0 > 1; ++i) {

38195

if (!DemandedOps[i])

38196

continue;

38197

unsigned Tmp1 =

38198

DAG.ComputeNumSignBits(Ops[i], DemandedOps[i], Depth + 1);

38199

Tmp0 = std::min(Tmp0, Tmp1);

38200

}

38201

return Tmp0;

38202

}

38203

}

38204

}

38205

38206

// Fallback case.

38207

return 1;

38208

}

38209

38210

SDValue X86TargetLowering::unwrapAddress(SDValue N) const {

38211

if (N->getOpcode() == X86ISD::Wrapper || N->getOpcode() == X86ISD::WrapperRIP)

38212

return N->getOperand(0);

38213

return N;

38214

}

38215

38216

// Helper to look for a normal load that can be narrowed into a vzload with the

38217

// specified VT and memory VT. Returns SDValue() on failure.

38218

static SDValue narrowLoadToVZLoad(LoadSDNode *LN, MVT MemVT, MVT VT,

38219

SelectionDAG &DAG) {

38220

// Can't if the load is volatile or atomic.

38221

if (!LN->isSimple())

38222

return SDValue();

38223

38224

SDVTList Tys = DAG.getVTList(VT, MVT::Other);

38225

SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};

38226

return DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, SDLoc(LN), Tys, Ops, MemVT,

38227

LN->getPointerInfo(), LN->getOriginalAlign(),

38228

LN->getMemOperand()->getFlags());

38229

}

38230

38231

// Attempt to match a combined shuffle mask against supported unary shuffle

38232

// instructions.

38233

// TODO: Investigate sharing more of this with shuffle lowering.

38234

static bool matchUnaryShuffle(MVT MaskVT, ArrayRef<int> Mask,

38235

bool AllowFloatDomain, bool AllowIntDomain,

38236

SDValue V1, const SelectionDAG &DAG,

38237

const X86Subtarget &Subtarget, unsigned &Shuffle,

38238

MVT &SrcVT, MVT &DstVT) {

38239

unsigned NumMaskElts = Mask.size();

38240

unsigned MaskEltSize = MaskVT.getScalarSizeInBits();

38241

38242

// Match against a VZEXT_MOVL vXi32 and vXi16 zero-extending instruction.

38243

if (Mask[0] == 0 &&

38244

(MaskEltSize == 32 || (MaskEltSize == 16 && Subtarget.hasFP16()))) {

38245

if ((isUndefOrZero(Mask[1]) && isUndefInRange(Mask, 2, NumMaskElts - 2)) ||

38246

(V1.getOpcode() == ISD::SCALAR_TO_VECTOR &&

38247

isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1))) {

38248

Shuffle = X86ISD::VZEXT_MOVL;

38249

if (MaskEltSize == 16)

38250

SrcVT = DstVT = MaskVT.changeVectorElementType(MVT::f16);

38251

else

38252

SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;

38253

return true;

38254

}

38255

}

38256

38257

// Match against a ANY/ZERO_EXTEND_VECTOR_INREG instruction.

38258

// TODO: Add 512-bit vector support (split AVX512F and AVX512BW).

38259

if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE41()) ||

38260

(MaskVT.is256BitVector() && Subtarget.hasInt256()))) {

38261

unsigned MaxScale = 64 / MaskEltSize;

38262

for (unsigned Scale = 2; Scale <= MaxScale; Scale *= 2) {

38263

bool MatchAny = true;

38264

bool MatchZero = true;

38265

unsigned NumDstElts = NumMaskElts / Scale;

38266

for (unsigned i = 0; i != NumDstElts && (MatchAny || MatchZero); ++i) {

38267

if (!isUndefOrEqual(Mask[i * Scale], (int)i)) {

38268

MatchAny = MatchZero = false;

38269

break;

38270

}

38271

MatchAny &= isUndefInRange(Mask, (i * Scale) + 1, Scale - 1);

38272

MatchZero &= isUndefOrZeroInRange(Mask, (i * Scale) + 1, Scale - 1);

38273

}

38274

if (MatchAny || MatchZero) {

38275

assert(MatchZero && "Failed to match zext but matched aext?")(static_cast <bool> (MatchZero && "Failed to match zext but matched aext?"
) ? void (0) : __assert_fail ("MatchZero && \"Failed to match zext but matched aext?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38275, __extension__
__PRETTY_FUNCTION__));

38276

unsigned SrcSize = std::max(128u, NumDstElts * MaskEltSize);

38277

MVT ScalarTy = MaskVT.isInteger() ? MaskVT.getScalarType() :

38278

MVT::getIntegerVT(MaskEltSize);

38279

SrcVT = MVT::getVectorVT(ScalarTy, SrcSize / MaskEltSize);

38280

38281

Shuffle = unsigned(MatchAny ? ISD::ANY_EXTEND : ISD::ZERO_EXTEND);

38282

if (SrcVT.getVectorNumElements() != NumDstElts)

38283

Shuffle = DAG.getOpcode_EXTEND_VECTOR_INREG(Shuffle);

38284

38285

DstVT = MVT::getIntegerVT(Scale * MaskEltSize);

38286

DstVT = MVT::getVectorVT(DstVT, NumDstElts);

38287

return true;

38288

}

38289

}

38290

}

38291

38292

// Match against a VZEXT_MOVL instruction, SSE1 only supports 32-bits (MOVSS).

38293

if (((MaskEltSize == 32) || (MaskEltSize == 64 && Subtarget.hasSSE2()) ||

38294

(MaskEltSize == 16 && Subtarget.hasFP16())) &&

38295

isUndefOrEqual(Mask[0], 0) &&

38296

isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) {

38297

Shuffle = X86ISD::VZEXT_MOVL;

38298

if (MaskEltSize == 16)

38299

SrcVT = DstVT = MaskVT.changeVectorElementType(MVT::f16);

38300

else

38301

SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;

38302

return true;

38303

}

38304

38305

// Check if we have SSE3 which will let us use MOVDDUP etc. The

38306

// instructions are no slower than UNPCKLPD but has the option to

38307

// fold the input operand into even an unaligned memory load.

38308

if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && AllowFloatDomain) {

38309

if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0}, DAG, V1)) {

38310

Shuffle = X86ISD::MOVDDUP;

38311

SrcVT = DstVT = MVT::v2f64;

38312

return true;

38313

}

38314

if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2}, DAG, V1)) {

38315

Shuffle = X86ISD::MOVSLDUP;

38316

SrcVT = DstVT = MVT::v4f32;

38317

return true;

38318

}

38319

if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1, 3, 3}, DAG, V1)) {

38320

Shuffle = X86ISD::MOVSHDUP;

38321

SrcVT = DstVT = MVT::v4f32;

38322

return true;

38323

}

38324

}

38325

38326

if (MaskVT.is256BitVector() && AllowFloatDomain) {

38327

assert(Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles")(static_cast <bool> (Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles"
) ? void (0) : __assert_fail ("Subtarget.hasAVX() && \"AVX required for 256-bit vector shuffles\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38327, __extension__
__PRETTY_FUNCTION__));

38328

if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2}, DAG, V1)) {

38329

Shuffle = X86ISD::MOVDDUP;

38330

SrcVT = DstVT = MVT::v4f64;

38331

return true;

38332

}

38333

if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2, 4, 4, 6, 6}, DAG,

38334

V1)) {

38335

Shuffle = X86ISD::MOVSLDUP;

38336

SrcVT = DstVT = MVT::v8f32;

38337

return true;

38338

}

38339

if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1, 3, 3, 5, 5, 7, 7}, DAG,

38340

V1)) {

38341

Shuffle = X86ISD::MOVSHDUP;

38342

SrcVT = DstVT = MVT::v8f32;

38343

return true;

38344

}

38345

}

38346

38347

if (MaskVT.is512BitVector() && AllowFloatDomain) {

38348

assert(Subtarget.hasAVX512() &&(static_cast <bool> (Subtarget.hasAVX512() && "AVX512 required for 512-bit vector shuffles"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && \"AVX512 required for 512-bit vector shuffles\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38349, __extension__
__PRETTY_FUNCTION__))

38349

"AVX512 required for 512-bit vector shuffles")(static_cast <bool> (Subtarget.hasAVX512() && "AVX512 required for 512-bit vector shuffles"
) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && \"AVX512 required for 512-bit vector shuffles\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38349, __extension__
__PRETTY_FUNCTION__));

38350

if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2, 4, 4, 6, 6}, DAG,

38351

V1)) {

38352

Shuffle = X86ISD::MOVDDUP;

38353

SrcVT = DstVT = MVT::v8f64;

38354

return true;

38355

}

38356

if (isTargetShuffleEquivalent(

38357

MaskVT, Mask,

38358

{0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14}, DAG, V1)) {

38359

Shuffle = X86ISD::MOVSLDUP;

38360

SrcVT = DstVT = MVT::v16f32;

38361

return true;

38362

}

38363

if (isTargetShuffleEquivalent(

38364

MaskVT, Mask,

38365

{1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15}, DAG, V1)) {

38366

Shuffle = X86ISD::MOVSHDUP;

38367

SrcVT = DstVT = MVT::v16f32;

38368

return true;

38369

}

38370

}

38371

38372

return false;

38373

}

38374

38375

// Attempt to match a combined shuffle mask against supported unary immediate

38376

// permute instructions.

38377

// TODO: Investigate sharing more of this with shuffle lowering.

38378

static bool matchUnaryPermuteShuffle(MVT MaskVT, ArrayRef<int> Mask,

38379

const APInt &Zeroable,

38380

bool AllowFloatDomain, bool AllowIntDomain,

38381

const SelectionDAG &DAG,

38382

const X86Subtarget &Subtarget,

38383

unsigned &Shuffle, MVT &ShuffleVT,

38384

unsigned &PermuteImm) {

38385

unsigned NumMaskElts = Mask.size();

38386

unsigned InputSizeInBits = MaskVT.getSizeInBits();

38387

unsigned MaskScalarSizeInBits = InputSizeInBits / NumMaskElts;

38388

MVT MaskEltVT = MVT::getIntegerVT(MaskScalarSizeInBits);

38389

bool ContainsZeros = isAnyZero(Mask);

38390

38391

// Handle VPERMI/VPERMILPD vXi64/vXi64 patterns.

38392

if (!ContainsZeros && MaskScalarSizeInBits == 64) {

38393

// Check for lane crossing permutes.

38394

if (is128BitLaneCrossingShuffleMask(MaskEltVT, Mask)) {

38395

// PERMPD/PERMQ permutes within a 256-bit vector (AVX2+).

38396

if (Subtarget.hasAVX2() && MaskVT.is256BitVector()) {

38397

Shuffle = X86ISD::VPERMI;

38398

ShuffleVT = (AllowFloatDomain ? MVT::v4f64 : MVT::v4i64);

38399

PermuteImm = getV4X86ShuffleImm(Mask);

38400

return true;

38401

}

38402

if (Subtarget.hasAVX512() && MaskVT.is512BitVector()) {

38403

SmallVector<int, 4> RepeatedMask;

38404

if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) {

38405

Shuffle = X86ISD::VPERMI;

38406

ShuffleVT = (AllowFloatDomain ? MVT::v8f64 : MVT::v8i64);

38407

PermuteImm = getV4X86ShuffleImm(RepeatedMask);

38408

return true;

38409

}

38410

}

38411

} else if (AllowFloatDomain && Subtarget.hasAVX()) {

38412

// VPERMILPD can permute with a non-repeating shuffle.

38413

Shuffle = X86ISD::VPERMILPI;

38414

ShuffleVT = MVT::getVectorVT(MVT::f64, Mask.size());

38415

PermuteImm = 0;

38416

for (int i = 0, e = Mask.size(); i != e; ++i) {

38417

int M = Mask[i];

38418

if (M == SM_SentinelUndef)

38419

continue;

38420

assert(((M / 2) == (i / 2)) && "Out of range shuffle mask index")(static_cast <bool> (((M / 2) == (i / 2)) && "Out of range shuffle mask index"
) ? void (0) : __assert_fail ("((M / 2) == (i / 2)) && \"Out of range shuffle mask index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38420, __extension__
__PRETTY_FUNCTION__));

38421

PermuteImm |= (M & 1) << i;

38422

}

38423

return true;

38424

}

38425

}

38426

38427

// Handle PSHUFD/VPERMILPI vXi32/vXf32 repeated patterns.

38428

// AVX introduced the VPERMILPD/VPERMILPS float permutes, before then we

38429

// had to use 2-input SHUFPD/SHUFPS shuffles (not handled here).

38430

if ((MaskScalarSizeInBits == 64 || MaskScalarSizeInBits == 32) &&

38431

!ContainsZeros && (AllowIntDomain || Subtarget.hasAVX())) {

38432

SmallVector<int, 4> RepeatedMask;

38433

if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {

38434

// Narrow the repeated mask to create 32-bit element permutes.

38435

SmallVector<int, 4> WordMask = RepeatedMask;

38436

if (MaskScalarSizeInBits == 64)

38437

narrowShuffleMaskElts(2, RepeatedMask, WordMask);

38438

38439

Shuffle = (AllowIntDomain ? X86ISD::PSHUFD : X86ISD::VPERMILPI);

38440

ShuffleVT = (AllowIntDomain ? MVT::i32 : MVT::f32);

38441

ShuffleVT = MVT::getVectorVT(ShuffleVT, InputSizeInBits / 32);

38442

PermuteImm = getV4X86ShuffleImm(WordMask);

38443

return true;

38444

}

38445

}

38446

38447

// Handle PSHUFLW/PSHUFHW vXi16 repeated patterns.

38448

if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits == 16 &&

38449

((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||

38450

(MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||

38451

(MaskVT.is512BitVector() && Subtarget.hasBWI()))) {

38452

SmallVector<int, 4> RepeatedMask;

38453

if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {

38454

ArrayRef<int> LoMask(RepeatedMask.data() + 0, 4);

38455

ArrayRef<int> HiMask(RepeatedMask.data() + 4, 4);

38456

38457

// PSHUFLW: permute lower 4 elements only.

38458

if (isUndefOrInRange(LoMask, 0, 4) &&

38459

isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {

38460

Shuffle = X86ISD::PSHUFLW;

38461

ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);

38462

PermuteImm = getV4X86ShuffleImm(LoMask);

38463

return true;

38464

}

38465

38466

// PSHUFHW: permute upper 4 elements only.

38467

if (isUndefOrInRange(HiMask, 4, 8) &&

38468

isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {

38469

// Offset the HiMask so that we can create the shuffle immediate.

38470

int OffsetHiMask[4];

38471

for (int i = 0; i != 4; ++i)

38472

OffsetHiMask[i] = (HiMask[i] < 0 ? HiMask[i] : HiMask[i] - 4);

38473

38474

Shuffle = X86ISD::PSHUFHW;

38475

ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);

38476

PermuteImm = getV4X86ShuffleImm(OffsetHiMask);

38477

return true;

38478

}

38479

}

38480

}

38481

38482

// Attempt to match against byte/bit shifts.

38483

if (AllowIntDomain &&

38484

((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||

38485

(MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||

38486

(MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {

38487

int ShiftAmt = matchShuffleAsShift(ShuffleVT, Shuffle, MaskScalarSizeInBits,

38488

Mask, 0, Zeroable, Subtarget);

38489

if (0 < ShiftAmt && (!ShuffleVT.is512BitVector() || Subtarget.hasBWI() ||

38490

32 <= ShuffleVT.getScalarSizeInBits())) {

38491

PermuteImm = (unsigned)ShiftAmt;

38492

return true;

38493

}

38494

}

38495

38496

// Attempt to match against bit rotates.

38497

if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits < 64 &&

38498

((MaskVT.is128BitVector() && Subtarget.hasXOP()) ||

38499

Subtarget.hasAVX512())) {

38500

int RotateAmt = matchShuffleAsBitRotate(ShuffleVT, MaskScalarSizeInBits,

38501

Subtarget, Mask);

38502

if (0 < RotateAmt) {

38503

Shuffle = X86ISD::VROTLI;

38504

PermuteImm = (unsigned)RotateAmt;

38505

return true;

38506

}

38507

}

38508

38509

return false;

38510

}

38511

38512

// Attempt to match a combined unary shuffle mask against supported binary

38513

// shuffle instructions.

38514

// TODO: Investigate sharing more of this with shuffle lowering.

38515

static bool matchBinaryShuffle(MVT MaskVT, ArrayRef<int> Mask,

38516

bool AllowFloatDomain, bool AllowIntDomain,

38517

SDValue &V1, SDValue &V2, const SDLoc &DL,

38518

SelectionDAG &DAG, const X86Subtarget &Subtarget,

38519

unsigned &Shuffle, MVT &SrcVT, MVT &DstVT,

38520

bool IsUnary) {

38521

unsigned NumMaskElts = Mask.size();

38522

unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();

38523

unsigned SizeInBits = MaskVT.getSizeInBits();

38524

38525

if (MaskVT.is128BitVector()) {

38526

if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0}, DAG) &&

38527

AllowFloatDomain) {

38528

V2 = V1;

38529

V1 = (SM_SentinelUndef == Mask[0] ? DAG.getUNDEF(MVT::v4f32) : V1);

38530

Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKL : X86ISD::MOVLHPS;

38531

SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32;

38532

return true;

38533

}

38534

if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1}, DAG) &&

38535

AllowFloatDomain) {

38536

V2 = V1;

38537

Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKH : X86ISD::MOVHLPS;

38538

SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32;

38539

return true;

38540

}

38541

if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 3}, DAG) &&

38542

Subtarget.hasSSE2() && (AllowFloatDomain || !Subtarget.hasSSE41())) {

38543

std::swap(V1, V2);

38544

Shuffle = X86ISD::MOVSD;

38545

SrcVT = DstVT = MVT::v2f64;

38546

return true;

38547

}

38548

if (isTargetShuffleEquivalent(MaskVT, Mask, {4, 1, 2, 3}, DAG) &&

38549

(AllowFloatDomain || !Subtarget.hasSSE41())) {

38550

Shuffle = X86ISD::MOVSS;

38551

SrcVT = DstVT = MVT::v4f32;

38552

return true;

38553

}

38554

if (isTargetShuffleEquivalent(MaskVT, Mask, {8, 1, 2, 3, 4, 5, 6, 7},

38555

DAG) &&

38556

Subtarget.hasFP16()) {

38557

Shuffle = X86ISD::MOVSH;

38558

SrcVT = DstVT = MVT::v8f16;

38559

return true;

38560

}

38561

}

38562

38563

// Attempt to match against either an unary or binary PACKSS/PACKUS shuffle.

38564

if (((MaskVT == MVT::v8i16 || MaskVT == MVT::v16i8) && Subtarget.hasSSE2()) ||

38565

((MaskVT == MVT::v16i16 || MaskVT == MVT::v32i8) && Subtarget.hasInt256()) ||

38566

((MaskVT == MVT::v32i16 || MaskVT == MVT::v64i8) && Subtarget.hasBWI())) {

38567

if (matchShuffleWithPACK(MaskVT, SrcVT, V1, V2, Shuffle, Mask, DAG,

38568

Subtarget)) {

38569

DstVT = MaskVT;

38570

return true;

38571

}

38572

}

38573

38574

// Attempt to match against either a unary or binary UNPCKL/UNPCKH shuffle.

38575

if ((MaskVT == MVT::v4f32 && Subtarget.hasSSE1()) ||

38576

(MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||

38577

(MaskVT.is256BitVector() && 32 <= EltSizeInBits && Subtarget.hasAVX()) ||

38578

(MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||

38579

(MaskVT.is512BitVector() && Subtarget.hasAVX512())) {

38580

if (matchShuffleWithUNPCK(MaskVT, V1, V2, Shuffle, IsUnary, Mask, DL, DAG,

38581

Subtarget)) {

38582

SrcVT = DstVT = MaskVT;

38583

if (MaskVT.is256BitVector() && !Subtarget.hasAVX2())

38584

SrcVT = DstVT = (32 == EltSizeInBits ? MVT::v8f32 : MVT::v4f64);

38585

return true;

38586

}

38587

}

38588

38589

// Attempt to match against a OR if we're performing a blend shuffle and the

38590

// non-blended source element is zero in each case.

38591

// TODO: Handle cases where V1/V2 sizes doesn't match SizeInBits.

38592

if (SizeInBits == V1.getValueSizeInBits() &&

38593

SizeInBits == V2.getValueSizeInBits() &&

38594

(EltSizeInBits % V1.getScalarValueSizeInBits()) == 0 &&

38595

(EltSizeInBits % V2.getScalarValueSizeInBits()) == 0) {

38596

bool IsBlend = true;

38597

unsigned NumV1Elts = V1.getValueType().getVectorNumElements();

38598

unsigned NumV2Elts = V2.getValueType().getVectorNumElements();

38599

unsigned Scale1 = NumV1Elts / NumMaskElts;

38600

unsigned Scale2 = NumV2Elts / NumMaskElts;

38601

APInt DemandedZeroV1 = APInt::getZero(NumV1Elts);

38602

APInt DemandedZeroV2 = APInt::getZero(NumV2Elts);

38603

for (unsigned i = 0; i != NumMaskElts; ++i) {

38604

int M = Mask[i];

38605

if (M == SM_SentinelUndef)

38606

continue;

38607

if (M == SM_SentinelZero) {

38608

DemandedZeroV1.setBits(i * Scale1, (i + 1) * Scale1);

38609

DemandedZeroV2.setBits(i * Scale2, (i + 1) * Scale2);

38610

continue;

38611

}

38612

if (M == (int)i) {

38613

DemandedZeroV2.setBits(i * Scale2, (i + 1) * Scale2);

38614

continue;

38615

}

38616

if (M == (int)(i + NumMaskElts)) {

38617

DemandedZeroV1.setBits(i * Scale1, (i + 1) * Scale1);

38618

continue;

38619

}

38620

IsBlend = false;

38621

break;

38622

}

38623

if (IsBlend) {

38624

if (DAG.MaskedVectorIsZero(V1, DemandedZeroV1) &&

38625

DAG.MaskedVectorIsZero(V2, DemandedZeroV2)) {

38626

Shuffle = ISD::OR;

38627

SrcVT = DstVT = MaskVT.changeTypeToInteger();

38628

return true;

38629

}

38630

if (NumV1Elts == NumV2Elts && NumV1Elts == NumMaskElts) {

38631

// FIXME: handle mismatched sizes?

38632

// TODO: investigate if `ISD::OR` handling in

38633

// `TargetLowering::SimplifyDemandedVectorElts` can be improved instead.

38634

auto computeKnownBitsElementWise = [&DAG](SDValue V) {

38635

unsigned NumElts = V.getValueType().getVectorNumElements();

38636

KnownBits Known(NumElts);

38637

for (unsigned EltIdx = 0; EltIdx != NumElts; ++EltIdx) {

38638

APInt Mask = APInt::getOneBitSet(NumElts, EltIdx);

38639

KnownBits PeepholeKnown = DAG.computeKnownBits(V, Mask);

38640

if (PeepholeKnown.isZero())

38641

Known.Zero.setBit(EltIdx);

38642

if (PeepholeKnown.isAllOnes())

38643

Known.One.setBit(EltIdx);

38644

}

38645

return Known;

38646

};

38647

38648

KnownBits V1Known = computeKnownBitsElementWise(V1);

38649

KnownBits V2Known = computeKnownBitsElementWise(V2);

38650

38651

for (unsigned i = 0; i != NumMaskElts && IsBlend; ++i) {

38652

int M = Mask[i];

38653

if (M == SM_SentinelUndef)

38654

continue;

38655

if (M == SM_SentinelZero) {

38656

IsBlend &= V1Known.Zero[i] && V2Known.Zero[i];

38657

continue;

38658

}

38659

if (M == (int)i) {

38660

IsBlend &= V2Known.Zero[i] || V1Known.One[i];

38661

continue;

38662

}

38663

if (M == (int)(i + NumMaskElts)) {

38664

IsBlend &= V1Known.Zero[i] || V2Known.One[i];

38665

continue;

38666

}

38667

llvm_unreachable("will not get here.")::llvm::llvm_unreachable_internal("will not get here.", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 38667);

38668

}

38669

if (IsBlend) {

38670

Shuffle = ISD::OR;

38671

SrcVT = DstVT = MaskVT.changeTypeToInteger();

38672

return true;

38673

}

38674

}

38675

}

38676

}

38677

38678

return false;

38679

}

38680

38681

static bool matchBinaryPermuteShuffle(

38682

MVT MaskVT, ArrayRef<int> Mask, const APInt &Zeroable,

38683

bool AllowFloatDomain, bool AllowIntDomain, SDValue &V1, SDValue &V2,

38684

const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget,

38685

unsigned &Shuffle, MVT &ShuffleVT, unsigned &PermuteImm) {

38686

unsigned NumMaskElts = Mask.size();

38687

unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();

38688

38689

// Attempt to match against VALIGND/VALIGNQ rotate.

38690

if (AllowIntDomain && (EltSizeInBits == 64 || EltSizeInBits == 32) &&

38691

((MaskVT.is128BitVector() && Subtarget.hasVLX()) ||

38692

(MaskVT.is256BitVector() && Subtarget.hasVLX()) ||

38693

(MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {

38694

if (!isAnyZero(Mask)) {

38695

int Rotation = matchShuffleAsElementRotate(V1, V2, Mask);

38696

if (0 < Rotation) {

38697

Shuffle = X86ISD::VALIGN;

38698

if (EltSizeInBits == 64)

38699

ShuffleVT = MVT::getVectorVT(MVT::i64, MaskVT.getSizeInBits() / 64);

38700

else

38701

ShuffleVT = MVT::getVectorVT(MVT::i32, MaskVT.getSizeInBits() / 32);

38702

PermuteImm = Rotation;

38703

return true;

38704

}

38705

}

38706

}

38707

38708

// Attempt to match against PALIGNR byte rotate.

38709

if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSSE3()) ||

38710

(MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||

38711

(MaskVT.is512BitVector() && Subtarget.hasBWI()))) {

38712

int ByteRotation = matchShuffleAsByteRotate(MaskVT, V1, V2, Mask);

38713

if (0 < ByteRotation) {

38714

Shuffle = X86ISD::PALIGNR;

38715

ShuffleVT = MVT::getVectorVT(MVT::i8, MaskVT.getSizeInBits() / 8);

38716

PermuteImm = ByteRotation;

38717

return true;

38718

}

38719

}

38720

38721

// Attempt to combine to X86ISD::BLENDI.

38722

if ((NumMaskElts <= 8 && ((Subtarget.hasSSE41() && MaskVT.is128BitVector()) ||

38723

(Subtarget.hasAVX() && MaskVT.is256BitVector()))) ||

38724

(MaskVT == MVT::v16i16 && Subtarget.hasAVX2())) {

38725

uint64_t BlendMask = 0;

38726

bool ForceV1Zero = false, ForceV2Zero = false;

38727

SmallVector<int, 8> TargetMask(Mask);

38728

if (matchShuffleAsBlend(V1, V2, TargetMask, Zeroable, ForceV1Zero,

38729

ForceV2Zero, BlendMask)) {

38730

if (MaskVT == MVT::v16i16) {

38731

// We can only use v16i16 PBLENDW if the lanes are repeated.

38732

SmallVector<int, 8> RepeatedMask;

38733

if (isRepeatedTargetShuffleMask(128, MaskVT, TargetMask,

38734

RepeatedMask)) {

38735

assert(RepeatedMask.size() == 8 &&(static_cast <bool> (RepeatedMask.size() == 8 &&
"Repeated mask size doesn't match!") ? void (0) : __assert_fail
("RepeatedMask.size() == 8 && \"Repeated mask size doesn't match!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38736, __extension__
__PRETTY_FUNCTION__))

38736

"Repeated mask size doesn't match!")(static_cast <bool> (RepeatedMask.size() == 8 &&
"Repeated mask size doesn't match!") ? void (0) : __assert_fail
("RepeatedMask.size() == 8 && \"Repeated mask size doesn't match!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38736, __extension__
__PRETTY_FUNCTION__));

38737

PermuteImm = 0;

38738

for (int i = 0; i < 8; ++i)

38739

if (RepeatedMask[i] >= 8)

38740

PermuteImm |= 1 << i;

38741

V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;

38742

V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;

38743

Shuffle = X86ISD::BLENDI;

38744

ShuffleVT = MaskVT;

38745

return true;

38746

}

38747

} else {

38748

V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;

38749

V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;

38750

PermuteImm = (unsigned)BlendMask;

38751

Shuffle = X86ISD::BLENDI;

38752

ShuffleVT = MaskVT;

38753

return true;

38754

}

38755

}

38756

}

38757

38758

// Attempt to combine to INSERTPS, but only if it has elements that need to

38759

// be set to zero.

38760

if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&

38761

MaskVT.is128BitVector() && isAnyZero(Mask) &&

38762

matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {

38763

Shuffle = X86ISD::INSERTPS;

38764

ShuffleVT = MVT::v4f32;

38765

return true;

38766

}

38767

38768

// Attempt to combine to SHUFPD.

38769

if (AllowFloatDomain && EltSizeInBits == 64 &&

38770

((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||

38771

(MaskVT.is256BitVector() && Subtarget.hasAVX()) ||

38772

(MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {

38773

bool ForceV1Zero = false, ForceV2Zero = false;

38774

if (matchShuffleWithSHUFPD(MaskVT, V1, V2, ForceV1Zero, ForceV2Zero,

38775

PermuteImm, Mask, Zeroable)) {

38776

V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;

38777

V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;

38778

Shuffle = X86ISD::SHUFP;

38779

ShuffleVT = MVT::getVectorVT(MVT::f64, MaskVT.getSizeInBits() / 64);

38780

return true;

38781

}

38782

}

38783

38784

// Attempt to combine to SHUFPS.

38785

if (AllowFloatDomain && EltSizeInBits == 32 &&

38786

((MaskVT.is128BitVector() && Subtarget.hasSSE1()) ||

38787

(MaskVT.is256BitVector() && Subtarget.hasAVX()) ||

38788

(MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {

38789

SmallVector<int, 4> RepeatedMask;

38790

if (isRepeatedTargetShuffleMask(128, MaskVT, Mask, RepeatedMask)) {

38791

// Match each half of the repeated mask, to determine if its just

38792

// referencing one of the vectors, is zeroable or entirely undef.

38793

auto MatchHalf = [&](unsigned Offset, int &S0, int &S1) {

38794

int M0 = RepeatedMask[Offset];

38795

int M1 = RepeatedMask[Offset + 1];

38796

38797

if (isUndefInRange(RepeatedMask, Offset, 2)) {

38798

return DAG.getUNDEF(MaskVT);

38799

} else if (isUndefOrZeroInRange(RepeatedMask, Offset, 2)) {

38800

S0 = (SM_SentinelUndef == M0 ? -1 : 0);

38801

S1 = (SM_SentinelUndef == M1 ? -1 : 1);

38802

return getZeroVector(MaskVT, Subtarget, DAG, DL);

38803

} else if (isUndefOrInRange(M0, 0, 4) && isUndefOrInRange(M1, 0, 4)) {

38804

S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);

38805

S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);

38806

return V1;

38807

} else if (isUndefOrInRange(M0, 4, 8) && isUndefOrInRange(M1, 4, 8)) {

38808

S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);

38809

S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);

38810

return V2;

38811

}

38812

38813

return SDValue();

38814

};

38815

38816

int ShufMask[4] = {-1, -1, -1, -1};

38817

SDValue Lo = MatchHalf(0, ShufMask[0], ShufMask[1]);

38818

SDValue Hi = MatchHalf(2, ShufMask[2], ShufMask[3]);

38819

38820

if (Lo && Hi) {

38821

V1 = Lo;

38822

V2 = Hi;

38823

Shuffle = X86ISD::SHUFP;

38824

ShuffleVT = MVT::getVectorVT(MVT::f32, MaskVT.getSizeInBits() / 32);

38825

PermuteImm = getV4X86ShuffleImm(ShufMask);

38826

return true;

38827

}

38828

}

38829

}

38830

38831

// Attempt to combine to INSERTPS more generally if X86ISD::SHUFP failed.

38832

if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&

38833

MaskVT.is128BitVector() &&

38834

matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {

38835

Shuffle = X86ISD::INSERTPS;

38836

ShuffleVT = MVT::v4f32;

38837

return true;

38838

}

38839

38840

return false;

38841

}

38842

38843

static SDValue combineX86ShuffleChainWithExtract(

38844

ArrayRef<SDValue> Inputs, SDValue Root, ArrayRef<int> BaseMask, int Depth,

38845

bool HasVariableMask, bool AllowVariableCrossLaneMask,

38846

bool AllowVariablePerLaneMask, SelectionDAG &DAG,

38847

const X86Subtarget &Subtarget);

38848

38849

/// Combine an arbitrary chain of shuffles into a single instruction if

38850

/// possible.

38851

///

38852

/// This is the leaf of the recursive combine below. When we have found some

38853

/// chain of single-use x86 shuffle instructions and accumulated the combined

38854

/// shuffle mask represented by them, this will try to pattern match that mask

38855

/// into either a single instruction if there is a special purpose instruction

38856

/// for this operation, or into a PSHUFB instruction which is a fully general

38857

/// instruction but should only be used to replace chains over a certain depth.

38858

static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,

38859

ArrayRef<int> BaseMask, int Depth,

38860

bool HasVariableMask,

38861

bool AllowVariableCrossLaneMask,

38862

bool AllowVariablePerLaneMask,

38863

SelectionDAG &DAG,

38864

const X86Subtarget &Subtarget) {

38865

assert(!BaseMask.empty() && "Cannot combine an empty shuffle mask!")(static_cast <bool> (!BaseMask.empty() && "Cannot combine an empty shuffle mask!"
) ? void (0) : __assert_fail ("!BaseMask.empty() && \"Cannot combine an empty shuffle mask!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38865, __extension__
__PRETTY_FUNCTION__));

38866

assert((Inputs.size() == 1 || Inputs.size() == 2) &&(static_cast <bool> ((Inputs.size() == 1 || Inputs.size
() == 2) && "Unexpected number of shuffle inputs!") ?
void (0) : __assert_fail ("(Inputs.size() == 1 || Inputs.size() == 2) && \"Unexpected number of shuffle inputs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38867, __extension__
__PRETTY_FUNCTION__))

38867

"Unexpected number of shuffle inputs!")(static_cast <bool> ((Inputs.size() == 1 || Inputs.size
() == 2) && "Unexpected number of shuffle inputs!") ?
void (0) : __assert_fail ("(Inputs.size() == 1 || Inputs.size() == 2) && \"Unexpected number of shuffle inputs!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38867, __extension__
__PRETTY_FUNCTION__));

38868

38869

SDLoc DL(Root);

38870

MVT RootVT = Root.getSimpleValueType();

38871

unsigned RootSizeInBits = RootVT.getSizeInBits();

38872

unsigned NumRootElts = RootVT.getVectorNumElements();

38873

38874

// Canonicalize shuffle input op to the requested type.

38875

auto CanonicalizeShuffleInput = [&](MVT VT, SDValue Op) {

38876

if (VT.getSizeInBits() > Op.getValueSizeInBits())

38877

Op = widenSubVector(Op, false, Subtarget, DAG, DL, VT.getSizeInBits());

38878

else if (VT.getSizeInBits() < Op.getValueSizeInBits())

38879

Op = extractSubVector(Op, 0, DAG, DL, VT.getSizeInBits());

38880

return DAG.getBitcast(VT, Op);

38881

};

38882

38883

// Find the inputs that enter the chain. Note that multiple uses are OK

38884

// here, we're not going to remove the operands we find.

38885

bool UnaryShuffle = (Inputs.size() == 1);

38886

SDValue V1 = peekThroughBitcasts(Inputs[0]);

38887

SDValue V2 = (UnaryShuffle ? DAG.getUNDEF(V1.getValueType())

38888

: peekThroughBitcasts(Inputs[1]));

38889

38890

MVT VT1 = V1.getSimpleValueType();

38891

MVT VT2 = V2.getSimpleValueType();

38892

assert((RootSizeInBits % VT1.getSizeInBits()) == 0 &&(static_cast <bool> ((RootSizeInBits % VT1.getSizeInBits
()) == 0 && (RootSizeInBits % VT2.getSizeInBits()) ==
0 && "Vector size mismatch") ? void (0) : __assert_fail
("(RootSizeInBits % VT1.getSizeInBits()) == 0 && (RootSizeInBits % VT2.getSizeInBits()) == 0 && \"Vector size mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38893, __extension__
__PRETTY_FUNCTION__))

38893

(RootSizeInBits % VT2.getSizeInBits()) == 0 && "Vector size mismatch")(static_cast <bool> ((RootSizeInBits % VT1.getSizeInBits
()) == 0 && (RootSizeInBits % VT2.getSizeInBits()) ==
0 && "Vector size mismatch") ? void (0) : __assert_fail
("(RootSizeInBits % VT1.getSizeInBits()) == 0 && (RootSizeInBits % VT2.getSizeInBits()) == 0 && \"Vector size mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38893, __extension__
__PRETTY_FUNCTION__));

38894

38895

SDValue Res;

38896

38897

unsigned NumBaseMaskElts = BaseMask.size();

38898

if (NumBaseMaskElts == 1) {

38899

assert(BaseMask[0] == 0 && "Invalid shuffle index found!")(static_cast <bool> (BaseMask[0] == 0 && "Invalid shuffle index found!"
) ? void (0) : __assert_fail ("BaseMask[0] == 0 && \"Invalid shuffle index found!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38899, __extension__
__PRETTY_FUNCTION__));

38900

return CanonicalizeShuffleInput(RootVT, V1);

38901

}

38902

38903

bool OptForSize = DAG.shouldOptForSize();

38904

unsigned BaseMaskEltSizeInBits = RootSizeInBits / NumBaseMaskElts;

38905

bool FloatDomain = VT1.isFloatingPoint() || VT2.isFloatingPoint() ||

38906

(RootVT.isFloatingPoint() && Depth >= 1) ||

38907

(RootVT.is256BitVector() && !Subtarget.hasAVX2());

38908

38909

// Don't combine if we are a AVX512/EVEX target and the mask element size

38910

// is different from the root element size - this would prevent writemasks

38911

// from being reused.

38912

bool IsMaskedShuffle = false;

38913

if (RootSizeInBits == 512 || (Subtarget.hasVLX() && RootSizeInBits >= 128)) {

38914

if (Root.hasOneUse() && Root->use_begin()->getOpcode() == ISD::VSELECT &&

38915

Root->use_begin()->getOperand(0).getScalarValueSizeInBits() == 1) {

38916

IsMaskedShuffle = true;

38917

}

38918

}

38919

38920

// If we are shuffling a splat (and not introducing zeros) then we can just

38921

// use it directly. This works for smaller elements as well as they already

38922

// repeat across each mask element.

38923

if (UnaryShuffle && !isAnyZero(BaseMask) &&

38924

V1.getValueSizeInBits() >= RootSizeInBits &&

38925

(BaseMaskEltSizeInBits % V1.getScalarValueSizeInBits()) == 0 &&

38926

DAG.isSplatValue(V1, /*AllowUndefs*/ false)) {

38927

return CanonicalizeShuffleInput(RootVT, V1);

38928

}

38929

38930

SmallVector<int, 64> Mask(BaseMask);

38931

38932

// See if the shuffle is a hidden identity shuffle - repeated args in HOPs

38933

// etc. can be simplified.

38934

if (VT1 == VT2 && VT1.getSizeInBits() == RootSizeInBits && VT1.isVector()) {

38935

SmallVector<int> ScaledMask, IdentityMask;

38936

unsigned NumElts = VT1.getVectorNumElements();

38937

if (Mask.size() <= NumElts &&

38938

scaleShuffleElements(Mask, NumElts, ScaledMask)) {

38939

for (unsigned i = 0; i != NumElts; ++i)

38940

IdentityMask.push_back(i);

38941

if (isTargetShuffleEquivalent(RootVT, ScaledMask, IdentityMask, DAG, V1,

38942

V2))

38943

return CanonicalizeShuffleInput(RootVT, V1);

38944

}

38945

}

38946

38947

// Handle 128/256-bit lane shuffles of 512-bit vectors.

38948

if (RootVT.is512BitVector() &&

38949

(NumBaseMaskElts == 2 || NumBaseMaskElts == 4)) {

38950

// If the upper subvectors are zeroable, then an extract+insert is more

38951

// optimal than using X86ISD::SHUF128. The insertion is free, even if it has

38952

// to zero the upper subvectors.

38953

if (isUndefOrZeroInRange(Mask, 1, NumBaseMaskElts - 1)) {

38954

if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)

38955

return SDValue(); // Nothing to do!

38956

assert(isInRange(Mask[0], 0, NumBaseMaskElts) &&(static_cast <bool> (isInRange(Mask[0], 0, NumBaseMaskElts
) && "Unexpected lane shuffle") ? void (0) : __assert_fail
("isInRange(Mask[0], 0, NumBaseMaskElts) && \"Unexpected lane shuffle\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38957, __extension__
__PRETTY_FUNCTION__))

38957

"Unexpected lane shuffle")(static_cast <bool> (isInRange(Mask[0], 0, NumBaseMaskElts
) && "Unexpected lane shuffle") ? void (0) : __assert_fail
("isInRange(Mask[0], 0, NumBaseMaskElts) && \"Unexpected lane shuffle\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38957, __extension__
__PRETTY_FUNCTION__));

38958

Res = CanonicalizeShuffleInput(RootVT, V1);

38959

unsigned SubIdx = Mask[0] * (NumRootElts / NumBaseMaskElts);

38960

bool UseZero = isAnyZero(Mask);

38961

Res = extractSubVector(Res, SubIdx, DAG, DL, BaseMaskEltSizeInBits);

38962

return widenSubVector(Res, UseZero, Subtarget, DAG, DL, RootSizeInBits);

38963

}

38964

38965

// Narrow shuffle mask to v4x128.

38966

SmallVector<int, 4> ScaledMask;

38967

assert((BaseMaskEltSizeInBits % 128) == 0 && "Illegal mask size")(static_cast <bool> ((BaseMaskEltSizeInBits % 128) == 0
&& "Illegal mask size") ? void (0) : __assert_fail (
"(BaseMaskEltSizeInBits % 128) == 0 && \"Illegal mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38967, __extension__
__PRETTY_FUNCTION__));

38968

narrowShuffleMaskElts(BaseMaskEltSizeInBits / 128, Mask, ScaledMask);

38969

38970

// Try to lower to vshuf64x2/vshuf32x4.

38971

auto MatchSHUF128 = [&](MVT ShuffleVT, const SDLoc &DL,

38972

ArrayRef<int> ScaledMask, SDValue V1, SDValue V2,

38973

SelectionDAG &DAG) {

38974

unsigned PermMask = 0;

38975

// Insure elements came from the same Op.

38976

SDValue Ops[2] = {DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT)};

38977

for (int i = 0; i < 4; ++i) {

38978

assert(ScaledMask[i] >= -1 && "Illegal shuffle sentinel value")(static_cast <bool> (ScaledMask[i] >= -1 && "Illegal shuffle sentinel value"
) ? void (0) : __assert_fail ("ScaledMask[i] >= -1 && \"Illegal shuffle sentinel value\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 38978, __extension__
__PRETTY_FUNCTION__));

38979

if (ScaledMask[i] < 0)

38980

continue;

38981

38982

SDValue Op = ScaledMask[i] >= 4 ? V2 : V1;

38983

unsigned OpIndex = i / 2;

38984

if (Ops[OpIndex].isUndef())

38985

Ops[OpIndex] = Op;

38986

else if (Ops[OpIndex] != Op)

38987

return SDValue();

38988

38989

// Convert the 128-bit shuffle mask selection values into 128-bit

38990

// selection bits defined by a vshuf64x2 instruction's immediate control

38991

// byte.

38992

PermMask |= (ScaledMask[i] % 4) << (i * 2);

38993

}

38994

38995

return DAG.getNode(X86ISD::SHUF128, DL, ShuffleVT,

38996

CanonicalizeShuffleInput(ShuffleVT, Ops[0]),

38997

CanonicalizeShuffleInput(ShuffleVT, Ops[1]),

38998

DAG.getTargetConstant(PermMask, DL, MVT::i8));

38999

};

39000

39001

// FIXME: Is there a better way to do this? is256BitLaneRepeatedShuffleMask

39002

// doesn't work because our mask is for 128 bits and we don't have an MVT

39003

// to match that.

39004

bool PreferPERMQ = UnaryShuffle && isUndefOrInRange(ScaledMask[0], 0, 2) &&

39005

isUndefOrInRange(ScaledMask[1], 0, 2) &&

39006

isUndefOrInRange(ScaledMask[2], 2, 4) &&

39007

isUndefOrInRange(ScaledMask[3], 2, 4) &&

39008

(ScaledMask[0] < 0 || ScaledMask[2] < 0 ||

39009

ScaledMask[0] == (ScaledMask[2] % 2)) &&

39010

(ScaledMask[1] < 0 || ScaledMask[3] < 0 ||

39011

ScaledMask[1] == (ScaledMask[3] % 2));

39012

39013

if (!isAnyZero(ScaledMask) && !PreferPERMQ) {

39014

if (Depth == 0 && Root.getOpcode() == X86ISD::SHUF128)

39015

return SDValue(); // Nothing to do!

39016

MVT ShuffleVT = (FloatDomain ? MVT::v8f64 : MVT::v8i64);

39017

if (SDValue V = MatchSHUF128(ShuffleVT, DL, ScaledMask, V1, V2, DAG))

39018

return DAG.getBitcast(RootVT, V);

39019

}

39020

}

39021

39022

// Handle 128-bit lane shuffles of 256-bit vectors.

39023

if (RootVT.is256BitVector() && NumBaseMaskElts == 2) {

39024

// If the upper half is zeroable, then an extract+insert is more optimal

39025

// than using X86ISD::VPERM2X128. The insertion is free, even if it has to

39026

// zero the upper half.

39027

if (isUndefOrZero(Mask[1])) {

39028

if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)

39029

return SDValue(); // Nothing to do!

39030

assert(isInRange(Mask[0], 0, 2) && "Unexpected lane shuffle")(static_cast <bool> (isInRange(Mask[0], 0, 2) &&
"Unexpected lane shuffle") ? void (0) : __assert_fail ("isInRange(Mask[0], 0, 2) && \"Unexpected lane shuffle\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39030, __extension__
__PRETTY_FUNCTION__));

39031

Res = CanonicalizeShuffleInput(RootVT, V1);

39032

Res = extract128BitVector(Res, Mask[0] * (NumRootElts / 2), DAG, DL);

39033

return widenSubVector(Res, Mask[1] == SM_SentinelZero, Subtarget, DAG, DL,

39034

256);

39035

}

39036

39037

// If we're inserting the low subvector, an insert-subvector 'concat'

39038

// pattern is quicker than VPERM2X128.

39039

// TODO: Add AVX2 support instead of VPERMQ/VPERMPD.

39040

if (BaseMask[0] == 0 && (BaseMask[1] == 0 || BaseMask[1] == 2) &&

39041

!Subtarget.hasAVX2()) {

39042

if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)

39043

return SDValue(); // Nothing to do!

39044

SDValue Lo = CanonicalizeShuffleInput(RootVT, V1);

39045

SDValue Hi = CanonicalizeShuffleInput(RootVT, BaseMask[1] == 0 ? V1 : V2);

39046

Hi = extractSubVector(Hi, 0, DAG, DL, 128);

39047

return insertSubVector(Lo, Hi, NumRootElts / 2, DAG, DL, 128);

39048

}

39049

39050

if (Depth == 0 && Root.getOpcode() == X86ISD::VPERM2X128)

39051

return SDValue(); // Nothing to do!

39052

39053

// If we have AVX2, prefer to use VPERMQ/VPERMPD for unary shuffles unless

39054

// we need to use the zeroing feature.

39055

// Prefer blends for sequential shuffles unless we are optimizing for size.

39056

if (UnaryShuffle &&

39057

!(Subtarget.hasAVX2() && isUndefOrInRange(Mask, 0, 2)) &&

39058

(OptForSize || !isSequentialOrUndefOrZeroInRange(Mask, 0, 2, 0))) {

39059

unsigned PermMask = 0;

39060

PermMask |= ((Mask[0] < 0 ? 0x8 : (Mask[0] & 1)) << 0);

39061

PermMask |= ((Mask[1] < 0 ? 0x8 : (Mask[1] & 1)) << 4);

39062

return DAG.getNode(

39063

X86ISD::VPERM2X128, DL, RootVT, CanonicalizeShuffleInput(RootVT, V1),

39064

DAG.getUNDEF(RootVT), DAG.getTargetConstant(PermMask, DL, MVT::i8));

39065

}

39066

39067

if (Depth == 0 && Root.getOpcode() == X86ISD::SHUF128)

39068

return SDValue(); // Nothing to do!

39069

39070

// TODO - handle AVX512VL cases with X86ISD::SHUF128.

39071

if (!UnaryShuffle && !IsMaskedShuffle) {

39072

assert(llvm::all_of(Mask, [](int M) { return 0 <= M && M < 4; }) &&(static_cast <bool> (llvm::all_of(Mask, [](int M) { return
0 <= M && M < 4; }) && "Unexpected shuffle sentinel value"
) ? void (0) : __assert_fail ("llvm::all_of(Mask, [](int M) { return 0 <= M && M < 4; }) && \"Unexpected shuffle sentinel value\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39073, __extension__
__PRETTY_FUNCTION__))

39073

"Unexpected shuffle sentinel value")(static_cast <bool> (llvm::all_of(Mask, [](int M) { return
0 <= M && M < 4; }) && "Unexpected shuffle sentinel value"
) ? void (0) : __assert_fail ("llvm::all_of(Mask, [](int M) { return 0 <= M && M < 4; }) && \"Unexpected shuffle sentinel value\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39073, __extension__
__PRETTY_FUNCTION__));

39074

// Prefer blends to X86ISD::VPERM2X128.

39075

if (!((Mask[0] == 0 && Mask[1] == 3) || (Mask[0] == 2 && Mask[1] == 1))) {

39076

unsigned PermMask = 0;

39077

PermMask |= ((Mask[0] & 3) << 0);

39078

PermMask |= ((Mask[1] & 3) << 4);

39079

SDValue LHS = isInRange(Mask[0], 0, 2) ? V1 : V2;

39080

SDValue RHS = isInRange(Mask[1], 0, 2) ? V1 : V2;

39081

return DAG.getNode(X86ISD::VPERM2X128, DL, RootVT,

39082

CanonicalizeShuffleInput(RootVT, LHS),

39083

CanonicalizeShuffleInput(RootVT, RHS),

39084

DAG.getTargetConstant(PermMask, DL, MVT::i8));

39085

}

39086

}

39087

}

39088

39089

// For masks that have been widened to 128-bit elements or more,

39090

// narrow back down to 64-bit elements.

39091

if (BaseMaskEltSizeInBits > 64) {

39092

assert((BaseMaskEltSizeInBits % 64) == 0 && "Illegal mask size")(static_cast <bool> ((BaseMaskEltSizeInBits % 64) == 0 &&
"Illegal mask size") ? void (0) : __assert_fail ("(BaseMaskEltSizeInBits % 64) == 0 && \"Illegal mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39092, __extension__
__PRETTY_FUNCTION__));

39093

int MaskScale = BaseMaskEltSizeInBits / 64;

39094

SmallVector<int, 64> ScaledMask;

39095

narrowShuffleMaskElts(MaskScale, Mask, ScaledMask);

39096

Mask = std::move(ScaledMask);

39097

}

39098

39099

// For masked shuffles, we're trying to match the root width for better

39100

// writemask folding, attempt to scale the mask.

39101

// TODO - variable shuffles might need this to be widened again.

39102

if (IsMaskedShuffle && NumRootElts > Mask.size()) {

39103

assert((NumRootElts % Mask.size()) == 0 && "Illegal mask size")(static_cast <bool> ((NumRootElts % Mask.size()) == 0 &&
"Illegal mask size") ? void (0) : __assert_fail ("(NumRootElts % Mask.size()) == 0 && \"Illegal mask size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39103, __extension__
__PRETTY_FUNCTION__));

39104

int MaskScale = NumRootElts / Mask.size();

39105

SmallVector<int, 64> ScaledMask;

39106

narrowShuffleMaskElts(MaskScale, Mask, ScaledMask);

39107

Mask = std::move(ScaledMask);

39108

}

39109

39110

unsigned NumMaskElts = Mask.size();

39111

unsigned MaskEltSizeInBits = RootSizeInBits / NumMaskElts;

39112

39113

// Determine the effective mask value type.

39114

FloatDomain &= (32 <= MaskEltSizeInBits);

39115

MVT MaskVT = FloatDomain ? MVT::getFloatingPointVT(MaskEltSizeInBits)

39116

: MVT::getIntegerVT(MaskEltSizeInBits);

39117

MaskVT = MVT::getVectorVT(MaskVT, NumMaskElts);

39118

39119

// Only allow legal mask types.

39120

if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT))

39121

return SDValue();

39122

39123

// Attempt to match the mask against known shuffle patterns.

39124

MVT ShuffleSrcVT, ShuffleVT;

39125

unsigned Shuffle, PermuteImm;

39126

39127

// Which shuffle domains are permitted?

39128

// Permit domain crossing at higher combine depths.

39129

// TODO: Should we indicate which domain is preferred if both are allowed?

39130

bool AllowFloatDomain = FloatDomain || (Depth >= 3);

39131

bool AllowIntDomain = (!FloatDomain || (Depth >= 3)) && Subtarget.hasSSE2() &&

39132

(!MaskVT.is256BitVector() || Subtarget.hasAVX2());

39133

39134

// Determine zeroable mask elements.

39135

APInt KnownUndef, KnownZero;

39136

resolveZeroablesFromTargetShuffle(Mask, KnownUndef, KnownZero);

39137

APInt Zeroable = KnownUndef | KnownZero;

39138

39139

if (UnaryShuffle) {

39140

// Attempt to match against broadcast-from-vector.

39141

// Limit AVX1 to cases where we're loading+broadcasting a scalar element.

39142

if ((Subtarget.hasAVX2() ||

39143

(Subtarget.hasAVX() && 32 <= MaskEltSizeInBits)) &&

39144

(!IsMaskedShuffle || NumRootElts == NumMaskElts)) {

39145

if (isUndefOrEqual(Mask, 0)) {

39146

if (V1.getValueType() == MaskVT &&

39147

V1.getOpcode() == ISD::SCALAR_TO_VECTOR &&

39148

X86::mayFoldLoad(V1.getOperand(0), Subtarget)) {

39149

if (Depth == 0 && Root.getOpcode() == X86ISD::VBROADCAST)

39150

return SDValue(); // Nothing to do!

39151

Res = V1.getOperand(0);

39152

Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res);

39153

return DAG.getBitcast(RootVT, Res);

39154

}

39155

if (Subtarget.hasAVX2()) {

39156

if (Depth == 0 && Root.getOpcode() == X86ISD::VBROADCAST)

39157

return SDValue(); // Nothing to do!

39158

Res = CanonicalizeShuffleInput(MaskVT, V1);

39159

Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res);

39160

return DAG.getBitcast(RootVT, Res);

39161

}

39162

}

39163

}

39164

39165

if (matchUnaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, V1,

39166

DAG, Subtarget, Shuffle, ShuffleSrcVT, ShuffleVT) &&

39167

(!IsMaskedShuffle ||

39168

(NumRootElts == ShuffleVT.getVectorNumElements()))) {

39169

if (Depth == 0 && Root.getOpcode() == Shuffle)

39170

return SDValue(); // Nothing to do!

39171

Res = CanonicalizeShuffleInput(ShuffleSrcVT, V1);

39172

Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res);

39173

return DAG.getBitcast(RootVT, Res);

39174

}

39175

39176

if (matchUnaryPermuteShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,

39177

AllowIntDomain, DAG, Subtarget, Shuffle, ShuffleVT,

39178

PermuteImm) &&

39179

(!IsMaskedShuffle ||

39180

(NumRootElts == ShuffleVT.getVectorNumElements()))) {

39181

if (Depth == 0 && Root.getOpcode() == Shuffle)

39182

return SDValue(); // Nothing to do!

39183

Res = CanonicalizeShuffleInput(ShuffleVT, V1);

39184

Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res,

39185

DAG.getTargetConstant(PermuteImm, DL, MVT::i8));

39186

return DAG.getBitcast(RootVT, Res);

39187

}

39188

}

39189

39190

// Attempt to combine to INSERTPS, but only if the inserted element has come

39191

// from a scalar.

39192

// TODO: Handle other insertions here as well?

39193

if (!UnaryShuffle && AllowFloatDomain && RootSizeInBits == 128 &&

39194

Subtarget.hasSSE41() &&

39195

!isTargetShuffleEquivalent(MaskVT, Mask, {4, 1, 2, 3}, DAG)) {

39196

if (MaskEltSizeInBits == 32) {

39197

SDValue SrcV1 = V1, SrcV2 = V2;

39198

if (matchShuffleAsInsertPS(SrcV1, SrcV2, PermuteImm, Zeroable, Mask,

39199

DAG) &&

39200

SrcV2.getOpcode() == ISD::SCALAR_TO_VECTOR) {

39201

if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTPS)

39202

return SDValue(); // Nothing to do!

39203

Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32,

39204

CanonicalizeShuffleInput(MVT::v4f32, SrcV1),

39205

CanonicalizeShuffleInput(MVT::v4f32, SrcV2),

39206

DAG.getTargetConstant(PermuteImm, DL, MVT::i8));

39207

return DAG.getBitcast(RootVT, Res);

39208

}

39209

}

39210

if (MaskEltSizeInBits == 64 &&

39211

isTargetShuffleEquivalent(MaskVT, Mask, {0, 2}, DAG) &&

39212

V2.getOpcode() == ISD::SCALAR_TO_VECTOR &&

39213

V2.getScalarValueSizeInBits() <= 32) {

39214

if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTPS)

39215

return SDValue(); // Nothing to do!

39216

PermuteImm = (/*DstIdx*/ 2 << 4) | (/*SrcIdx*/ 0 << 0);

39217

Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32,

39218

CanonicalizeShuffleInput(MVT::v4f32, V1),

39219

CanonicalizeShuffleInput(MVT::v4f32, V2),

39220

DAG.getTargetConstant(PermuteImm, DL, MVT::i8));

39221

return DAG.getBitcast(RootVT, Res);

39222

}

39223

}

39224

39225

SDValue NewV1 = V1; // Save operands in case early exit happens.

39226

SDValue NewV2 = V2;

39227

if (matchBinaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, NewV1,

39228

NewV2, DL, DAG, Subtarget, Shuffle, ShuffleSrcVT,

39229

ShuffleVT, UnaryShuffle) &&

39230

(!IsMaskedShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {

39231

if (Depth == 0 && Root.getOpcode() == Shuffle)

39232

return SDValue(); // Nothing to do!

39233

NewV1 = CanonicalizeShuffleInput(ShuffleSrcVT, NewV1);

39234

NewV2 = CanonicalizeShuffleInput(ShuffleSrcVT, NewV2);

39235

Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2);

39236

return DAG.getBitcast(RootVT, Res);

39237

}

39238

39239

NewV1 = V1; // Save operands in case early exit happens.

39240

NewV2 = V2;

39241

if (matchBinaryPermuteShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,

39242

AllowIntDomain, NewV1, NewV2, DL, DAG,

39243

Subtarget, Shuffle, ShuffleVT, PermuteImm) &&

39244

(!IsMaskedShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {

39245

if (Depth == 0 && Root.getOpcode() == Shuffle)

39246

return SDValue(); // Nothing to do!

39247

NewV1 = CanonicalizeShuffleInput(ShuffleVT, NewV1);

39248

NewV2 = CanonicalizeShuffleInput(ShuffleVT, NewV2);

39249

Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2,

39250

DAG.getTargetConstant(PermuteImm, DL, MVT::i8));

39251

return DAG.getBitcast(RootVT, Res);

39252

}

39253

39254

// Typically from here on, we need an integer version of MaskVT.

39255

MVT IntMaskVT = MVT::getIntegerVT(MaskEltSizeInBits);

39256

IntMaskVT = MVT::getVectorVT(IntMaskVT, NumMaskElts);

39257

39258

// Annoyingly, SSE4A instructions don't map into the above match helpers.

39259

if (Subtarget.hasSSE4A() && AllowIntDomain && RootSizeInBits == 128) {

39260

uint64_t BitLen, BitIdx;

39261

if (matchShuffleAsEXTRQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx,

39262

Zeroable)) {

39263

if (Depth == 0 && Root.getOpcode() == X86ISD::EXTRQI)

39264

return SDValue(); // Nothing to do!

39265

V1 = CanonicalizeShuffleInput(IntMaskVT, V1);

39266

Res = DAG.getNode(X86ISD::EXTRQI, DL, IntMaskVT, V1,

39267

DAG.getTargetConstant(BitLen, DL, MVT::i8),

39268

DAG.getTargetConstant(BitIdx, DL, MVT::i8));

39269

return DAG.getBitcast(RootVT, Res);

39270

}

39271

39272

if (matchShuffleAsINSERTQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx)) {

39273

if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTQI)

39274

return SDValue(); // Nothing to do!

39275

V1 = CanonicalizeShuffleInput(IntMaskVT, V1);

39276

V2 = CanonicalizeShuffleInput(IntMaskVT, V2);

39277

Res = DAG.getNode(X86ISD::INSERTQI, DL, IntMaskVT, V1, V2,

39278

DAG.getTargetConstant(BitLen, DL, MVT::i8),

39279

DAG.getTargetConstant(BitIdx, DL, MVT::i8));

39280

return DAG.getBitcast(RootVT, Res);

39281

}

39282

}

39283

39284

// Match shuffle against TRUNCATE patterns.

39285

if (AllowIntDomain && MaskEltSizeInBits < 64 && Subtarget.hasAVX512()) {

39286

// Match against a VTRUNC instruction, accounting for src/dst sizes.

39287

if (matchShuffleAsVTRUNC(ShuffleSrcVT, ShuffleVT, IntMaskVT, Mask, Zeroable,

39288

Subtarget)) {

39289

bool IsTRUNCATE = ShuffleVT.getVectorNumElements() ==

39290

ShuffleSrcVT.getVectorNumElements();

39291

unsigned Opc =

39292

IsTRUNCATE ? (unsigned)ISD::TRUNCATE : (unsigned)X86ISD::VTRUNC;

39293

if (Depth == 0 && Root.getOpcode() == Opc)

39294

return SDValue(); // Nothing to do!

39295

V1 = CanonicalizeShuffleInput(ShuffleSrcVT, V1);

39296

Res = DAG.getNode(Opc, DL, ShuffleVT, V1);

39297

if (ShuffleVT.getSizeInBits() < RootSizeInBits)

39298

Res = widenSubVector(Res, true, Subtarget, DAG, DL, RootSizeInBits);

39299

return DAG.getBitcast(RootVT, Res);

39300

}

39301

39302

// Do we need a more general binary truncation pattern?

39303

if (RootSizeInBits < 512 &&

39304

((RootVT.is256BitVector() && Subtarget.useAVX512Regs()) ||

39305

(RootVT.is128BitVector() && Subtarget.hasVLX())) &&

39306

(MaskEltSizeInBits > 8 || Subtarget.hasBWI()) &&

39307

isSequentialOrUndefInRange(Mask, 0, NumMaskElts, 0, 2)) {

39308

// Bail if this was already a truncation or PACK node.

39309

// We sometimes fail to match PACK if we demand known undef elements.

39310

if (Depth == 0 && (Root.getOpcode() == ISD::TRUNCATE ||

39311

Root.getOpcode() == X86ISD::PACKSS ||

39312

Root.getOpcode() == X86ISD::PACKUS))

39313

return SDValue(); // Nothing to do!

39314

ShuffleSrcVT = MVT::getIntegerVT(MaskEltSizeInBits * 2);

39315

ShuffleSrcVT = MVT::getVectorVT(ShuffleSrcVT, NumMaskElts / 2);

39316

V1 = CanonicalizeShuffleInput(ShuffleSrcVT, V1);

39317

V2 = CanonicalizeShuffleInput(ShuffleSrcVT, V2);

39318

ShuffleSrcVT = MVT::getIntegerVT(MaskEltSizeInBits * 2);

39319

ShuffleSrcVT = MVT::getVectorVT(ShuffleSrcVT, NumMaskElts);

39320

Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, ShuffleSrcVT, V1, V2);

39321

Res = DAG.getNode(ISD::TRUNCATE, DL, IntMaskVT, Res);

39322

return DAG.getBitcast(RootVT, Res);

39323

}

39324

}

39325

39326

// Don't try to re-form single instruction chains under any circumstances now

39327

// that we've done encoding canonicalization for them.

39328

if (Depth < 1)

39329

return SDValue();

39330

39331

// Depth threshold above which we can efficiently use variable mask shuffles.

39332

int VariableCrossLaneShuffleDepth =

39333

Subtarget.hasFastVariableCrossLaneShuffle() ? 1 : 2;

39334

int VariablePerLaneShuffleDepth =

39335

Subtarget.hasFastVariablePerLaneShuffle() ? 1 : 2;

39336

AllowVariableCrossLaneMask &=

39337

(Depth >= VariableCrossLaneShuffleDepth) || HasVariableMask;

39338

AllowVariablePerLaneMask &=

39339

(Depth >= VariablePerLaneShuffleDepth) || HasVariableMask;

39340

// VPERMI2W/VPERMI2B are 3 uops on Skylake and Icelake so we require a

39341

// higher depth before combining them.

39342

bool AllowBWIVPERMV3 =

39343

(Depth >= (VariableCrossLaneShuffleDepth + 2) || HasVariableMask);

39344

39345

bool MaskContainsZeros = isAnyZero(Mask);

39346

39347

if (is128BitLaneCrossingShuffleMask(MaskVT, Mask)) {

39348

// If we have a single input lane-crossing shuffle then lower to VPERMV.

39349

if (UnaryShuffle && AllowVariableCrossLaneMask && !MaskContainsZeros) {

39350

if (Subtarget.hasAVX2() &&

39351

(MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) {

39352

SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);

39353

Res = CanonicalizeShuffleInput(MaskVT, V1);

39354

Res = DAG.getNode(X86ISD::VPERMV, DL, MaskVT, VPermMask, Res);

39355

return DAG.getBitcast(RootVT, Res);

39356

}

39357

// AVX512 variants (non-VLX will pad to 512-bit shuffles).

39358

if ((Subtarget.hasAVX512() &&

39359

(MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||

39360

MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||

39361

(Subtarget.hasBWI() &&

39362

(MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||

39363

(Subtarget.hasVBMI() &&

39364

(MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8))) {

39365

V1 = CanonicalizeShuffleInput(MaskVT, V1);

39366

V2 = DAG.getUNDEF(MaskVT);

39367

Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);

39368

return DAG.getBitcast(RootVT, Res);

39369

}

39370

}

39371

39372

// Lower a unary+zero lane-crossing shuffle as VPERMV3 with a zero

39373

// vector as the second source (non-VLX will pad to 512-bit shuffles).

39374

if (UnaryShuffle && AllowVariableCrossLaneMask &&

39375

((Subtarget.hasAVX512() &&

39376

(MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||

39377

MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||

39378

MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32 ||

39379

MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||

39380

(Subtarget.hasBWI() && AllowBWIVPERMV3 &&

39381

(MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||

39382

(Subtarget.hasVBMI() && AllowBWIVPERMV3 &&

39383

(MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8)))) {

39384

// Adjust shuffle mask - replace SM_SentinelZero with second source index.

39385

for (unsigned i = 0; i != NumMaskElts; ++i)

39386

if (Mask[i] == SM_SentinelZero)

39387

Mask[i] = NumMaskElts + i;

39388

V1 = CanonicalizeShuffleInput(MaskVT, V1);

39389

V2 = getZeroVector(MaskVT, Subtarget, DAG, DL);

39390

Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);

39391

return DAG.getBitcast(RootVT, Res);

39392

}

39393

39394

// If that failed and either input is extracted then try to combine as a

39395

// shuffle with the larger type.

39396

if (SDValue WideShuffle = combineX86ShuffleChainWithExtract(

39397

Inputs, Root, BaseMask, Depth, HasVariableMask,

39398

AllowVariableCrossLaneMask, AllowVariablePerLaneMask, DAG,

39399

Subtarget))

39400

return WideShuffle;

39401

39402

// If we have a dual input lane-crossing shuffle then lower to VPERMV3,

39403

// (non-VLX will pad to 512-bit shuffles).

39404

if (AllowVariableCrossLaneMask && !MaskContainsZeros &&

39405

((Subtarget.hasAVX512() &&

39406

(MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||

39407

MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||

39408

MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32 ||

39409

MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||

39410

(Subtarget.hasBWI() && AllowBWIVPERMV3 &&

39411

(MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) ||

39412

(Subtarget.hasVBMI() && AllowBWIVPERMV3 &&

39413

(MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8)))) {

39414

V1 = CanonicalizeShuffleInput(MaskVT, V1);

39415

V2 = CanonicalizeShuffleInput(MaskVT, V2);

39416

Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);

39417

return DAG.getBitcast(RootVT, Res);

39418

}

39419

return SDValue();

39420

}

39421

39422

// See if we can combine a single input shuffle with zeros to a bit-mask,

39423

// which is much simpler than any shuffle.

39424

if (UnaryShuffle && MaskContainsZeros && AllowVariablePerLaneMask &&

39425

isSequentialOrUndefOrZeroInRange(Mask, 0, NumMaskElts, 0) &&

39426

DAG.getTargetLoweringInfo().isTypeLegal(MaskVT)) {

39427

APInt Zero = APInt::getZero(MaskEltSizeInBits);

39428

APInt AllOnes = APInt::getAllOnes(MaskEltSizeInBits);

39429

APInt UndefElts(NumMaskElts, 0);

39430

SmallVector<APInt, 64> EltBits(NumMaskElts, Zero);

39431

for (unsigned i = 0; i != NumMaskElts; ++i) {

39432

int M = Mask[i];

39433

if (M == SM_SentinelUndef) {

39434

UndefElts.setBit(i);

39435

continue;

39436

}

39437

if (M == SM_SentinelZero)

39438

continue;

39439

EltBits[i] = AllOnes;

39440

}

39441

SDValue BitMask = getConstVector(EltBits, UndefElts, MaskVT, DAG, DL);

39442

Res = CanonicalizeShuffleInput(MaskVT, V1);

39443

unsigned AndOpcode =

39444

MaskVT.isFloatingPoint() ? unsigned(X86ISD::FAND) : unsigned(ISD::AND);

39445

Res = DAG.getNode(AndOpcode, DL, MaskVT, Res, BitMask);

39446

return DAG.getBitcast(RootVT, Res);

39447

}

39448

39449

// If we have a single input shuffle with different shuffle patterns in the

39450

// the 128-bit lanes use the variable mask to VPERMILPS.

39451

// TODO Combine other mask types at higher depths.

39452

if (UnaryShuffle && AllowVariablePerLaneMask && !MaskContainsZeros &&

39453

((MaskVT == MVT::v8f32 && Subtarget.hasAVX()) ||

39454

(MaskVT == MVT::v16f32 && Subtarget.hasAVX512()))) {

39455

SmallVector<SDValue, 16> VPermIdx;

39456

for (int M : Mask) {

39457

SDValue Idx =

39458

M < 0 ? DAG.getUNDEF(MVT::i32) : DAG.getConstant(M % 4, DL, MVT::i32);

39459

VPermIdx.push_back(Idx);

39460

}

39461

SDValue VPermMask = DAG.getBuildVector(IntMaskVT, DL, VPermIdx);

39462

Res = CanonicalizeShuffleInput(MaskVT, V1);

39463

Res = DAG.getNode(X86ISD::VPERMILPV, DL, MaskVT, Res, VPermMask);

39464

return DAG.getBitcast(RootVT, Res);

39465

}

39466

39467

// With XOP, binary shuffles of 128/256-bit floating point vectors can combine

39468

// to VPERMIL2PD/VPERMIL2PS.

39469

if (AllowVariablePerLaneMask && Subtarget.hasXOP() &&

39470

(MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v4f32 ||

39471

MaskVT == MVT::v8f32)) {

39472

// VPERMIL2 Operation.

39473

// Bits[3] - Match Bit.

39474

// Bits[2:1] - (Per Lane) PD Shuffle Mask.

39475

// Bits[2:0] - (Per Lane) PS Shuffle Mask.

39476

unsigned NumLanes = MaskVT.getSizeInBits() / 128;

39477

unsigned NumEltsPerLane = NumMaskElts / NumLanes;

39478

SmallVector<int, 8> VPerm2Idx;

39479

unsigned M2ZImm = 0;

39480

for (int M : Mask) {

39481

if (M == SM_SentinelUndef) {

39482

VPerm2Idx.push_back(-1);

39483

continue;

39484

}

39485

if (M == SM_SentinelZero) {

39486

M2ZImm = 2;

39487

VPerm2Idx.push_back(8);

39488

continue;

39489

}

39490

int Index = (M % NumEltsPerLane) + ((M / NumMaskElts) * NumEltsPerLane);

39491

Index = (MaskVT.getScalarSizeInBits() == 64 ? Index << 1 : Index);

39492

VPerm2Idx.push_back(Index);

39493

}

39494

V1 = CanonicalizeShuffleInput(MaskVT, V1);

39495

V2 = CanonicalizeShuffleInput(MaskVT, V2);

39496

SDValue VPerm2MaskOp = getConstVector(VPerm2Idx, IntMaskVT, DAG, DL, true);

39497

Res = DAG.getNode(X86ISD::VPERMIL2, DL, MaskVT, V1, V2, VPerm2MaskOp,

39498

DAG.getTargetConstant(M2ZImm, DL, MVT::i8));

39499

return DAG.getBitcast(RootVT, Res);

39500

}

39501

39502

// If we have 3 or more shuffle instructions or a chain involving a variable

39503

// mask, we can replace them with a single PSHUFB instruction profitably.

39504

// Intel's manuals suggest only using PSHUFB if doing so replacing 5

39505

// instructions, but in practice PSHUFB tends to be *very* fast so we're

39506

// more aggressive.

39507

if (UnaryShuffle && AllowVariablePerLaneMask &&

39508

((RootVT.is128BitVector() && Subtarget.hasSSSE3()) ||

39509

(RootVT.is256BitVector() && Subtarget.hasAVX2()) ||

39510

(RootVT.is512BitVector() && Subtarget.hasBWI()))) {

39511

SmallVector<SDValue, 16> PSHUFBMask;

39512

int NumBytes = RootVT.getSizeInBits() / 8;

39513

int Ratio = NumBytes / NumMaskElts;

39514

for (int i = 0; i < NumBytes; ++i) {

39515

int M = Mask[i / Ratio];

39516

if (M == SM_SentinelUndef) {

39517

PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8));

39518

continue;

39519

}

39520

if (M == SM_SentinelZero) {

39521

PSHUFBMask.push_back(DAG.getConstant(0x80, DL, MVT::i8));

39522

continue;

39523

}

39524

M = Ratio * M + i % Ratio;

39525

assert((M / 16) == (i / 16) && "Lane crossing detected")(static_cast <bool> ((M / 16) == (i / 16) && "Lane crossing detected"
) ? void (0) : __assert_fail ("(M / 16) == (i / 16) && \"Lane crossing detected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39525, __extension__
__PRETTY_FUNCTION__));

39526

PSHUFBMask.push_back(DAG.getConstant(M, DL, MVT::i8));

39527

}

39528

MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes);

39529

Res = CanonicalizeShuffleInput(ByteVT, V1);

39530

SDValue PSHUFBMaskOp = DAG.getBuildVector(ByteVT, DL, PSHUFBMask);

39531

Res = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, Res, PSHUFBMaskOp);

39532

return DAG.getBitcast(RootVT, Res);

39533

}

39534

39535

// With XOP, if we have a 128-bit binary input shuffle we can always combine

39536

// to VPPERM. We match the depth requirement of PSHUFB - VPPERM is never

39537

// slower than PSHUFB on targets that support both.

39538

if (AllowVariablePerLaneMask && RootVT.is128BitVector() &&

39539

Subtarget.hasXOP()) {

39540

// VPPERM Mask Operation

39541

// Bits[4:0] - Byte Index (0 - 31)

39542

// Bits[7:5] - Permute Operation (0 - Source byte, 4 - ZERO)

39543

SmallVector<SDValue, 16> VPPERMMask;

39544

int NumBytes = 16;

39545

int Ratio = NumBytes / NumMaskElts;

39546

for (int i = 0; i < NumBytes; ++i) {

39547

int M = Mask[i / Ratio];

39548

if (M == SM_SentinelUndef) {

39549

VPPERMMask.push_back(DAG.getUNDEF(MVT::i8));

39550

continue;

39551

}

39552

if (M == SM_SentinelZero) {

39553

VPPERMMask.push_back(DAG.getConstant(0x80, DL, MVT::i8));

39554

continue;

39555

}

39556

M = Ratio * M + i % Ratio;

39557

VPPERMMask.push_back(DAG.getConstant(M, DL, MVT::i8));

39558

}

39559

MVT ByteVT = MVT::v16i8;

39560

V1 = CanonicalizeShuffleInput(ByteVT, V1);

39561

V2 = CanonicalizeShuffleInput(ByteVT, V2);

39562

SDValue VPPERMMaskOp = DAG.getBuildVector(ByteVT, DL, VPPERMMask);

39563

Res = DAG.getNode(X86ISD::VPPERM, DL, ByteVT, V1, V2, VPPERMMaskOp);

39564

return DAG.getBitcast(RootVT, Res);

39565

}

39566

39567

// If that failed and either input is extracted then try to combine as a

39568

// shuffle with the larger type.

39569

if (SDValue WideShuffle = combineX86ShuffleChainWithExtract(

39570

Inputs, Root, BaseMask, Depth, HasVariableMask,

39571

AllowVariableCrossLaneMask, AllowVariablePerLaneMask, DAG, Subtarget))

39572

return WideShuffle;

39573

39574

// If we have a dual input shuffle then lower to VPERMV3,

39575

// (non-VLX will pad to 512-bit shuffles)

39576

if (!UnaryShuffle && AllowVariablePerLaneMask && !MaskContainsZeros &&

39577

((Subtarget.hasAVX512() &&

39578

(MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v8f64 ||

39579

MaskVT == MVT::v2i64 || MaskVT == MVT::v4i64 || MaskVT == MVT::v8i64 ||

39580

MaskVT == MVT::v4f32 || MaskVT == MVT::v4i32 || MaskVT == MVT::v8f32 ||

39581

MaskVT == MVT::v8i32 || MaskVT == MVT::v16f32 ||

39582

MaskVT == MVT::v16i32)) ||

39583

(Subtarget.hasBWI() && AllowBWIVPERMV3 &&

39584

(MaskVT == MVT::v8i16 || MaskVT == MVT::v16i16 ||

39585

MaskVT == MVT::v32i16)) ||

39586

(Subtarget.hasVBMI() && AllowBWIVPERMV3 &&

39587

(MaskVT == MVT::v16i8 || MaskVT == MVT::v32i8 ||

39588

MaskVT == MVT::v64i8)))) {

39589

V1 = CanonicalizeShuffleInput(MaskVT, V1);

39590

V2 = CanonicalizeShuffleInput(MaskVT, V2);

39591

Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG);

39592

return DAG.getBitcast(RootVT, Res);

39593

}

39594

39595

// Failed to find any combines.

39596

return SDValue();

39597

}

39598

39599

// Combine an arbitrary chain of shuffles + extract_subvectors into a single

39600

// instruction if possible.

39601

//

39602

// Wrapper for combineX86ShuffleChain that extends the shuffle mask to a larger

39603

// type size to attempt to combine:

39604

// shuffle(extract_subvector(x,c1),extract_subvector(y,c2),m1)

39605

// -->

39606

// extract_subvector(shuffle(x,y,m2),0)

39607

static SDValue combineX86ShuffleChainWithExtract(

39608

ArrayRef<SDValue> Inputs, SDValue Root, ArrayRef<int> BaseMask, int Depth,

39609

bool HasVariableMask, bool AllowVariableCrossLaneMask,

39610

bool AllowVariablePerLaneMask, SelectionDAG &DAG,

39611

const X86Subtarget &Subtarget) {

39612

unsigned NumMaskElts = BaseMask.size();

39613

unsigned NumInputs = Inputs.size();

39614

if (NumInputs == 0)

39615

return SDValue();

39616

39617

EVT RootVT = Root.getValueType();

39618

unsigned RootSizeInBits = RootVT.getSizeInBits();

39619

assert((RootSizeInBits % NumMaskElts) == 0 && "Unexpected root shuffle mask")(static_cast <bool> ((RootSizeInBits % NumMaskElts) == 0
&& "Unexpected root shuffle mask") ? void (0) : __assert_fail
("(RootSizeInBits % NumMaskElts) == 0 && \"Unexpected root shuffle mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39619, __extension__
__PRETTY_FUNCTION__));

39620

39621

// Bail if we have any smaller inputs.

39622

if (llvm::any_of(Inputs, [RootSizeInBits](SDValue Input) {

39623

return Input.getValueSizeInBits() < RootSizeInBits;

39624

}))

39625

return SDValue();

39626

39627

SmallVector<SDValue, 4> WideInputs(Inputs.begin(), Inputs.end());

39628

SmallVector<unsigned, 4> Offsets(NumInputs, 0);

39629

39630

// Peek through subvectors.

39631

// TODO: Support inter-mixed EXTRACT_SUBVECTORs + BITCASTs?

39632

unsigned WideSizeInBits = RootSizeInBits;

39633

for (unsigned i = 0; i != NumInputs; ++i) {

39634

SDValue &Src = WideInputs[i];

39635

unsigned &Offset = Offsets[i];

39636

Src = peekThroughBitcasts(Src);

39637

EVT BaseVT = Src.getValueType();

39638

while (Src.getOpcode() == ISD::EXTRACT_SUBVECTOR) {

39639

Offset += Src.getConstantOperandVal(1);

39640

Src = Src.getOperand(0);

39641

}

39642

WideSizeInBits = std::max(WideSizeInBits,

39643

(unsigned)Src.getValueSizeInBits());

39644

assert((Offset % BaseVT.getVectorNumElements()) == 0 &&(static_cast <bool> ((Offset % BaseVT.getVectorNumElements
()) == 0 && "Unexpected subvector extraction") ? void
(0) : __assert_fail ("(Offset % BaseVT.getVectorNumElements()) == 0 && \"Unexpected subvector extraction\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39645, __extension__
__PRETTY_FUNCTION__))

39645

"Unexpected subvector extraction")(static_cast <bool> ((Offset % BaseVT.getVectorNumElements
()) == 0 && "Unexpected subvector extraction") ? void
(0) : __assert_fail ("(Offset % BaseVT.getVectorNumElements()) == 0 && \"Unexpected subvector extraction\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39645, __extension__
__PRETTY_FUNCTION__));

39646

Offset /= BaseVT.getVectorNumElements();

39647

Offset *= NumMaskElts;

39648

}

39649

39650

// Bail if we're always extracting from the lowest subvectors,

39651

// combineX86ShuffleChain should match this for the current width.

39652

if (llvm::all_of(Offsets, [](unsigned Offset) { return Offset == 0; }))

39653

return SDValue();

39654

39655

unsigned Scale = WideSizeInBits / RootSizeInBits;

39656

assert((WideSizeInBits % RootSizeInBits) == 0 &&(static_cast <bool> ((WideSizeInBits % RootSizeInBits) ==
0 && "Unexpected subvector extraction") ? void (0) :
__assert_fail ("(WideSizeInBits % RootSizeInBits) == 0 && \"Unexpected subvector extraction\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39657, __extension__
__PRETTY_FUNCTION__))

39657

"Unexpected subvector extraction")(static_cast <bool> ((WideSizeInBits % RootSizeInBits) ==
0 && "Unexpected subvector extraction") ? void (0) :
__assert_fail ("(WideSizeInBits % RootSizeInBits) == 0 && \"Unexpected subvector extraction\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39657, __extension__
__PRETTY_FUNCTION__));

39658

39659

// If the src vector types aren't the same, see if we can extend

39660

// them to match each other.

39661

// TODO: Support different scalar types?

39662

EVT WideSVT = WideInputs[0].getValueType().getScalarType();

39663

if (llvm::any_of(WideInputs, [&WideSVT, &DAG](SDValue Op) {

39664

return !DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()) ||

39665

Op.getValueType().getScalarType() != WideSVT;

39666

}))

39667

return SDValue();

39668

39669

// Create new mask for larger type.

39670

for (unsigned i = 1; i != NumInputs; ++i)

39671

Offsets[i] += i * Scale * NumMaskElts;

39672

39673

SmallVector<int, 64> WideMask(BaseMask);

39674

for (int &M : WideMask) {

39675

if (M < 0)

39676

continue;

39677

M = (M % NumMaskElts) + Offsets[M / NumMaskElts];

39678

}

39679

WideMask.append((Scale - 1) * NumMaskElts, SM_SentinelUndef);

39680

39681

// Remove unused/repeated shuffle source ops.

39682

resolveTargetShuffleInputsAndMask(WideInputs, WideMask);

39683

assert(!WideInputs.empty() && "Shuffle with no inputs detected")(static_cast <bool> (!WideInputs.empty() && "Shuffle with no inputs detected"
) ? void (0) : __assert_fail ("!WideInputs.empty() && \"Shuffle with no inputs detected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39683, __extension__
__PRETTY_FUNCTION__));

39684

39685

if (WideInputs.size() > 2)

39686

return SDValue();

39687

39688

// Increase depth for every upper subvector we've peeked through.

39689

Depth += count_if(Offsets, [](unsigned Offset) { return Offset > 0; });

39690

39691

// Attempt to combine wider chain.

39692

// TODO: Can we use a better Root?

39693

SDValue WideRoot = WideInputs.front().getValueSizeInBits() >

39694

WideInputs.back().getValueSizeInBits()

39695

? WideInputs.front()

39696

: WideInputs.back();

39697

if (SDValue WideShuffle =

39698

combineX86ShuffleChain(WideInputs, WideRoot, WideMask, Depth,

39699

HasVariableMask, AllowVariableCrossLaneMask,

39700

AllowVariablePerLaneMask, DAG, Subtarget)) {

39701

WideShuffle =

39702

extractSubVector(WideShuffle, 0, DAG, SDLoc(Root), RootSizeInBits);

39703

return DAG.getBitcast(RootVT, WideShuffle);

39704

}

39705

return SDValue();

39706

}

39707

39708

// Canonicalize the combined shuffle mask chain with horizontal ops.

39709

// NOTE: This may update the Ops and Mask.

39710

static SDValue canonicalizeShuffleMaskWithHorizOp(

39711

MutableArrayRef<SDValue> Ops, MutableArrayRef<int> Mask,

39712

unsigned RootSizeInBits, const SDLoc &DL, SelectionDAG &DAG,

39713

const X86Subtarget &Subtarget) {

39714

if (Mask.empty() || Ops.empty())

39715

return SDValue();

39716

39717

SmallVector<SDValue> BC;

39718

for (SDValue Op : Ops)

39719

BC.push_back(peekThroughBitcasts(Op));

39720

39721

// All ops must be the same horizop + type.

39722

SDValue BC0 = BC[0];

39723

EVT VT0 = BC0.getValueType();

39724

unsigned Opcode0 = BC0.getOpcode();

39725

if (VT0.getSizeInBits() != RootSizeInBits || llvm::any_of(BC, [&](SDValue V) {

39726

return V.getOpcode() != Opcode0 || V.getValueType() != VT0;

39727

}))

39728

return SDValue();

39729

39730

bool isHoriz = (Opcode0 == X86ISD::FHADD || Opcode0 == X86ISD::HADD ||

39731

Opcode0 == X86ISD::FHSUB || Opcode0 == X86ISD::HSUB);

39732

bool isPack = (Opcode0 == X86ISD::PACKSS || Opcode0 == X86ISD::PACKUS);

39733

if (!isHoriz && !isPack)

39734

return SDValue();

39735

39736

// Do all ops have a single use?

39737

bool OneUseOps = llvm::all_of(Ops, [](SDValue Op) {

39738

return Op.hasOneUse() &&

39739

peekThroughBitcasts(Op) == peekThroughOneUseBitcasts(Op);

39740

});

39741

39742

int NumElts = VT0.getVectorNumElements();

39743

int NumLanes = VT0.getSizeInBits() / 128;

39744

int NumEltsPerLane = NumElts / NumLanes;

39745

int NumHalfEltsPerLane = NumEltsPerLane / 2;

39746

MVT SrcVT = BC0.getOperand(0).getSimpleValueType();

39747

unsigned EltSizeInBits = RootSizeInBits / Mask.size();

39748

39749

if (NumEltsPerLane >= 4 &&

39750

(isPack || shouldUseHorizontalOp(Ops.size() == 1, DAG, Subtarget))) {

39751

SmallVector<int> LaneMask, ScaledMask;

39752

if (isRepeatedTargetShuffleMask(128, EltSizeInBits, Mask, LaneMask) &&

39753

scaleShuffleElements(LaneMask, 4, ScaledMask)) {

39754

// See if we can remove the shuffle by resorting the HOP chain so that

39755

// the HOP args are pre-shuffled.

39756

// TODO: Generalize to any sized/depth chain.

39757

// TODO: Add support for PACKSS/PACKUS.

39758

if (isHoriz) {

39759

// Attempt to find a HOP(HOP(X,Y),HOP(Z,W)) source operand.

39760

auto GetHOpSrc = [&](int M) {

39761

if (M == SM_SentinelUndef)

39762

return DAG.getUNDEF(VT0);

39763

if (M == SM_SentinelZero)

39764

return getZeroVector(VT0.getSimpleVT(), Subtarget, DAG, DL);

39765

SDValue Src0 = BC[M / 4];

39766

SDValue Src1 = Src0.getOperand((M % 4) >= 2);

39767

if (Src1.getOpcode() == Opcode0 && Src0->isOnlyUserOf(Src1.getNode()))

39768

return Src1.getOperand(M % 2);

39769

return SDValue();

39770

};

39771

SDValue M0 = GetHOpSrc(ScaledMask[0]);

39772

SDValue M1 = GetHOpSrc(ScaledMask[1]);

39773

SDValue M2 = GetHOpSrc(ScaledMask[2]);

39774

SDValue M3 = GetHOpSrc(ScaledMask[3]);

39775

if (M0 && M1 && M2 && M3) {

39776

SDValue LHS = DAG.getNode(Opcode0, DL, SrcVT, M0, M1);

39777

SDValue RHS = DAG.getNode(Opcode0, DL, SrcVT, M2, M3);

39778

return DAG.getNode(Opcode0, DL, VT0, LHS, RHS);

39779

}

39780

}

39781

// shuffle(hop(x,y),hop(z,w)) -> permute(hop(x,z)) etc.

39782

if (Ops.size() >= 2) {

39783

SDValue LHS, RHS;

39784

auto GetHOpSrc = [&](int M, int &OutM) {

39785

// TODO: Support SM_SentinelZero

39786

if (M < 0)

39787

return M == SM_SentinelUndef;

39788

SDValue Src = BC[M / 4].getOperand((M % 4) >= 2);

39789

if (!LHS || LHS == Src) {

39790

LHS = Src;

39791

OutM = (M % 2);

39792

return true;

39793

}

39794

if (!RHS || RHS == Src) {

39795

RHS = Src;

39796

OutM = (M % 2) + 2;

39797

return true;

39798

}

39799

return false;

39800

};

39801

int PostMask[4] = {-1, -1, -1, -1};

39802

if (GetHOpSrc(ScaledMask[0], PostMask[0]) &&

39803

GetHOpSrc(ScaledMask[1], PostMask[1]) &&

39804

GetHOpSrc(ScaledMask[2], PostMask[2]) &&

39805

GetHOpSrc(ScaledMask[3], PostMask[3])) {

39806

LHS = DAG.getBitcast(SrcVT, LHS);

39807

RHS = DAG.getBitcast(SrcVT, RHS ? RHS : LHS);

39808

SDValue Res = DAG.getNode(Opcode0, DL, VT0, LHS, RHS);

39809

// Use SHUFPS for the permute so this will work on SSE3 targets,

39810

// shuffle combining and domain handling will simplify this later on.

39811

MVT ShuffleVT = MVT::getVectorVT(MVT::f32, RootSizeInBits / 32);

39812

Res = DAG.getBitcast(ShuffleVT, Res);

39813

return DAG.getNode(X86ISD::SHUFP, DL, ShuffleVT, Res, Res,

39814

getV4X86ShuffleImm8ForMask(PostMask, DL, DAG));

39815

}

39816

}

39817

}

39818

}

39819

39820

if (2 < Ops.size())

39821

return SDValue();

39822

39823

SDValue BC1 = BC[BC.size() - 1];

39824

if (Mask.size() == VT0.getVectorNumElements()) {

39825

// Canonicalize binary shuffles of horizontal ops that use the

39826

// same sources to an unary shuffle.

39827

// TODO: Try to perform this fold even if the shuffle remains.

39828

if (Ops.size() == 2) {

39829

auto ContainsOps = [](SDValue HOp, SDValue Op) {

39830

return Op == HOp.getOperand(0) || Op == HOp.getOperand(1);

39831

};

39832

// Commute if all BC0's ops are contained in BC1.

39833

if (ContainsOps(BC1, BC0.getOperand(0)) &&

39834

ContainsOps(BC1, BC0.getOperand(1))) {

39835

ShuffleVectorSDNode::commuteMask(Mask);

39836

std::swap(Ops[0], Ops[1]);

39837

std::swap(BC0, BC1);

39838

}

39839

39840

// If BC1 can be represented by BC0, then convert to unary shuffle.

39841

if (ContainsOps(BC0, BC1.getOperand(0)) &&

39842

ContainsOps(BC0, BC1.getOperand(1))) {

39843

for (int &M : Mask) {

39844

if (M < NumElts) // BC0 element or UNDEF/Zero sentinel.

39845

continue;

39846

int SubLane = ((M % NumEltsPerLane) >= NumHalfEltsPerLane) ? 1 : 0;

39847

M -= NumElts + (SubLane * NumHalfEltsPerLane);

39848

if (BC1.getOperand(SubLane) != BC0.getOperand(0))

39849

M += NumHalfEltsPerLane;

39850

}

39851

}

39852

}

39853

39854

// Canonicalize unary horizontal ops to only refer to lower halves.

39855

for (int i = 0; i != NumElts; ++i) {

39856

int &M = Mask[i];

39857

if (isUndefOrZero(M))

39858

continue;

39859

if (M < NumElts && BC0.getOperand(0) == BC0.getOperand(1) &&

39860

(M % NumEltsPerLane) >= NumHalfEltsPerLane)

39861

M -= NumHalfEltsPerLane;

39862

if (NumElts <= M && BC1.getOperand(0) == BC1.getOperand(1) &&

39863

(M % NumEltsPerLane) >= NumHalfEltsPerLane)

39864

M -= NumHalfEltsPerLane;

39865

}

39866

}

39867

39868

// Combine binary shuffle of 2 similar 'Horizontal' instructions into a

39869

// single instruction. Attempt to match a v2X64 repeating shuffle pattern that

39870

// represents the LHS/RHS inputs for the lower/upper halves.

39871

SmallVector<int, 16> TargetMask128, WideMask128;

39872

if (isRepeatedTargetShuffleMask(128, EltSizeInBits, Mask, TargetMask128) &&

39873

scaleShuffleElements(TargetMask128, 2, WideMask128)) {

39874

assert(isUndefOrZeroOrInRange(WideMask128, 0, 4) && "Illegal shuffle")(static_cast <bool> (isUndefOrZeroOrInRange(WideMask128
, 0, 4) && "Illegal shuffle") ? void (0) : __assert_fail
("isUndefOrZeroOrInRange(WideMask128, 0, 4) && \"Illegal shuffle\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39874, __extension__
__PRETTY_FUNCTION__));

39875

bool SingleOp = (Ops.size() == 1);

39876

if (isPack || OneUseOps ||

39877

shouldUseHorizontalOp(SingleOp, DAG, Subtarget)) {

39878

SDValue Lo = isInRange(WideMask128[0], 0, 2) ? BC0 : BC1;

39879

SDValue Hi = isInRange(WideMask128[1], 0, 2) ? BC0 : BC1;

39880

Lo = Lo.getOperand(WideMask128[0] & 1);

39881

Hi = Hi.getOperand(WideMask128[1] & 1);

39882

if (SingleOp) {

39883

SDValue Undef = DAG.getUNDEF(SrcVT);

39884

SDValue Zero = getZeroVector(SrcVT, Subtarget, DAG, DL);

39885

Lo = (WideMask128[0] == SM_SentinelZero ? Zero : Lo);

39886

Hi = (WideMask128[1] == SM_SentinelZero ? Zero : Hi);

39887

Lo = (WideMask128[0] == SM_SentinelUndef ? Undef : Lo);

39888

Hi = (WideMask128[1] == SM_SentinelUndef ? Undef : Hi);

39889

}

39890

return DAG.getNode(Opcode0, DL, VT0, Lo, Hi);

39891

}

39892

}

39893

39894

return SDValue();

39895

}

39896

39897

// Attempt to constant fold all of the constant source ops.

39898

// Returns true if the entire shuffle is folded to a constant.

39899

// TODO: Extend this to merge multiple constant Ops and update the mask.

39900

static SDValue combineX86ShufflesConstants(ArrayRef<SDValue> Ops,

39901

ArrayRef<int> Mask, SDValue Root,

39902

bool HasVariableMask,

39903

SelectionDAG &DAG,

39904

const X86Subtarget &Subtarget) {

39905

MVT VT = Root.getSimpleValueType();

39906

39907

unsigned SizeInBits = VT.getSizeInBits();

39908

unsigned NumMaskElts = Mask.size();

39909

unsigned MaskSizeInBits = SizeInBits / NumMaskElts;

39910

unsigned NumOps = Ops.size();

39911

39912

// Extract constant bits from each source op.

39913

bool OneUseConstantOp = false;

39914

SmallVector<APInt, 16> UndefEltsOps(NumOps);

39915

SmallVector<SmallVector<APInt, 16>, 16> RawBitsOps(NumOps);

39916

for (unsigned i = 0; i != NumOps; ++i) {

39917

SDValue SrcOp = Ops[i];

39918

OneUseConstantOp |= SrcOp.hasOneUse();

39919

if (!getTargetConstantBitsFromNode(SrcOp, MaskSizeInBits, UndefEltsOps[i],

39920

RawBitsOps[i]))

39921

return SDValue();

39922

}

39923

39924

// If we're optimizing for size, only fold if at least one of the constants is

39925

// only used once or the combined shuffle has included a variable mask

39926

// shuffle, this is to avoid constant pool bloat.

39927

bool IsOptimizingSize = DAG.shouldOptForSize();

39928

if (IsOptimizingSize && !OneUseConstantOp && !HasVariableMask)

39929

return SDValue();

39930

39931

// Shuffle the constant bits according to the mask.

39932

SDLoc DL(Root);

39933

APInt UndefElts(NumMaskElts, 0);

39934

APInt ZeroElts(NumMaskElts, 0);

39935

APInt ConstantElts(NumMaskElts, 0);

39936

SmallVector<APInt, 8> ConstantBitData(NumMaskElts,

39937

APInt::getZero(MaskSizeInBits));

39938

for (unsigned i = 0; i != NumMaskElts; ++i) {

39939

int M = Mask[i];

39940

if (M == SM_SentinelUndef) {

39941

UndefElts.setBit(i);

39942

continue;

39943

} else if (M == SM_SentinelZero) {

39944

ZeroElts.setBit(i);

39945

continue;

39946

}

39947

assert(0 <= M && M < (int)(NumMaskElts * NumOps))(static_cast <bool> (0 <= M && M < (int)(
NumMaskElts * NumOps)) ? void (0) : __assert_fail ("0 <= M && M < (int)(NumMaskElts * NumOps)"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 39947, __extension__
__PRETTY_FUNCTION__));

39948

39949

unsigned SrcOpIdx = (unsigned)M / NumMaskElts;

39950

unsigned SrcMaskIdx = (unsigned)M % NumMaskElts;

39951

39952

auto &SrcUndefElts = UndefEltsOps[SrcOpIdx];

39953

if (SrcUndefElts[SrcMaskIdx]) {

39954

UndefElts.setBit(i);

39955

continue;

39956

}

39957

39958

auto &SrcEltBits = RawBitsOps[SrcOpIdx];

39959

APInt &Bits = SrcEltBits[SrcMaskIdx];

39960

if (!Bits) {

39961

ZeroElts.setBit(i);

39962

continue;

39963

}

39964

39965

ConstantElts.setBit(i);

39966

ConstantBitData[i] = Bits;

39967

}

39968

39969

39970

// Attempt to create a zero vector.

39971

if ((UndefElts | ZeroElts).isAllOnes())

39972

return getZeroVector(Root.getSimpleValueType(), Subtarget, DAG, DL);

39973

39974

// Create the constant data.

39975

MVT MaskSVT;

39976

if (VT.isFloatingPoint() && (MaskSizeInBits == 32 || MaskSizeInBits == 64))

39977

MaskSVT = MVT::getFloatingPointVT(MaskSizeInBits);

39978

else

39979

MaskSVT = MVT::getIntegerVT(MaskSizeInBits);

39980

39981

MVT MaskVT = MVT::getVectorVT(MaskSVT, NumMaskElts);

39982

if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT))

39983

return SDValue();

39984

39985

SDValue CstOp = getConstVector(ConstantBitData, UndefElts, MaskVT, DAG, DL);

39986

return DAG.getBitcast(VT, CstOp);

39987

}

39988

39989

namespace llvm {

39990

namespace X86 {

39991

enum {

39992

MaxShuffleCombineDepth = 8

39993

};

39994

}

39995

} // namespace llvm

39996

39997

/// Fully generic combining of x86 shuffle instructions.

39998

///

39999

/// This should be the last combine run over the x86 shuffle instructions. Once

40000

/// they have been fully optimized, this will recursively consider all chains

40001

/// of single-use shuffle instructions, build a generic model of the cumulative

40002

/// shuffle operation, and check for simpler instructions which implement this

40003

/// operation. We use this primarily for two purposes:

40004

///

40005

/// 1) Collapse generic shuffles to specialized single instructions when

40006

/// equivalent. In most cases, this is just an encoding size win, but

40007

/// sometimes we will collapse multiple generic shuffles into a single

40008

/// special-purpose shuffle.

40009

/// 2) Look for sequences of shuffle instructions with 3 or more total

40010

/// instructions, and replace them with the slightly more expensive SSSE3

40011

/// PSHUFB instruction if available. We do this as the last combining step

40012

/// to ensure we avoid using PSHUFB if we can implement the shuffle with

40013

/// a suitable short sequence of other instructions. The PSHUFB will either

40014

/// use a register or have to read from memory and so is slightly (but only

40015

/// slightly) more expensive than the other shuffle instructions.

40016

///

40017

/// Because this is inherently a quadratic operation (for each shuffle in

40018

/// a chain, we recurse up the chain), the depth is limited to 8 instructions.

40019

/// This should never be an issue in practice as the shuffle lowering doesn't

40020

/// produce sequences of more than 8 instructions.

40021

///

40022

/// FIXME: We will currently miss some cases where the redundant shuffling

40023

/// would simplify under the threshold for PSHUFB formation because of

40024

/// combine-ordering. To fix this, we should do the redundant instruction

40025

/// combining in this recursive walk.

40026

static SDValue combineX86ShufflesRecursively(

40027

ArrayRef<SDValue> SrcOps, int SrcOpIndex, SDValue Root,

40028

ArrayRef<int> RootMask, ArrayRef<const SDNode *> SrcNodes, unsigned Depth,

40029

unsigned MaxDepth, bool HasVariableMask, bool AllowVariableCrossLaneMask,

40030

bool AllowVariablePerLaneMask, SelectionDAG &DAG,

40031

const X86Subtarget &Subtarget) {

40032

assert(RootMask.size() > 0 &&(static_cast <bool> (RootMask.size() > 0 && (
RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex
== 0)) && "Illegal shuffle root mask") ? void (0) : __assert_fail
("RootMask.size() > 0 && (RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0)) && \"Illegal shuffle root mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40034, __extension__
__PRETTY_FUNCTION__))

40033

(RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0)) &&(static_cast <bool> (RootMask.size() > 0 && (
RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex
== 0)) && "Illegal shuffle root mask") ? void (0) : __assert_fail
("RootMask.size() > 0 && (RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0)) && \"Illegal shuffle root mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40034, __extension__
__PRETTY_FUNCTION__))

40034

"Illegal shuffle root mask")(static_cast <bool> (RootMask.size() > 0 && (
RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex
== 0)) && "Illegal shuffle root mask") ? void (0) : __assert_fail
("RootMask.size() > 0 && (RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0)) && \"Illegal shuffle root mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40034, __extension__
__PRETTY_FUNCTION__));

40035

MVT RootVT = Root.getSimpleValueType();

40036

assert(RootVT.isVector() && "Shuffles operate on vector types!")(static_cast <bool> (RootVT.isVector() && "Shuffles operate on vector types!"
) ? void (0) : __assert_fail ("RootVT.isVector() && \"Shuffles operate on vector types!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40036, __extension__
__PRETTY_FUNCTION__));

40037

unsigned RootSizeInBits = RootVT.getSizeInBits();

40038

40039

// Bound the depth of our recursive combine because this is ultimately

40040

// quadratic in nature.

40041

if (Depth >= MaxDepth)

40042

return SDValue();

40043

40044

// Directly rip through bitcasts to find the underlying operand.

40045

SDValue Op = SrcOps[SrcOpIndex];

40046

Op = peekThroughOneUseBitcasts(Op);

40047

40048

EVT VT = Op.getValueType();

40049

if (!VT.isVector() || !VT.isSimple())

40050

return SDValue(); // Bail if we hit a non-simple non-vector.

40051

40052

// FIXME: Just bail on f16 for now.

40053

if (VT.getVectorElementType() == MVT::f16)

40054

return SDValue();

40055

40056

assert((RootSizeInBits % VT.getSizeInBits()) == 0 &&(static_cast <bool> ((RootSizeInBits % VT.getSizeInBits
()) == 0 && "Can only combine shuffles upto size of the root op."
) ? void (0) : __assert_fail ("(RootSizeInBits % VT.getSizeInBits()) == 0 && \"Can only combine shuffles upto size of the root op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40057, __extension__
__PRETTY_FUNCTION__))

40057

"Can only combine shuffles upto size of the root op.")(static_cast <bool> ((RootSizeInBits % VT.getSizeInBits
()) == 0 && "Can only combine shuffles upto size of the root op."
) ? void (0) : __assert_fail ("(RootSizeInBits % VT.getSizeInBits()) == 0 && \"Can only combine shuffles upto size of the root op.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40057, __extension__
__PRETTY_FUNCTION__));

40058

40059

// Create a demanded elts mask from the referenced elements of Op.

40060

APInt OpDemandedElts = APInt::getZero(RootMask.size());

40061

for (int M : RootMask) {

40062

int BaseIdx = RootMask.size() * SrcOpIndex;

40063

if (isInRange(M, BaseIdx, BaseIdx + RootMask.size()))

40064

OpDemandedElts.setBit(M - BaseIdx);

40065

}

40066

if (RootSizeInBits != VT.getSizeInBits()) {

40067

// Op is smaller than Root - extract the demanded elts for the subvector.

40068

unsigned Scale = RootSizeInBits / VT.getSizeInBits();

40069

unsigned NumOpMaskElts = RootMask.size() / Scale;

40070

assert((RootMask.size() % Scale) == 0 && "Root mask size mismatch")(static_cast <bool> ((RootMask.size() % Scale) == 0 &&
"Root mask size mismatch") ? void (0) : __assert_fail ("(RootMask.size() % Scale) == 0 && \"Root mask size mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40070, __extension__
__PRETTY_FUNCTION__));

40071

assert(OpDemandedElts(static_cast <bool> (OpDemandedElts .extractBits(RootMask
.size() - NumOpMaskElts, NumOpMaskElts) .isZero() && "Out of range elements referenced in root mask"
) ? void (0) : __assert_fail ("OpDemandedElts .extractBits(RootMask.size() - NumOpMaskElts, NumOpMaskElts) .isZero() && \"Out of range elements referenced in root mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40074, __extension__
__PRETTY_FUNCTION__))

40072

.extractBits(RootMask.size() - NumOpMaskElts, NumOpMaskElts)(static_cast <bool> (OpDemandedElts .extractBits(RootMask
.size() - NumOpMaskElts, NumOpMaskElts) .isZero() && "Out of range elements referenced in root mask"
) ? void (0) : __assert_fail ("OpDemandedElts .extractBits(RootMask.size() - NumOpMaskElts, NumOpMaskElts) .isZero() && \"Out of range elements referenced in root mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40074, __extension__
__PRETTY_FUNCTION__))

40073

.isZero() &&(static_cast <bool> (OpDemandedElts .extractBits(RootMask
.size() - NumOpMaskElts, NumOpMaskElts) .isZero() && "Out of range elements referenced in root mask"
) ? void (0) : __assert_fail ("OpDemandedElts .extractBits(RootMask.size() - NumOpMaskElts, NumOpMaskElts) .isZero() && \"Out of range elements referenced in root mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40074, __extension__
__PRETTY_FUNCTION__))

40074

"Out of range elements referenced in root mask")(static_cast <bool> (OpDemandedElts .extractBits(RootMask
.size() - NumOpMaskElts, NumOpMaskElts) .isZero() && "Out of range elements referenced in root mask"
) ? void (0) : __assert_fail ("OpDemandedElts .extractBits(RootMask.size() - NumOpMaskElts, NumOpMaskElts) .isZero() && \"Out of range elements referenced in root mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40074, __extension__
__PRETTY_FUNCTION__));

40075

OpDemandedElts = OpDemandedElts.extractBits(NumOpMaskElts, 0);

40076

}

40077

OpDemandedElts =

40078

APIntOps::ScaleBitMask(OpDemandedElts, VT.getVectorNumElements());

40079

40080

// Extract target shuffle mask and resolve sentinels and inputs.

40081

SmallVector<int, 64> OpMask;

40082

SmallVector<SDValue, 2> OpInputs;

40083

APInt OpUndef, OpZero;

40084

bool IsOpVariableMask = isTargetShuffleVariableMask(Op.getOpcode());

40085

if (getTargetShuffleInputs(Op, OpDemandedElts, OpInputs, OpMask, OpUndef,

40086

OpZero, DAG, Depth, false)) {

40087

// Shuffle inputs must not be larger than the shuffle result.

40088

// TODO: Relax this for single input faux shuffles (e.g. trunc).

40089

if (llvm::any_of(OpInputs, [VT](SDValue OpInput) {

40090

return OpInput.getValueSizeInBits() > VT.getSizeInBits();

40091

}))

40092

return SDValue();

40093

} else if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&

40094

(RootSizeInBits % Op.getOperand(0).getValueSizeInBits()) == 0 &&

40095

!isNullConstant(Op.getOperand(1))) {

40096

SDValue SrcVec = Op.getOperand(0);

40097

int ExtractIdx = Op.getConstantOperandVal(1);

40098

unsigned NumElts = VT.getVectorNumElements();

40099

OpInputs.assign({SrcVec});

40100

OpMask.assign(NumElts, SM_SentinelUndef);

40101

std::iota(OpMask.begin(), OpMask.end(), ExtractIdx);

40102

OpZero = OpUndef = APInt::getNullValue(NumElts);

40103

} else {

40104

return SDValue();

40105

}

40106

40107

// If the shuffle result was smaller than the root, we need to adjust the

40108

// mask indices and pad the mask with undefs.

40109

if (RootSizeInBits > VT.getSizeInBits()) {

40110

unsigned NumSubVecs = RootSizeInBits / VT.getSizeInBits();

40111

unsigned OpMaskSize = OpMask.size();

40112

if (OpInputs.size() > 1) {

40113

unsigned PaddedMaskSize = NumSubVecs * OpMaskSize;

40114

for (int &M : OpMask) {

40115

if (M < 0)

40116

continue;

40117

int EltIdx = M % OpMaskSize;

40118

int OpIdx = M / OpMaskSize;

40119

M = (PaddedMaskSize * OpIdx) + EltIdx;

40120

}

40121

}

40122

OpZero = OpZero.zext(NumSubVecs * OpMaskSize);

40123

OpUndef = OpUndef.zext(NumSubVecs * OpMaskSize);

40124

OpMask.append((NumSubVecs - 1) * OpMaskSize, SM_SentinelUndef);

40125

}

40126

40127

SmallVector<int, 64> Mask;

40128

SmallVector<SDValue, 16> Ops;

40129

40130

// We don't need to merge masks if the root is empty.

40131

bool EmptyRoot = (Depth == 0) && (RootMask.size() == 1);

40132

if (EmptyRoot) {

40133

// Only resolve zeros if it will remove an input, otherwise we might end

40134

// up in an infinite loop.

40135

bool ResolveKnownZeros = true;

40136

if (!OpZero.isZero()) {

40137

APInt UsedInputs = APInt::getZero(OpInputs.size());

40138

for (int i = 0, e = OpMask.size(); i != e; ++i) {

40139

int M = OpMask[i];

40140

if (OpUndef[i] || OpZero[i] || isUndefOrZero(M))

40141

continue;

40142

UsedInputs.setBit(M / OpMask.size());

40143

if (UsedInputs.isAllOnes()) {

40144

ResolveKnownZeros = false;

40145

break;

40146

}

40147

}

40148

}

40149

resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero,

40150

ResolveKnownZeros);

40151

40152

Mask = OpMask;

40153

Ops.append(OpInputs.begin(), OpInputs.end());

40154

} else {

40155

resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero);

40156

40157

// Add the inputs to the Ops list, avoiding duplicates.

40158

Ops.append(SrcOps.begin(), SrcOps.end());

40159

40160

auto AddOp = [&Ops](SDValue Input, int InsertionPoint) -> int {

40161

// Attempt to find an existing match.

40162

SDValue InputBC = peekThroughBitcasts(Input);

40163

for (int i = 0, e = Ops.size(); i < e; ++i)

40164

if (InputBC == peekThroughBitcasts(Ops[i]))

40165

return i;

40166

// Match failed - should we replace an existing Op?

40167

if (InsertionPoint >= 0) {

40168

Ops[InsertionPoint] = Input;

40169

return InsertionPoint;

40170

}

40171

// Add to the end of the Ops list.

40172

Ops.push_back(Input);

40173

return Ops.size() - 1;

40174

};

40175

40176

SmallVector<int, 2> OpInputIdx;

40177

for (SDValue OpInput : OpInputs)

40178

OpInputIdx.push_back(

40179

AddOp(OpInput, OpInputIdx.empty() ? SrcOpIndex : -1));

40180

40181

assert(((RootMask.size() > OpMask.size() &&(static_cast <bool> (((RootMask.size() > OpMask.size
() && RootMask.size() % OpMask.size() == 0) || (OpMask
.size() > RootMask.size() && OpMask.size() % RootMask
.size() == 0) || OpMask.size() == RootMask.size()) &&
"The smaller number of elements must divide the larger.") ? void
(0) : __assert_fail ("((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && \"The smaller number of elements must divide the larger.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40186, __extension__
__PRETTY_FUNCTION__))

40182

RootMask.size() % OpMask.size() == 0) ||(static_cast <bool> (((RootMask.size() > OpMask.size
() && RootMask.size() % OpMask.size() == 0) || (OpMask
.size() > RootMask.size() && OpMask.size() % RootMask
.size() == 0) || OpMask.size() == RootMask.size()) &&
"The smaller number of elements must divide the larger.") ? void
(0) : __assert_fail ("((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && \"The smaller number of elements must divide the larger.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40186, __extension__
__PRETTY_FUNCTION__))

40183

(OpMask.size() > RootMask.size() &&(static_cast <bool> (((RootMask.size() > OpMask.size
() && RootMask.size() % OpMask.size() == 0) || (OpMask
.size() > RootMask.size() && OpMask.size() % RootMask
.size() == 0) || OpMask.size() == RootMask.size()) &&
"The smaller number of elements must divide the larger.") ? void
(0) : __assert_fail ("((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && \"The smaller number of elements must divide the larger.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40186, __extension__
__PRETTY_FUNCTION__))

40184

OpMask.size() % RootMask.size() == 0) ||(static_cast <bool> (((RootMask.size() > OpMask.size
() && RootMask.size() % OpMask.size() == 0) || (OpMask
.size() > RootMask.size() && OpMask.size() % RootMask
.size() == 0) || OpMask.size() == RootMask.size()) &&
"The smaller number of elements must divide the larger.") ? void
(0) : __assert_fail ("((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && \"The smaller number of elements must divide the larger.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40186, __extension__
__PRETTY_FUNCTION__))

40185

OpMask.size() == RootMask.size()) &&(static_cast <bool> (((RootMask.size() > OpMask.size
() && RootMask.size() % OpMask.size() == 0) || (OpMask
.size() > RootMask.size() && OpMask.size() % RootMask
.size() == 0) || OpMask.size() == RootMask.size()) &&
"The smaller number of elements must divide the larger.") ? void
(0) : __assert_fail ("((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && \"The smaller number of elements must divide the larger.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40186, __extension__
__PRETTY_FUNCTION__))

40186

"The smaller number of elements must divide the larger.")(static_cast <bool> (((RootMask.size() > OpMask.size
() && RootMask.size() % OpMask.size() == 0) || (OpMask
.size() > RootMask.size() && OpMask.size() % RootMask
.size() == 0) || OpMask.size() == RootMask.size()) &&
"The smaller number of elements must divide the larger.") ? void
(0) : __assert_fail ("((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && \"The smaller number of elements must divide the larger.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40186, __extension__
__PRETTY_FUNCTION__));

40187

40188

// This function can be performance-critical, so we rely on the power-of-2

40189

// knowledge that we have about the mask sizes to replace div/rem ops with

40190

// bit-masks and shifts.

40191

assert(isPowerOf2_32(RootMask.size()) &&(static_cast <bool> (isPowerOf2_32(RootMask.size()) &&
"Non-power-of-2 shuffle mask sizes") ? void (0) : __assert_fail
("isPowerOf2_32(RootMask.size()) && \"Non-power-of-2 shuffle mask sizes\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40192, __extension__
__PRETTY_FUNCTION__))

40192

"Non-power-of-2 shuffle mask sizes")(static_cast <bool> (isPowerOf2_32(RootMask.size()) &&
"Non-power-of-2 shuffle mask sizes") ? void (0) : __assert_fail
("isPowerOf2_32(RootMask.size()) && \"Non-power-of-2 shuffle mask sizes\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40192, __extension__
__PRETTY_FUNCTION__));

40193

assert(isPowerOf2_32(OpMask.size()) && "Non-power-of-2 shuffle mask sizes")(static_cast <bool> (isPowerOf2_32(OpMask.size()) &&
"Non-power-of-2 shuffle mask sizes") ? void (0) : __assert_fail
("isPowerOf2_32(OpMask.size()) && \"Non-power-of-2 shuffle mask sizes\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40193, __extension__
__PRETTY_FUNCTION__));

40194

unsigned RootMaskSizeLog2 = countTrailingZeros(RootMask.size());

40195

unsigned OpMaskSizeLog2 = countTrailingZeros(OpMask.size());

40196

40197

unsigned MaskWidth = std::max<unsigned>(OpMask.size(), RootMask.size());

40198

unsigned RootRatio =

40199

std::max<unsigned>(1, OpMask.size() >> RootMaskSizeLog2);

40200

unsigned OpRatio = std::max<unsigned>(1, RootMask.size() >> OpMaskSizeLog2);

40201

assert((RootRatio == 1 || OpRatio == 1) &&(static_cast <bool> ((RootRatio == 1 || OpRatio == 1) &&
"Must not have a ratio for both incoming and op masks!") ? void
(0) : __assert_fail ("(RootRatio == 1 || OpRatio == 1) && \"Must not have a ratio for both incoming and op masks!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40202, __extension__
__PRETTY_FUNCTION__))

40202

"Must not have a ratio for both incoming and op masks!")(static_cast <bool> ((RootRatio == 1 || OpRatio == 1) &&
"Must not have a ratio for both incoming and op masks!") ? void
(0) : __assert_fail ("(RootRatio == 1 || OpRatio == 1) && \"Must not have a ratio for both incoming and op masks!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40202, __extension__
__PRETTY_FUNCTION__));

40203

40204

assert(isPowerOf2_32(MaskWidth) && "Non-power-of-2 shuffle mask sizes")(static_cast <bool> (isPowerOf2_32(MaskWidth) &&
"Non-power-of-2 shuffle mask sizes") ? void (0) : __assert_fail
("isPowerOf2_32(MaskWidth) && \"Non-power-of-2 shuffle mask sizes\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40204, __extension__
__PRETTY_FUNCTION__));

40205

assert(isPowerOf2_32(RootRatio) && "Non-power-of-2 shuffle mask sizes")(static_cast <bool> (isPowerOf2_32(RootRatio) &&
"Non-power-of-2 shuffle mask sizes") ? void (0) : __assert_fail
("isPowerOf2_32(RootRatio) && \"Non-power-of-2 shuffle mask sizes\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40205, __extension__
__PRETTY_FUNCTION__));

40206

assert(isPowerOf2_32(OpRatio) && "Non-power-of-2 shuffle mask sizes")(static_cast <bool> (isPowerOf2_32(OpRatio) && "Non-power-of-2 shuffle mask sizes"
) ? void (0) : __assert_fail ("isPowerOf2_32(OpRatio) && \"Non-power-of-2 shuffle mask sizes\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40206, __extension__
__PRETTY_FUNCTION__));

40207

unsigned RootRatioLog2 = countTrailingZeros(RootRatio);

40208

unsigned OpRatioLog2 = countTrailingZeros(OpRatio);

40209

40210

Mask.resize(MaskWidth, SM_SentinelUndef);

40211

40212

// Merge this shuffle operation's mask into our accumulated mask. Note that

40213

// this shuffle's mask will be the first applied to the input, followed by

40214

// the root mask to get us all the way to the root value arrangement. The

40215

// reason for this order is that we are recursing up the operation chain.

40216

for (unsigned i = 0; i < MaskWidth; ++i) {

40217

unsigned RootIdx = i >> RootRatioLog2;

40218

if (RootMask[RootIdx] < 0) {

40219

// This is a zero or undef lane, we're done.

40220

Mask[i] = RootMask[RootIdx];

40221

continue;

40222

}

40223

40224

unsigned RootMaskedIdx =

40225

RootRatio == 1

40226

? RootMask[RootIdx]

40227

: (RootMask[RootIdx] << RootRatioLog2) + (i & (RootRatio - 1));

40228

40229

// Just insert the scaled root mask value if it references an input other

40230

// than the SrcOp we're currently inserting.

40231

if ((RootMaskedIdx < (SrcOpIndex * MaskWidth)) ||

40232

(((SrcOpIndex + 1) * MaskWidth) <= RootMaskedIdx)) {

40233

Mask[i] = RootMaskedIdx;

40234

continue;

40235

}

40236

40237

RootMaskedIdx = RootMaskedIdx & (MaskWidth - 1);

40238

unsigned OpIdx = RootMaskedIdx >> OpRatioLog2;

40239

if (OpMask[OpIdx] < 0) {

40240

// The incoming lanes are zero or undef, it doesn't matter which ones we

40241

// are using.

40242

Mask[i] = OpMask[OpIdx];

40243

continue;

40244

}

40245

40246

// Ok, we have non-zero lanes, map them through to one of the Op's inputs.

40247

unsigned OpMaskedIdx = OpRatio == 1 ? OpMask[OpIdx]

40248

: (OpMask[OpIdx] << OpRatioLog2) +

40249

(RootMaskedIdx & (OpRatio - 1));

40250

40251

OpMaskedIdx = OpMaskedIdx & (MaskWidth - 1);

40252

int InputIdx = OpMask[OpIdx] / (int)OpMask.size();

40253

assert(0 <= OpInputIdx[InputIdx] && "Unknown target shuffle input")(static_cast <bool> (0 <= OpInputIdx[InputIdx] &&
"Unknown target shuffle input") ? void (0) : __assert_fail (
"0 <= OpInputIdx[InputIdx] && \"Unknown target shuffle input\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40253, __extension__
__PRETTY_FUNCTION__));

40254

OpMaskedIdx += OpInputIdx[InputIdx] * MaskWidth;

40255

40256

Mask[i] = OpMaskedIdx;

40257

}

40258

}

40259

40260

// Remove unused/repeated shuffle source ops.

40261

resolveTargetShuffleInputsAndMask(Ops, Mask);

40262

40263

// Handle the all undef/zero/ones cases early.

40264

if (all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; }))

40265

return DAG.getUNDEF(RootVT);

40266

if (all_of(Mask, [](int Idx) { return Idx < 0; }))

40267

return getZeroVector(RootVT, Subtarget, DAG, SDLoc(Root));

40268

if (Ops.size() == 1 && ISD::isBuildVectorAllOnes(Ops[0].getNode()) &&

40269

!llvm::is_contained(Mask, SM_SentinelZero))

40270

return getOnesVector(RootVT, DAG, SDLoc(Root));

40271

40272

assert(!Ops.empty() && "Shuffle with no inputs detected")(static_cast <bool> (!Ops.empty() && "Shuffle with no inputs detected"
) ? void (0) : __assert_fail ("!Ops.empty() && \"Shuffle with no inputs detected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40272, __extension__
__PRETTY_FUNCTION__));

40273

HasVariableMask |= IsOpVariableMask;

40274

40275

// Update the list of shuffle nodes that have been combined so far.

40276

SmallVector<const SDNode *, 16> CombinedNodes(SrcNodes.begin(),

40277

SrcNodes.end());

40278

CombinedNodes.push_back(Op.getNode());

40279

40280

// See if we can recurse into each shuffle source op (if it's a target

40281

// shuffle). The source op should only be generally combined if it either has

40282

// a single use (i.e. current Op) or all its users have already been combined,

40283

// if not then we can still combine but should prevent generation of variable

40284

// shuffles to avoid constant pool bloat.

40285

// Don't recurse if we already have more source ops than we can combine in

40286

// the remaining recursion depth.

40287

if (Ops.size() < (MaxDepth - Depth)) {

40288

for (int i = 0, e = Ops.size(); i < e; ++i) {

40289

// For empty roots, we need to resolve zeroable elements before combining

40290

// them with other shuffles.

40291

SmallVector<int, 64> ResolvedMask = Mask;

40292

if (EmptyRoot)

40293

resolveTargetShuffleFromZeroables(ResolvedMask, OpUndef, OpZero);

40294

bool AllowCrossLaneVar = false;

40295

bool AllowPerLaneVar = false;

40296

if (Ops[i].getNode()->hasOneUse() ||

40297

SDNode::areOnlyUsersOf(CombinedNodes, Ops[i].getNode())) {

40298

AllowCrossLaneVar = AllowVariableCrossLaneMask;

40299

AllowPerLaneVar = AllowVariablePerLaneMask;

40300

}

40301

if (SDValue Res = combineX86ShufflesRecursively(

40302

Ops, i, Root, ResolvedMask, CombinedNodes, Depth + 1, MaxDepth,

40303

HasVariableMask, AllowCrossLaneVar, AllowPerLaneVar, DAG,

40304

Subtarget))

40305

return Res;

40306

}

40307

}

40308

40309

// Attempt to constant fold all of the constant source ops.

40310

if (SDValue Cst = combineX86ShufflesConstants(

40311

Ops, Mask, Root, HasVariableMask, DAG, Subtarget))

40312

return Cst;

40313

40314

// If constant fold failed and we only have constants - then we have

40315

// multiple uses by a single non-variable shuffle - just bail.

40316

if (Depth == 0 && llvm::all_of(Ops, [&](SDValue Op) {

40317

APInt UndefElts;

40318

SmallVector<APInt> RawBits;

40319

unsigned EltSizeInBits = RootSizeInBits / Mask.size();

40320

return getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,

40321

RawBits);

40322

})) {

40323

return SDValue();

40324

}

40325

40326

// Canonicalize the combined shuffle mask chain with horizontal ops.

40327

// NOTE: This will update the Ops and Mask.

40328

if (SDValue HOp = canonicalizeShuffleMaskWithHorizOp(

40329

Ops, Mask, RootSizeInBits, SDLoc(Root), DAG, Subtarget))

40330

return DAG.getBitcast(RootVT, HOp);

40331

40332

// Try to refine our inputs given our knowledge of target shuffle mask.

40333

for (auto I : enumerate(Ops)) {

40334

int OpIdx = I.index();

40335

SDValue &Op = I.value();

40336

40337

// What range of shuffle mask element values results in picking from Op?

40338

int Lo = OpIdx * Mask.size();

40339

int Hi = Lo + Mask.size();

40340

40341

// Which elements of Op do we demand, given the mask's granularity?

40342

APInt OpDemandedElts(Mask.size(), 0);

40343

for (int MaskElt : Mask) {

40344

if (isInRange(MaskElt, Lo, Hi)) { // Picks from Op?

40345

int OpEltIdx = MaskElt - Lo;

40346

OpDemandedElts.setBit(OpEltIdx);

40347

}

40348

}

40349

40350

// Is the shuffle result smaller than the root?

40351

if (Op.getValueSizeInBits() < RootSizeInBits) {

40352

// We padded the mask with undefs. But we now need to undo that.

40353

unsigned NumExpectedVectorElts = Mask.size();

40354

unsigned EltSizeInBits = RootSizeInBits / NumExpectedVectorElts;

40355

unsigned NumOpVectorElts = Op.getValueSizeInBits() / EltSizeInBits;

40356

assert(!OpDemandedElts.extractBits((static_cast <bool> (!OpDemandedElts.extractBits( NumExpectedVectorElts
- NumOpVectorElts, NumOpVectorElts) && "Demanding the virtual undef widening padding?"
) ? void (0) : __assert_fail ("!OpDemandedElts.extractBits( NumExpectedVectorElts - NumOpVectorElts, NumOpVectorElts) && \"Demanding the virtual undef widening padding?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40358, __extension__
__PRETTY_FUNCTION__))

40357

NumExpectedVectorElts - NumOpVectorElts, NumOpVectorElts) &&(static_cast <bool> (!OpDemandedElts.extractBits( NumExpectedVectorElts
- NumOpVectorElts, NumOpVectorElts) && "Demanding the virtual undef widening padding?"
) ? void (0) : __assert_fail ("!OpDemandedElts.extractBits( NumExpectedVectorElts - NumOpVectorElts, NumOpVectorElts) && \"Demanding the virtual undef widening padding?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40358, __extension__
__PRETTY_FUNCTION__))

40358

"Demanding the virtual undef widening padding?")(static_cast <bool> (!OpDemandedElts.extractBits( NumExpectedVectorElts
- NumOpVectorElts, NumOpVectorElts) && "Demanding the virtual undef widening padding?"
) ? void (0) : __assert_fail ("!OpDemandedElts.extractBits( NumExpectedVectorElts - NumOpVectorElts, NumOpVectorElts) && \"Demanding the virtual undef widening padding?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40358, __extension__
__PRETTY_FUNCTION__));

40359

OpDemandedElts = OpDemandedElts.trunc(NumOpVectorElts); // NUW

40360

}

40361

40362

// The Op itself may be of different VT, so we need to scale the mask.

40363

unsigned NumOpElts = Op.getValueType().getVectorNumElements();

40364

APInt OpScaledDemandedElts = APIntOps::ScaleBitMask(OpDemandedElts, NumOpElts);

40365

40366

// Can this operand be simplified any further, given it's demanded elements?

40367

if (SDValue NewOp =

40368

DAG.getTargetLoweringInfo().SimplifyMultipleUseDemandedVectorElts(

40369

Op, OpScaledDemandedElts, DAG))

40370

Op = NewOp;

40371

}

40372

// FIXME: should we rerun resolveTargetShuffleInputsAndMask() now?

40373

40374

// Widen any subvector shuffle inputs we've collected.

40375

// TODO: Remove this to avoid generating temporary nodes, we should only

40376

// widen once combineX86ShuffleChain has found a match.

40377

if (any_of(Ops, [RootSizeInBits](SDValue Op) {

40378

return Op.getValueSizeInBits() < RootSizeInBits;

40379

})) {

40380

for (SDValue &Op : Ops)

40381

if (Op.getValueSizeInBits() < RootSizeInBits)

40382

Op = widenSubVector(Op, false, Subtarget, DAG, SDLoc(Op),

40383

RootSizeInBits);

40384

// Reresolve - we might have repeated subvector sources.

40385

resolveTargetShuffleInputsAndMask(Ops, Mask);

40386

}

40387

40388

// We can only combine unary and binary shuffle mask cases.

40389

if (Ops.size() <= 2) {

40390

// Minor canonicalization of the accumulated shuffle mask to make it easier

40391

// to match below. All this does is detect masks with sequential pairs of

40392

// elements, and shrink them to the half-width mask. It does this in a loop

40393

// so it will reduce the size of the mask to the minimal width mask which

40394

// performs an equivalent shuffle.

40395

while (Mask.size() > 1) {

40396

SmallVector<int, 64> WidenedMask;

40397

if (!canWidenShuffleElements(Mask, WidenedMask))

40398

break;

40399

Mask = std::move(WidenedMask);

40400

}

40401

40402

// Canonicalization of binary shuffle masks to improve pattern matching by

40403

// commuting the inputs.

40404

if (Ops.size() == 2 && canonicalizeShuffleMaskWithCommute(Mask)) {

40405

ShuffleVectorSDNode::commuteMask(Mask);

40406

std::swap(Ops[0], Ops[1]);

40407

}

40408

40409

// Try to combine into a single shuffle instruction.

40410

if (SDValue Shuffle = combineX86ShuffleChain(

40411

Ops, Root, Mask, Depth, HasVariableMask, AllowVariableCrossLaneMask,

40412

AllowVariablePerLaneMask, DAG, Subtarget))

40413

return Shuffle;

40414

40415

// If all the operands come from the same larger vector, fallthrough and try

40416

// to use combineX86ShuffleChainWithExtract.

40417

SDValue LHS = peekThroughBitcasts(Ops.front());

40418

SDValue RHS = peekThroughBitcasts(Ops.back());

40419

if (Ops.size() != 2 || !Subtarget.hasAVX2() || RootSizeInBits != 128 ||

40420

(RootSizeInBits / Mask.size()) != 64 ||

40421

LHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||

40422

RHS.getOpcode() != ISD::EXTRACT_SUBVECTOR ||

40423

LHS.getOperand(0) != RHS.getOperand(0))

40424

return SDValue();

40425

}

40426

40427

// If that failed and any input is extracted then try to combine as a

40428

// shuffle with the larger type.

40429

return combineX86ShuffleChainWithExtract(

40430

Ops, Root, Mask, Depth, HasVariableMask, AllowVariableCrossLaneMask,

40431

AllowVariablePerLaneMask, DAG, Subtarget);

40432

}

40433

40434

/// Helper entry wrapper to combineX86ShufflesRecursively.

40435

static SDValue combineX86ShufflesRecursively(SDValue Op, SelectionDAG &DAG,

40436

const X86Subtarget &Subtarget) {

40437

return combineX86ShufflesRecursively(

40438

{Op}, 0, Op, {0}, {}, /*Depth*/ 0, X86::MaxShuffleCombineDepth,

40439

/*HasVarMask*/ false,

40440

/*AllowCrossLaneVarMask*/ true, /*AllowPerLaneVarMask*/ true, DAG,

40441

Subtarget);

40442

}

40443

40444

/// Get the PSHUF-style mask from PSHUF node.

40445

///

40446

/// This is a very minor wrapper around getTargetShuffleMask to easy forming v4

40447

/// PSHUF-style masks that can be reused with such instructions.

40448

static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) {

40449

MVT VT = N.getSimpleValueType();

40450

SmallVector<int, 4> Mask;

40451

SmallVector<SDValue, 2> Ops;

40452

bool HaveMask =

40453

getTargetShuffleMask(N.getNode(), VT, false, Ops, Mask);

40454

(void)HaveMask;

40455

assert(HaveMask)(static_cast <bool> (HaveMask) ? void (0) : __assert_fail
("HaveMask", "llvm/lib/Target/X86/X86ISelLowering.cpp", 40455
, __extension__ __PRETTY_FUNCTION__));

40456

40457

// If we have more than 128-bits, only the low 128-bits of shuffle mask

40458

// matter. Check that the upper masks are repeats and remove them.

40459

if (VT.getSizeInBits() > 128) {

40460

int LaneElts = 128 / VT.getScalarSizeInBits();

40461

#ifndef NDEBUG

40462

for (int i = 1, NumLanes = VT.getSizeInBits() / 128; i < NumLanes; ++i)

40463

for (int j = 0; j < LaneElts; ++j)

40464

assert(Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) &&(static_cast <bool> (Mask[j] == Mask[i * LaneElts + j] -
(LaneElts * i) && "Mask doesn't repeat in high 128-bit lanes!"
) ? void (0) : __assert_fail ("Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) && \"Mask doesn't repeat in high 128-bit lanes!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40465, __extension__
__PRETTY_FUNCTION__))

40465

"Mask doesn't repeat in high 128-bit lanes!")(static_cast <bool> (Mask[j] == Mask[i * LaneElts + j] -
(LaneElts * i) && "Mask doesn't repeat in high 128-bit lanes!"
) ? void (0) : __assert_fail ("Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) && \"Mask doesn't repeat in high 128-bit lanes!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40465, __extension__
__PRETTY_FUNCTION__));

40466

#endif

40467

Mask.resize(LaneElts);

40468

}

40469

40470

switch (N.getOpcode()) {

40471

case X86ISD::PSHUFD:

40472

return Mask;

40473

case X86ISD::PSHUFLW:

40474

Mask.resize(4);

40475

return Mask;

40476

case X86ISD::PSHUFHW:

40477

Mask.erase(Mask.begin(), Mask.begin() + 4);

40478

for (int &M : Mask)

40479

M -= 4;

40480

return Mask;

40481

default:

40482

llvm_unreachable("No valid shuffle instruction found!")::llvm::llvm_unreachable_internal("No valid shuffle instruction found!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40482);

40483

}

40484

}

40485

40486

/// Search for a combinable shuffle across a chain ending in pshufd.

40487

///

40488

/// We walk up the chain and look for a combinable shuffle, skipping over

40489

/// shuffles that we could hoist this shuffle's transformation past without

40490

/// altering anything.

40491

static SDValue

40492

combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,

40493

SelectionDAG &DAG) {

40494

assert(N.getOpcode() == X86ISD::PSHUFD &&(static_cast <bool> (N.getOpcode() == X86ISD::PSHUFD &&
"Called with something other than an x86 128-bit half shuffle!"
) ? void (0) : __assert_fail ("N.getOpcode() == X86ISD::PSHUFD && \"Called with something other than an x86 128-bit half shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40495, __extension__
__PRETTY_FUNCTION__))

40495

"Called with something other than an x86 128-bit half shuffle!")(static_cast <bool> (N.getOpcode() == X86ISD::PSHUFD &&
"Called with something other than an x86 128-bit half shuffle!"
) ? void (0) : __assert_fail ("N.getOpcode() == X86ISD::PSHUFD && \"Called with something other than an x86 128-bit half shuffle!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40495, __extension__
__PRETTY_FUNCTION__));

40496

SDLoc DL(N);

40497

40498

// Walk up a single-use chain looking for a combinable shuffle. Keep a stack

40499

// of the shuffles in the chain so that we can form a fresh chain to replace

40500

// this one.

40501

SmallVector<SDValue, 8> Chain;

40502

SDValue V = N.getOperand(0);

40503

for (; V.hasOneUse(); V = V.getOperand(0)) {

40504

switch (V.getOpcode()) {

40505

default:

40506

return SDValue(); // Nothing combined!

40507

40508

case ISD::BITCAST:

40509

// Skip bitcasts as we always know the type for the target specific

40510

// instructions.

40511

continue;

40512

40513

case X86ISD::PSHUFD:

40514

// Found another dword shuffle.

40515

break;

40516

40517

case X86ISD::PSHUFLW:

40518

// Check that the low words (being shuffled) are the identity in the

40519

// dword shuffle, and the high words are self-contained.

40520

if (Mask[0] != 0 || Mask[1] != 1 ||

40521

!(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4))

40522

return SDValue();

40523

40524

Chain.push_back(V);

40525

continue;

40526

40527

case X86ISD::PSHUFHW:

40528

// Check that the high words (being shuffled) are the identity in the

40529

// dword shuffle, and the low words are self-contained.

40530

if (Mask[2] != 2 || Mask[3] != 3 ||

40531

!(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2))

40532

return SDValue();

40533

40534

Chain.push_back(V);

40535

continue;

40536

40537

case X86ISD::UNPCKL:

40538

case X86ISD::UNPCKH:

40539

// For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword

40540

// shuffle into a preceding word shuffle.

40541

if (V.getSimpleValueType().getVectorElementType() != MVT::i8 &&

40542

V.getSimpleValueType().getVectorElementType() != MVT::i16)

40543

return SDValue();

40544

40545

// Search for a half-shuffle which we can combine with.

40546

unsigned CombineOp =

40547

V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;

40548

if (V.getOperand(0) != V.getOperand(1) ||

40549

!V->isOnlyUserOf(V.getOperand(0).getNode()))

40550

return SDValue();

40551

Chain.push_back(V);

40552

V = V.getOperand(0);

40553

do {

40554

switch (V.getOpcode()) {

40555

default:

40556

return SDValue(); // Nothing to combine.

40557

40558

case X86ISD::PSHUFLW:

40559

case X86ISD::PSHUFHW:

40560

if (V.getOpcode() == CombineOp)

40561

break;

40562

40563

Chain.push_back(V);

40564

40565

[[fallthrough]];

40566

case ISD::BITCAST:

40567

V = V.getOperand(0);

40568

continue;

40569

}

40570

break;

40571

} while (V.hasOneUse());

40572

break;

40573

}

40574

// Break out of the loop if we break out of the switch.

40575

break;

40576

}

40577

40578

if (!V.hasOneUse())

40579

// We fell out of the loop without finding a viable combining instruction.

40580

return SDValue();

40581

40582

// Merge this node's mask and our incoming mask.

40583

SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);

40584

for (int &M : Mask)

40585

M = VMask[M];

40586

V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0),

40587

getV4X86ShuffleImm8ForMask(Mask, DL, DAG));

40588

40589

// Rebuild the chain around this new shuffle.

40590

while (!Chain.empty()) {

40591

SDValue W = Chain.pop_back_val();

40592

40593

if (V.getValueType() != W.getOperand(0).getValueType())

40594

V = DAG.getBitcast(W.getOperand(0).getValueType(), V);

40595

40596

switch (W.getOpcode()) {

40597

default:

40598

llvm_unreachable("Only PSHUF and UNPCK instructions get here!")::llvm::llvm_unreachable_internal("Only PSHUF and UNPCK instructions get here!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40598);

40599

40600

case X86ISD::UNPCKL:

40601

case X86ISD::UNPCKH:

40602

V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, V);

40603

break;

40604

40605

case X86ISD::PSHUFD:

40606

case X86ISD::PSHUFLW:

40607

case X86ISD::PSHUFHW:

40608

V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, W.getOperand(1));

40609

break;

40610

}

40611

}

40612

if (V.getValueType() != N.getValueType())

40613

V = DAG.getBitcast(N.getValueType(), V);

40614

40615

// Return the new chain to replace N.

40616

return V;

40617

}

40618

40619

// Attempt to commute shufps LHS loads:

40620

// permilps(shufps(load(),x)) --> permilps(shufps(x,load()))

40621

static SDValue combineCommutableSHUFP(SDValue N, MVT VT, const SDLoc &DL,

40622

SelectionDAG &DAG) {

40623

// TODO: Add vXf64 support.

40624

if (VT != MVT::v4f32 && VT != MVT::v8f32 && VT != MVT::v16f32)

40625

return SDValue();

40626

40627

// SHUFP(LHS, RHS) -> SHUFP(RHS, LHS) iff LHS is foldable + RHS is not.

40628

auto commuteSHUFP = [&VT, &DL, &DAG](SDValue Parent, SDValue V) {

40629

if (V.getOpcode() != X86ISD::SHUFP || !Parent->isOnlyUserOf(V.getNode()))

40630

return SDValue();

40631

SDValue N0 = V.getOperand(0);

40632

SDValue N1 = V.getOperand(1);

40633

unsigned Imm = V.getConstantOperandVal(2);

40634

const X86Subtarget &Subtarget = DAG.getSubtarget<X86Subtarget>();

40635

if (!X86::mayFoldLoad(peekThroughOneUseBitcasts(N0), Subtarget) ||

40636

X86::mayFoldLoad(peekThroughOneUseBitcasts(N1), Subtarget))

40637

return SDValue();

40638

Imm = ((Imm & 0x0F) << 4) | ((Imm & 0xF0) >> 4);

40639

return DAG.getNode(X86ISD::SHUFP, DL, VT, N1, N0,

40640

DAG.getTargetConstant(Imm, DL, MVT::i8));

40641

};

40642

40643

switch (N.getOpcode()) {

40644

case X86ISD::VPERMILPI:

40645

if (SDValue NewSHUFP = commuteSHUFP(N, N.getOperand(0))) {

40646

unsigned Imm = N.getConstantOperandVal(1);

40647

return DAG.getNode(X86ISD::VPERMILPI, DL, VT, NewSHUFP,

40648

DAG.getTargetConstant(Imm ^ 0xAA, DL, MVT::i8));

40649

}

40650

break;

40651

case X86ISD::SHUFP: {

40652

SDValue N0 = N.getOperand(0);

40653

SDValue N1 = N.getOperand(1);

40654

unsigned Imm = N.getConstantOperandVal(2);

40655

if (N0 == N1) {

40656

if (SDValue NewSHUFP = commuteSHUFP(N, N0))

40657

return DAG.getNode(X86ISD::SHUFP, DL, VT, NewSHUFP, NewSHUFP,

40658

DAG.getTargetConstant(Imm ^ 0xAA, DL, MVT::i8));

40659

} else if (SDValue NewSHUFP = commuteSHUFP(N, N0)) {

40660

return DAG.getNode(X86ISD::SHUFP, DL, VT, NewSHUFP, N1,

40661

DAG.getTargetConstant(Imm ^ 0x0A, DL, MVT::i8));

40662

} else if (SDValue NewSHUFP = commuteSHUFP(N, N1)) {

40663

return DAG.getNode(X86ISD::SHUFP, DL, VT, N0, NewSHUFP,

40664

DAG.getTargetConstant(Imm ^ 0xA0, DL, MVT::i8));

40665

}

40666

break;

40667

}

40668

}

40669

40670

return SDValue();

40671

}

40672

40673

// Canonicalize SHUFFLE(BINOP(X,Y)) -> BINOP(SHUFFLE(X),SHUFFLE(Y)).

40674

static SDValue canonicalizeShuffleWithBinOps(SDValue N, SelectionDAG &DAG,

40675

const SDLoc &DL) {

40676

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

40677

EVT ShuffleVT = N.getValueType();

40678

40679

auto IsMergeableWithShuffle = [&DAG](SDValue Op, bool FoldLoad = false) {

40680

// AllZeros/AllOnes constants are freely shuffled and will peek through

40681

// bitcasts. Other constant build vectors do not peek through bitcasts. Only

40682

// merge with target shuffles if it has one use so shuffle combining is

40683

// likely to kick in. Shuffles of splats are expected to be removed.

40684

return ISD::isBuildVectorAllOnes(Op.getNode()) ||

40685

ISD::isBuildVectorAllZeros(Op.getNode()) ||

40686

ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) ||

40687

ISD::isBuildVectorOfConstantFPSDNodes(Op.getNode()) ||

40688

(isTargetShuffle(Op.getOpcode()) && Op->hasOneUse()) ||

40689

(FoldLoad && isShuffleFoldableLoad(Op)) ||

40690

DAG.isSplatValue(Op, /*AllowUndefs*/ false);

40691

};

40692

auto IsSafeToMoveShuffle = [ShuffleVT](SDValue Op, unsigned BinOp) {

40693

// Ensure we only shuffle whole vector src elements, unless its a logical

40694

// binops where we can more aggressively move shuffles from dst to src.

40695

return BinOp == ISD::AND || BinOp == ISD::OR || BinOp == ISD::XOR ||

40696

BinOp == X86ISD::ANDNP ||

40697

(Op.getScalarValueSizeInBits() <= ShuffleVT.getScalarSizeInBits());

40698

};

40699

40700

unsigned Opc = N.getOpcode();

40701

switch (Opc) {

40702

// Unary and Unary+Permute Shuffles.

40703

case X86ISD::PSHUFB: {

40704

// Don't merge PSHUFB if it contains zero'd elements.

40705

SmallVector<int> Mask;

40706

SmallVector<SDValue> Ops;

40707

if (!getTargetShuffleMask(N.getNode(), ShuffleVT.getSimpleVT(), false, Ops,

40708

Mask))

40709

break;

40710

[[fallthrough]];

40711

}

40712

case X86ISD::VBROADCAST:

40713

case X86ISD::MOVDDUP:

40714

case X86ISD::PSHUFD:

40715

case X86ISD::PSHUFHW:

40716

case X86ISD::PSHUFLW:

40717

case X86ISD::VPERMI:

40718

case X86ISD::VPERMILPI: {

40719

if (N.getOperand(0).getValueType() == ShuffleVT &&

40720

N->isOnlyUserOf(N.getOperand(0).getNode())) {

40721

SDValue N0 = peekThroughOneUseBitcasts(N.getOperand(0));

40722

unsigned SrcOpcode = N0.getOpcode();

40723

if (TLI.isBinOp(SrcOpcode) && IsSafeToMoveShuffle(N0, SrcOpcode)) {

40724

SDValue Op00 = peekThroughOneUseBitcasts(N0.getOperand(0));

40725

SDValue Op01 = peekThroughOneUseBitcasts(N0.getOperand(1));

40726

if (IsMergeableWithShuffle(Op00, Opc != X86ISD::PSHUFB) ||

40727

IsMergeableWithShuffle(Op01, Opc != X86ISD::PSHUFB)) {

40728

SDValue LHS, RHS;

40729

Op00 = DAG.getBitcast(ShuffleVT, Op00);

40730

Op01 = DAG.getBitcast(ShuffleVT, Op01);

40731

if (N.getNumOperands() == 2) {

40732

LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, N.getOperand(1));

40733

RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, N.getOperand(1));

40734

} else {

40735

LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00);

40736

RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01);

40737

}

40738

EVT OpVT = N0.getValueType();

40739

return DAG.getBitcast(ShuffleVT,

40740

DAG.getNode(SrcOpcode, DL, OpVT,

40741

DAG.getBitcast(OpVT, LHS),

40742

DAG.getBitcast(OpVT, RHS)));

40743

}

40744

}

40745

}

40746

break;

40747

}

40748

// Binary and Binary+Permute Shuffles.

40749

case X86ISD::INSERTPS: {

40750

// Don't merge INSERTPS if it contains zero'd elements.

40751

unsigned InsertPSMask = N.getConstantOperandVal(2);

40752

unsigned ZeroMask = InsertPSMask & 0xF;

40753

if (ZeroMask != 0)

40754

break;

40755

[[fallthrough]];

40756

}

40757

case X86ISD::MOVSD:

40758

case X86ISD::MOVSS:

40759

case X86ISD::BLENDI:

40760

case X86ISD::SHUFP:

40761

case X86ISD::UNPCKH:

40762

case X86ISD::UNPCKL: {

40763

if (N->isOnlyUserOf(N.getOperand(0).getNode()) &&

40764

N->isOnlyUserOf(N.getOperand(1).getNode())) {

40765

SDValue N0 = peekThroughOneUseBitcasts(N.getOperand(0));

40766

SDValue N1 = peekThroughOneUseBitcasts(N.getOperand(1));

40767

unsigned SrcOpcode = N0.getOpcode();

40768

if (TLI.isBinOp(SrcOpcode) && N1.getOpcode() == SrcOpcode &&

40769

IsSafeToMoveShuffle(N0, SrcOpcode) &&

40770

IsSafeToMoveShuffle(N1, SrcOpcode)) {

40771

SDValue Op00 = peekThroughOneUseBitcasts(N0.getOperand(0));

40772

SDValue Op10 = peekThroughOneUseBitcasts(N1.getOperand(0));

40773

SDValue Op01 = peekThroughOneUseBitcasts(N0.getOperand(1));

40774

SDValue Op11 = peekThroughOneUseBitcasts(N1.getOperand(1));

40775

// Ensure the total number of shuffles doesn't increase by folding this

40776

// shuffle through to the source ops.

40777

if (((IsMergeableWithShuffle(Op00) && IsMergeableWithShuffle(Op10)) ||

40778

(IsMergeableWithShuffle(Op01) && IsMergeableWithShuffle(Op11))) ||

40779

((IsMergeableWithShuffle(Op00) || IsMergeableWithShuffle(Op10)) &&

40780

(IsMergeableWithShuffle(Op01) || IsMergeableWithShuffle(Op11)))) {

40781

SDValue LHS, RHS;

40782

Op00 = DAG.getBitcast(ShuffleVT, Op00);

40783

Op10 = DAG.getBitcast(ShuffleVT, Op10);

40784

Op01 = DAG.getBitcast(ShuffleVT, Op01);

40785

Op11 = DAG.getBitcast(ShuffleVT, Op11);

40786

if (N.getNumOperands() == 3) {

40787

LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10, N.getOperand(2));

40788

RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, Op11, N.getOperand(2));

40789

} else {

40790

LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10);

40791

RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, Op11);

40792

}

40793

EVT OpVT = N0.getValueType();

40794

return DAG.getBitcast(ShuffleVT,

40795

DAG.getNode(SrcOpcode, DL, OpVT,

40796

DAG.getBitcast(OpVT, LHS),

40797

DAG.getBitcast(OpVT, RHS)));

40798

}

40799

}

40800

}

40801

break;

40802

}

40803

}

40804

return SDValue();

40805

}

40806

40807

/// Attempt to fold vpermf128(op(),op()) -> op(vpermf128(),vpermf128()).

40808

static SDValue canonicalizeLaneShuffleWithRepeatedOps(SDValue V,

40809

SelectionDAG &DAG,

40810

const SDLoc &DL) {

40811

assert(V.getOpcode() == X86ISD::VPERM2X128 && "Unknown lane shuffle")(static_cast <bool> (V.getOpcode() == X86ISD::VPERM2X128
&& "Unknown lane shuffle") ? void (0) : __assert_fail
("V.getOpcode() == X86ISD::VPERM2X128 && \"Unknown lane shuffle\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40811, __extension__
__PRETTY_FUNCTION__));

40812

40813

MVT VT = V.getSimpleValueType();

40814

SDValue Src0 = peekThroughBitcasts(V.getOperand(0));

40815

SDValue Src1 = peekThroughBitcasts(V.getOperand(1));

40816

unsigned SrcOpc0 = Src0.getOpcode();

40817

unsigned SrcOpc1 = Src1.getOpcode();

40818

EVT SrcVT0 = Src0.getValueType();

40819

EVT SrcVT1 = Src1.getValueType();

40820

40821

if (!Src1.isUndef() && (SrcVT0 != SrcVT1 || SrcOpc0 != SrcOpc1))

40822

return SDValue();

40823

40824

switch (SrcOpc0) {

40825

case X86ISD::MOVDDUP: {

40826

SDValue LHS = Src0.getOperand(0);

40827

SDValue RHS = Src1.isUndef() ? Src1 : Src1.getOperand(0);

40828

SDValue Res =

40829

DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT0, LHS, RHS, V.getOperand(2));

40830

Res = DAG.getNode(SrcOpc0, DL, SrcVT0, Res);

40831

return DAG.getBitcast(VT, Res);

40832

}

40833

case X86ISD::VPERMILPI:

40834

// TODO: Handle v4f64 permutes with different low/high lane masks.

40835

if (SrcVT0 == MVT::v4f64) {

40836

uint64_t Mask = Src0.getConstantOperandVal(1);

40837

if ((Mask & 0x3) != ((Mask >> 2) & 0x3))

40838

break;

40839

}

40840

[[fallthrough]];

40841

case X86ISD::VSHLI:

40842

case X86ISD::VSRLI:

40843

case X86ISD::VSRAI:

40844

case X86ISD::PSHUFD:

40845

if (Src1.isUndef() || Src0.getOperand(1) == Src1.getOperand(1)) {

40846

SDValue LHS = Src0.getOperand(0);

40847

SDValue RHS = Src1.isUndef() ? Src1 : Src1.getOperand(0);

40848

SDValue Res = DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT0, LHS, RHS,

40849

V.getOperand(2));

40850

Res = DAG.getNode(SrcOpc0, DL, SrcVT0, Res, Src0.getOperand(1));

40851

return DAG.getBitcast(VT, Res);

40852

}

40853

break;

40854

}

40855

40856

return SDValue();

40857

}

40858

40859

/// Try to combine x86 target specific shuffles.

40860

static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,

40861

TargetLowering::DAGCombinerInfo &DCI,

40862

const X86Subtarget &Subtarget) {

40863

SDLoc DL(N);

40864

MVT VT = N.getSimpleValueType();

40865

SmallVector<int, 4> Mask;

40866

unsigned Opcode = N.getOpcode();

40867

40868

if (SDValue R = combineCommutableSHUFP(N, VT, DL, DAG))

40869

return R;

40870

40871

// Handle specific target shuffles.

40872

switch (Opcode) {

40873

case X86ISD::MOVDDUP: {

40874

SDValue Src = N.getOperand(0);

40875

// Turn a 128-bit MOVDDUP of a full vector load into movddup+vzload.

40876

if (VT == MVT::v2f64 && Src.hasOneUse() &&

40877

ISD::isNormalLoad(Src.getNode())) {

40878

LoadSDNode *LN = cast<LoadSDNode>(Src);

40879

if (SDValue VZLoad = narrowLoadToVZLoad(LN, MVT::f64, MVT::v2f64, DAG)) {

40880

SDValue Movddup = DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, VZLoad);

40881

DCI.CombineTo(N.getNode(), Movddup);

40882

DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));

40883

DCI.recursivelyDeleteUnusedNodes(LN);

40884

return N; // Return N so it doesn't get rechecked!

40885

}

40886

}

40887

40888

return SDValue();

40889

}

40890

case X86ISD::VBROADCAST: {

40891

SDValue Src = N.getOperand(0);

40892

SDValue BC = peekThroughBitcasts(Src);

40893

EVT SrcVT = Src.getValueType();

40894

EVT BCVT = BC.getValueType();

40895

40896

// If broadcasting from another shuffle, attempt to simplify it.

40897

// TODO - we really need a general SimplifyDemandedVectorElts mechanism.

40898

if (isTargetShuffle(BC.getOpcode()) &&

40899

VT.getScalarSizeInBits() % BCVT.getScalarSizeInBits() == 0) {

40900

unsigned Scale = VT.getScalarSizeInBits() / BCVT.getScalarSizeInBits();

40901

SmallVector<int, 16> DemandedMask(BCVT.getVectorNumElements(),

40902

SM_SentinelUndef);

40903

for (unsigned i = 0; i != Scale; ++i)

40904

DemandedMask[i] = i;

40905

if (SDValue Res = combineX86ShufflesRecursively(

40906

{BC}, 0, BC, DemandedMask, {}, /*Depth*/ 0,

40907

X86::MaxShuffleCombineDepth,

40908

/*HasVarMask*/ false, /*AllowCrossLaneVarMask*/ true,

40909

/*AllowPerLaneVarMask*/ true, DAG, Subtarget))

40910

return DAG.getNode(X86ISD::VBROADCAST, DL, VT,

40911

DAG.getBitcast(SrcVT, Res));

40912

}

40913

40914

// broadcast(bitcast(src)) -> bitcast(broadcast(src))

40915

// 32-bit targets have to bitcast i64 to f64, so better to bitcast upward.

40916

if (Src.getOpcode() == ISD::BITCAST &&

40917

SrcVT.getScalarSizeInBits() == BCVT.getScalarSizeInBits() &&

40918

DAG.getTargetLoweringInfo().isTypeLegal(BCVT) &&

40919

FixedVectorType::isValidElementType(

40920

BCVT.getScalarType().getTypeForEVT(*DAG.getContext()))) {

40921

EVT NewVT = EVT::getVectorVT(*DAG.getContext(), BCVT.getScalarType(),

40922

VT.getVectorNumElements());

40923

return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, DL, NewVT, BC));

40924

}

40925

40926

// vbroadcast(bitcast(vbroadcast(src))) -> bitcast(vbroadcast(src))

40927

// If we're re-broadcasting a smaller type then broadcast with that type and

40928

// bitcast.

40929

// TODO: Do this for any splat?

40930

if (Src.getOpcode() == ISD::BITCAST &&

40931

(BC.getOpcode() == X86ISD::VBROADCAST ||

40932

BC.getOpcode() == X86ISD::VBROADCAST_LOAD) &&

40933

(VT.getScalarSizeInBits() % BCVT.getScalarSizeInBits()) == 0 &&

40934

(VT.getSizeInBits() % BCVT.getSizeInBits()) == 0) {

40935

MVT NewVT =

40936

MVT::getVectorVT(BCVT.getSimpleVT().getScalarType(),

40937

VT.getSizeInBits() / BCVT.getScalarSizeInBits());

40938

return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, DL, NewVT, BC));

40939

}

40940

40941

// Reduce broadcast source vector to lowest 128-bits.

40942

if (SrcVT.getSizeInBits() > 128)

40943

return DAG.getNode(X86ISD::VBROADCAST, DL, VT,

40944

extract128BitVector(Src, 0, DAG, DL));

40945

40946

// broadcast(scalar_to_vector(x)) -> broadcast(x).

40947

if (Src.getOpcode() == ISD::SCALAR_TO_VECTOR)

40948

return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Src.getOperand(0));

40949

40950

// broadcast(extract_vector_elt(x, 0)) -> broadcast(x).

40951

if (Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&

40952

isNullConstant(Src.getOperand(1)) &&

40953

DAG.getTargetLoweringInfo().isTypeLegal(

40954

Src.getOperand(0).getValueType()))

40955

return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Src.getOperand(0));

40956

40957

// Share broadcast with the longest vector and extract low subvector (free).

40958

// Ensure the same SDValue from the SDNode use is being used.

40959

for (SDNode *User : Src->uses())

40960

if (User != N.getNode() && User->getOpcode() == X86ISD::VBROADCAST &&

40961

Src == User->getOperand(0) &&

40962

User->getValueSizeInBits(0).getFixedValue() >

40963

VT.getFixedSizeInBits()) {

40964

return extractSubVector(SDValue(User, 0), 0, DAG, DL,

40965

VT.getSizeInBits());

40966

}

40967

40968

// vbroadcast(scalarload X) -> vbroadcast_load X

40969

// For float loads, extract other uses of the scalar from the broadcast.

40970

if (!SrcVT.isVector() && (Src.hasOneUse() || VT.isFloatingPoint()) &&

40971

ISD::isNormalLoad(Src.getNode())) {

40972

LoadSDNode *LN = cast<LoadSDNode>(Src);

40973

SDVTList Tys = DAG.getVTList(VT, MVT::Other);

40974

SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };

40975

SDValue BcastLd =

40976

DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops,

40977

LN->getMemoryVT(), LN->getMemOperand());

40978

// If the load value is used only by N, replace it via CombineTo N.

40979

bool NoReplaceExtract = Src.hasOneUse();

40980

DCI.CombineTo(N.getNode(), BcastLd);

40981

if (NoReplaceExtract) {

40982

DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));

40983

DCI.recursivelyDeleteUnusedNodes(LN);

40984

} else {

40985

SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SrcVT, BcastLd,

40986

DAG.getIntPtrConstant(0, DL));

40987

DCI.CombineTo(LN, Scl, BcastLd.getValue(1));

40988

}

40989

return N; // Return N so it doesn't get rechecked!

40990

}

40991

40992

// Due to isTypeDesirableForOp, we won't always shrink a load truncated to

40993

// i16. So shrink it ourselves if we can make a broadcast_load.

40994

if (SrcVT == MVT::i16 && Src.getOpcode() == ISD::TRUNCATE &&

40995

Src.hasOneUse() && Src.getOperand(0).hasOneUse()) {

40996

assert(Subtarget.hasAVX2() && "Expected AVX2")(static_cast <bool> (Subtarget.hasAVX2() && "Expected AVX2"
) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"Expected AVX2\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 40996, __extension__
__PRETTY_FUNCTION__));

40997

SDValue TruncIn = Src.getOperand(0);

40998

40999

// If this is a truncate of a non extending load we can just narrow it to

41000

// use a broadcast_load.

41001

if (ISD::isNormalLoad(TruncIn.getNode())) {

41002

LoadSDNode *LN = cast<LoadSDNode>(TruncIn);

41003

// Unless its volatile or atomic.

41004

if (LN->isSimple()) {

41005

SDVTList Tys = DAG.getVTList(VT, MVT::Other);

41006

SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };

41007

SDValue BcastLd = DAG.getMemIntrinsicNode(

41008

X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::i16,

41009

LN->getPointerInfo(), LN->getOriginalAlign(),

41010

LN->getMemOperand()->getFlags());

41011

DCI.CombineTo(N.getNode(), BcastLd);

41012

DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));

41013

DCI.recursivelyDeleteUnusedNodes(Src.getNode());

41014

return N; // Return N so it doesn't get rechecked!

41015

}

41016

}

41017

41018

// If this is a truncate of an i16 extload, we can directly replace it.

41019

if (ISD::isUNINDEXEDLoad(Src.getOperand(0).getNode()) &&

41020

ISD::isEXTLoad(Src.getOperand(0).getNode())) {

41021

LoadSDNode *LN = cast<LoadSDNode>(Src.getOperand(0));

41022

if (LN->getMemoryVT().getSizeInBits() == 16) {

41023

SDVTList Tys = DAG.getVTList(VT, MVT::Other);

41024

SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };

41025

SDValue BcastLd =

41026

DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops,

41027

LN->getMemoryVT(), LN->getMemOperand());

41028

DCI.CombineTo(N.getNode(), BcastLd);

41029

DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));

41030

DCI.recursivelyDeleteUnusedNodes(Src.getNode());

41031

return N; // Return N so it doesn't get rechecked!

41032

}

41033

}

41034

41035

// If this is a truncate of load that has been shifted right, we can

41036

// offset the pointer and use a narrower load.

41037

if (TruncIn.getOpcode() == ISD::SRL &&

41038

TruncIn.getOperand(0).hasOneUse() &&

41039

isa<ConstantSDNode>(TruncIn.getOperand(1)) &&

41040

ISD::isNormalLoad(TruncIn.getOperand(0).getNode())) {

41041

LoadSDNode *LN = cast<LoadSDNode>(TruncIn.getOperand(0));

41042

unsigned ShiftAmt = TruncIn.getConstantOperandVal(1);

41043

// Make sure the shift amount and the load size are divisible by 16.

41044

// Don't do this if the load is volatile or atomic.

41045

if (ShiftAmt % 16 == 0 && TruncIn.getValueSizeInBits() % 16 == 0 &&

41046

LN->isSimple()) {

41047

unsigned Offset = ShiftAmt / 8;

41048

SDVTList Tys = DAG.getVTList(VT, MVT::Other);

41049

SDValue Ptr = DAG.getMemBasePlusOffset(LN->getBasePtr(),

41050

TypeSize::Fixed(Offset), DL);

41051

SDValue Ops[] = { LN->getChain(), Ptr };

41052

SDValue BcastLd = DAG.getMemIntrinsicNode(

41053

X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::i16,

41054

LN->getPointerInfo().getWithOffset(Offset),

41055

LN->getOriginalAlign(),

41056

LN->getMemOperand()->getFlags());

41057

DCI.CombineTo(N.getNode(), BcastLd);

41058

DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));

41059

DCI.recursivelyDeleteUnusedNodes(Src.getNode());

41060

return N; // Return N so it doesn't get rechecked!

41061

}

41062

}

41063

}

41064

41065

// vbroadcast(vzload X) -> vbroadcast_load X

41066

if (Src.getOpcode() == X86ISD::VZEXT_LOAD && Src.hasOneUse()) {

41067

MemSDNode *LN = cast<MemIntrinsicSDNode>(Src);

41068

if (LN->getMemoryVT().getSizeInBits() == VT.getScalarSizeInBits()) {

41069

SDVTList Tys = DAG.getVTList(VT, MVT::Other);

41070

SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };

41071

SDValue BcastLd =

41072

DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops,

41073

LN->getMemoryVT(), LN->getMemOperand());

41074

DCI.CombineTo(N.getNode(), BcastLd);

41075

DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));

41076

DCI.recursivelyDeleteUnusedNodes(LN);

41077

return N; // Return N so it doesn't get rechecked!

41078

}

41079

}

41080

41081

// vbroadcast(vector load X) -> vbroadcast_load

41082

if ((SrcVT == MVT::v2f64 || SrcVT == MVT::v4f32 || SrcVT == MVT::v2i64 ||

41083

SrcVT == MVT::v4i32) &&

41084

Src.hasOneUse() && ISD::isNormalLoad(Src.getNode())) {

41085

LoadSDNode *LN = cast<LoadSDNode>(Src);

41086

// Unless the load is volatile or atomic.

41087

if (LN->isSimple()) {

41088

SDVTList Tys = DAG.getVTList(VT, MVT::Other);

41089

SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};

41090

SDValue BcastLd = DAG.getMemIntrinsicNode(

41091

X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, SrcVT.getScalarType(),

41092

LN->getPointerInfo(), LN->getOriginalAlign(),

41093

LN->getMemOperand()->getFlags());

41094

DCI.CombineTo(N.getNode(), BcastLd);

41095

DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));

41096

DCI.recursivelyDeleteUnusedNodes(LN);

41097

return N; // Return N so it doesn't get rechecked!

41098

}

41099

}

41100

41101

return SDValue();

41102

}

41103

case X86ISD::VZEXT_MOVL: {

41104

SDValue N0 = N.getOperand(0);

41105

41106

// If this a vzmovl of a full vector load, replace it with a vzload, unless

41107

// the load is volatile.

41108

if (N0.hasOneUse() && ISD::isNormalLoad(N0.getNode())) {

41109

auto *LN = cast<LoadSDNode>(N0);

41110

if (SDValue VZLoad =

41111

narrowLoadToVZLoad(LN, VT.getVectorElementType(), VT, DAG)) {

41112

DCI.CombineTo(N.getNode(), VZLoad);

41113

DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));

41114

DCI.recursivelyDeleteUnusedNodes(LN);

41115

return N;

41116

}

41117

}

41118

41119

// If this a VZEXT_MOVL of a VBROADCAST_LOAD, we don't need the broadcast

41120

// and can just use a VZEXT_LOAD.

41121

// FIXME: Is there some way to do this with SimplifyDemandedVectorElts?

41122

if (N0.hasOneUse() && N0.getOpcode() == X86ISD::VBROADCAST_LOAD) {

41123

auto *LN = cast<MemSDNode>(N0);

41124

if (VT.getScalarSizeInBits() == LN->getMemoryVT().getSizeInBits()) {

41125

SDVTList Tys = DAG.getVTList(VT, MVT::Other);

41126

SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};

41127

SDValue VZLoad =

41128

DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops,

41129

LN->getMemoryVT(), LN->getMemOperand());

41130

DCI.CombineTo(N.getNode(), VZLoad);

41131

DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));

41132

DCI.recursivelyDeleteUnusedNodes(LN);

41133

return N;

41134

}

41135

}

41136

41137

// Turn (v2i64 (vzext_movl (scalar_to_vector (i64 X)))) into

41138

// (v2i64 (bitcast (v4i32 (vzext_movl (scalar_to_vector (i32 (trunc X)))))))

41139

// if the upper bits of the i64 are zero.

41140

if (N0.hasOneUse() && N0.getOpcode() == ISD::SCALAR_TO_VECTOR &&

41141

N0.getOperand(0).hasOneUse() &&

41142

N0.getOperand(0).getValueType() == MVT::i64) {

41143

SDValue In = N0.getOperand(0);

41144

APInt Mask = APInt::getHighBitsSet(64, 32);

41145

if (DAG.MaskedValueIsZero(In, Mask)) {

41146

SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, In);

41147

MVT VecVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);

41148

SDValue SclVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Trunc);

41149

SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, DL, VecVT, SclVec);

41150

return DAG.getBitcast(VT, Movl);

41151

}

41152

}

41153

41154

// Load a scalar integer constant directly to XMM instead of transferring an

41155

// immediate value from GPR.

41156

// vzext_movl (scalar_to_vector C) --> load [C,0...]

41157

if (N0.getOpcode() == ISD::SCALAR_TO_VECTOR) {

41158

if (auto *C = dyn_cast<ConstantSDNode>(N0.getOperand(0))) {

41159

// Create a vector constant - scalar constant followed by zeros.

41160

EVT ScalarVT = N0.getOperand(0).getValueType();

41161

Type *ScalarTy = ScalarVT.getTypeForEVT(*DAG.getContext());

41162

unsigned NumElts = VT.getVectorNumElements();

41163

Constant *Zero = ConstantInt::getNullValue(ScalarTy);

41164

SmallVector<Constant *, 32> ConstantVec(NumElts, Zero);

41165

ConstantVec[0] = const_cast<ConstantInt *>(C->getConstantIntValue());

41166

41167

// Load the vector constant from constant pool.

41168

MVT PVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());

41169

SDValue CP = DAG.getConstantPool(ConstantVector::get(ConstantVec), PVT);

41170

MachinePointerInfo MPI =

41171

MachinePointerInfo::getConstantPool(DAG.getMachineFunction());

41172

Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();

41173

return DAG.getLoad(VT, DL, DAG.getEntryNode(), CP, MPI, Alignment,

41174

MachineMemOperand::MOLoad);

41175

}

41176

}

41177

41178

// Pull subvector inserts into undef through VZEXT_MOVL by making it an

41179

// insert into a zero vector. This helps get VZEXT_MOVL closer to

41180

// scalar_to_vectors where 256/512 are canonicalized to an insert and a

41181

// 128-bit scalar_to_vector. This reduces the number of isel patterns.

41182

if (!DCI.isBeforeLegalizeOps() && N0.hasOneUse()) {

41183

SDValue V = peekThroughOneUseBitcasts(N0);

41184

41185

if (V.getOpcode() == ISD::INSERT_SUBVECTOR && V.getOperand(0).isUndef() &&

41186

isNullConstant(V.getOperand(2))) {

41187

SDValue In = V.getOperand(1);

41188

MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(),

41189

In.getValueSizeInBits() /

41190

VT.getScalarSizeInBits());

41191

In = DAG.getBitcast(SubVT, In);

41192

SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, DL, SubVT, In);

41193

return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,

41194

getZeroVector(VT, Subtarget, DAG, DL), Movl,

41195

V.getOperand(2));

41196

}

41197

}

41198

41199

return SDValue();

41200

}

41201

case X86ISD::BLENDI: {

41202

SDValue N0 = N.getOperand(0);

41203

SDValue N1 = N.getOperand(1);

41204

41205

// blend(bitcast(x),bitcast(y)) -> bitcast(blend(x,y)) to narrower types.

41206

// TODO: Handle MVT::v16i16 repeated blend mask.

41207

if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST &&

41208

N0.getOperand(0).getValueType() == N1.getOperand(0).getValueType()) {

41209

MVT SrcVT = N0.getOperand(0).getSimpleValueType();

41210

if ((VT.getScalarSizeInBits() % SrcVT.getScalarSizeInBits()) == 0 &&

41211

SrcVT.getScalarSizeInBits() >= 32) {

41212

unsigned BlendMask = N.getConstantOperandVal(2);

41213

unsigned Size = VT.getVectorNumElements();

41214

unsigned Scale = VT.getScalarSizeInBits() / SrcVT.getScalarSizeInBits();

41215

BlendMask = scaleVectorShuffleBlendMask(BlendMask, Size, Scale);

41216

return DAG.getBitcast(

41217

VT, DAG.getNode(X86ISD::BLENDI, DL, SrcVT, N0.getOperand(0),

41218

N1.getOperand(0),

41219

DAG.getTargetConstant(BlendMask, DL, MVT::i8)));

41220

}

41221

}

41222

return SDValue();

41223

}

41224

case X86ISD::SHUFP: {

41225

// Fold shufps(shuffle(x),shuffle(y)) -> shufps(x,y).

41226

// This is a more relaxed shuffle combiner that can ignore oneuse limits.

41227

// TODO: Support types other than v4f32.

41228

if (VT == MVT::v4f32) {

41229

bool Updated = false;

41230

SmallVector<int> Mask;

41231

SmallVector<SDValue> Ops;

41232

if (getTargetShuffleMask(N.getNode(), VT, false, Ops, Mask) &&

41233

Ops.size() == 2) {

41234

for (int i = 0; i != 2; ++i) {

41235

SmallVector<SDValue> SubOps;

41236

SmallVector<int> SubMask, SubScaledMask;

41237

SDValue Sub = peekThroughBitcasts(Ops[i]);

41238

// TODO: Scaling might be easier if we specify the demanded elts.

41239

if (getTargetShuffleInputs(Sub, SubOps, SubMask, DAG, 0, false) &&

41240

scaleShuffleElements(SubMask, 4, SubScaledMask) &&

41241

SubOps.size() == 1 && isUndefOrInRange(SubScaledMask, 0, 4)) {

41242

int Ofs = i * 2;

41243

Mask[Ofs + 0] = SubScaledMask[Mask[Ofs + 0] % 4] + (i * 4);

41244

Mask[Ofs + 1] = SubScaledMask[Mask[Ofs + 1] % 4] + (i * 4);

41245

Ops[i] = DAG.getBitcast(VT, SubOps[0]);

41246

Updated = true;

41247

}

41248

}

41249

}

41250

if (Updated) {

41251

for (int &M : Mask)

41252

M %= 4;

41253

Ops.push_back(getV4X86ShuffleImm8ForMask(Mask, DL, DAG));

41254

return DAG.getNode(X86ISD::SHUFP, DL, VT, Ops);

41255

}

41256

}

41257

return SDValue();

41258

}

41259

case X86ISD::VPERMI: {

41260

// vpermi(bitcast(x)) -> bitcast(vpermi(x)) for same number of elements.

41261

// TODO: Remove when we have preferred domains in combineX86ShuffleChain.

41262

SDValue N0 = N.getOperand(0);

41263

SDValue N1 = N.getOperand(1);

41264

unsigned EltSizeInBits = VT.getScalarSizeInBits();

41265

if (N0.getOpcode() == ISD::BITCAST &&

41266

N0.getOperand(0).getScalarValueSizeInBits() == EltSizeInBits) {

41267

SDValue Src = N0.getOperand(0);

41268

EVT SrcVT = Src.getValueType();

41269

SDValue Res = DAG.getNode(X86ISD::VPERMI, DL, SrcVT, Src, N1);

41270

return DAG.getBitcast(VT, Res);

41271

}

41272

return SDValue();

41273

}

41274

case X86ISD::VPERM2X128: {

41275

// Fold vperm2x128(bitcast(x),bitcast(y),c) -> bitcast(vperm2x128(x,y,c)).

41276

SDValue LHS = N->getOperand(0);

41277

SDValue RHS = N->getOperand(1);

41278

if (LHS.getOpcode() == ISD::BITCAST &&

41279

(RHS.getOpcode() == ISD::BITCAST || RHS.isUndef())) {

41280

EVT SrcVT = LHS.getOperand(0).getValueType();

41281

if (RHS.isUndef() || SrcVT == RHS.getOperand(0).getValueType()) {

41282

return DAG.getBitcast(VT, DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT,

41283

DAG.getBitcast(SrcVT, LHS),

41284

DAG.getBitcast(SrcVT, RHS),

41285

N->getOperand(2)));

41286

}

41287

}

41288

41289

// Fold vperm2x128(op(),op()) -> op(vperm2x128(),vperm2x128()).

41290

if (SDValue Res = canonicalizeLaneShuffleWithRepeatedOps(N, DAG, DL))

41291

return Res;

41292

41293

// Fold vperm2x128 subvector shuffle with an inner concat pattern.

41294

// vperm2x128(concat(X,Y),concat(Z,W)) --> concat X,Y etc.

41295

auto FindSubVector128 = [&](unsigned Idx) {

41296

if (Idx > 3)

41297

return SDValue();

41298

SDValue Src = peekThroughBitcasts(N.getOperand(Idx < 2 ? 0 : 1));

41299

SmallVector<SDValue> SubOps;

41300

if (collectConcatOps(Src.getNode(), SubOps, DAG) && SubOps.size() == 2)

41301

return SubOps[Idx & 1];

41302

unsigned NumElts = Src.getValueType().getVectorNumElements();

41303

if ((Idx & 1) == 1 && Src.getOpcode() == ISD::INSERT_SUBVECTOR &&

41304

Src.getOperand(1).getValueSizeInBits() == 128 &&

41305

Src.getConstantOperandAPInt(2) == (NumElts / 2)) {

41306

return Src.getOperand(1);

41307

}

41308

return SDValue();

41309

};

41310

unsigned Imm = N.getConstantOperandVal(2);

41311

if (SDValue SubLo = FindSubVector128(Imm & 0x0F)) {

41312

if (SDValue SubHi = FindSubVector128((Imm & 0xF0) >> 4)) {

41313

MVT SubVT = VT.getHalfNumVectorElementsVT();

41314

SubLo = DAG.getBitcast(SubVT, SubLo);

41315

SubHi = DAG.getBitcast(SubVT, SubHi);

41316

return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, SubLo, SubHi);

41317

}

41318

}

41319

return SDValue();

41320

}

41321

case X86ISD::PSHUFD:

41322

case X86ISD::PSHUFLW:

41323

case X86ISD::PSHUFHW:

41324

Mask = getPSHUFShuffleMask(N);

41325

assert(Mask.size() == 4)(static_cast <bool> (Mask.size() == 4) ? void (0) : __assert_fail
("Mask.size() == 4", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 41325, __extension__ __PRETTY_FUNCTION__));

41326

break;

41327

case X86ISD::MOVSD:

41328

case X86ISD::MOVSH:

41329

case X86ISD::MOVSS: {

41330

SDValue N0 = N.getOperand(0);

41331

SDValue N1 = N.getOperand(1);

41332

41333

// Canonicalize scalar FPOps:

41334

// MOVS*(N0, OP(N0, N1)) --> MOVS*(N0, SCALAR_TO_VECTOR(OP(N0[0], N1[0])))

41335

// If commutable, allow OP(N1[0], N0[0]).

41336

unsigned Opcode1 = N1.getOpcode();

41337

if (Opcode1 == ISD::FADD || Opcode1 == ISD::FMUL || Opcode1 == ISD::FSUB ||

41338

Opcode1 == ISD::FDIV) {

41339

SDValue N10 = N1.getOperand(0);

41340

SDValue N11 = N1.getOperand(1);

41341

if (N10 == N0 ||

41342

(N11 == N0 && (Opcode1 == ISD::FADD || Opcode1 == ISD::FMUL))) {

41343

if (N10 != N0)

41344

std::swap(N10, N11);

41345

MVT SVT = VT.getVectorElementType();

41346

SDValue ZeroIdx = DAG.getIntPtrConstant(0, DL);

41347

N10 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT, N10, ZeroIdx);

41348

N11 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT, N11, ZeroIdx);

41349

SDValue Scl = DAG.getNode(Opcode1, DL, SVT, N10, N11);

41350

SDValue SclVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl);

41351

return DAG.getNode(Opcode, DL, VT, N0, SclVec);

41352

}

41353

}

41354

41355

return SDValue();

41356

}

41357

case X86ISD::INSERTPS: {

41358

assert(VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32")(static_cast <bool> (VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32"
) ? void (0) : __assert_fail ("VT == MVT::v4f32 && \"INSERTPS ValueType must be MVT::v4f32\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41358, __extension__
__PRETTY_FUNCTION__));

41359

SDValue Op0 = N.getOperand(0);

41360

SDValue Op1 = N.getOperand(1);

41361

unsigned InsertPSMask = N.getConstantOperandVal(2);

41362

unsigned SrcIdx = (InsertPSMask >> 6) & 0x3;

41363

unsigned DstIdx = (InsertPSMask >> 4) & 0x3;

41364

unsigned ZeroMask = InsertPSMask & 0xF;

41365

41366

// If we zero out all elements from Op0 then we don't need to reference it.

41367

if (((ZeroMask | (1u << DstIdx)) == 0xF) && !Op0.isUndef())

41368

return DAG.getNode(X86ISD::INSERTPS, DL, VT, DAG.getUNDEF(VT), Op1,

41369

DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));

41370

41371

// If we zero out the element from Op1 then we don't need to reference it.

41372

if ((ZeroMask & (1u << DstIdx)) && !Op1.isUndef())

41373

return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),

41374

DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));

41375

41376

// Attempt to merge insertps Op1 with an inner target shuffle node.

41377

SmallVector<int, 8> TargetMask1;

41378

SmallVector<SDValue, 2> Ops1;

41379

APInt KnownUndef1, KnownZero1;

41380

if (getTargetShuffleAndZeroables(Op1, TargetMask1, Ops1, KnownUndef1,

41381

KnownZero1)) {

41382

if (KnownUndef1[SrcIdx] || KnownZero1[SrcIdx]) {

41383

// Zero/UNDEF insertion - zero out element and remove dependency.

41384

InsertPSMask |= (1u << DstIdx);

41385

return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),

41386

DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));

41387

}

41388

// Update insertps mask srcidx and reference the source input directly.

41389

int M = TargetMask1[SrcIdx];

41390

assert(0 <= M && M < 8 && "Shuffle index out of range")(static_cast <bool> (0 <= M && M < 8 &&
"Shuffle index out of range") ? void (0) : __assert_fail ("0 <= M && M < 8 && \"Shuffle index out of range\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41390, __extension__
__PRETTY_FUNCTION__));

41391

InsertPSMask = (InsertPSMask & 0x3f) | ((M & 0x3) << 6);

41392

Op1 = Ops1[M < 4 ? 0 : 1];

41393

return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,

41394

DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));

41395

}

41396

41397

// Attempt to merge insertps Op0 with an inner target shuffle node.

41398

SmallVector<int, 8> TargetMask0;

41399

SmallVector<SDValue, 2> Ops0;

41400

APInt KnownUndef0, KnownZero0;

41401

if (getTargetShuffleAndZeroables(Op0, TargetMask0, Ops0, KnownUndef0,

41402

KnownZero0)) {

41403

bool Updated = false;

41404

bool UseInput00 = false;

41405

bool UseInput01 = false;

41406

for (int i = 0; i != 4; ++i) {

41407

if ((InsertPSMask & (1u << i)) || (i == (int)DstIdx)) {

41408

// No change if element is already zero or the inserted element.

41409

continue;

41410

}

41411

41412

if (KnownUndef0[i] || KnownZero0[i]) {

41413

// If the target mask is undef/zero then we must zero the element.

41414

InsertPSMask |= (1u << i);

41415

Updated = true;

41416

continue;

41417

}

41418

41419

// The input vector element must be inline.

41420

int M = TargetMask0[i];

41421

if (M != i && M != (i + 4))

41422

return SDValue();

41423

41424

// Determine which inputs of the target shuffle we're using.

41425

UseInput00 |= (0 <= M && M < 4);

41426

UseInput01 |= (4 <= M);

41427

}

41428

41429

// If we're not using both inputs of the target shuffle then use the

41430

// referenced input directly.

41431

if (UseInput00 && !UseInput01) {

41432

Updated = true;

41433

Op0 = Ops0[0];

41434

} else if (!UseInput00 && UseInput01) {

41435

Updated = true;

41436

Op0 = Ops0[1];

41437

}

41438

41439

if (Updated)

41440

return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,

41441

DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));

41442

}

41443

41444

// If we're inserting an element from a vbroadcast load, fold the

41445

// load into the X86insertps instruction. We need to convert the scalar

41446

// load to a vector and clear the source lane of the INSERTPS control.

41447

if (Op1.getOpcode() == X86ISD::VBROADCAST_LOAD && Op1.hasOneUse()) {

41448

auto *MemIntr = cast<MemIntrinsicSDNode>(Op1);

41449

if (MemIntr->getMemoryVT().getScalarSizeInBits() == 32) {

41450

SDValue Load = DAG.getLoad(MVT::f32, DL, MemIntr->getChain(),

41451

MemIntr->getBasePtr(),

41452

MemIntr->getMemOperand());

41453

SDValue Insert = DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0,

41454

DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT,

41455

Load),

41456

DAG.getTargetConstant(InsertPSMask & 0x3f, DL, MVT::i8));

41457

DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), Load.getValue(1));

41458

return Insert;

41459

}

41460

}

41461

41462

return SDValue();

41463

}

41464

default:

41465

return SDValue();

41466

}

41467

41468

// Nuke no-op shuffles that show up after combining.

41469

if (isNoopShuffleMask(Mask))

41470

return N.getOperand(0);

41471

41472

// Look for simplifications involving one or two shuffle instructions.

41473

SDValue V = N.getOperand(0);

41474

switch (N.getOpcode()) {

41475

default:

41476

break;

41477

case X86ISD::PSHUFLW:

41478

case X86ISD::PSHUFHW:

41479

assert(VT.getVectorElementType() == MVT::i16 && "Bad word shuffle type!")(static_cast <bool> (VT.getVectorElementType() == MVT::
i16 && "Bad word shuffle type!") ? void (0) : __assert_fail
("VT.getVectorElementType() == MVT::i16 && \"Bad word shuffle type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41479, __extension__
__PRETTY_FUNCTION__));

41480

41481

// See if this reduces to a PSHUFD which is no more expensive and can

41482

// combine with more operations. Note that it has to at least flip the

41483

// dwords as otherwise it would have been removed as a no-op.

41484

if (ArrayRef(Mask).equals({2, 3, 0, 1})) {

41485

int DMask[] = {0, 1, 2, 3};

41486

int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2;

41487

DMask[DOffset + 0] = DOffset + 1;

41488

DMask[DOffset + 1] = DOffset + 0;

41489

MVT DVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);

41490

V = DAG.getBitcast(DVT, V);

41491

V = DAG.getNode(X86ISD::PSHUFD, DL, DVT, V,

41492

getV4X86ShuffleImm8ForMask(DMask, DL, DAG));

41493

return DAG.getBitcast(VT, V);

41494

}

41495

41496

// Look for shuffle patterns which can be implemented as a single unpack.

41497

// FIXME: This doesn't handle the location of the PSHUFD generically, and

41498

// only works when we have a PSHUFD followed by two half-shuffles.

41499

if (Mask[0] == Mask[1] && Mask[2] == Mask[3] &&

41500

(V.getOpcode() == X86ISD::PSHUFLW ||

41501

V.getOpcode() == X86ISD::PSHUFHW) &&

41502

V.getOpcode() != N.getOpcode() &&

41503

V.hasOneUse() && V.getOperand(0).hasOneUse()) {

41504

SDValue D = peekThroughOneUseBitcasts(V.getOperand(0));

41505

if (D.getOpcode() == X86ISD::PSHUFD) {

41506

SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);

41507

SmallVector<int, 4> DMask = getPSHUFShuffleMask(D);

41508

int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;

41509

int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;

41510

int WordMask[8];

41511

for (int i = 0; i < 4; ++i) {

41512

WordMask[i + NOffset] = Mask[i] + NOffset;

41513

WordMask[i + VOffset] = VMask[i] + VOffset;

41514

}

41515

// Map the word mask through the DWord mask.

41516

int MappedMask[8];

41517

for (int i = 0; i < 8; ++i)

41518

MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2;

41519

if (ArrayRef(MappedMask).equals({0, 0, 1, 1, 2, 2, 3, 3}) ||

41520

ArrayRef(MappedMask).equals({4, 4, 5, 5, 6, 6, 7, 7})) {

41521

// We can replace all three shuffles with an unpack.

41522

V = DAG.getBitcast(VT, D.getOperand(0));

41523

return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL

41524

: X86ISD::UNPCKH,

41525

DL, VT, V, V);

41526

}

41527

}

41528

}

41529

41530

break;

41531

41532

case X86ISD::PSHUFD:

41533

if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DAG))

41534

return NewN;

41535

41536

break;

41537

}

41538

41539

return SDValue();

41540

}

41541

41542

/// Checks if the shuffle mask takes subsequent elements

41543

/// alternately from two vectors.

41544

/// For example <0, 5, 2, 7> or <8, 1, 10, 3, 12, 5, 14, 7> are both correct.

41545

static bool isAddSubOrSubAddMask(ArrayRef<int> Mask, bool &Op0Even) {

41546

41547

int ParitySrc[2] = {-1, -1};

41548

unsigned Size = Mask.size();

41549

for (unsigned i = 0; i != Size; ++i) {

41550

int M = Mask[i];

41551

if (M < 0)

41552

continue;

41553

41554

// Make sure we are using the matching element from the input.

41555

if ((M % Size) != i)

41556

return false;

41557

41558

// Make sure we use the same input for all elements of the same parity.

41559

int Src = M / Size;

41560

if (ParitySrc[i % 2] >= 0 && ParitySrc[i % 2] != Src)

41561

return false;

41562

ParitySrc[i % 2] = Src;

41563

}

41564

41565

// Make sure each input is used.

41566

if (ParitySrc[0] < 0 || ParitySrc[1] < 0 || ParitySrc[0] == ParitySrc[1])

41567

return false;

41568

41569

Op0Even = ParitySrc[0] == 0;

41570

return true;

41571

}

41572

41573

/// Returns true iff the shuffle node \p N can be replaced with ADDSUB(SUBADD)

41574

/// operation. If true is returned then the operands of ADDSUB(SUBADD) operation

41575

/// are written to the parameters \p Opnd0 and \p Opnd1.

41576

///

41577

/// We combine shuffle to ADDSUB(SUBADD) directly on the abstract vector shuffle nodes

41578

/// so it is easier to generically match. We also insert dummy vector shuffle

41579

/// nodes for the operands which explicitly discard the lanes which are unused

41580

/// by this operation to try to flow through the rest of the combiner

41581

/// the fact that they're unused.

41582

static bool isAddSubOrSubAdd(SDNode *N, const X86Subtarget &Subtarget,

41583

SelectionDAG &DAG, SDValue &Opnd0, SDValue &Opnd1,

41584

bool &IsSubAdd) {

41585

41586

EVT VT = N->getValueType(0);

41587

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

41588

if (!Subtarget.hasSSE3() || !TLI.isTypeLegal(VT) ||

41589

!VT.getSimpleVT().isFloatingPoint())

41590

return false;

41591

41592

// We only handle target-independent shuffles.

41593

// FIXME: It would be easy and harmless to use the target shuffle mask

41594

// extraction tool to support more.

41595

if (N->getOpcode() != ISD::VECTOR_SHUFFLE)

41596

return false;

41597

41598

SDValue V1 = N->getOperand(0);

41599

SDValue V2 = N->getOperand(1);

41600

41601

// Make sure we have an FADD and an FSUB.

41602

if ((V1.getOpcode() != ISD::FADD && V1.getOpcode() != ISD::FSUB) ||

41603

(V2.getOpcode() != ISD::FADD && V2.getOpcode() != ISD::FSUB) ||

41604

V1.getOpcode() == V2.getOpcode())

41605

return false;

41606

41607

// If there are other uses of these operations we can't fold them.

41608

if (!V1->hasOneUse() || !V2->hasOneUse())

41609

return false;

41610

41611

// Ensure that both operations have the same operands. Note that we can

41612

// commute the FADD operands.

41613

SDValue LHS, RHS;

41614

if (V1.getOpcode() == ISD::FSUB) {

41615

LHS = V1->getOperand(0); RHS = V1->getOperand(1);

41616

if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) &&

41617

(V2->getOperand(0) != RHS || V2->getOperand(1) != LHS))

41618

return false;

41619

} else {

41620

assert(V2.getOpcode() == ISD::FSUB && "Unexpected opcode")(static_cast <bool> (V2.getOpcode() == ISD::FSUB &&
"Unexpected opcode") ? void (0) : __assert_fail ("V2.getOpcode() == ISD::FSUB && \"Unexpected opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41620, __extension__
__PRETTY_FUNCTION__));

41621

LHS = V2->getOperand(0); RHS = V2->getOperand(1);

41622

if ((V1->getOperand(0) != LHS || V1->getOperand(1) != RHS) &&

41623

(V1->getOperand(0) != RHS || V1->getOperand(1) != LHS))

41624

return false;

41625

}

41626

41627

ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();

41628

bool Op0Even;

41629

if (!isAddSubOrSubAddMask(Mask, Op0Even))

41630

return false;

41631

41632

// It's a subadd if the vector in the even parity is an FADD.

41633

IsSubAdd = Op0Even ? V1->getOpcode() == ISD::FADD

41634

: V2->getOpcode() == ISD::FADD;

41635

41636

Opnd0 = LHS;

41637

Opnd1 = RHS;

41638

return true;

41639

}

41640

41641

/// Combine shuffle of two fma nodes into FMAddSub or FMSubAdd.

41642

static SDValue combineShuffleToFMAddSub(SDNode *N,

41643

const X86Subtarget &Subtarget,

41644

SelectionDAG &DAG) {

41645

// We only handle target-independent shuffles.

41646

// FIXME: It would be easy and harmless to use the target shuffle mask

41647

// extraction tool to support more.

41648

if (N->getOpcode() != ISD::VECTOR_SHUFFLE)

41649

return SDValue();

41650

41651

MVT VT = N->getSimpleValueType(0);

41652

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

41653

if (!Subtarget.hasAnyFMA() || !TLI.isTypeLegal(VT))

41654

return SDValue();

41655

41656

// We're trying to match (shuffle fma(a, b, c), X86Fmsub(a, b, c).

41657

SDValue Op0 = N->getOperand(0);

41658

SDValue Op1 = N->getOperand(1);

41659

SDValue FMAdd = Op0, FMSub = Op1;

41660

if (FMSub.getOpcode() != X86ISD::FMSUB)

41661

std::swap(FMAdd, FMSub);

41662

41663

if (FMAdd.getOpcode() != ISD::FMA || FMSub.getOpcode() != X86ISD::FMSUB ||

41664

FMAdd.getOperand(0) != FMSub.getOperand(0) || !FMAdd.hasOneUse() ||

41665

FMAdd.getOperand(1) != FMSub.getOperand(1) || !FMSub.hasOneUse() ||

41666

FMAdd.getOperand(2) != FMSub.getOperand(2))

41667

return SDValue();

41668

41669

// Check for correct shuffle mask.

41670

ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();

41671

bool Op0Even;

41672

if (!isAddSubOrSubAddMask(Mask, Op0Even))

41673

return SDValue();

41674

41675

// FMAddSub takes zeroth operand from FMSub node.

41676

SDLoc DL(N);

41677

bool IsSubAdd = Op0Even ? Op0 == FMAdd : Op1 == FMAdd;

41678

unsigned Opcode = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;

41679

return DAG.getNode(Opcode, DL, VT, FMAdd.getOperand(0), FMAdd.getOperand(1),

41680

FMAdd.getOperand(2));

41681

}

41682

41683

/// Try to combine a shuffle into a target-specific add-sub or

41684

/// mul-add-sub node.

41685

static SDValue combineShuffleToAddSubOrFMAddSub(SDNode *N,

41686

const X86Subtarget &Subtarget,

41687

SelectionDAG &DAG) {

41688

if (SDValue V = combineShuffleToFMAddSub(N, Subtarget, DAG))

41689

return V;

41690

41691

SDValue Opnd0, Opnd1;

41692

bool IsSubAdd;

41693

if (!isAddSubOrSubAdd(N, Subtarget, DAG, Opnd0, Opnd1, IsSubAdd))

41694

return SDValue();

41695

41696

MVT VT = N->getSimpleValueType(0);

41697

SDLoc DL(N);

41698

41699

// Try to generate X86ISD::FMADDSUB node here.

41700

SDValue Opnd2;

41701

if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, 2)) {

41702

unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;

41703

return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);

41704

}

41705

41706

if (IsSubAdd)

41707

return SDValue();

41708

41709

// Do not generate X86ISD::ADDSUB node for 512-bit types even though

41710

// the ADDSUB idiom has been successfully recognized. There are no known

41711

// X86 targets with 512-bit ADDSUB instructions!

41712

if (VT.is512BitVector())

41713

return SDValue();

41714

41715

// Do not generate X86ISD::ADDSUB node for FP16's vector types even though

41716

// the ADDSUB idiom has been successfully recognized. There are no known

41717

// X86 targets with FP16 ADDSUB instructions!

41718

if (VT.getVectorElementType() == MVT::f16)

41719

return SDValue();

41720

41721

return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);

41722

}

41723

41724

// We are looking for a shuffle where both sources are concatenated with undef

41725

// and have a width that is half of the output's width. AVX2 has VPERMD/Q, so

41726

// if we can express this as a single-source shuffle, that's preferable.

41727

static SDValue combineShuffleOfConcatUndef(SDNode *N, SelectionDAG &DAG,

41728

const X86Subtarget &Subtarget) {

41729

if (!Subtarget.hasAVX2() || !isa<ShuffleVectorSDNode>(N))

41730

return SDValue();

41731

41732

EVT VT = N->getValueType(0);

41733

41734

// We only care about shuffles of 128/256-bit vectors of 32/64-bit values.

41735

if (!VT.is128BitVector() && !VT.is256BitVector())

41736

return SDValue();

41737

41738

if (VT.getVectorElementType() != MVT::i32 &&

41739

VT.getVectorElementType() != MVT::i64 &&

41740

VT.getVectorElementType() != MVT::f32 &&

41741

VT.getVectorElementType() != MVT::f64)

41742

return SDValue();

41743

41744

SDValue N0 = N->getOperand(0);

41745

SDValue N1 = N->getOperand(1);

41746

41747

// Check that both sources are concats with undef.

41748

if (N0.getOpcode() != ISD::CONCAT_VECTORS ||

41749

N1.getOpcode() != ISD::CONCAT_VECTORS || N0.getNumOperands() != 2 ||

41750

N1.getNumOperands() != 2 || !N0.getOperand(1).isUndef() ||

41751

!N1.getOperand(1).isUndef())

41752

return SDValue();

41753

41754

// Construct the new shuffle mask. Elements from the first source retain their

41755

// index, but elements from the second source no longer need to skip an undef.

41756

SmallVector<int, 8> Mask;

41757

int NumElts = VT.getVectorNumElements();

41758

41759

ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);

41760

for (int Elt : SVOp->getMask())

41761

Mask.push_back(Elt < NumElts ? Elt : (Elt - NumElts / 2));

41762

41763

SDLoc DL(N);

41764

SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, N0.getOperand(0),

41765

N1.getOperand(0));

41766

return DAG.getVectorShuffle(VT, DL, Concat, DAG.getUNDEF(VT), Mask);

41767

}

41768

41769

/// If we have a shuffle of AVX/AVX512 (256/512 bit) vectors that only uses the

41770

/// low half of each source vector and does not set any high half elements in

41771

/// the destination vector, narrow the shuffle to half its original size.

41772

static SDValue narrowShuffle(ShuffleVectorSDNode *Shuf, SelectionDAG &DAG) {

41773

if (!Shuf->getValueType(0).isSimple())

41774

return SDValue();

41775

MVT VT = Shuf->getSimpleValueType(0);

41776

if (!VT.is256BitVector() && !VT.is512BitVector())

41777

return SDValue();

41778

41779

// See if we can ignore all of the high elements of the shuffle.

41780

ArrayRef<int> Mask = Shuf->getMask();

41781

if (!isUndefUpperHalf(Mask))

41782

return SDValue();

41783

41784

// Check if the shuffle mask accesses only the low half of each input vector

41785

// (half-index output is 0 or 2).

41786

int HalfIdx1, HalfIdx2;

41787

SmallVector<int, 8> HalfMask(Mask.size() / 2);

41788

if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2) ||

41789

(HalfIdx1 % 2 == 1) || (HalfIdx2 % 2 == 1))

41790

return SDValue();

41791

41792

// Create a half-width shuffle to replace the unnecessarily wide shuffle.

41793

// The trick is knowing that all of the insert/extract are actually free

41794

// subregister (zmm<->ymm or ymm<->xmm) ops. That leaves us with a shuffle

41795

// of narrow inputs into a narrow output, and that is always cheaper than

41796

// the wide shuffle that we started with.

41797

return getShuffleHalfVectors(SDLoc(Shuf), Shuf->getOperand(0),

41798

Shuf->getOperand(1), HalfMask, HalfIdx1,

41799

HalfIdx2, false, DAG, /*UseConcat*/true);

41800

}

41801

41802

static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,

41803

TargetLowering::DAGCombinerInfo &DCI,

41804

const X86Subtarget &Subtarget) {

41805

if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(N))

41806

if (SDValue V = narrowShuffle(Shuf, DAG))

41807

return V;

41808

41809

// If we have legalized the vector types, look for blends of FADD and FSUB

41810

// nodes that we can fuse into an ADDSUB, FMADDSUB, or FMSUBADD node.

41811

SDLoc dl(N);

41812

EVT VT = N->getValueType(0);

41813

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

41814

if (TLI.isTypeLegal(VT))

41815

if (SDValue AddSub = combineShuffleToAddSubOrFMAddSub(N, Subtarget, DAG))

41816

return AddSub;

41817

41818

// Attempt to combine into a vector load/broadcast.

41819

if (SDValue LD = combineToConsecutiveLoads(

41820

VT, SDValue(N, 0), dl, DAG, Subtarget, /*IsAfterLegalize*/ true))

41821

return LD;

41822

41823

// For AVX2, we sometimes want to combine

41824

// (vector_shuffle <mask> (concat_vectors t1, undef)

41825

// (concat_vectors t2, undef))

41826

// Into:

41827

// (vector_shuffle <mask> (concat_vectors t1, t2), undef)

41828

// Since the latter can be efficiently lowered with VPERMD/VPERMQ

41829

if (SDValue ShufConcat = combineShuffleOfConcatUndef(N, DAG, Subtarget))

41830

return ShufConcat;

41831

41832

if (isTargetShuffle(N->getOpcode())) {

41833

SDValue Op(N, 0);

41834

if (SDValue Shuffle = combineTargetShuffle(Op, DAG, DCI, Subtarget))

41835

return Shuffle;

41836

41837

// Try recursively combining arbitrary sequences of x86 shuffle

41838

// instructions into higher-order shuffles. We do this after combining

41839

// specific PSHUF instruction sequences into their minimal form so that we

41840

// can evaluate how many specialized shuffle instructions are involved in

41841

// a particular chain.

41842

if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))

41843

return Res;

41844

41845

// Simplify source operands based on shuffle mask.

41846

// TODO - merge this into combineX86ShufflesRecursively.

41847

APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());

41848

if (TLI.SimplifyDemandedVectorElts(Op, DemandedElts, DCI))

41849

return SDValue(N, 0);

41850

41851

// Canonicalize SHUFFLE(BINOP(X,Y)) -> BINOP(SHUFFLE(X),SHUFFLE(Y)).

41852

// Perform this after other shuffle combines to allow inner shuffles to be

41853

// combined away first.

41854

if (SDValue BinOp = canonicalizeShuffleWithBinOps(Op, DAG, dl))

41855

return BinOp;

41856

}

41857

41858

return SDValue();

41859

}

41860

41861

// Simplify variable target shuffle masks based on the demanded elements.

41862

// TODO: Handle DemandedBits in mask indices as well?

41863

bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetShuffle(

41864

SDValue Op, const APInt &DemandedElts, unsigned MaskIndex,

41865

TargetLowering::TargetLoweringOpt &TLO, unsigned Depth) const {

41866

// If we're demanding all elements don't bother trying to simplify the mask.

41867

unsigned NumElts = DemandedElts.getBitWidth();

41868

if (DemandedElts.isAllOnes())

41869

return false;

41870

41871

SDValue Mask = Op.getOperand(MaskIndex);

41872

if (!Mask.hasOneUse())

41873

return false;

41874

41875

// Attempt to generically simplify the variable shuffle mask.

41876

APInt MaskUndef, MaskZero;

41877

if (SimplifyDemandedVectorElts(Mask, DemandedElts, MaskUndef, MaskZero, TLO,

41878

Depth + 1))

41879

return true;

41880

41881

// Attempt to extract+simplify a (constant pool load) shuffle mask.

41882

// TODO: Support other types from getTargetShuffleMaskIndices?

41883

SDValue BC = peekThroughOneUseBitcasts(Mask);

41884

EVT BCVT = BC.getValueType();

41885

auto *Load = dyn_cast<LoadSDNode>(BC);

41886

if (!Load)

41887

return false;

41888

41889

const Constant *C = getTargetConstantFromNode(Load);

41890

if (!C)

41891

return false;

41892

41893

Type *CTy = C->getType();

41894

if (!CTy->isVectorTy() ||

41895

CTy->getPrimitiveSizeInBits() != Mask.getValueSizeInBits())

41896

return false;

41897

41898

// Handle scaling for i64 elements on 32-bit targets.

41899

unsigned NumCstElts = cast<FixedVectorType>(CTy)->getNumElements();

41900

if (NumCstElts != NumElts && NumCstElts != (NumElts * 2))

41901

return false;

41902

unsigned Scale = NumCstElts / NumElts;

41903

41904

// Simplify mask if we have an undemanded element that is not undef.

41905

bool Simplified = false;

41906

SmallVector<Constant *, 32> ConstVecOps;

41907

for (unsigned i = 0; i != NumCstElts; ++i) {

41908

Constant *Elt = C->getAggregateElement(i);

41909

if (!DemandedElts[i / Scale] && !isa<UndefValue>(Elt)) {

41910

ConstVecOps.push_back(UndefValue::get(Elt->getType()));

41911

Simplified = true;

41912

continue;

41913

}

41914

ConstVecOps.push_back(Elt);

41915

}

41916

if (!Simplified)

41917

return false;

41918

41919

// Generate new constant pool entry + legalize immediately for the load.

41920

SDLoc DL(Op);

41921

SDValue CV = TLO.DAG.getConstantPool(ConstantVector::get(ConstVecOps), BCVT);

41922

SDValue LegalCV = LowerConstantPool(CV, TLO.DAG);

41923

SDValue NewMask = TLO.DAG.getLoad(

41924

BCVT, DL, TLO.DAG.getEntryNode(), LegalCV,

41925

MachinePointerInfo::getConstantPool(TLO.DAG.getMachineFunction()),

41926

Load->getAlign());

41927

return TLO.CombineTo(Mask, TLO.DAG.getBitcast(Mask.getValueType(), NewMask));

41928

}

41929

41930

bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(

41931

SDValue Op, const APInt &DemandedElts, APInt &KnownUndef, APInt &KnownZero,

41932

TargetLoweringOpt &TLO, unsigned Depth) const {

41933

int NumElts = DemandedElts.getBitWidth();

41934

unsigned Opc = Op.getOpcode();

41935

EVT VT = Op.getValueType();

41936

41937

// Handle special case opcodes.

41938

switch (Opc) {

41939

case X86ISD::PMULDQ:

41940

case X86ISD::PMULUDQ: {

41941

APInt LHSUndef, LHSZero;

41942

APInt RHSUndef, RHSZero;

41943

SDValue LHS = Op.getOperand(0);

41944

SDValue RHS = Op.getOperand(1);

41945

if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO,

41946

Depth + 1))

41947

return true;

41948

if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO,

41949

Depth + 1))

41950

return true;

41951

// Multiply by zero.

41952

KnownZero = LHSZero | RHSZero;

41953

break;

41954

}

41955

case X86ISD::VPMADDWD: {

41956

APInt LHSUndef, LHSZero;

41957

APInt RHSUndef, RHSZero;

41958

SDValue LHS = Op.getOperand(0);

41959

SDValue RHS = Op.getOperand(1);

41960

APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, 2 * NumElts);

41961

41962

if (SimplifyDemandedVectorElts(LHS, DemandedSrcElts, LHSUndef, LHSZero, TLO,

41963

Depth + 1))

41964

return true;

41965

if (SimplifyDemandedVectorElts(RHS, DemandedSrcElts, RHSUndef, RHSZero, TLO,

41966

Depth + 1))

41967

return true;

41968

41969

// TODO: Multiply by zero.

41970

41971

// If RHS/LHS elements are known zero then we don't need the LHS/RHS equivalent.

41972

APInt DemandedLHSElts = DemandedSrcElts & ~RHSZero;

41973

if (SimplifyDemandedVectorElts(LHS, DemandedLHSElts, LHSUndef, LHSZero, TLO,

41974

Depth + 1))

41975

return true;

41976

APInt DemandedRHSElts = DemandedSrcElts & ~LHSZero;

41977

if (SimplifyDemandedVectorElts(RHS, DemandedRHSElts, RHSUndef, RHSZero, TLO,

41978

Depth + 1))

41979

return true;

41980

break;

41981

}

41982

case X86ISD::PSADBW: {

41983

SDValue LHS = Op.getOperand(0);

41984

SDValue RHS = Op.getOperand(1);

41985

assert(VT.getScalarType() == MVT::i64 &&(static_cast <bool> (VT.getScalarType() == MVT::i64 &&
LHS.getValueType() == RHS.getValueType() && LHS.getValueType
().getScalarType() == MVT::i8 && "Unexpected PSADBW types"
) ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i64 && LHS.getValueType() == RHS.getValueType() && LHS.getValueType().getScalarType() == MVT::i8 && \"Unexpected PSADBW types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41988, __extension__
__PRETTY_FUNCTION__))

41986

LHS.getValueType() == RHS.getValueType() &&(static_cast <bool> (VT.getScalarType() == MVT::i64 &&
LHS.getValueType() == RHS.getValueType() && LHS.getValueType
().getScalarType() == MVT::i8 && "Unexpected PSADBW types"
) ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i64 && LHS.getValueType() == RHS.getValueType() && LHS.getValueType().getScalarType() == MVT::i8 && \"Unexpected PSADBW types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41988, __extension__
__PRETTY_FUNCTION__))

41987

LHS.getValueType().getScalarType() == MVT::i8 &&(static_cast <bool> (VT.getScalarType() == MVT::i64 &&
LHS.getValueType() == RHS.getValueType() && LHS.getValueType
().getScalarType() == MVT::i8 && "Unexpected PSADBW types"
) ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i64 && LHS.getValueType() == RHS.getValueType() && LHS.getValueType().getScalarType() == MVT::i8 && \"Unexpected PSADBW types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41988, __extension__
__PRETTY_FUNCTION__))

41988

"Unexpected PSADBW types")(static_cast <bool> (VT.getScalarType() == MVT::i64 &&
LHS.getValueType() == RHS.getValueType() && LHS.getValueType
().getScalarType() == MVT::i8 && "Unexpected PSADBW types"
) ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i64 && LHS.getValueType() == RHS.getValueType() && LHS.getValueType().getScalarType() == MVT::i8 && \"Unexpected PSADBW types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 41988, __extension__
__PRETTY_FUNCTION__));

41989

41990

// Aggressively peek through ops to get at the demanded elts.

41991

if (!DemandedElts.isAllOnes()) {

41992

unsigned NumSrcElts = LHS.getValueType().getVectorNumElements();

41993

APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, NumSrcElts);

41994

SDValue NewLHS = SimplifyMultipleUseDemandedVectorElts(

41995

LHS, DemandedSrcElts, TLO.DAG, Depth + 1);

41996

SDValue NewRHS = SimplifyMultipleUseDemandedVectorElts(

41997

RHS, DemandedSrcElts, TLO.DAG, Depth + 1);

41998

if (NewLHS || NewRHS) {

41999

NewLHS = NewLHS ? NewLHS : LHS;

42000

NewRHS = NewRHS ? NewRHS : RHS;

42001

return TLO.CombineTo(

42002

Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewLHS, NewRHS));

42003

}

42004

}

42005

break;

42006

}

42007

case X86ISD::VSHL:

42008

case X86ISD::VSRL:

42009

case X86ISD::VSRA: {

42010

// We only need the bottom 64-bits of the (128-bit) shift amount.

42011

SDValue Amt = Op.getOperand(1);

42012

MVT AmtVT = Amt.getSimpleValueType();

42013

assert(AmtVT.is128BitVector() && "Unexpected value type")(static_cast <bool> (AmtVT.is128BitVector() && "Unexpected value type"
) ? void (0) : __assert_fail ("AmtVT.is128BitVector() && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 42013, __extension__
__PRETTY_FUNCTION__));

42014

42015

// If we reuse the shift amount just for sse shift amounts then we know that

42016

// only the bottom 64-bits are only ever used.

42017

bool AssumeSingleUse = llvm::all_of(Amt->uses(), [&Amt](SDNode *Use) {

42018

unsigned UseOpc = Use->getOpcode();

42019

return (UseOpc == X86ISD::VSHL || UseOpc == X86ISD::VSRL ||

42020

UseOpc == X86ISD::VSRA) &&

42021

Use->getOperand(0) != Amt;

42022

});

42023

42024

APInt AmtUndef, AmtZero;

42025

unsigned NumAmtElts = AmtVT.getVectorNumElements();

42026

APInt AmtElts = APInt::getLowBitsSet(NumAmtElts, NumAmtElts / 2);

42027

if (SimplifyDemandedVectorElts(Amt, AmtElts, AmtUndef, AmtZero, TLO,

42028

Depth + 1, AssumeSingleUse))

42029

return true;

42030

[[fallthrough]];

42031

}

42032

case X86ISD::VSHLI:

42033

case X86ISD::VSRLI:

42034

case X86ISD::VSRAI: {

42035

SDValue Src = Op.getOperand(0);

42036

APInt SrcUndef;

42037

if (SimplifyDemandedVectorElts(Src, DemandedElts, SrcUndef, KnownZero, TLO,

42038

Depth + 1))

42039

return true;

42040

42041

// Fold shift(0,x) -> 0

42042

if (DemandedElts.isSubsetOf(KnownZero))

42043

return TLO.CombineTo(

42044

Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op)));

42045

42046

// Aggressively peek through ops to get at the demanded elts.

42047

if (!DemandedElts.isAllOnes())

42048

if (SDValue NewSrc = SimplifyMultipleUseDemandedVectorElts(

42049

Src, DemandedElts, TLO.DAG, Depth + 1))

42050

return TLO.CombineTo(

42051

Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc, Op.getOperand(1)));

42052

break;

42053

}

42054

case X86ISD::VPSHA:

42055

case X86ISD::VPSHL:

42056

case X86ISD::VSHLV:

42057

case X86ISD::VSRLV:

42058

case X86ISD::VSRAV: {

42059

APInt LHSUndef, LHSZero;

42060

APInt RHSUndef, RHSZero;

42061

SDValue LHS = Op.getOperand(0);

42062

SDValue RHS = Op.getOperand(1);

42063

if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO,

42064

Depth + 1))

42065

return true;

42066

42067

// Fold shift(0,x) -> 0

42068

if (DemandedElts.isSubsetOf(LHSZero))

42069

return TLO.CombineTo(

42070

Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op)));

42071

42072

if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO,

42073

Depth + 1))

42074

return true;

42075

42076

KnownZero = LHSZero;

42077

break;

42078

}

42079

case X86ISD::KSHIFTL: {

42080

SDValue Src = Op.getOperand(0);

42081

auto *Amt = cast<ConstantSDNode>(Op.getOperand(1));

42082

assert(Amt->getAPIntValue().ult(NumElts) && "Out of range shift amount")(static_cast <bool> (Amt->getAPIntValue().ult(NumElts
) && "Out of range shift amount") ? void (0) : __assert_fail
("Amt->getAPIntValue().ult(NumElts) && \"Out of range shift amount\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 42082, __extension__
__PRETTY_FUNCTION__));

42083

unsigned ShiftAmt = Amt->getZExtValue();

42084

42085

if (ShiftAmt == 0)

42086

return TLO.CombineTo(Op, Src);

42087

42088

// If this is ((X >>u C1) << ShAmt), see if we can simplify this into a

42089

// single shift. We can do this if the bottom bits (which are shifted

42090

// out) are never demanded.

42091

if (Src.getOpcode() == X86ISD::KSHIFTR) {

42092

if (!DemandedElts.intersects(APInt::getLowBitsSet(NumElts, ShiftAmt))) {

42093

unsigned C1 = Src.getConstantOperandVal(1);

42094

unsigned NewOpc = X86ISD::KSHIFTL;

42095

int Diff = ShiftAmt - C1;

42096

if (Diff < 0) {

42097

Diff = -Diff;

42098

NewOpc = X86ISD::KSHIFTR;

42099

}

42100

42101

SDLoc dl(Op);

42102

SDValue NewSA = TLO.DAG.getTargetConstant(Diff, dl, MVT::i8);

42103

return TLO.CombineTo(

42104

Op, TLO.DAG.getNode(NewOpc, dl, VT, Src.getOperand(0), NewSA));

42105

}

42106

}

42107

42108

APInt DemandedSrc = DemandedElts.lshr(ShiftAmt);

42109

if (SimplifyDemandedVectorElts(Src, DemandedSrc, KnownUndef, KnownZero, TLO,

42110

Depth + 1))

42111

return true;

42112

42113

KnownUndef <<= ShiftAmt;

42114

KnownZero <<= ShiftAmt;

42115

KnownZero.setLowBits(ShiftAmt);

42116

break;

42117

}

42118

case X86ISD::KSHIFTR: {

42119

SDValue Src = Op.getOperand(0);

42120

auto *Amt = cast<ConstantSDNode>(Op.getOperand(1));

42121

assert(Amt->getAPIntValue().ult(NumElts) && "Out of range shift amount")(static_cast <bool> (Amt->getAPIntValue().ult(NumElts
) && "Out of range shift amount") ? void (0) : __assert_fail
("Amt->getAPIntValue().ult(NumElts) && \"Out of range shift amount\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 42121, __extension__
__PRETTY_FUNCTION__));

42122

unsigned ShiftAmt = Amt->getZExtValue();

42123

42124

if (ShiftAmt == 0)

42125

return TLO.CombineTo(Op, Src);

42126

42127

// If this is ((X << C1) >>u ShAmt), see if we can simplify this into a

42128

// single shift. We can do this if the top bits (which are shifted

42129

// out) are never demanded.

42130

if (Src.getOpcode() == X86ISD::KSHIFTL) {

42131

if (!DemandedElts.intersects(APInt::getHighBitsSet(NumElts, ShiftAmt))) {

42132

unsigned C1 = Src.getConstantOperandVal(1);

42133

unsigned NewOpc = X86ISD::KSHIFTR;

42134

int Diff = ShiftAmt - C1;

42135

if (Diff < 0) {

42136

Diff = -Diff;

42137

NewOpc = X86ISD::KSHIFTL;

42138

}

42139

42140

SDLoc dl(Op);

42141

SDValue NewSA = TLO.DAG.getTargetConstant(Diff, dl, MVT::i8);

42142

return TLO.CombineTo(

42143

Op, TLO.DAG.getNode(NewOpc, dl, VT, Src.getOperand(0), NewSA));

42144

}

42145

}

42146

42147

APInt DemandedSrc = DemandedElts.shl(ShiftAmt);

42148

if (SimplifyDemandedVectorElts(Src, DemandedSrc, KnownUndef, KnownZero, TLO,

42149

Depth + 1))

42150

return true;

42151

42152

KnownUndef.lshrInPlace(ShiftAmt);

42153

KnownZero.lshrInPlace(ShiftAmt);

42154

KnownZero.setHighBits(ShiftAmt);

42155

break;

42156

}

42157

case X86ISD::ANDNP: {

42158

// ANDNP = (~LHS & RHS);

42159

SDValue LHS = Op.getOperand(0);

42160

SDValue RHS = Op.getOperand(1);

42161

42162

auto GetDemandedMasks = [&](SDValue Op, bool Invert = false) {

42163

APInt UndefElts;

42164

SmallVector<APInt> EltBits;

42165

int NumElts = VT.getVectorNumElements();

42166

int EltSizeInBits = VT.getScalarSizeInBits();

42167

APInt OpBits = APInt::getAllOnes(EltSizeInBits);

42168

APInt OpElts = DemandedElts;

42169

if (getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,

42170

EltBits)) {

42171

OpBits.clearAllBits();

42172

OpElts.clearAllBits();

42173

for (int I = 0; I != NumElts; ++I) {

42174

if (!DemandedElts[I])

42175

continue;

42176

if (UndefElts[I]) {

42177

// We can't assume an undef src element gives an undef dst - the

42178

// other src might be zero.

42179

OpBits.setAllBits();

42180

OpElts.setBit(I);

42181

} else if ((Invert && !EltBits[I].isAllOnes()) ||

42182

(!Invert && !EltBits[I].isZero())) {

42183

OpBits |= Invert ? ~EltBits[I] : EltBits[I];

42184

OpElts.setBit(I);

42185

}

42186

}

42187

}

42188

return std::make_pair(OpBits, OpElts);

42189

};

42190

APInt BitsLHS, EltsLHS;

42191

APInt BitsRHS, EltsRHS;

42192

std::tie(BitsLHS, EltsLHS) = GetDemandedMasks(RHS);

42193

std::tie(BitsRHS, EltsRHS) = GetDemandedMasks(LHS, true);

42194

42195

APInt LHSUndef, LHSZero;

42196

APInt RHSUndef, RHSZero;

42197

if (SimplifyDemandedVectorElts(LHS, EltsLHS, LHSUndef, LHSZero, TLO,

42198

Depth + 1))

42199

return true;

42200

if (SimplifyDemandedVectorElts(RHS, EltsRHS, RHSUndef, RHSZero, TLO,

42201

Depth + 1))

42202

return true;

42203

42204

if (!DemandedElts.isAllOnes()) {

42205

SDValue NewLHS = SimplifyMultipleUseDemandedBits(LHS, BitsLHS, EltsLHS,

42206

TLO.DAG, Depth + 1);

42207

SDValue NewRHS = SimplifyMultipleUseDemandedBits(RHS, BitsRHS, EltsRHS,

42208

TLO.DAG, Depth + 1);

42209

if (NewLHS || NewRHS) {

42210

NewLHS = NewLHS ? NewLHS : LHS;

42211

NewRHS = NewRHS ? NewRHS : RHS;

42212

return TLO.CombineTo(

42213

Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewLHS, NewRHS));

42214

}

42215

}

42216

break;

42217

}

42218

case X86ISD::CVTSI2P:

42219

case X86ISD::CVTUI2P: {

42220

SDValue Src = Op.getOperand(0);

42221

MVT SrcVT = Src.getSimpleValueType();

42222

APInt SrcUndef, SrcZero;

42223

APInt SrcElts = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());

42224

if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO,

42225

Depth + 1))

42226

return true;

42227

break;

42228

}

42229

case X86ISD::PACKSS:

42230

case X86ISD::PACKUS: {

42231

SDValue N0 = Op.getOperand(0);

42232

SDValue N1 = Op.getOperand(1);

42233

42234

APInt DemandedLHS, DemandedRHS;

42235

getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);

42236

42237

APInt LHSUndef, LHSZero;

42238

if (SimplifyDemandedVectorElts(N0, DemandedLHS, LHSUndef, LHSZero, TLO,

42239

Depth + 1))

42240

return true;

42241

APInt RHSUndef, RHSZero;

42242

if (SimplifyDemandedVectorElts(N1, DemandedRHS, RHSUndef, RHSZero, TLO,

42243

Depth + 1))

42244

return true;

42245

42246

// TODO - pass on known zero/undef.

42247

42248

// Aggressively peek through ops to get at the demanded elts.

42249

// TODO - we should do this for all target/faux shuffles ops.

42250

if (!DemandedElts.isAllOnes()) {

42251

SDValue NewN0 = SimplifyMultipleUseDemandedVectorElts(N0, DemandedLHS,

42252

TLO.DAG, Depth + 1);

42253

SDValue NewN1 = SimplifyMultipleUseDemandedVectorElts(N1, DemandedRHS,

42254

TLO.DAG, Depth + 1);

42255

if (NewN0 || NewN1) {

42256

NewN0 = NewN0 ? NewN0 : N0;

42257

NewN1 = NewN1 ? NewN1 : N1;

42258

return TLO.CombineTo(Op,

42259

TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewN0, NewN1));

42260

}

42261

}

42262

break;

42263

}

42264

case X86ISD::HADD:

42265

case X86ISD::HSUB:

42266

case X86ISD::FHADD:

42267

case X86ISD::FHSUB: {

42268

SDValue N0 = Op.getOperand(0);

42269

SDValue N1 = Op.getOperand(1);

42270

42271

APInt DemandedLHS, DemandedRHS;

42272

getHorizDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS);

42273

42274

APInt LHSUndef, LHSZero;

42275

if (SimplifyDemandedVectorElts(N0, DemandedLHS, LHSUndef, LHSZero, TLO,

42276

Depth + 1))

42277

return true;

42278

APInt RHSUndef, RHSZero;

42279

if (SimplifyDemandedVectorElts(N1, DemandedRHS, RHSUndef, RHSZero, TLO,

42280

Depth + 1))

42281

return true;

42282

42283

// TODO - pass on known zero/undef.

42284

42285

// Aggressively peek through ops to get at the demanded elts.

42286

// TODO: Handle repeated operands.

42287

if (N0 != N1 && !DemandedElts.isAllOnes()) {

42288

SDValue NewN0 = SimplifyMultipleUseDemandedVectorElts(N0, DemandedLHS,

42289

TLO.DAG, Depth + 1);

42290

SDValue NewN1 = SimplifyMultipleUseDemandedVectorElts(N1, DemandedRHS,

42291

TLO.DAG, Depth + 1);

42292

if (NewN0 || NewN1) {

42293

NewN0 = NewN0 ? NewN0 : N0;

42294

NewN1 = NewN1 ? NewN1 : N1;

42295

return TLO.CombineTo(Op,

42296

TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewN0, NewN1));

42297

}

42298

}

42299

break;

42300

}

42301

case X86ISD::VTRUNC:

42302

case X86ISD::VTRUNCS:

42303

case X86ISD::VTRUNCUS: {

42304

SDValue Src = Op.getOperand(0);

42305

MVT SrcVT = Src.getSimpleValueType();

42306

APInt DemandedSrc = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());

42307

APInt SrcUndef, SrcZero;

42308

if (SimplifyDemandedVectorElts(Src, DemandedSrc, SrcUndef, SrcZero, TLO,

42309

Depth + 1))

42310

return true;

42311

KnownZero = SrcZero.zextOrTrunc(NumElts);

42312

KnownUndef = SrcUndef.zextOrTrunc(NumElts);

42313

break;

42314

}

42315

case X86ISD::BLENDV: {

42316

APInt SelUndef, SelZero;

42317

if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedElts, SelUndef,

42318

SelZero, TLO, Depth + 1))

42319

return true;

42320

42321

// TODO: Use SelZero to adjust LHS/RHS DemandedElts.

42322

APInt LHSUndef, LHSZero;

42323

if (SimplifyDemandedVectorElts(Op.getOperand(1), DemandedElts, LHSUndef,

42324

LHSZero, TLO, Depth + 1))

42325

return true;

42326

42327

APInt RHSUndef, RHSZero;

42328

if (SimplifyDemandedVectorElts(Op.getOperand(2), DemandedElts, RHSUndef,

42329

RHSZero, TLO, Depth + 1))

42330

return true;

42331

42332

KnownZero = LHSZero & RHSZero;

42333

KnownUndef = LHSUndef & RHSUndef;

42334

break;

42335

}

42336

case X86ISD::VZEXT_MOVL: {

42337

// If upper demanded elements are already zero then we have nothing to do.

42338

SDValue Src = Op.getOperand(0);

42339

APInt DemandedUpperElts = DemandedElts;

42340

DemandedUpperElts.clearLowBits(1);

42341

if (TLO.DAG.MaskedVectorIsZero(Src, DemandedUpperElts, Depth + 1))

42342

return TLO.CombineTo(Op, Src);

42343

break;

42344

}

42345

case X86ISD::VBROADCAST: {

42346

SDValue Src = Op.getOperand(0);

42347

MVT SrcVT = Src.getSimpleValueType();

42348

if (!SrcVT.isVector())

42349

break;

42350

// Don't bother broadcasting if we just need the 0'th element.

42351

if (DemandedElts == 1) {

42352

if (Src.getValueType() != VT)

42353

Src = widenSubVector(VT.getSimpleVT(), Src, false, Subtarget, TLO.DAG,

42354

SDLoc(Op));

42355

return TLO.CombineTo(Op, Src);

42356

}

42357

APInt SrcUndef, SrcZero;

42358

APInt SrcElts = APInt::getOneBitSet(SrcVT.getVectorNumElements(), 0);

42359

if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO,

42360

Depth + 1))

42361

return true;

42362

// Aggressively peek through src to get at the demanded elt.

42363

// TODO - we should do this for all target/faux shuffles ops.

42364

if (SDValue NewSrc = SimplifyMultipleUseDemandedVectorElts(

42365

Src, SrcElts, TLO.DAG, Depth + 1))

42366

return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));

42367

break;

42368

}

42369

case X86ISD::VPERMV:

42370

if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 0, TLO,

42371

Depth))

42372

return true;

42373

break;

42374

case X86ISD::PSHUFB:

42375

case X86ISD::VPERMV3:

42376

case X86ISD::VPERMILPV:

42377

if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 1, TLO,

42378

Depth))

42379

return true;

42380

break;

42381

case X86ISD::VPPERM:

42382

case X86ISD::VPERMIL2:

42383

if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 2, TLO,

42384

Depth))

42385

return true;

42386

break;

42387

}

42388

42389

// For 256/512-bit ops that are 128/256-bit ops glued together, if we do not

42390

// demand any of the high elements, then narrow the op to 128/256-bits: e.g.

42391

// (op ymm0, ymm1) --> insert undef, (op xmm0, xmm1), 0

42392

if ((VT.is256BitVector() || VT.is512BitVector()) &&

42393

DemandedElts.lshr(NumElts / 2) == 0) {

42394

unsigned SizeInBits = VT.getSizeInBits();

42395

unsigned ExtSizeInBits = SizeInBits / 2;

42396

42397

// See if 512-bit ops only use the bottom 128-bits.

42398

if (VT.is512BitVector() && DemandedElts.lshr(NumElts / 4) == 0)

42399

ExtSizeInBits = SizeInBits / 4;

42400

42401

switch (Opc) {

42402

// Scalar broadcast.

42403

case X86ISD::VBROADCAST: {

42404

SDLoc DL(Op);

42405

SDValue Src = Op.getOperand(0);

42406

if (Src.getValueSizeInBits() > ExtSizeInBits)

42407

Src = extractSubVector(Src, 0, TLO.DAG, DL, ExtSizeInBits);

42408

EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),

42409

ExtSizeInBits / VT.getScalarSizeInBits());

42410

SDValue Bcst = TLO.DAG.getNode(X86ISD::VBROADCAST, DL, BcstVT, Src);

42411

return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Bcst, 0,

42412

TLO.DAG, DL, ExtSizeInBits));

42413

}

42414

case X86ISD::VBROADCAST_LOAD: {

42415

SDLoc DL(Op);

42416

auto *MemIntr = cast<MemIntrinsicSDNode>(Op);

42417

EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),

42418

ExtSizeInBits / VT.getScalarSizeInBits());

42419

SDVTList Tys = TLO.DAG.getVTList(BcstVT, MVT::Other);

42420

SDValue Ops[] = {MemIntr->getOperand(0), MemIntr->getOperand(1)};

42421

SDValue Bcst = TLO.DAG.getMemIntrinsicNode(

42422

X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MemIntr->getMemoryVT(),

42423

MemIntr->getMemOperand());

42424

TLO.DAG.makeEquivalentMemoryOrdering(SDValue(MemIntr, 1),

42425

Bcst.getValue(1));

42426

return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Bcst, 0,

42427

TLO.DAG, DL, ExtSizeInBits));

42428

}

42429

// Subvector broadcast.

42430

case X86ISD::SUBV_BROADCAST_LOAD: {

42431

auto *MemIntr = cast<MemIntrinsicSDNode>(Op);

42432

EVT MemVT = MemIntr->getMemoryVT();

42433

if (ExtSizeInBits == MemVT.getStoreSizeInBits()) {

42434

SDLoc DL(Op);

42435

SDValue Ld =

42436

TLO.DAG.getLoad(MemVT, DL, MemIntr->getChain(),

42437

MemIntr->getBasePtr(), MemIntr->getMemOperand());

42438

TLO.DAG.makeEquivalentMemoryOrdering(SDValue(MemIntr, 1),

42439

Ld.getValue(1));

42440

return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Ld, 0,

42441

TLO.DAG, DL, ExtSizeInBits));

42442

} else if ((ExtSizeInBits % MemVT.getStoreSizeInBits()) == 0) {

42443

SDLoc DL(Op);

42444

EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(),

42445

ExtSizeInBits / VT.getScalarSizeInBits());

42446

if (SDValue BcstLd =

42447

getBROADCAST_LOAD(Opc, DL, BcstVT, MemVT, MemIntr, 0, TLO.DAG))

42448

return TLO.CombineTo(Op,

42449

insertSubVector(TLO.DAG.getUNDEF(VT), BcstLd, 0,

42450

TLO.DAG, DL, ExtSizeInBits));

42451

}

42452

break;

42453

}

42454

// Byte shifts by immediate.

42455

case X86ISD::VSHLDQ:

42456

case X86ISD::VSRLDQ:

42457

// Shift by uniform.

42458

case X86ISD::VSHL:

42459

case X86ISD::VSRL:

42460

case X86ISD::VSRA:

42461

// Shift by immediate.

42462

case X86ISD::VSHLI:

42463

case X86ISD::VSRLI:

42464

case X86ISD::VSRAI: {

42465

SDLoc DL(Op);

42466

SDValue Ext0 =

42467

extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, ExtSizeInBits);

42468

SDValue ExtOp =

42469

TLO.DAG.getNode(Opc, DL, Ext0.getValueType(), Ext0, Op.getOperand(1));

42470

SDValue UndefVec = TLO.DAG.getUNDEF(VT);

42471

SDValue Insert =

42472

insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);

42473

return TLO.CombineTo(Op, Insert);

42474

}

42475

case X86ISD::VPERMI: {

42476

// Simplify PERMPD/PERMQ to extract_subvector.

42477

// TODO: This should be done in shuffle combining.

42478

if (VT == MVT::v4f64 || VT == MVT::v4i64) {

42479

SmallVector<int, 4> Mask;

42480

DecodeVPERMMask(NumElts, Op.getConstantOperandVal(1), Mask);

42481

if (isUndefOrEqual(Mask[0], 2) && isUndefOrEqual(Mask[1], 3)) {

42482

SDLoc DL(Op);

42483

SDValue Ext = extractSubVector(Op.getOperand(0), 2, TLO.DAG, DL, 128);

42484

SDValue UndefVec = TLO.DAG.getUNDEF(VT);

42485

SDValue Insert = insertSubVector(UndefVec, Ext, 0, TLO.DAG, DL, 128);

42486

return TLO.CombineTo(Op, Insert);

42487

}

42488

}

42489

break;

42490

}

42491

case X86ISD::VPERM2X128: {

42492

// Simplify VPERM2F128/VPERM2I128 to extract_subvector.

42493

SDLoc DL(Op);

42494

unsigned LoMask = Op.getConstantOperandVal(2) & 0xF;

42495

if (LoMask & 0x8)

42496

return TLO.CombineTo(

42497

Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, DL));

42498

unsigned EltIdx = (LoMask & 0x1) * (NumElts / 2);

42499

unsigned SrcIdx = (LoMask & 0x2) >> 1;

42500

SDValue ExtOp =

42501

extractSubVector(Op.getOperand(SrcIdx), EltIdx, TLO.DAG, DL, 128);

42502

SDValue UndefVec = TLO.DAG.getUNDEF(VT);

42503

SDValue Insert =

42504

insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);

42505

return TLO.CombineTo(Op, Insert);

42506

}

42507

// Zero upper elements.

42508

case X86ISD::VZEXT_MOVL:

42509

// Target unary shuffles by immediate:

42510

case X86ISD::PSHUFD:

42511

case X86ISD::PSHUFLW:

42512

case X86ISD::PSHUFHW:

42513

case X86ISD::VPERMILPI:

42514

// (Non-Lane Crossing) Target Shuffles.

42515

case X86ISD::VPERMILPV:

42516

case X86ISD::VPERMIL2:

42517

case X86ISD::PSHUFB:

42518

case X86ISD::UNPCKL:

42519

case X86ISD::UNPCKH:

42520

case X86ISD::BLENDI:

42521

// Integer ops.

42522

case X86ISD::PACKSS:

42523

case X86ISD::PACKUS:

42524

// Horizontal Ops.

42525

case X86ISD::HADD:

42526

case X86ISD::HSUB:

42527

case X86ISD::FHADD:

42528

case X86ISD::FHSUB: {

42529

SDLoc DL(Op);

42530

SmallVector<SDValue, 4> Ops;

42531

for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {

42532

SDValue SrcOp = Op.getOperand(i);

42533

EVT SrcVT = SrcOp.getValueType();

42534

assert((!SrcVT.isVector() || SrcVT.getSizeInBits() == SizeInBits) &&(static_cast <bool> ((!SrcVT.isVector() || SrcVT.getSizeInBits
() == SizeInBits) && "Unsupported vector size") ? void
(0) : __assert_fail ("(!SrcVT.isVector() || SrcVT.getSizeInBits() == SizeInBits) && \"Unsupported vector size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 42535, __extension__
__PRETTY_FUNCTION__))

42535

"Unsupported vector size")(static_cast <bool> ((!SrcVT.isVector() || SrcVT.getSizeInBits
() == SizeInBits) && "Unsupported vector size") ? void
(0) : __assert_fail ("(!SrcVT.isVector() || SrcVT.getSizeInBits() == SizeInBits) && \"Unsupported vector size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 42535, __extension__
__PRETTY_FUNCTION__));

42536

Ops.push_back(SrcVT.isVector() ? extractSubVector(SrcOp, 0, TLO.DAG, DL,

42537

ExtSizeInBits)

42538

: SrcOp);

42539

}

42540

MVT ExtVT = VT.getSimpleVT();

42541

ExtVT = MVT::getVectorVT(ExtVT.getScalarType(),

42542

ExtSizeInBits / ExtVT.getScalarSizeInBits());

42543

SDValue ExtOp = TLO.DAG.getNode(Opc, DL, ExtVT, Ops);

42544

SDValue UndefVec = TLO.DAG.getUNDEF(VT);

42545

SDValue Insert =

42546

insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);

42547

return TLO.CombineTo(Op, Insert);

42548

}

42549

}

42550

}

42551

42552

// For splats, unless we *only* demand the 0'th element,

42553

// stop attempts at simplification here, we aren't going to improve things,

42554

// this is better than any potential shuffle.

42555

if (!DemandedElts.isOne() && TLO.DAG.isSplatValue(Op, /*AllowUndefs*/false))

42556

return false;

42557

42558

// Get target/faux shuffle mask.

42559

APInt OpUndef, OpZero;

42560

SmallVector<int, 64> OpMask;

42561

SmallVector<SDValue, 2> OpInputs;

42562

if (!getTargetShuffleInputs(Op, DemandedElts, OpInputs, OpMask, OpUndef,

42563

OpZero, TLO.DAG, Depth, false))

42564

return false;

42565

42566

// Shuffle inputs must be the same size as the result.

42567

if (OpMask.size() != (unsigned)NumElts ||

42568

llvm::any_of(OpInputs, [VT](SDValue V) {

42569

return VT.getSizeInBits() != V.getValueSizeInBits() ||

42570

!V.getValueType().isVector();

42571

}))

42572

return false;

42573

42574

KnownZero = OpZero;

42575

KnownUndef = OpUndef;

42576

42577

// Check if shuffle mask can be simplified to undef/zero/identity.

42578

int NumSrcs = OpInputs.size();

42579

for (int i = 0; i != NumElts; ++i)

42580

if (!DemandedElts[i])

42581

OpMask[i] = SM_SentinelUndef;

42582

42583

if (isUndefInRange(OpMask, 0, NumElts)) {

42584

KnownUndef.setAllBits();

42585

return TLO.CombineTo(Op, TLO.DAG.getUNDEF(VT));

42586

}

42587

if (isUndefOrZeroInRange(OpMask, 0, NumElts)) {

42588

KnownZero.setAllBits();

42589

return TLO.CombineTo(

42590

Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op)));

42591

}

42592

for (int Src = 0; Src != NumSrcs; ++Src)

42593

if (isSequentialOrUndefInRange(OpMask, 0, NumElts, Src * NumElts))

42594

return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, OpInputs[Src]));

42595

42596

// Attempt to simplify inputs.

42597

for (int Src = 0; Src != NumSrcs; ++Src) {

42598

// TODO: Support inputs of different types.

42599

if (OpInputs[Src].getValueType() != VT)

42600

continue;

42601

42602

int Lo = Src * NumElts;

42603

APInt SrcElts = APInt::getZero(NumElts);

42604

for (int i = 0; i != NumElts; ++i)

42605

if (DemandedElts[i]) {

42606

int M = OpMask[i] - Lo;

42607

if (0 <= M && M < NumElts)

42608

SrcElts.setBit(M);

42609

}

42610

42611

// TODO - Propagate input undef/zero elts.

42612

APInt SrcUndef, SrcZero;

42613

if (SimplifyDemandedVectorElts(OpInputs[Src], SrcElts, SrcUndef, SrcZero,

42614

TLO, Depth + 1))

42615

return true;

42616

}

42617

42618

// If we don't demand all elements, then attempt to combine to a simpler

42619

// shuffle.

42620

// We need to convert the depth to something combineX86ShufflesRecursively

42621

// can handle - so pretend its Depth == 0 again, and reduce the max depth

42622

// to match. This prevents combineX86ShuffleChain from returning a

42623

// combined shuffle that's the same as the original root, causing an

42624

// infinite loop.

42625

if (!DemandedElts.isAllOnes()) {

42626

assert(Depth < X86::MaxShuffleCombineDepth && "Depth out of range")(static_cast <bool> (Depth < X86::MaxShuffleCombineDepth
&& "Depth out of range") ? void (0) : __assert_fail (
"Depth < X86::MaxShuffleCombineDepth && \"Depth out of range\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 42626, __extension__
__PRETTY_FUNCTION__));

42627

42628

SmallVector<int, 64> DemandedMask(NumElts, SM_SentinelUndef);

42629

for (int i = 0; i != NumElts; ++i)

42630

if (DemandedElts[i])

42631

DemandedMask[i] = i;

42632

42633

SDValue NewShuffle = combineX86ShufflesRecursively(

42634

{Op}, 0, Op, DemandedMask, {}, 0, X86::MaxShuffleCombineDepth - Depth,

42635

/*HasVarMask*/ false,

42636

/*AllowCrossLaneVarMask*/ true, /*AllowPerLaneVarMask*/ true, TLO.DAG,

42637

Subtarget);

42638

if (NewShuffle)

42639

return TLO.CombineTo(Op, NewShuffle);

42640

}

42641

42642

return false;

42643

}

42644

42645

bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(

42646

SDValue Op, const APInt &OriginalDemandedBits,

42647

const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO,

42648

unsigned Depth) const {

42649

EVT VT = Op.getValueType();

42650

unsigned BitWidth = OriginalDemandedBits.getBitWidth();

42651

unsigned Opc = Op.getOpcode();

42652

switch(Opc) {

42653

case X86ISD::VTRUNC: {

42654

KnownBits KnownOp;

42655

SDValue Src = Op.getOperand(0);

42656

MVT SrcVT = Src.getSimpleValueType();

42657

42658

// Simplify the input, using demanded bit information.

42659

APInt TruncMask = OriginalDemandedBits.zext(SrcVT.getScalarSizeInBits());

42660

APInt DemandedElts = OriginalDemandedElts.trunc(SrcVT.getVectorNumElements());

42661

if (SimplifyDemandedBits(Src, TruncMask, DemandedElts, KnownOp, TLO, Depth + 1))

42662

return true;

42663

break;

42664

}

42665

case X86ISD::PMULDQ:

42666

case X86ISD::PMULUDQ: {

42667

// PMULDQ/PMULUDQ only uses lower 32 bits from each vector element.

42668

KnownBits KnownLHS, KnownRHS;

42669

SDValue LHS = Op.getOperand(0);

42670

SDValue RHS = Op.getOperand(1);

42671

42672

// Don't mask bits on 32-bit AVX512 targets which might lose a broadcast.

42673

// FIXME: Can we bound this better?

42674

APInt DemandedMask = APInt::getLowBitsSet(64, 32);

42675

APInt DemandedMaskLHS = APInt::getAllOnes(64);

42676

APInt DemandedMaskRHS = APInt::getAllOnes(64);

42677

42678

bool Is32BitAVX512 = !Subtarget.is64Bit() && Subtarget.hasAVX512();

42679

if (!Is32BitAVX512 || !TLO.DAG.isSplatValue(LHS))

42680

DemandedMaskLHS = DemandedMask;

42681

if (!Is32BitAVX512 || !TLO.DAG.isSplatValue(RHS))

42682

DemandedMaskRHS = DemandedMask;

42683

42684

if (SimplifyDemandedBits(LHS, DemandedMaskLHS, OriginalDemandedElts,

42685

KnownLHS, TLO, Depth + 1))

42686

return true;

42687

if (SimplifyDemandedBits(RHS, DemandedMaskRHS, OriginalDemandedElts,

42688

KnownRHS, TLO, Depth + 1))

42689

return true;

42690

42691

// PMULUDQ(X,1) -> AND(X,(1<<32)-1) 'getZeroExtendInReg'.

42692

KnownRHS = KnownRHS.trunc(32);

42693

if (Opc == X86ISD::PMULUDQ && KnownRHS.isConstant() &&

42694

KnownRHS.getConstant().isOne()) {

42695

SDLoc DL(Op);

42696

SDValue Mask = TLO.DAG.getConstant(DemandedMask, DL, VT);

42697

return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::AND, DL, VT, LHS, Mask));

42698

}

42699

42700

// Aggressively peek through ops to get at the demanded low bits.

42701

SDValue DemandedLHS = SimplifyMultipleUseDemandedBits(

42702

LHS, DemandedMaskLHS, OriginalDemandedElts, TLO.DAG, Depth + 1);

42703

SDValue DemandedRHS = SimplifyMultipleUseDemandedBits(

42704

RHS, DemandedMaskRHS, OriginalDemandedElts, TLO.DAG, Depth + 1);

42705

if (DemandedLHS || DemandedRHS) {

42706

DemandedLHS = DemandedLHS ? DemandedLHS : LHS;

42707

DemandedRHS = DemandedRHS ? DemandedRHS : RHS;

42708

return TLO.CombineTo(

42709

Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, DemandedLHS, DemandedRHS));

42710

}

42711

break;

42712

}

42713

case X86ISD::VSHLI: {

42714

SDValue Op0 = Op.getOperand(0);

42715

42716

unsigned ShAmt = Op.getConstantOperandVal(1);

42717

if (ShAmt >= BitWidth)

42718

break;

42719

42720

APInt DemandedMask = OriginalDemandedBits.lshr(ShAmt);

42721

42722

// If this is ((X >>u C1) << ShAmt), see if we can simplify this into a

42723

// single shift. We can do this if the bottom bits (which are shifted

42724

// out) are never demanded.

42725

if (Op0.getOpcode() == X86ISD::VSRLI &&

42726

OriginalDemandedBits.countTrailingZeros() >= ShAmt) {

42727

unsigned Shift2Amt = Op0.getConstantOperandVal(1);

42728

if (Shift2Amt < BitWidth) {

42729

int Diff = ShAmt - Shift2Amt;

42730

if (Diff == 0)

42731

return TLO.CombineTo(Op, Op0.getOperand(0));

42732

42733

unsigned NewOpc = Diff < 0 ? X86ISD::VSRLI : X86ISD::VSHLI;

42734

SDValue NewShift = TLO.DAG.getNode(

42735

NewOpc, SDLoc(Op), VT, Op0.getOperand(0),

42736

TLO.DAG.getTargetConstant(std::abs(Diff), SDLoc(Op), MVT::i8));

42737

return TLO.CombineTo(Op, NewShift);

42738

}

42739

}

42740

42741

// If we are only demanding sign bits then we can use the shift source directly.

42742

unsigned NumSignBits =

42743

TLO.DAG.ComputeNumSignBits(Op0, OriginalDemandedElts, Depth + 1);

42744

unsigned UpperDemandedBits =

42745

BitWidth - OriginalDemandedBits.countTrailingZeros();

42746

if (NumSignBits > ShAmt && (NumSignBits - ShAmt) >= UpperDemandedBits)

42747

return TLO.CombineTo(Op, Op0);

42748

42749

if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,

42750

TLO, Depth + 1))

42751

return true;

42752

42753

assert(!Known.hasConflict() && "Bits known to be one AND zero?")(static_cast <bool> (!Known.hasConflict() && "Bits known to be one AND zero?"
) ? void (0) : __assert_fail ("!Known.hasConflict() && \"Bits known to be one AND zero?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 42753, __extension__
__PRETTY_FUNCTION__));

42754

Known.Zero <<= ShAmt;

42755

Known.One <<= ShAmt;

42756

42757

// Low bits known zero.

42758

Known.Zero.setLowBits(ShAmt);

42759

return false;

42760

}

42761

case X86ISD::VSRLI: {

42762

unsigned ShAmt = Op.getConstantOperandVal(1);

42763

if (ShAmt >= BitWidth)

42764

break;

42765

42766

APInt DemandedMask = OriginalDemandedBits << ShAmt;

42767

42768

if (SimplifyDemandedBits(Op.getOperand(0), DemandedMask,

42769

OriginalDemandedElts, Known, TLO, Depth + 1))

42770

return true;

42771

42772

assert(!Known.hasConflict() && "Bits known to be one AND zero?")(static_cast <bool> (!Known.hasConflict() && "Bits known to be one AND zero?"
) ? void (0) : __assert_fail ("!Known.hasConflict() && \"Bits known to be one AND zero?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 42772, __extension__
__PRETTY_FUNCTION__));

42773

Known.Zero.lshrInPlace(ShAmt);

42774

Known.One.lshrInPlace(ShAmt);

42775

42776

// High bits known zero.

42777

Known.Zero.setHighBits(ShAmt);

42778

return false;

42779

}

42780

case X86ISD::VSRAI: {

42781

SDValue Op0 = Op.getOperand(0);

42782

SDValue Op1 = Op.getOperand(1);

42783

42784

unsigned ShAmt = cast<ConstantSDNode>(Op1)->getZExtValue();

42785

if (ShAmt >= BitWidth)

42786

break;

42787

42788

APInt DemandedMask = OriginalDemandedBits << ShAmt;

42789

42790

// If we just want the sign bit then we don't need to shift it.

42791

if (OriginalDemandedBits.isSignMask())

42792

return TLO.CombineTo(Op, Op0);

42793

42794

// fold (VSRAI (VSHLI X, C1), C1) --> X iff NumSignBits(X) > C1

42795

if (Op0.getOpcode() == X86ISD::VSHLI &&

42796

Op.getOperand(1) == Op0.getOperand(1)) {

42797

SDValue Op00 = Op0.getOperand(0);

42798

unsigned NumSignBits =

42799

TLO.DAG.ComputeNumSignBits(Op00, OriginalDemandedElts);

42800

if (ShAmt < NumSignBits)

42801

return TLO.CombineTo(Op, Op00);

42802

}

42803

42804

// If any of the demanded bits are produced by the sign extension, we also

42805

// demand the input sign bit.

42806

if (OriginalDemandedBits.countLeadingZeros() < ShAmt)

42807

DemandedMask.setSignBit();

42808

42809

if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,

42810

TLO, Depth + 1))

42811

return true;

42812

42813

assert(!Known.hasConflict() && "Bits known to be one AND zero?")(static_cast <bool> (!Known.hasConflict() && "Bits known to be one AND zero?"
) ? void (0) : __assert_fail ("!Known.hasConflict() && \"Bits known to be one AND zero?\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 42813, __extension__
__PRETTY_FUNCTION__));

42814

Known.Zero.lshrInPlace(ShAmt);

42815

Known.One.lshrInPlace(ShAmt);

42816

42817

// If the input sign bit is known to be zero, or if none of the top bits

42818

// are demanded, turn this into an unsigned shift right.

42819

if (Known.Zero[BitWidth - ShAmt - 1] ||

42820

OriginalDemandedBits.countLeadingZeros() >= ShAmt)

42821

return TLO.CombineTo(

42822

Op, TLO.DAG.getNode(X86ISD::VSRLI, SDLoc(Op), VT, Op0, Op1));

42823

42824

// High bits are known one.

42825

if (Known.One[BitWidth - ShAmt - 1])

42826

Known.One.setHighBits(ShAmt);

42827

return false;

42828

}

42829

case X86ISD::BLENDV: {

42830

SDValue Sel = Op.getOperand(0);

42831

SDValue LHS = Op.getOperand(1);

42832

SDValue RHS = Op.getOperand(2);

42833

42834

APInt SignMask = APInt::getSignMask(BitWidth);

42835

SDValue NewSel = SimplifyMultipleUseDemandedBits(

42836

Sel, SignMask, OriginalDemandedElts, TLO.DAG, Depth + 1);

42837

SDValue NewLHS = SimplifyMultipleUseDemandedBits(

42838

LHS, OriginalDemandedBits, OriginalDemandedElts, TLO.DAG, Depth + 1);

42839

SDValue NewRHS = SimplifyMultipleUseDemandedBits(

42840

RHS, OriginalDemandedBits, OriginalDemandedElts, TLO.DAG, Depth + 1);

42841

42842

if (NewSel || NewLHS || NewRHS) {

42843

NewSel = NewSel ? NewSel : Sel;

42844

NewLHS = NewLHS ? NewLHS : LHS;

42845

NewRHS = NewRHS ? NewRHS : RHS;

42846

return TLO.CombineTo(Op, TLO.DAG.getNode(X86ISD::BLENDV, SDLoc(Op), VT,

42847

NewSel, NewLHS, NewRHS));

42848

}

42849

break;

42850

}

42851

case X86ISD::PEXTRB:

42852

case X86ISD::PEXTRW: {

42853

SDValue Vec = Op.getOperand(0);

42854

auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(1));

42855

MVT VecVT = Vec.getSimpleValueType();

42856

unsigned NumVecElts = VecVT.getVectorNumElements();

42857

42858

if (CIdx && CIdx->getAPIntValue().ult(NumVecElts)) {

42859

unsigned Idx = CIdx->getZExtValue();

42860

unsigned VecBitWidth = VecVT.getScalarSizeInBits();

42861

42862

// If we demand no bits from the vector then we must have demanded

42863

// bits from the implict zext - simplify to zero.

42864

APInt DemandedVecBits = OriginalDemandedBits.trunc(VecBitWidth);

42865

if (DemandedVecBits == 0)

42866

return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));

42867

42868

APInt KnownUndef, KnownZero;

42869

APInt DemandedVecElts = APInt::getOneBitSet(NumVecElts, Idx);

42870

if (SimplifyDemandedVectorElts(Vec, DemandedVecElts, KnownUndef,

42871

KnownZero, TLO, Depth + 1))

42872

return true;

42873

42874

KnownBits KnownVec;

42875

if (SimplifyDemandedBits(Vec, DemandedVecBits, DemandedVecElts,

42876

KnownVec, TLO, Depth + 1))

42877

return true;

42878

42879

if (SDValue V = SimplifyMultipleUseDemandedBits(

42880

Vec, DemandedVecBits, DemandedVecElts, TLO.DAG, Depth + 1))

42881

return TLO.CombineTo(

42882

Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, V, Op.getOperand(1)));

42883

42884

Known = KnownVec.zext(BitWidth);

42885

return false;

42886

}

42887

break;

42888

}

42889

case X86ISD::PINSRB:

42890

case X86ISD::PINSRW: {

42891

SDValue Vec = Op.getOperand(0);

42892

SDValue Scl = Op.getOperand(1);

42893

auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2));

42894

MVT VecVT = Vec.getSimpleValueType();

42895

42896

if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements())) {

42897

unsigned Idx = CIdx->getZExtValue();

42898

if (!OriginalDemandedElts[Idx])

42899

return TLO.CombineTo(Op, Vec);

42900

42901

KnownBits KnownVec;

42902

APInt DemandedVecElts(OriginalDemandedElts);

42903

DemandedVecElts.clearBit(Idx);

42904

if (SimplifyDemandedBits(Vec, OriginalDemandedBits, DemandedVecElts,

42905

KnownVec, TLO, Depth + 1))

42906

return true;

42907

42908

KnownBits KnownScl;

42909

unsigned NumSclBits = Scl.getScalarValueSizeInBits();

42910

APInt DemandedSclBits = OriginalDemandedBits.zext(NumSclBits);

42911

if (SimplifyDemandedBits(Scl, DemandedSclBits, KnownScl, TLO, Depth + 1))

42912

return true;

42913

42914

KnownScl = KnownScl.trunc(VecVT.getScalarSizeInBits());

42915

Known = KnownBits::commonBits(KnownVec, KnownScl);

42916

return false;

42917

}

42918

break;

42919

}

42920

case X86ISD::PACKSS:

42921

// PACKSS saturates to MIN/MAX integer values. So if we just want the

42922

// sign bit then we can just ask for the source operands sign bit.

42923

// TODO - add known bits handling.

42924

if (OriginalDemandedBits.isSignMask()) {

42925

APInt DemandedLHS, DemandedRHS;

42926

getPackDemandedElts(VT, OriginalDemandedElts, DemandedLHS, DemandedRHS);

42927

42928

KnownBits KnownLHS, KnownRHS;

42929

APInt SignMask = APInt::getSignMask(BitWidth * 2);

42930

if (SimplifyDemandedBits(Op.getOperand(0), SignMask, DemandedLHS,

42931

KnownLHS, TLO, Depth + 1))

42932

return true;

42933

if (SimplifyDemandedBits(Op.getOperand(1), SignMask, DemandedRHS,

42934

KnownRHS, TLO, Depth + 1))

42935

return true;

42936

42937

// Attempt to avoid multi-use ops if we don't need anything from them.

42938

SDValue DemandedOp0 = SimplifyMultipleUseDemandedBits(

42939

Op.getOperand(0), SignMask, DemandedLHS, TLO.DAG, Depth + 1);

42940

SDValue DemandedOp1 = SimplifyMultipleUseDemandedBits(

42941

Op.getOperand(1), SignMask, DemandedRHS, TLO.DAG, Depth + 1);

42942

if (DemandedOp0 || DemandedOp1) {

42943

SDValue Op0 = DemandedOp0 ? DemandedOp0 : Op.getOperand(0);

42944

SDValue Op1 = DemandedOp1 ? DemandedOp1 : Op.getOperand(1);

42945

return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, Op0, Op1));

42946

}

42947

}

42948

// TODO - add general PACKSS/PACKUS SimplifyDemandedBits support.

42949

break;

42950

case X86ISD::VBROADCAST: {

42951

SDValue Src = Op.getOperand(0);

42952

MVT SrcVT = Src.getSimpleValueType();

42953

APInt DemandedElts = APInt::getOneBitSet(

42954

SrcVT.isVector() ? SrcVT.getVectorNumElements() : 1, 0);

42955

if (SimplifyDemandedBits(Src, OriginalDemandedBits, DemandedElts, Known,

42956

TLO, Depth + 1))

42957

return true;

42958

// If we don't need the upper bits, attempt to narrow the broadcast source.

42959

// Don't attempt this on AVX512 as it might affect broadcast folding.

42960

// TODO: Should we attempt this for i32/i16 splats? They tend to be slower.

42961

if ((BitWidth == 64) && SrcVT.isScalarInteger() && !Subtarget.hasAVX512() &&

42962

OriginalDemandedBits.countLeadingZeros() >= (BitWidth / 2) &&

42963

Src->hasOneUse()) {

42964

MVT NewSrcVT = MVT::getIntegerVT(BitWidth / 2);

42965

SDValue NewSrc =

42966

TLO.DAG.getNode(ISD::TRUNCATE, SDLoc(Src), NewSrcVT, Src);

42967

MVT NewVT = MVT::getVectorVT(NewSrcVT, VT.getVectorNumElements() * 2);

42968

SDValue NewBcst =

42969

TLO.DAG.getNode(X86ISD::VBROADCAST, SDLoc(Op), NewVT, NewSrc);

42970

return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, NewBcst));

42971

}

42972

break;

42973

}

42974

case X86ISD::PCMPGT:

42975

// icmp sgt(0, R) == ashr(R, BitWidth-1).

42976

// iff we only need the sign bit then we can use R directly.

42977

if (OriginalDemandedBits.isSignMask() &&

42978

ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()))

42979

return TLO.CombineTo(Op, Op.getOperand(1));

42980

break;

42981

case X86ISD::MOVMSK: {

42982

SDValue Src = Op.getOperand(0);

42983

MVT SrcVT = Src.getSimpleValueType();

42984

unsigned SrcBits = SrcVT.getScalarSizeInBits();

42985

unsigned NumElts = SrcVT.getVectorNumElements();

42986

42987

// If we don't need the sign bits at all just return zero.

42988

if (OriginalDemandedBits.countTrailingZeros() >= NumElts)

42989

return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));

42990

42991

// See if we only demand bits from the lower 128-bit vector.

42992

if (SrcVT.is256BitVector() &&

42993

OriginalDemandedBits.getActiveBits() <= (NumElts / 2)) {

42994

SDValue NewSrc = extract128BitVector(Src, 0, TLO.DAG, SDLoc(Src));

42995

return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));

42996

}

42997

42998

// Only demand the vector elements of the sign bits we need.

42999

APInt KnownUndef, KnownZero;

43000

APInt DemandedElts = OriginalDemandedBits.zextOrTrunc(NumElts);

43001

if (SimplifyDemandedVectorElts(Src, DemandedElts, KnownUndef, KnownZero,

43002

TLO, Depth + 1))

43003

return true;

43004

43005

Known.Zero = KnownZero.zext(BitWidth);

43006

Known.Zero.setHighBits(BitWidth - NumElts);

43007

43008

// MOVMSK only uses the MSB from each vector element.

43009

KnownBits KnownSrc;

43010

APInt DemandedSrcBits = APInt::getSignMask(SrcBits);

43011

if (SimplifyDemandedBits(Src, DemandedSrcBits, DemandedElts, KnownSrc, TLO,

43012

Depth + 1))

43013

return true;

43014

43015

if (KnownSrc.One[SrcBits - 1])

43016

Known.One.setLowBits(NumElts);

43017

else if (KnownSrc.Zero[SrcBits - 1])

43018

Known.Zero.setLowBits(NumElts);

43019

43020

// Attempt to avoid multi-use os if we don't need anything from it.

43021

if (SDValue NewSrc = SimplifyMultipleUseDemandedBits(

43022

Src, DemandedSrcBits, DemandedElts, TLO.DAG, Depth + 1))

43023

return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));

43024

return false;

43025

}

43026

case X86ISD::BEXTR:

43027

case X86ISD::BEXTRI: {

43028

SDValue Op0 = Op.getOperand(0);

43029

SDValue Op1 = Op.getOperand(1);

43030

43031

// Only bottom 16-bits of the control bits are required.

43032

if (auto *Cst1 = dyn_cast<ConstantSDNode>(Op1)) {

43033

// NOTE: SimplifyDemandedBits won't do this for constants.

43034

uint64_t Val1 = Cst1->getZExtValue();

43035

uint64_t MaskedVal1 = Val1 & 0xFFFF;

43036

if (Opc == X86ISD::BEXTR && MaskedVal1 != Val1) {

43037

SDLoc DL(Op);

43038

return TLO.CombineTo(

43039

Op, TLO.DAG.getNode(X86ISD::BEXTR, DL, VT, Op0,

43040

TLO.DAG.getConstant(MaskedVal1, DL, VT)));

43041

}

43042

43043

unsigned Shift = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 0);

43044

unsigned Length = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 8);

43045

43046

// If the length is 0, the result is 0.

43047

if (Length == 0) {

43048

Known.setAllZero();

43049

return false;

43050

}

43051

43052

if ((Shift + Length) <= BitWidth) {

43053

APInt DemandedMask = APInt::getBitsSet(BitWidth, Shift, Shift + Length);

43054

if (SimplifyDemandedBits(Op0, DemandedMask, Known, TLO, Depth + 1))

43055

return true;

43056

43057

Known = Known.extractBits(Length, Shift);

43058

Known = Known.zextOrTrunc(BitWidth);

43059

return false;

43060

}

43061

} else {

43062

assert(Opc == X86ISD::BEXTR && "Unexpected opcode!")(static_cast <bool> (Opc == X86ISD::BEXTR && "Unexpected opcode!"
) ? void (0) : __assert_fail ("Opc == X86ISD::BEXTR && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43062, __extension__
__PRETTY_FUNCTION__));

43063

KnownBits Known1;

43064

APInt DemandedMask(APInt::getLowBitsSet(BitWidth, 16));

43065

if (SimplifyDemandedBits(Op1, DemandedMask, Known1, TLO, Depth + 1))

43066

return true;

43067

43068

// If the length is 0, replace with 0.

43069

KnownBits LengthBits = Known1.extractBits(8, 8);

43070

if (LengthBits.isZero())

43071

return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));

43072

}

43073

43074

break;

43075

}

43076

case X86ISD::PDEP: {

43077

SDValue Op0 = Op.getOperand(0);

43078

SDValue Op1 = Op.getOperand(1);

43079

43080

unsigned DemandedBitsLZ = OriginalDemandedBits.countLeadingZeros();

43081

APInt LoMask = APInt::getLowBitsSet(BitWidth, BitWidth - DemandedBitsLZ);

43082

43083

// If the demanded bits has leading zeroes, we don't demand those from the

43084

// mask.

43085

if (SimplifyDemandedBits(Op1, LoMask, Known, TLO, Depth + 1))

43086

return true;

43087

43088

// The number of possible 1s in the mask determines the number of LSBs of

43089

// operand 0 used. Undemanded bits from the mask don't matter so filter

43090

// them before counting.

43091

KnownBits Known2;

43092

uint64_t Count = (~Known.Zero & LoMask).countPopulation();

43093

APInt DemandedMask(APInt::getLowBitsSet(BitWidth, Count));

43094

if (SimplifyDemandedBits(Op0, DemandedMask, Known2, TLO, Depth + 1))

43095

return true;

43096

43097

// Zeroes are retained from the mask, but not ones.

43098

Known.One.clearAllBits();

43099

// The result will have at least as many trailing zeros as the non-mask

43100

// operand since bits can only map to the same or higher bit position.

43101

Known.Zero.setLowBits(Known2.countMinTrailingZeros());

43102

return false;

43103

}

43104

}

43105

43106

return TargetLowering::SimplifyDemandedBitsForTargetNode(

43107

Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth);

43108

}

43109

43110

SDValue X86TargetLowering::SimplifyMultipleUseDemandedBitsForTargetNode(

43111

SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,

43112

SelectionDAG &DAG, unsigned Depth) const {

43113

int NumElts = DemandedElts.getBitWidth();

43114

unsigned Opc = Op.getOpcode();

43115

EVT VT = Op.getValueType();

43116

43117

switch (Opc) {

43118

case X86ISD::PINSRB:

43119

case X86ISD::PINSRW: {

43120

// If we don't demand the inserted element, return the base vector.

43121

SDValue Vec = Op.getOperand(0);

43122

auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2));

43123

MVT VecVT = Vec.getSimpleValueType();

43124

if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements()) &&

43125

!DemandedElts[CIdx->getZExtValue()])

43126

return Vec;

43127

break;

43128

}

43129

case X86ISD::VSHLI: {

43130

// If we are only demanding sign bits then we can use the shift source

43131

// directly.

43132

SDValue Op0 = Op.getOperand(0);

43133

unsigned ShAmt = Op.getConstantOperandVal(1);

43134

unsigned BitWidth = DemandedBits.getBitWidth();

43135

unsigned NumSignBits = DAG.ComputeNumSignBits(Op0, DemandedElts, Depth + 1);

43136

unsigned UpperDemandedBits = BitWidth - DemandedBits.countTrailingZeros();

43137

if (NumSignBits > ShAmt && (NumSignBits - ShAmt) >= UpperDemandedBits)

43138

return Op0;

43139

break;

43140

}

43141

case X86ISD::VSRAI:

43142

// iff we only need the sign bit then we can use the source directly.

43143

// TODO: generalize where we only demand extended signbits.

43144

if (DemandedBits.isSignMask())

43145

return Op.getOperand(0);

43146

break;

43147

case X86ISD::PCMPGT:

43148

// icmp sgt(0, R) == ashr(R, BitWidth-1).

43149

// iff we only need the sign bit then we can use R directly.

43150

if (DemandedBits.isSignMask() &&

43151

ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()))

43152

return Op.getOperand(1);

43153

break;

43154

case X86ISD::ANDNP: {

43155

// ANDNP = (~LHS & RHS);

43156

SDValue LHS = Op.getOperand(0);

43157

SDValue RHS = Op.getOperand(1);

43158

43159

KnownBits LHSKnown = DAG.computeKnownBits(LHS, DemandedElts, Depth + 1);

43160

KnownBits RHSKnown = DAG.computeKnownBits(RHS, DemandedElts, Depth + 1);

43161

43162

// If all of the demanded bits are known 0 on LHS and known 0 on RHS, then

43163

// the (inverted) LHS bits cannot contribute to the result of the 'andn' in

43164

// this context, so return RHS.

43165

if (DemandedBits.isSubsetOf(RHSKnown.Zero | LHSKnown.Zero))

43166

return RHS;

43167

break;

43168

}

43169

}

43170

43171

APInt ShuffleUndef, ShuffleZero;

43172

SmallVector<int, 16> ShuffleMask;

43173

SmallVector<SDValue, 2> ShuffleOps;

43174

if (getTargetShuffleInputs(Op, DemandedElts, ShuffleOps, ShuffleMask,

43175

ShuffleUndef, ShuffleZero, DAG, Depth, false)) {

43176

// If all the demanded elts are from one operand and are inline,

43177

// then we can use the operand directly.

43178

int NumOps = ShuffleOps.size();

43179

if (ShuffleMask.size() == (unsigned)NumElts &&

43180

llvm::all_of(ShuffleOps, [VT](SDValue V) {

43181

return VT.getSizeInBits() == V.getValueSizeInBits();

43182

})) {

43183

43184

if (DemandedElts.isSubsetOf(ShuffleUndef))

43185

return DAG.getUNDEF(VT);

43186

if (DemandedElts.isSubsetOf(ShuffleUndef | ShuffleZero))

43187

return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(Op));

43188

43189

// Bitmask that indicates which ops have only been accessed 'inline'.

43190

APInt IdentityOp = APInt::getAllOnes(NumOps);

43191

for (int i = 0; i != NumElts; ++i) {

43192

int M = ShuffleMask[i];

43193

if (!DemandedElts[i] || ShuffleUndef[i])

43194

continue;

43195

int OpIdx = M / NumElts;

43196

int EltIdx = M % NumElts;

43197

if (M < 0 || EltIdx != i) {

43198

IdentityOp.clearAllBits();

43199

break;

43200

}

43201

IdentityOp &= APInt::getOneBitSet(NumOps, OpIdx);

43202

if (IdentityOp == 0)

43203

break;

43204

}

43205

assert((IdentityOp == 0 || IdentityOp.countPopulation() == 1) &&(static_cast <bool> ((IdentityOp == 0 || IdentityOp.countPopulation
() == 1) && "Multiple identity shuffles detected") ? void
(0) : __assert_fail ("(IdentityOp == 0 || IdentityOp.countPopulation() == 1) && \"Multiple identity shuffles detected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43206, __extension__
__PRETTY_FUNCTION__))

43206

"Multiple identity shuffles detected")(static_cast <bool> ((IdentityOp == 0 || IdentityOp.countPopulation
() == 1) && "Multiple identity shuffles detected") ? void
(0) : __assert_fail ("(IdentityOp == 0 || IdentityOp.countPopulation() == 1) && \"Multiple identity shuffles detected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43206, __extension__
__PRETTY_FUNCTION__));

43207

43208

if (IdentityOp != 0)

43209

return DAG.getBitcast(VT, ShuffleOps[IdentityOp.countTrailingZeros()]);

43210

}

43211

}

43212

43213

return TargetLowering::SimplifyMultipleUseDemandedBitsForTargetNode(

43214

Op, DemandedBits, DemandedElts, DAG, Depth);

43215

}

43216

43217

bool X86TargetLowering::isGuaranteedNotToBeUndefOrPoisonForTargetNode(

43218

SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,

43219

bool PoisonOnly, unsigned Depth) const {

43220

unsigned EltsBits = Op.getScalarValueSizeInBits();

43221

unsigned NumElts = DemandedElts.getBitWidth();

43222

43223

// TODO: Add more target shuffles.

43224

switch (Op.getOpcode()) {

43225

case X86ISD::PSHUFD:

43226

case X86ISD::VPERMILPI: {

43227

SmallVector<int, 8> Mask;

43228

DecodePSHUFMask(NumElts, EltsBits, Op.getConstantOperandVal(1), Mask);

43229

43230

APInt DemandedSrcElts = APInt::getZero(NumElts);

43231

for (unsigned I = 0; I != NumElts; ++I)

43232

if (DemandedElts[I])

43233

DemandedSrcElts.setBit(Mask[I]);

43234

43235

return DAG.isGuaranteedNotToBeUndefOrPoison(

43236

Op.getOperand(0), DemandedSrcElts, PoisonOnly, Depth + 1);

43237

}

43238

}

43239

return TargetLowering::isGuaranteedNotToBeUndefOrPoisonForTargetNode(

43240

Op, DemandedElts, DAG, PoisonOnly, Depth);

43241

}

43242

43243

bool X86TargetLowering::canCreateUndefOrPoisonForTargetNode(

43244

SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,

43245

bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const {

43246

43247

// TODO: Add more target shuffles.

43248

switch (Op.getOpcode()) {

43249

case X86ISD::PSHUFD:

43250

case X86ISD::VPERMILPI:

43251

return false;

43252

}

43253

return TargetLowering::canCreateUndefOrPoisonForTargetNode(

43254

Op, DemandedElts, DAG, PoisonOnly, ConsiderFlags, Depth);

43255

}

43256

43257

bool X86TargetLowering::isSplatValueForTargetNode(SDValue Op,

43258

const APInt &DemandedElts,

43259

APInt &UndefElts,

43260

const SelectionDAG &DAG,

43261

unsigned Depth) const {

43262

unsigned NumElts = DemandedElts.getBitWidth();

43263

unsigned Opc = Op.getOpcode();

43264

43265

switch (Opc) {

43266

case X86ISD::VBROADCAST:

43267

case X86ISD::VBROADCAST_LOAD:

43268

UndefElts = APInt::getNullValue(NumElts);

43269

return true;

43270

}

43271

43272

return TargetLowering::isSplatValueForTargetNode(Op, DemandedElts, UndefElts,

43273

DAG, Depth);

43274

}

43275

43276

// Helper to peek through bitops/trunc/setcc to determine size of source vector.

43277

// Allows combineBitcastvxi1 to determine what size vector generated a <X x i1>.

43278

static bool checkBitcastSrcVectorSize(SDValue Src, unsigned Size,

43279

bool AllowTruncate) {

43280

switch (Src.getOpcode()) {

43281

case ISD::TRUNCATE:

43282

if (!AllowTruncate)

43283

return false;

43284

[[fallthrough]];

43285

case ISD::SETCC:

43286

return Src.getOperand(0).getValueSizeInBits() == Size;

43287

case ISD::AND:

43288

case ISD::XOR:

43289

case ISD::OR:

43290

return checkBitcastSrcVectorSize(Src.getOperand(0), Size, AllowTruncate) &&

43291

checkBitcastSrcVectorSize(Src.getOperand(1), Size, AllowTruncate);

43292

case ISD::VSELECT:

43293

return Src.getOperand(0).getScalarValueSizeInBits() == 1 &&

43294

checkBitcastSrcVectorSize(Src.getOperand(1), Size, AllowTruncate) &&

43295

checkBitcastSrcVectorSize(Src.getOperand(2), Size, AllowTruncate);

43296

case ISD::BUILD_VECTOR:

43297

return ISD::isBuildVectorAllZeros(Src.getNode());

43298

43299

}

43300

return false;

43301

}

43302

43303

// Helper to flip between AND/OR/XOR opcodes and their X86ISD FP equivalents.

43304

static unsigned getAltBitOpcode(unsigned Opcode) {

43305

switch(Opcode) {

43306

case ISD::AND: return X86ISD::FAND;

43307

case ISD::OR: return X86ISD::FOR;

43308

case ISD::XOR: return X86ISD::FXOR;

43309

case X86ISD::ANDNP: return X86ISD::FANDN;

43310

}

43311

llvm_unreachable("Unknown bitwise opcode")::llvm::llvm_unreachable_internal("Unknown bitwise opcode", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 43311);

43312

}

43313

43314

// Helper to adjust v4i32 MOVMSK expansion to work with SSE1-only targets.

43315

static SDValue adjustBitcastSrcVectorSSE1(SelectionDAG &DAG, SDValue Src,

43316

const SDLoc &DL) {

43317

EVT SrcVT = Src.getValueType();

43318

if (SrcVT != MVT::v4i1)

43319

return SDValue();

43320

43321

switch (Src.getOpcode()) {

43322

case ISD::SETCC:

43323

if (Src.getOperand(0).getValueType() == MVT::v4i32 &&

43324

ISD::isBuildVectorAllZeros(Src.getOperand(1).getNode()) &&

43325

cast<CondCodeSDNode>(Src.getOperand(2))->get() == ISD::SETLT) {

43326

SDValue Op0 = Src.getOperand(0);

43327

if (ISD::isNormalLoad(Op0.getNode()))

43328

return DAG.getBitcast(MVT::v4f32, Op0);

43329

if (Op0.getOpcode() == ISD::BITCAST &&

43330

Op0.getOperand(0).getValueType() == MVT::v4f32)

43331

return Op0.getOperand(0);

43332

}

43333

break;

43334

case ISD::AND:

43335

case ISD::XOR:

43336

case ISD::OR: {

43337

SDValue Op0 = adjustBitcastSrcVectorSSE1(DAG, Src.getOperand(0), DL);

43338

SDValue Op1 = adjustBitcastSrcVectorSSE1(DAG, Src.getOperand(1), DL);

43339

if (Op0 && Op1)

43340

return DAG.getNode(getAltBitOpcode(Src.getOpcode()), DL, MVT::v4f32, Op0,

43341

Op1);

43342

break;

43343

}

43344

}

43345

return SDValue();

43346

}

43347

43348

// Helper to push sign extension of vXi1 SETCC result through bitops.

43349

static SDValue signExtendBitcastSrcVector(SelectionDAG &DAG, EVT SExtVT,

43350

SDValue Src, const SDLoc &DL) {

43351

switch (Src.getOpcode()) {

43352

case ISD::SETCC:

43353

case ISD::TRUNCATE:

43354

case ISD::BUILD_VECTOR:

43355

return DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src);

43356

case ISD::AND:

43357

case ISD::XOR:

43358

case ISD::OR:

43359

return DAG.getNode(

43360

Src.getOpcode(), DL, SExtVT,

43361

signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(0), DL),

43362

signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(1), DL));

43363

case ISD::VSELECT:

43364

return DAG.getSelect(

43365

DL, SExtVT, Src.getOperand(0),

43366

signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(1), DL),

43367

signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(2), DL));

43368

}

43369

llvm_unreachable("Unexpected node type for vXi1 sign extension")::llvm::llvm_unreachable_internal("Unexpected node type for vXi1 sign extension"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43369);

43370

}

43371

43372

// Try to match patterns such as

43373

// (i16 bitcast (v16i1 x))

43374

// ->

43375

// (i16 movmsk (16i8 sext (v16i1 x)))

43376

// before the illegal vector is scalarized on subtargets that don't have legal

43377

// vxi1 types.

43378

static SDValue combineBitcastvxi1(SelectionDAG &DAG, EVT VT, SDValue Src,

43379

const SDLoc &DL,

43380

const X86Subtarget &Subtarget) {

43381

EVT SrcVT = Src.getValueType();

43382

if (!SrcVT.isSimple() || SrcVT.getScalarType() != MVT::i1)

43383

return SDValue();

43384

43385

// Recognize the IR pattern for the movmsk intrinsic under SSE1 before type

43386

// legalization destroys the v4i32 type.

43387

if (Subtarget.hasSSE1() && !Subtarget.hasSSE2()) {

43388

if (SDValue V = adjustBitcastSrcVectorSSE1(DAG, Src, DL)) {

43389

V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32,

43390

DAG.getBitcast(MVT::v4f32, V));

43391

return DAG.getZExtOrTrunc(V, DL, VT);

43392

}

43393

}

43394

43395

// If the input is a truncate from v16i8 or v32i8 go ahead and use a

43396

// movmskb even with avx512. This will be better than truncating to vXi1 and

43397

// using a kmov. This can especially help KNL if the input is a v16i8/v32i8

43398

// vpcmpeqb/vpcmpgtb.

43399

bool PreferMovMsk = Src.getOpcode() == ISD::TRUNCATE && Src.hasOneUse() &&

43400

(Src.getOperand(0).getValueType() == MVT::v16i8 ||

43401

Src.getOperand(0).getValueType() == MVT::v32i8 ||

43402

Src.getOperand(0).getValueType() == MVT::v64i8);

43403

43404

// Prefer movmsk for AVX512 for (bitcast (setlt X, 0)) which can be handled

43405

// directly with vpmovmskb/vmovmskps/vmovmskpd.

43406

if (Src.getOpcode() == ISD::SETCC && Src.hasOneUse() &&

43407

cast<CondCodeSDNode>(Src.getOperand(2))->get() == ISD::SETLT &&

43408

ISD::isBuildVectorAllZeros(Src.getOperand(1).getNode())) {

43409

EVT CmpVT = Src.getOperand(0).getValueType();

43410

EVT EltVT = CmpVT.getVectorElementType();

43411

if (CmpVT.getSizeInBits() <= 256 &&

43412

(EltVT == MVT::i8 || EltVT == MVT::i32 || EltVT == MVT::i64))

43413

PreferMovMsk = true;

43414

}

43415

43416

// With AVX512 vxi1 types are legal and we prefer using k-regs.

43417

// MOVMSK is supported in SSE2 or later.

43418

if (!Subtarget.hasSSE2() || (Subtarget.hasAVX512() && !PreferMovMsk))

43419

return SDValue();

43420

43421

// There are MOVMSK flavors for types v16i8, v32i8, v4f32, v8f32, v4f64 and

43422

// v8f64. So all legal 128-bit and 256-bit vectors are covered except for

43423

// v8i16 and v16i16.

43424

// For these two cases, we can shuffle the upper element bytes to a

43425

// consecutive sequence at the start of the vector and treat the results as

43426

// v16i8 or v32i8, and for v16i8 this is the preferable solution. However,

43427

// for v16i16 this is not the case, because the shuffle is expensive, so we

43428

// avoid sign-extending to this type entirely.

43429

// For example, t0 := (v8i16 sext(v8i1 x)) needs to be shuffled as:

43430

// (v16i8 shuffle <0,2,4,6,8,10,12,14,u,u,...,u> (v16i8 bitcast t0), undef)

43431

MVT SExtVT;

43432

bool PropagateSExt = false;

43433

switch (SrcVT.getSimpleVT().SimpleTy) {

43434

default:

43435

return SDValue();

43436

case MVT::v2i1:

43437

SExtVT = MVT::v2i64;

43438

break;

43439

case MVT::v4i1:

43440

SExtVT = MVT::v4i32;

43441

// For cases such as (i4 bitcast (v4i1 setcc v4i64 v1, v2))

43442

// sign-extend to a 256-bit operation to avoid truncation.

43443

if (Subtarget.hasAVX() &&

43444

checkBitcastSrcVectorSize(Src, 256, Subtarget.hasAVX2())) {

43445

SExtVT = MVT::v4i64;

43446

PropagateSExt = true;

43447

}

43448

break;

43449

case MVT::v8i1:

43450

SExtVT = MVT::v8i16;

43451

// For cases such as (i8 bitcast (v8i1 setcc v8i32 v1, v2)),

43452

// sign-extend to a 256-bit operation to match the compare.

43453

// If the setcc operand is 128-bit, prefer sign-extending to 128-bit over

43454

// 256-bit because the shuffle is cheaper than sign extending the result of

43455

// the compare.

43456

if (Subtarget.hasAVX() && (checkBitcastSrcVectorSize(Src, 256, true) ||

43457

checkBitcastSrcVectorSize(Src, 512, true))) {

43458

SExtVT = MVT::v8i32;

43459

PropagateSExt = true;

43460

}

43461

break;

43462

case MVT::v16i1:

43463

SExtVT = MVT::v16i8;

43464

// For the case (i16 bitcast (v16i1 setcc v16i16 v1, v2)),

43465

// it is not profitable to sign-extend to 256-bit because this will

43466

// require an extra cross-lane shuffle which is more expensive than

43467

// truncating the result of the compare to 128-bits.

43468

break;

43469

case MVT::v32i1:

43470

SExtVT = MVT::v32i8;

43471

break;

43472

case MVT::v64i1:

43473

// If we have AVX512F, but not AVX512BW and the input is truncated from

43474

// v64i8 checked earlier. Then split the input and make two pmovmskbs.

43475

if (Subtarget.hasAVX512()) {

43476

if (Subtarget.hasBWI())

43477

return SDValue();

43478

SExtVT = MVT::v64i8;

43479

break;

43480

}

43481

// Split if this is a <64 x i8> comparison result.

43482

if (checkBitcastSrcVectorSize(Src, 512, false)) {

43483

SExtVT = MVT::v64i8;

43484

break;

43485

}

43486

return SDValue();

43487

};

43488

43489

SDValue V = PropagateSExt ? signExtendBitcastSrcVector(DAG, SExtVT, Src, DL)

43490

: DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src);

43491

43492

if (SExtVT == MVT::v16i8 || SExtVT == MVT::v32i8 || SExtVT == MVT::v64i8) {

43493

V = getPMOVMSKB(DL, V, DAG, Subtarget);

43494

} else {

43495

if (SExtVT == MVT::v8i16)

43496

V = DAG.getNode(X86ISD::PACKSS, DL, MVT::v16i8, V,

43497

DAG.getUNDEF(MVT::v8i16));

43498

V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);

43499

}

43500

43501

EVT IntVT =

43502

EVT::getIntegerVT(*DAG.getContext(), SrcVT.getVectorNumElements());

43503

V = DAG.getZExtOrTrunc(V, DL, IntVT);

43504

return DAG.getBitcast(VT, V);

43505

}

43506

43507

// Convert a vXi1 constant build vector to the same width scalar integer.

43508

static SDValue combinevXi1ConstantToInteger(SDValue Op, SelectionDAG &DAG) {

43509

EVT SrcVT = Op.getValueType();

43510

assert(SrcVT.getVectorElementType() == MVT::i1 &&(static_cast <bool> (SrcVT.getVectorElementType() == MVT
::i1 && "Expected a vXi1 vector") ? void (0) : __assert_fail
("SrcVT.getVectorElementType() == MVT::i1 && \"Expected a vXi1 vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43511, __extension__
__PRETTY_FUNCTION__))

43511

"Expected a vXi1 vector")(static_cast <bool> (SrcVT.getVectorElementType() == MVT
::i1 && "Expected a vXi1 vector") ? void (0) : __assert_fail
("SrcVT.getVectorElementType() == MVT::i1 && \"Expected a vXi1 vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43511, __extension__
__PRETTY_FUNCTION__));

43512

assert(ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) &&(static_cast <bool> (ISD::isBuildVectorOfConstantSDNodes
(Op.getNode()) && "Expected a constant build vector")
? void (0) : __assert_fail ("ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) && \"Expected a constant build vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43513, __extension__
__PRETTY_FUNCTION__))

43513

"Expected a constant build vector")(static_cast <bool> (ISD::isBuildVectorOfConstantSDNodes
(Op.getNode()) && "Expected a constant build vector")
? void (0) : __assert_fail ("ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) && \"Expected a constant build vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43513, __extension__
__PRETTY_FUNCTION__));

43514

43515

APInt Imm(SrcVT.getVectorNumElements(), 0);

43516

for (unsigned Idx = 0, e = Op.getNumOperands(); Idx < e; ++Idx) {

43517

SDValue In = Op.getOperand(Idx);

43518

if (!In.isUndef() && (cast<ConstantSDNode>(In)->getZExtValue() & 0x1))

43519

Imm.setBit(Idx);

43520

}

43521

EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), Imm.getBitWidth());

43522

return DAG.getConstant(Imm, SDLoc(Op), IntVT);

43523

}

43524

43525

static SDValue combineCastedMaskArithmetic(SDNode *N, SelectionDAG &DAG,

43526

TargetLowering::DAGCombinerInfo &DCI,

43527

const X86Subtarget &Subtarget) {

43528

assert(N->getOpcode() == ISD::BITCAST && "Expected a bitcast")(static_cast <bool> (N->getOpcode() == ISD::BITCAST &&
"Expected a bitcast") ? void (0) : __assert_fail ("N->getOpcode() == ISD::BITCAST && \"Expected a bitcast\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 43528, __extension__
__PRETTY_FUNCTION__));

43529

43530

if (!DCI.isBeforeLegalizeOps())

43531

return SDValue();

43532

43533

// Only do this if we have k-registers.

43534

if (!Subtarget.hasAVX512())

43535

return SDValue();

43536

43537

EVT DstVT = N->getValueType(0);

43538

SDValue Op = N->getOperand(0);

43539

EVT SrcVT = Op.getValueType();

43540

43541

if (!Op.hasOneUse())

43542

return SDValue();

43543

43544

// Look for logic ops.

43545

if (Op.getOpcode() != ISD::AND &&

43546

Op.getOpcode() != ISD::OR &&

43547

Op.getOpcode() != ISD::XOR)

43548

return SDValue();

43549

43550

// Make sure we have a bitcast between mask registers and a scalar type.

43551

if (!(SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&

43552

DstVT.isScalarInteger()) &&

43553

!(DstVT.isVector() && DstVT.getVectorElementType() == MVT::i1 &&

43554

SrcVT.isScalarInteger()))

43555

return SDValue();

43556

43557

SDValue LHS = Op.getOperand(0);

43558

SDValue RHS = Op.getOperand(1);

43559

43560

if (LHS.hasOneUse() && LHS.getOpcode() == ISD::BITCAST &&

43561

LHS.getOperand(0).getValueType() == DstVT)

43562

return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT, LHS.getOperand(0),

43563

DAG.getBitcast(DstVT, RHS));

43564

43565

if (RHS.hasOneUse() && RHS.getOpcode() == ISD::BITCAST &&

43566

RHS.getOperand(0).getValueType() == DstVT)

43567

return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,

43568

DAG.getBitcast(DstVT, LHS), RHS.getOperand(0));

43569

43570

// If the RHS is a vXi1 build vector, this is a good reason to flip too.

43571

// Most of these have to move a constant from the scalar domain anyway.

43572

if (ISD::isBuildVectorOfConstantSDNodes(RHS.getNode())) {

43573

RHS = combinevXi1ConstantToInteger(RHS, DAG);

43574

return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,

43575

DAG.getBitcast(DstVT, LHS), RHS);

43576

}

43577

43578

return SDValue();

43579

}

43580

43581

static SDValue createMMXBuildVector(BuildVectorSDNode *BV, SelectionDAG &DAG,

43582

const X86Subtarget &Subtarget) {

43583

SDLoc DL(BV);

43584

unsigned NumElts = BV->getNumOperands();

43585

SDValue Splat = BV->getSplatValue();

43586

43587

// Build MMX element from integer GPR or SSE float values.

43588

auto CreateMMXElement = [&](SDValue V) {

43589

if (V.isUndef())

43590

return DAG.getUNDEF(MVT::x86mmx);

43591

if (V.getValueType().isFloatingPoint()) {

43592

if (Subtarget.hasSSE1() && !isa<ConstantFPSDNode>(V)) {

43593

V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, V);

43594

V = DAG.getBitcast(MVT::v2i64, V);

43595

return DAG.getNode(X86ISD::MOVDQ2Q, DL, MVT::x86mmx, V);

43596

}

43597

V = DAG.getBitcast(MVT::i32, V);

43598

} else {

43599

V = DAG.getAnyExtOrTrunc(V, DL, MVT::i32);

43600

}

43601

return DAG.getNode(X86ISD::MMX_MOVW2D, DL, MVT::x86mmx, V);

43602

};

43603

43604

// Convert build vector ops to MMX data in the bottom elements.

43605

SmallVector<SDValue, 8> Ops;

43606

43607

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

43608

43609

// Broadcast - use (PUNPCKL+)PSHUFW to broadcast single element.

43610

if (Splat) {

43611

if (Splat.isUndef())

43612

return DAG.getUNDEF(MVT::x86mmx);

43613

43614

Splat = CreateMMXElement(Splat);

43615

43616

if (Subtarget.hasSSE1()) {

43617

// Unpack v8i8 to splat i8 elements to lowest 16-bits.

43618

if (NumElts == 8)

43619

Splat = DAG.getNode(

43620

ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,

43621

DAG.getTargetConstant(Intrinsic::x86_mmx_punpcklbw, DL,

43622

TLI.getPointerTy(DAG.getDataLayout())),

43623

Splat, Splat);

43624

43625

// Use PSHUFW to repeat 16-bit elements.

43626

unsigned ShufMask = (NumElts > 2 ? 0 : 0x44);

43627

return DAG.getNode(

43628

ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,

43629

DAG.getTargetConstant(Intrinsic::x86_sse_pshuf_w, DL,

43630

TLI.getPointerTy(DAG.getDataLayout())),

43631

Splat, DAG.getTargetConstant(ShufMask, DL, MVT::i8));

43632

}

43633

Ops.append(NumElts, Splat);

43634

} else {

43635

for (unsigned i = 0; i != NumElts; ++i)

43636

Ops.push_back(CreateMMXElement(BV->getOperand(i)));

43637

}

43638

43639

// Use tree of PUNPCKLs to build up general MMX vector.

43640

while (Ops.size() > 1) {

43641

unsigned NumOps = Ops.size();

43642

unsigned IntrinOp =

43643

(NumOps == 2 ? Intrinsic::x86_mmx_punpckldq

43644

: (NumOps == 4 ? Intrinsic::x86_mmx_punpcklwd

43645

: Intrinsic::x86_mmx_punpcklbw));

43646

SDValue Intrin = DAG.getTargetConstant(

43647

IntrinOp, DL, TLI.getPointerTy(DAG.getDataLayout()));

43648

for (unsigned i = 0; i != NumOps; i += 2)

43649

Ops[i / 2] = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx, Intrin,

43650

Ops[i], Ops[i + 1]);

43651

Ops.resize(NumOps / 2);

43652

}

43653

43654

return Ops[0];

43655

}

43656

43657

// Recursive function that attempts to find if a bool vector node was originally

43658

// a vector/float/double that got truncated/extended/bitcast to/from a scalar

43659

// integer. If so, replace the scalar ops with bool vector equivalents back down

43660

// the chain.

43661

static SDValue combineBitcastToBoolVector(EVT VT, SDValue V, const SDLoc &DL,

43662

SelectionDAG &DAG,

43663

const X86Subtarget &Subtarget) {

43664

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

43665

unsigned Opc = V.getOpcode();

43666

switch (Opc) {

43667

case ISD::BITCAST: {

43668

// Bitcast from a vector/float/double, we can cheaply bitcast to VT.

43669

SDValue Src = V.getOperand(0);

43670

EVT SrcVT = Src.getValueType();

43671

if (SrcVT.isVector() || SrcVT.isFloatingPoint())

43672

return DAG.getBitcast(VT, Src);

43673

break;

43674

}

43675

case ISD::TRUNCATE: {

43676

// If we find a suitable source, a truncated scalar becomes a subvector.

43677

SDValue Src = V.getOperand(0);

43678

EVT NewSrcVT =

43679

EVT::getVectorVT(*DAG.getContext(), MVT::i1, Src.getValueSizeInBits());

43680

if (TLI.isTypeLegal(NewSrcVT))

43681

if (SDValue N0 =

43682

combineBitcastToBoolVector(NewSrcVT, Src, DL, DAG, Subtarget))

43683

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, N0,

43684

DAG.getIntPtrConstant(0, DL));

43685

break;

43686

}

43687

case ISD::ANY_EXTEND:

43688

case ISD::ZERO_EXTEND: {

43689

// If we find a suitable source, an extended scalar becomes a subvector.

43690

SDValue Src = V.getOperand(0);

43691

EVT NewSrcVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,

43692

Src.getScalarValueSizeInBits());

43693

if (TLI.isTypeLegal(NewSrcVT))

43694

if (SDValue N0 =

43695

combineBitcastToBoolVector(NewSrcVT, Src, DL, DAG, Subtarget))

43696

return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,

43697

Opc == ISD::ANY_EXTEND ? DAG.getUNDEF(VT)

43698

: DAG.getConstant(0, DL, VT),

43699

N0, DAG.getIntPtrConstant(0, DL));

43700

break;

43701

}

43702

case ISD::OR: {

43703

// If we find suitable sources, we can just move an OR to the vector domain.

43704

SDValue Src0 = V.getOperand(0);

43705

SDValue Src1 = V.getOperand(1);

43706

if (SDValue N0 = combineBitcastToBoolVector(VT, Src0, DL, DAG, Subtarget))

43707

if (SDValue N1 = combineBitcastToBoolVector(VT, Src1, DL, DAG, Subtarget))

43708

return DAG.getNode(Opc, DL, VT, N0, N1);

43709

break;

43710

}

43711

case ISD::SHL: {

43712

// If we find a suitable source, a SHL becomes a KSHIFTL.

43713

SDValue Src0 = V.getOperand(0);

43714

if ((VT == MVT::v8i1 && !Subtarget.hasDQI()) ||

43715

((VT == MVT::v32i1 || VT == MVT::v64i1) && !Subtarget.hasBWI()))

43716

break;

43717

43718

if (auto *Amt = dyn_cast<ConstantSDNode>(V.getOperand(1)))

43719

if (SDValue N0 = combineBitcastToBoolVector(VT, Src0, DL, DAG, Subtarget))

43720

return DAG.getNode(

43721

X86ISD::KSHIFTL, DL, VT, N0,

43722

DAG.getTargetConstant(Amt->getZExtValue(), DL, MVT::i8));

43723

break;

43724

}

43725

}

43726

return SDValue();

43727

}

43728

43729

static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,

43730

TargetLowering::DAGCombinerInfo &DCI,

43731

const X86Subtarget &Subtarget) {

43732

SDValue N0 = N->getOperand(0);

43733

EVT VT = N->getValueType(0);

43734

EVT SrcVT = N0.getValueType();

43735

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

43736

43737

// Try to match patterns such as

43738

// (i16 bitcast (v16i1 x))

43739

// ->

43740

// (i16 movmsk (16i8 sext (v16i1 x)))

43741

// before the setcc result is scalarized on subtargets that don't have legal

43742

// vxi1 types.

43743

if (DCI.isBeforeLegalize()) {

43744

SDLoc dl(N);

43745

if (SDValue V = combineBitcastvxi1(DAG, VT, N0, dl, Subtarget))

43746

return V;

43747

43748

// If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer

43749

// type, widen both sides to avoid a trip through memory.

43750

if ((VT == MVT::v4i1 || VT == MVT::v2i1) && SrcVT.isScalarInteger() &&

43751

Subtarget.hasAVX512()) {

43752

N0 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i8, N0);

43753

N0 = DAG.getBitcast(MVT::v8i1, N0);

43754

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, N0,

43755

DAG.getIntPtrConstant(0, dl));

43756

}

43757

43758

// If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer

43759

// type, widen both sides to avoid a trip through memory.

43760

if ((SrcVT == MVT::v4i1 || SrcVT == MVT::v2i1) && VT.isScalarInteger() &&

43761

Subtarget.hasAVX512()) {

43762

// Use zeros for the widening if we already have some zeroes. This can

43763

// allow SimplifyDemandedBits to remove scalar ANDs that may be down

43764

// stream of this.

43765

// FIXME: It might make sense to detect a concat_vectors with a mix of

43766

// zeroes and undef and turn it into insert_subvector for i1 vectors as

43767

// a separate combine. What we can't do is canonicalize the operands of

43768

// such a concat or we'll get into a loop with SimplifyDemandedBits.

43769

if (N0.getOpcode() == ISD::CONCAT_VECTORS) {

43770

SDValue LastOp = N0.getOperand(N0.getNumOperands() - 1);

43771

if (ISD::isBuildVectorAllZeros(LastOp.getNode())) {

43772

SrcVT = LastOp.getValueType();

43773

unsigned NumConcats = 8 / SrcVT.getVectorNumElements();

43774

SmallVector<SDValue, 4> Ops(N0->op_begin(), N0->op_end());

43775

Ops.resize(NumConcats, DAG.getConstant(0, dl, SrcVT));

43776

N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);

43777

N0 = DAG.getBitcast(MVT::i8, N0);

43778

return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);

43779

}

43780

}

43781

43782

unsigned NumConcats = 8 / SrcVT.getVectorNumElements();

43783

SmallVector<SDValue, 4> Ops(NumConcats, DAG.getUNDEF(SrcVT));

43784

Ops[0] = N0;

43785

N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);

43786

N0 = DAG.getBitcast(MVT::i8, N0);

43787

return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);

43788

}

43789

} else {

43790

// If we're bitcasting from iX to vXi1, see if the integer originally

43791

// began as a vXi1 and whether we can remove the bitcast entirely.

43792

if (VT.isVector() && VT.getScalarType() == MVT::i1 &&

43793

SrcVT.isScalarInteger() && TLI.isTypeLegal(VT)) {

43794

if (SDValue V =

43795

combineBitcastToBoolVector(VT, N0, SDLoc(N), DAG, Subtarget))

43796

return V;

43797

}

43798

}

43799

43800

// Look for (i8 (bitcast (v8i1 (extract_subvector (v16i1 X), 0)))) and

43801

// replace with (i8 (trunc (i16 (bitcast (v16i1 X))))). This can occur

43802

// due to insert_subvector legalization on KNL. By promoting the copy to i16

43803

// we can help with known bits propagation from the vXi1 domain to the

43804

// scalar domain.

43805

if (VT == MVT::i8 && SrcVT == MVT::v8i1 && Subtarget.hasAVX512() &&

43806

!Subtarget.hasDQI() && N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&

43807

N0.getOperand(0).getValueType() == MVT::v16i1 &&

43808

isNullConstant(N0.getOperand(1)))

43809

return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT,

43810

DAG.getBitcast(MVT::i16, N0.getOperand(0)));

43811

43812

// Canonicalize (bitcast (vbroadcast_load)) so that the output of the bitcast

43813

// and the vbroadcast_load are both integer or both fp. In some cases this

43814

// will remove the bitcast entirely.

43815

if (N0.getOpcode() == X86ISD::VBROADCAST_LOAD && N0.hasOneUse() &&

43816

VT.isFloatingPoint() != SrcVT.isFloatingPoint() && VT.isVector()) {

43817

auto *BCast = cast<MemIntrinsicSDNode>(N0);

43818

unsigned SrcVTSize = SrcVT.getScalarSizeInBits();

43819

unsigned MemSize = BCast->getMemoryVT().getScalarSizeInBits();

43820

// Don't swap i8/i16 since don't have fp types that size.

43821

if (MemSize >= 32) {

43822

MVT MemVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(MemSize)

43823

: MVT::getIntegerVT(MemSize);

43824

MVT LoadVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(SrcVTSize)

43825

: MVT::getIntegerVT(SrcVTSize);

43826

LoadVT = MVT::getVectorVT(LoadVT, SrcVT.getVectorNumElements());

43827

43828

SDVTList Tys = DAG.getVTList(LoadVT, MVT::Other);

43829

SDValue Ops[] = { BCast->getChain(), BCast->getBasePtr() };

43830

SDValue ResNode =

43831

DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, SDLoc(N), Tys, Ops,

43832

MemVT, BCast->getMemOperand());

43833

DAG.ReplaceAllUsesOfValueWith(SDValue(BCast, 1), ResNode.getValue(1));

43834

return DAG.getBitcast(VT, ResNode);

43835

}

43836

}

43837

43838

// Since MMX types are special and don't usually play with other vector types,

43839

// it's better to handle them early to be sure we emit efficient code by

43840

// avoiding store-load conversions.

43841

if (VT == MVT::x86mmx) {

43842

// Detect MMX constant vectors.

43843

APInt UndefElts;

43844

SmallVector<APInt, 1> EltBits;

43845

if (getTargetConstantBitsFromNode(N0, 64, UndefElts, EltBits)) {

43846

SDLoc DL(N0);

43847

// Handle zero-extension of i32 with MOVD.

43848

if (EltBits[0].countLeadingZeros() >= 32)

43849

return DAG.getNode(X86ISD::MMX_MOVW2D, DL, VT,

43850

DAG.getConstant(EltBits[0].trunc(32), DL, MVT::i32));

43851

// Else, bitcast to a double.

43852

// TODO - investigate supporting sext 32-bit immediates on x86_64.

43853

APFloat F64(APFloat::IEEEdouble(), EltBits[0]);

43854

return DAG.getBitcast(VT, DAG.getConstantFP(F64, DL, MVT::f64));

43855

}

43856

43857

// Detect bitcasts to x86mmx low word.

43858

if (N0.getOpcode() == ISD::BUILD_VECTOR &&

43859

(SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8) &&

43860

N0.getOperand(0).getValueType() == SrcVT.getScalarType()) {

43861

bool LowUndef = true, AllUndefOrZero = true;

43862

for (unsigned i = 1, e = SrcVT.getVectorNumElements(); i != e; ++i) {

43863

SDValue Op = N0.getOperand(i);

43864

LowUndef &= Op.isUndef() || (i >= e/2);

43865

AllUndefOrZero &= (Op.isUndef() || isNullConstant(Op));

43866

}

43867

if (AllUndefOrZero) {

43868

SDValue N00 = N0.getOperand(0);

43869

SDLoc dl(N00);

43870

N00 = LowUndef ? DAG.getAnyExtOrTrunc(N00, dl, MVT::i32)

43871

: DAG.getZExtOrTrunc(N00, dl, MVT::i32);

43872

return DAG.getNode(X86ISD::MMX_MOVW2D, dl, VT, N00);

43873

}

43874

}

43875

43876

// Detect bitcasts of 64-bit build vectors and convert to a

43877

// MMX UNPCK/PSHUFW which takes MMX type inputs with the value in the

43878

// lowest element.

43879

if (N0.getOpcode() == ISD::BUILD_VECTOR &&

43880

(SrcVT == MVT::v2f32 || SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 ||

43881

SrcVT == MVT::v8i8))

43882

return createMMXBuildVector(cast<BuildVectorSDNode>(N0), DAG, Subtarget);

43883

43884

// Detect bitcasts between element or subvector extraction to x86mmx.

43885

if ((N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||

43886

N0.getOpcode() == ISD::EXTRACT_SUBVECTOR) &&

43887

isNullConstant(N0.getOperand(1))) {

43888

SDValue N00 = N0.getOperand(0);

43889

if (N00.getValueType().is128BitVector())

43890

return DAG.getNode(X86ISD::MOVDQ2Q, SDLoc(N00), VT,

43891

DAG.getBitcast(MVT::v2i64, N00));

43892

}

43893

43894

// Detect bitcasts from FP_TO_SINT to x86mmx.

43895

if (SrcVT == MVT::v2i32 && N0.getOpcode() == ISD::FP_TO_SINT) {

43896

SDLoc DL(N0);

43897

SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,

43898

DAG.getUNDEF(MVT::v2i32));

43899

return DAG.getNode(X86ISD::MOVDQ2Q, DL, VT,

43900

DAG.getBitcast(MVT::v2i64, Res));

43901

}

43902

}

43903

43904

// Try to remove a bitcast of constant vXi1 vector. We have to legalize

43905

// most of these to scalar anyway.

43906

if (Subtarget.hasAVX512() && VT.isScalarInteger() &&

43907

SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&

43908

ISD::isBuildVectorOfConstantSDNodes(N0.getNode())) {

43909

return combinevXi1ConstantToInteger(N0, DAG);

43910

}

43911

43912

if (Subtarget.hasAVX512() && SrcVT.isScalarInteger() &&

43913

VT.isVector() && VT.getVectorElementType() == MVT::i1 &&

43914

isa<ConstantSDNode>(N0)) {

43915

auto *C = cast<ConstantSDNode>(N0);

43916

if (C->isAllOnes())

43917

return DAG.getConstant(1, SDLoc(N0), VT);

43918

if (C->isZero())

43919

return DAG.getConstant(0, SDLoc(N0), VT);

43920

}

43921

43922

// Look for MOVMSK that is maybe truncated and then bitcasted to vXi1.

43923

// Turn it into a sign bit compare that produces a k-register. This avoids

43924

// a trip through a GPR.

43925

if (Subtarget.hasAVX512() && SrcVT.isScalarInteger() &&

43926

VT.isVector() && VT.getVectorElementType() == MVT::i1 &&

43927

isPowerOf2_32(VT.getVectorNumElements())) {

43928

unsigned NumElts = VT.getVectorNumElements();

43929

SDValue Src = N0;

43930

43931

// Peek through truncate.

43932

if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse())

43933

Src = N0.getOperand(0);

43934

43935

if (Src.getOpcode() == X86ISD::MOVMSK && Src.hasOneUse()) {

43936

SDValue MovmskIn = Src.getOperand(0);

43937

MVT MovmskVT = MovmskIn.getSimpleValueType();

43938

unsigned MovMskElts = MovmskVT.getVectorNumElements();

43939

43940

// We allow extra bits of the movmsk to be used since they are known zero.

43941

// We can't convert a VPMOVMSKB without avx512bw.

43942

if (MovMskElts <= NumElts &&

43943

(Subtarget.hasBWI() || MovmskVT.getVectorElementType() != MVT::i8)) {

43944

EVT IntVT = EVT(MovmskVT).changeVectorElementTypeToInteger();

43945

MovmskIn = DAG.getBitcast(IntVT, MovmskIn);

43946

SDLoc dl(N);

43947

MVT CmpVT = MVT::getVectorVT(MVT::i1, MovMskElts);

43948

SDValue Cmp = DAG.getSetCC(dl, CmpVT, MovmskIn,

43949

DAG.getConstant(0, dl, IntVT), ISD::SETLT);

43950

if (EVT(CmpVT) == VT)

43951

return Cmp;

43952

43953

// Pad with zeroes up to original VT to replace the zeroes that were

43954

// being used from the MOVMSK.

43955

unsigned NumConcats = NumElts / MovMskElts;

43956

SmallVector<SDValue, 4> Ops(NumConcats, DAG.getConstant(0, dl, CmpVT));

43957

Ops[0] = Cmp;

43958

return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Ops);

43959

}

43960

}

43961

}

43962

43963

// Try to remove bitcasts from input and output of mask arithmetic to

43964

// remove GPR<->K-register crossings.

43965

if (SDValue V = combineCastedMaskArithmetic(N, DAG, DCI, Subtarget))

43966

return V;

43967

43968

// Convert a bitcasted integer logic operation that has one bitcasted

43969

// floating-point operand into a floating-point logic operation. This may

43970

// create a load of a constant, but that is cheaper than materializing the

43971

// constant in an integer register and transferring it to an SSE register or

43972

// transferring the SSE operand to integer register and back.

43973

unsigned FPOpcode;

43974

switch (N0.getOpcode()) {

43975

case ISD::AND: FPOpcode = X86ISD::FAND; break;

43976

case ISD::OR: FPOpcode = X86ISD::FOR; break;

43977

case ISD::XOR: FPOpcode = X86ISD::FXOR; break;

43978

default: return SDValue();

43979

}

43980

43981

// Check if we have a bitcast from another integer type as well.

43982

if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||

43983

(Subtarget.hasSSE2() && VT == MVT::f64) ||

43984

(Subtarget.hasFP16() && VT == MVT::f16) ||

43985

(Subtarget.hasSSE2() && VT.isInteger() && VT.isVector() &&

43986

TLI.isTypeLegal(VT))))

43987

return SDValue();

43988

43989

SDValue LogicOp0 = N0.getOperand(0);

43990

SDValue LogicOp1 = N0.getOperand(1);

43991

SDLoc DL0(N0);

43992

43993

// bitcast(logic(bitcast(X), Y)) --> logic'(X, bitcast(Y))

43994

if (N0.hasOneUse() && LogicOp0.getOpcode() == ISD::BITCAST &&

43995

LogicOp0.hasOneUse() && LogicOp0.getOperand(0).hasOneUse() &&

43996

LogicOp0.getOperand(0).getValueType() == VT &&

43997

!isa<ConstantSDNode>(LogicOp0.getOperand(0))) {

43998

SDValue CastedOp1 = DAG.getBitcast(VT, LogicOp1);

43999

unsigned Opcode = VT.isFloatingPoint() ? FPOpcode : N0.getOpcode();

44000

return DAG.getNode(Opcode, DL0, VT, LogicOp0.getOperand(0), CastedOp1);

44001

}

44002

// bitcast(logic(X, bitcast(Y))) --> logic'(bitcast(X), Y)

44003

if (N0.hasOneUse() && LogicOp1.getOpcode() == ISD::BITCAST &&

44004

LogicOp1.hasOneUse() && LogicOp1.getOperand(0).hasOneUse() &&

44005

LogicOp1.getOperand(0).getValueType() == VT &&

44006

!isa<ConstantSDNode>(LogicOp1.getOperand(0))) {

44007

SDValue CastedOp0 = DAG.getBitcast(VT, LogicOp0);

44008

unsigned Opcode = VT.isFloatingPoint() ? FPOpcode : N0.getOpcode();

44009

return DAG.getNode(Opcode, DL0, VT, LogicOp1.getOperand(0), CastedOp0);

44010

}

44011

44012

return SDValue();

44013

}

44014

44015

// (mul (zext a), (sext, b))

44016

static bool detectExtMul(SelectionDAG &DAG, const SDValue &Mul, SDValue &Op0,

44017

SDValue &Op1) {

44018

Op0 = Mul.getOperand(0);

44019

Op1 = Mul.getOperand(1);

44020

44021

// The operand1 should be signed extend

44022

if (Op0.getOpcode() == ISD::SIGN_EXTEND)

44023

std::swap(Op0, Op1);

44024

44025

auto IsFreeTruncation = [](SDValue &Op) -> bool {

44026

if ((Op.getOpcode() == ISD::ZERO_EXTEND ||

44027

Op.getOpcode() == ISD::SIGN_EXTEND) &&

44028

Op.getOperand(0).getScalarValueSizeInBits() <= 8)

44029

return true;

44030

44031

auto *BV = dyn_cast<BuildVectorSDNode>(Op);

44032

return (BV && BV->isConstant());

44033

};

44034

44035

// (dpbusd (zext a), (sext, b)). Since the first operand should be unsigned

44036

// value, we need to check Op0 is zero extended value. Op1 should be signed

44037

// value, so we just check the signed bits.

44038

if ((IsFreeTruncation(Op0) &&

44039

DAG.computeKnownBits(Op0).countMaxActiveBits() <= 8) &&

44040

(IsFreeTruncation(Op1) && DAG.ComputeMaxSignificantBits(Op1) <= 8))

44041

return true;

44042

44043

return false;

44044

}

44045

44046

// Given a ABS node, detect the following pattern:

44047

// (ABS (SUB (ZERO_EXTEND a), (ZERO_EXTEND b))).

44048

// This is useful as it is the input into a SAD pattern.

44049

static bool detectZextAbsDiff(const SDValue &Abs, SDValue &Op0, SDValue &Op1) {

44050

SDValue AbsOp1 = Abs->getOperand(0);

44051

if (AbsOp1.getOpcode() != ISD::SUB)

44052

return false;

44053

44054

Op0 = AbsOp1.getOperand(0);

44055

Op1 = AbsOp1.getOperand(1);

44056

44057

// Check if the operands of the sub are zero-extended from vectors of i8.

44058

if (Op0.getOpcode() != ISD::ZERO_EXTEND ||

44059

Op0.getOperand(0).getValueType().getVectorElementType() != MVT::i8 ||

44060

Op1.getOpcode() != ISD::ZERO_EXTEND ||

44061

Op1.getOperand(0).getValueType().getVectorElementType() != MVT::i8)

44062

return false;

44063

44064

return true;

44065

}

44066

44067

static SDValue createVPDPBUSD(SelectionDAG &DAG, SDValue LHS, SDValue RHS,

44068

unsigned &LogBias, const SDLoc &DL,

44069

const X86Subtarget &Subtarget) {

44070

// Extend or truncate to MVT::i8 first.

44071

MVT Vi8VT =

44072

MVT::getVectorVT(MVT::i8, LHS.getValueType().getVectorElementCount());

44073

LHS = DAG.getZExtOrTrunc(LHS, DL, Vi8VT);

44074

RHS = DAG.getSExtOrTrunc(RHS, DL, Vi8VT);

44075

44076

// VPDPBUSD(<16 x i32>C, <16 x i8>A, <16 x i8>B). For each dst element

44077

// C[0] = C[0] + A[0]B[0] + A[1]B[1] + A[2]B[2] + A[3]B[3].

44078

// The src A, B element type is i8, but the dst C element type is i32.

44079

// When we calculate the reduce stage, we use src vector type vXi8 for it

44080

// so we need logbias 2 to avoid extra 2 stages.

44081

LogBias = 2;

44082

44083

unsigned RegSize = std::max(128u, (unsigned)Vi8VT.getSizeInBits());

44084

if (Subtarget.hasVNNI() && !Subtarget.hasVLX())

44085

RegSize = std::max(512u, RegSize);

44086

44087

// "Zero-extend" the i8 vectors. This is not a per-element zext, rather we

44088

// fill in the missing vector elements with 0.

44089

unsigned NumConcat = RegSize / Vi8VT.getSizeInBits();

44090

SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, Vi8VT));

44091

Ops[0] = LHS;

44092

MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);

44093

SDValue DpOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);

44094

Ops[0] = RHS;

44095

SDValue DpOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);

44096

44097

// Actually build the DotProduct, split as 256/512 bits for

44098

// AVXVNNI/AVX512VNNI.

44099

auto DpBuilder = [](SelectionDAG &DAG, const SDLoc &DL,

44100

ArrayRef<SDValue> Ops) {

44101

MVT VT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);

44102

return DAG.getNode(X86ISD::VPDPBUSD, DL, VT, Ops);

44103

};

44104

MVT DpVT = MVT::getVectorVT(MVT::i32, RegSize / 32);

44105

SDValue Zero = DAG.getConstant(0, DL, DpVT);

44106

44107

return SplitOpsAndApply(DAG, Subtarget, DL, DpVT, {Zero, DpOp0, DpOp1},

44108

DpBuilder, false);

44109

}

44110

44111

// Given two zexts of <k x i8> to <k x i32>, create a PSADBW of the inputs

44112

// to these zexts.

44113

static SDValue createPSADBW(SelectionDAG &DAG, const SDValue &Zext0,

44114

const SDValue &Zext1, const SDLoc &DL,

44115

const X86Subtarget &Subtarget) {

44116

// Find the appropriate width for the PSADBW.

44117

EVT InVT = Zext0.getOperand(0).getValueType();

44118

unsigned RegSize = std::max(128u, (unsigned)InVT.getSizeInBits());

44119

44120

// "Zero-extend" the i8 vectors. This is not a per-element zext, rather we

44121

// fill in the missing vector elements with 0.

44122

unsigned NumConcat = RegSize / InVT.getSizeInBits();

44123

SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, InVT));

44124

Ops[0] = Zext0.getOperand(0);

44125

MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);

44126

SDValue SadOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);

44127

Ops[0] = Zext1.getOperand(0);

44128

SDValue SadOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);

44129

44130

// Actually build the SAD, split as 128/256/512 bits for SSE/AVX2/AVX512BW.

44131

auto PSADBWBuilder = [](SelectionDAG &DAG, const SDLoc &DL,

44132

ArrayRef<SDValue> Ops) {

44133

MVT VT = MVT::getVectorVT(MVT::i64, Ops[0].getValueSizeInBits() / 64);

44134

return DAG.getNode(X86ISD::PSADBW, DL, VT, Ops);

44135

};

44136

MVT SadVT = MVT::getVectorVT(MVT::i64, RegSize / 64);

44137

return SplitOpsAndApply(DAG, Subtarget, DL, SadVT, { SadOp0, SadOp1 },

44138

PSADBWBuilder);

44139

}

44140

44141

// Attempt to replace an min/max v8i16/v16i8 horizontal reduction with

44142

// PHMINPOSUW.

44143

static SDValue combineMinMaxReduction(SDNode *Extract, SelectionDAG &DAG,

44144

const X86Subtarget &Subtarget) {

44145

// Bail without SSE41.

44146

if (!Subtarget.hasSSE41())

44147

return SDValue();

44148

44149

EVT ExtractVT = Extract->getValueType(0);

44150

if (ExtractVT != MVT::i16 && ExtractVT != MVT::i8)

44151

return SDValue();

44152

44153

// Check for SMAX/SMIN/UMAX/UMIN horizontal reduction patterns.

44154

ISD::NodeType BinOp;

44155

SDValue Src = DAG.matchBinOpReduction(

44156

Extract, BinOp, {ISD::SMAX, ISD::SMIN, ISD::UMAX, ISD::UMIN}, true);

44157

if (!Src)

44158

return SDValue();

44159

44160

EVT SrcVT = Src.getValueType();

44161

EVT SrcSVT = SrcVT.getScalarType();

44162

if (SrcSVT != ExtractVT || (SrcVT.getSizeInBits() % 128) != 0)

44163

return SDValue();

44164

44165

SDLoc DL(Extract);

44166

SDValue MinPos = Src;

44167

44168

// First, reduce the source down to 128-bit, applying BinOp to lo/hi.

44169

while (SrcVT.getSizeInBits() > 128) {

44170

SDValue Lo, Hi;

44171

std::tie(Lo, Hi) = splitVector(MinPos, DAG, DL);

44172

SrcVT = Lo.getValueType();

44173

MinPos = DAG.getNode(BinOp, DL, SrcVT, Lo, Hi);

44174

}

44175

assert(((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) ||(static_cast <bool> (((SrcVT == MVT::v8i16 && ExtractVT
== MVT::i16) || (SrcVT == MVT::v16i8 && ExtractVT ==
MVT::i8)) && "Unexpected value type") ? void (0) : __assert_fail
("((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) || (SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 44177, __extension__
__PRETTY_FUNCTION__))

44176

(SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) &&(static_cast <bool> (((SrcVT == MVT::v8i16 && ExtractVT
== MVT::i16) || (SrcVT == MVT::v16i8 && ExtractVT ==
MVT::i8)) && "Unexpected value type") ? void (0) : __assert_fail
("((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) || (SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 44177, __extension__
__PRETTY_FUNCTION__))

44177

"Unexpected value type")(static_cast <bool> (((SrcVT == MVT::v8i16 && ExtractVT
== MVT::i16) || (SrcVT == MVT::v16i8 && ExtractVT ==
MVT::i8)) && "Unexpected value type") ? void (0) : __assert_fail
("((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) || (SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 44177, __extension__
__PRETTY_FUNCTION__));

44178

44179

// PHMINPOSUW applies to UMIN(v8i16), for SMIN/SMAX/UMAX we must apply a mask

44180

// to flip the value accordingly.

44181

SDValue Mask;

44182

unsigned MaskEltsBits = ExtractVT.getSizeInBits();

44183

if (BinOp == ISD::SMAX)

44184

Mask = DAG.getConstant(APInt::getSignedMaxValue(MaskEltsBits), DL, SrcVT);

44185

else if (BinOp == ISD::SMIN)

44186

Mask = DAG.getConstant(APInt::getSignedMinValue(MaskEltsBits), DL, SrcVT);

44187

else if (BinOp == ISD::UMAX)

44188

Mask = DAG.getAllOnesConstant(DL, SrcVT);

44189

44190

if (Mask)

44191

MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);

44192

44193

// For v16i8 cases we need to perform UMIN on pairs of byte elements,

44194

// shuffling each upper element down and insert zeros. This means that the

44195

// v16i8 UMIN will leave the upper element as zero, performing zero-extension

44196

// ready for the PHMINPOS.

44197

if (ExtractVT == MVT::i8) {

44198

SDValue Upper = DAG.getVectorShuffle(

44199

SrcVT, DL, MinPos, DAG.getConstant(0, DL, MVT::v16i8),

44200

{1, 16, 3, 16, 5, 16, 7, 16, 9, 16, 11, 16, 13, 16, 15, 16});

44201

MinPos = DAG.getNode(ISD::UMIN, DL, SrcVT, MinPos, Upper);

44202

}

44203

44204

// Perform the PHMINPOS on a v8i16 vector,

44205

MinPos = DAG.getBitcast(MVT::v8i16, MinPos);

44206

MinPos = DAG.getNode(X86ISD::PHMINPOS, DL, MVT::v8i16, MinPos);

44207

MinPos = DAG.getBitcast(SrcVT, MinPos);

44208

44209

if (Mask)

44210

MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);

44211

44212

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, MinPos,

44213

DAG.getIntPtrConstant(0, DL));

44214

}

44215

44216

// Attempt to replace an all_of/any_of/parity style horizontal reduction with a MOVMSK.

44217

static SDValue combinePredicateReduction(SDNode *Extract, SelectionDAG &DAG,

44218

const X86Subtarget &Subtarget) {

44219

// Bail without SSE2.

44220

if (!Subtarget.hasSSE2())

44221

return SDValue();

44222

44223

EVT ExtractVT = Extract->getValueType(0);

44224

unsigned BitWidth = ExtractVT.getSizeInBits();

44225

if (ExtractVT != MVT::i64 && ExtractVT != MVT::i32 && ExtractVT != MVT::i16 &&

44226

ExtractVT != MVT::i8 && ExtractVT != MVT::i1)

44227

return SDValue();

44228

44229

// Check for OR(any_of)/AND(all_of)/XOR(parity) horizontal reduction patterns.

44230

ISD::NodeType BinOp;

44231

SDValue Match = DAG.matchBinOpReduction(Extract, BinOp, {ISD::OR, ISD::AND});

44232

if (!Match && ExtractVT == MVT::i1)

44233

Match = DAG.matchBinOpReduction(Extract, BinOp, {ISD::XOR});

44234

if (!Match)

44235

return SDValue();

44236

44237

// EXTRACT_VECTOR_ELT can require implicit extension of the vector element

44238

// which we can't support here for now.

44239

if (Match.getScalarValueSizeInBits() != BitWidth)

44240

return SDValue();

44241

44242

SDValue Movmsk;

44243

SDLoc DL(Extract);

44244

EVT MatchVT = Match.getValueType();

44245

unsigned NumElts = MatchVT.getVectorNumElements();

44246

unsigned MaxElts = Subtarget.hasInt256() ? 32 : 16;

44247

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

44248

44249

if (ExtractVT == MVT::i1) {

44250

// Special case for (pre-legalization) vXi1 reductions.

44251

if (NumElts > 64 || !isPowerOf2_32(NumElts))

44252

return SDValue();

44253

if (TLI.isTypeLegal(MatchVT)) {

44254

// If this is a legal AVX512 predicate type then we can just bitcast.

44255

EVT MovmskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);

44256

Movmsk = DAG.getBitcast(MovmskVT, Match);

44257

} else {

44258

// For all_of(setcc(x,y,eq)) - use PMOVMSKB(PCMPEQB()).

44259

if (BinOp == ISD::AND && Match.getOpcode() == ISD::SETCC &&

44260

cast<CondCodeSDNode>(Match.getOperand(2))->get() ==

44261

ISD::CondCode::SETEQ) {

44262

EVT VecSVT = Match.getOperand(0).getValueType().getScalarType();

44263

if (VecSVT != MVT::i8 && (VecSVT.getSizeInBits() % 8) == 0) {

44264

NumElts *= VecSVT.getSizeInBits() / 8;

44265

EVT CmpVT = EVT::getVectorVT(*DAG.getContext(), MVT::i8, NumElts);

44266

MatchVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElts);

44267

Match = DAG.getSetCC(

44268

DL, MatchVT, DAG.getBitcast(CmpVT, Match.getOperand(0)),

44269

DAG.getBitcast(CmpVT, Match.getOperand(1)), ISD::CondCode::SETEQ);

44270

}

44271

}

44272

44273

// Use combineBitcastvxi1 to create the MOVMSK.

44274

while (NumElts > MaxElts) {

44275

SDValue Lo, Hi;

44276

std::tie(Lo, Hi) = DAG.SplitVector(Match, DL);

44277

Match = DAG.getNode(BinOp, DL, Lo.getValueType(), Lo, Hi);

44278

NumElts /= 2;

44279

}

44280

EVT MovmskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);

44281

Movmsk = combineBitcastvxi1(DAG, MovmskVT, Match, DL, Subtarget);

44282

}

44283

if (!Movmsk)

44284

return SDValue();

44285

Movmsk = DAG.getZExtOrTrunc(Movmsk, DL, NumElts > 32 ? MVT::i64 : MVT::i32);

44286

} else {

44287

// FIXME: Better handling of k-registers or 512-bit vectors?

44288

unsigned MatchSizeInBits = Match.getValueSizeInBits();

44289

if (!(MatchSizeInBits == 128 ||

44290

(MatchSizeInBits == 256 && Subtarget.hasAVX())))

44291

return SDValue();

44292

44293

// Make sure this isn't a vector of 1 element. The perf win from using

44294

// MOVMSK diminishes with less elements in the reduction, but it is

44295

// generally better to get the comparison over to the GPRs as soon as

44296

// possible to reduce the number of vector ops.

44297

if (Match.getValueType().getVectorNumElements() < 2)

44298

return SDValue();

44299

44300

// Check that we are extracting a reduction of all sign bits.

44301

if (DAG.ComputeNumSignBits(Match) != BitWidth)

44302

return SDValue();

44303

44304

if (MatchSizeInBits == 256 && BitWidth < 32 && !Subtarget.hasInt256()) {

44305

SDValue Lo, Hi;

44306

std::tie(Lo, Hi) = DAG.SplitVector(Match, DL);

44307

Match = DAG.getNode(BinOp, DL, Lo.getValueType(), Lo, Hi);

44308

MatchSizeInBits = Match.getValueSizeInBits();

44309

}

44310

44311

// For 32/64 bit comparisons use MOVMSKPS/MOVMSKPD, else PMOVMSKB.

44312

MVT MaskSrcVT;

44313

if (64 == BitWidth || 32 == BitWidth)

44314

MaskSrcVT = MVT::getVectorVT(MVT::getFloatingPointVT(BitWidth),

44315

MatchSizeInBits / BitWidth);

44316

else

44317

MaskSrcVT = MVT::getVectorVT(MVT::i8, MatchSizeInBits / 8);

44318

44319

SDValue BitcastLogicOp = DAG.getBitcast(MaskSrcVT, Match);

44320

Movmsk = getPMOVMSKB(DL, BitcastLogicOp, DAG, Subtarget);

44321

NumElts = MaskSrcVT.getVectorNumElements();

44322

}

44323

assert((NumElts <= 32 || NumElts == 64) &&(static_cast <bool> ((NumElts <= 32 || NumElts == 64
) && "Not expecting more than 64 elements") ? void (0
) : __assert_fail ("(NumElts <= 32 || NumElts == 64) && \"Not expecting more than 64 elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 44324, __extension__
__PRETTY_FUNCTION__))

44324

"Not expecting more than 64 elements")(static_cast <bool> ((NumElts <= 32 || NumElts == 64
) && "Not expecting more than 64 elements") ? void (0
) : __assert_fail ("(NumElts <= 32 || NumElts == 64) && \"Not expecting more than 64 elements\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 44324, __extension__
__PRETTY_FUNCTION__));

44325

44326

MVT CmpVT = NumElts == 64 ? MVT::i64 : MVT::i32;

44327

if (BinOp == ISD::XOR) {

44328

// parity -> (PARITY(MOVMSK X))

44329

SDValue Result = DAG.getNode(ISD::PARITY, DL, CmpVT, Movmsk);

44330

return DAG.getZExtOrTrunc(Result, DL, ExtractVT);

44331

}

44332

44333

SDValue CmpC;

44334

ISD::CondCode CondCode;

44335

if (BinOp == ISD::OR) {

44336

// any_of -> MOVMSK != 0

44337

CmpC = DAG.getConstant(0, DL, CmpVT);

44338

CondCode = ISD::CondCode::SETNE;

44339

} else {

44340

// all_of -> MOVMSK == ((1 << NumElts) - 1)

44341

CmpC = DAG.getConstant(APInt::getLowBitsSet(CmpVT.getSizeInBits(), NumElts),

44342

DL, CmpVT);

44343

CondCode = ISD::CondCode::SETEQ;

44344

}

44345

44346

// The setcc produces an i8 of 0/1, so extend that to the result width and

44347

// negate to get the final 0/-1 mask value.

44348

EVT SetccVT =

44349

TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), CmpVT);

44350

SDValue Setcc = DAG.getSetCC(DL, SetccVT, Movmsk, CmpC, CondCode);

44351

SDValue Zext = DAG.getZExtOrTrunc(Setcc, DL, ExtractVT);

44352

SDValue Zero = DAG.getConstant(0, DL, ExtractVT);

44353

return DAG.getNode(ISD::SUB, DL, ExtractVT, Zero, Zext);

44354

}

44355

44356

static SDValue combineVPDPBUSDPattern(SDNode *Extract, SelectionDAG &DAG,

44357

const X86Subtarget &Subtarget) {

44358

if (!Subtarget.hasVNNI() && !Subtarget.hasAVXVNNI())

44359

return SDValue();

44360

44361

EVT ExtractVT = Extract->getValueType(0);

44362

// Verify the type we're extracting is i32, as the output element type of

44363

// vpdpbusd is i32.

44364

if (ExtractVT != MVT::i32)

44365

return SDValue();

44366

44367

EVT VT = Extract->getOperand(0).getValueType();

44368

if (!isPowerOf2_32(VT.getVectorNumElements()))

44369

return SDValue();

44370

44371

// Match shuffle + add pyramid.

44372

ISD::NodeType BinOp;

44373

SDValue Root = DAG.matchBinOpReduction(Extract, BinOp, {ISD::ADD});

44374

44375

// We can't combine to vpdpbusd for zext, because each of the 4 multiplies

44376

// done by vpdpbusd compute a signed 16-bit product that will be sign extended

44377

// before adding into the accumulator.

44378

// TODO:

44379

// We also need to verify that the multiply has at least 2x the number of bits

44380

// of the input. We shouldn't match

44381

// (sign_extend (mul (vXi9 (zext (vXi8 X))), (vXi9 (zext (vXi8 Y)))).

44382

// if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND))

44383

// Root = Root.getOperand(0);

44384

44385

// If there was a match, we want Root to be a mul.

44386

if (!Root || Root.getOpcode() != ISD::MUL)

44387

return SDValue();

44388

44389

// Check whether we have an extend and mul pattern

44390

SDValue LHS, RHS;

44391

if (!detectExtMul(DAG, Root, LHS, RHS))

44392

return SDValue();

44393

44394

// Create the dot product instruction.

44395

SDLoc DL(Extract);

44396

unsigned StageBias;

44397

SDValue DP = createVPDPBUSD(DAG, LHS, RHS, StageBias, DL, Subtarget);

44398

44399

// If the original vector was wider than 4 elements, sum over the results

44400

// in the DP vector.

44401

unsigned Stages = Log2_32(VT.getVectorNumElements());

44402

EVT DpVT = DP.getValueType();

44403

44404

if (Stages > StageBias) {

44405

unsigned DpElems = DpVT.getVectorNumElements();

44406

44407

for (unsigned i = Stages - StageBias; i > 0; --i) {

44408

SmallVector<int, 16> Mask(DpElems, -1);

44409

for (unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)

44410

Mask[j] = MaskEnd + j;

44411

44412

SDValue Shuffle =

44413

DAG.getVectorShuffle(DpVT, DL, DP, DAG.getUNDEF(DpVT), Mask);

44414

DP = DAG.getNode(ISD::ADD, DL, DpVT, DP, Shuffle);

44415

}

44416

}

44417

44418

// Return the lowest ExtractSizeInBits bits.

44419

EVT ResVT =

44420

EVT::getVectorVT(*DAG.getContext(), ExtractVT,

44421

DpVT.getSizeInBits() / ExtractVT.getSizeInBits());

44422

DP = DAG.getBitcast(ResVT, DP);

44423

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, DP,

44424

Extract->getOperand(1));

44425

}

44426

44427

static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,

44428

const X86Subtarget &Subtarget) {

44429

// PSADBW is only supported on SSE2 and up.

44430

if (!Subtarget.hasSSE2())

44431

return SDValue();

44432

44433

EVT ExtractVT = Extract->getValueType(0);

44434

// Verify the type we're extracting is either i32 or i64.

44435

// FIXME: Could support other types, but this is what we have coverage for.

44436

if (ExtractVT != MVT::i32 && ExtractVT != MVT::i64)

44437

return SDValue();

44438

44439

EVT VT = Extract->getOperand(0).getValueType();

44440

if (!isPowerOf2_32(VT.getVectorNumElements()))

44441

return SDValue();

44442

44443

// Match shuffle + add pyramid.

44444

ISD::NodeType BinOp;

44445

SDValue Root = DAG.matchBinOpReduction(Extract, BinOp, {ISD::ADD});

44446

44447

// The operand is expected to be zero extended from i8

44448

// (verified in detectZextAbsDiff).

44449

// In order to convert to i64 and above, additional any/zero/sign

44450

// extend is expected.

44451

// The zero extend from 32 bit has no mathematical effect on the result.

44452

// Also the sign extend is basically zero extend

44453

// (extends the sign bit which is zero).

44454

// So it is correct to skip the sign/zero extend instruction.

44455

if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND ||

44456

Root.getOpcode() == ISD::ZERO_EXTEND ||

44457

Root.getOpcode() == ISD::ANY_EXTEND))

44458

Root = Root.getOperand(0);

44459

44460

// If there was a match, we want Root to be a select that is the root of an

44461

// abs-diff pattern.

44462

if (!Root || Root.getOpcode() != ISD::ABS)

44463

return SDValue();

44464

44465

// Check whether we have an abs-diff pattern feeding into the select.

44466

SDValue Zext0, Zext1;

44467

if (!detectZextAbsDiff(Root, Zext0, Zext1))

44468

return SDValue();

44469

44470

// Create the SAD instruction.

44471

SDLoc DL(Extract);

44472

SDValue SAD = createPSADBW(DAG, Zext0, Zext1, DL, Subtarget);

44473

44474

// If the original vector was wider than 8 elements, sum over the results

44475

// in the SAD vector.

44476

unsigned Stages = Log2_32(VT.getVectorNumElements());

44477

EVT SadVT = SAD.getValueType();

44478

if (Stages > 3) {

44479

unsigned SadElems = SadVT.getVectorNumElements();

44480

44481

for(unsigned i = Stages - 3; i > 0; --i) {

44482

SmallVector<int, 16> Mask(SadElems, -1);

44483

for(unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)

44484

Mask[j] = MaskEnd + j;

44485

44486

SDValue Shuffle =

44487

DAG.getVectorShuffle(SadVT, DL, SAD, DAG.getUNDEF(SadVT), Mask);

44488

SAD = DAG.getNode(ISD::ADD, DL, SadVT, SAD, Shuffle);

44489

}

44490

}

44491

44492

unsigned ExtractSizeInBits = ExtractVT.getSizeInBits();

44493

// Return the lowest ExtractSizeInBits bits.

44494

EVT ResVT = EVT::getVectorVT(*DAG.getContext(), ExtractVT,

44495

SadVT.getSizeInBits() / ExtractSizeInBits);

44496

SAD = DAG.getBitcast(ResVT, SAD);

44497

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, SAD,

44498

Extract->getOperand(1));

44499

}

44500

44501

// Attempt to peek through a target shuffle and extract the scalar from the

44502

// source.

44503

static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,

44504

TargetLowering::DAGCombinerInfo &DCI,

44505

const X86Subtarget &Subtarget) {

44506

if (DCI.isBeforeLegalizeOps())

44507

return SDValue();

44508

44509

SDLoc dl(N);

44510

SDValue Src = N->getOperand(0);

44511

SDValue Idx = N->getOperand(1);

44512

44513

EVT VT = N->getValueType(0);

44514

EVT SrcVT = Src.getValueType();

44515

EVT SrcSVT = SrcVT.getVectorElementType();

44516

unsigned SrcEltBits = SrcSVT.getSizeInBits();

44517

unsigned NumSrcElts = SrcVT.getVectorNumElements();

44518

44519

// Don't attempt this for boolean mask vectors or unknown extraction indices.

44520

if (SrcSVT == MVT::i1 || !isa<ConstantSDNode>(Idx))

44521

return SDValue();

44522

44523

const APInt &IdxC = N->getConstantOperandAPInt(1);

44524

if (IdxC.uge(NumSrcElts))

44525

return SDValue();

44526

44527

SDValue SrcBC = peekThroughBitcasts(Src);

44528

44529

// Handle extract(bitcast(broadcast(scalar_value))).

44530

if (X86ISD::VBROADCAST == SrcBC.getOpcode()) {

44531

SDValue SrcOp = SrcBC.getOperand(0);

44532

EVT SrcOpVT = SrcOp.getValueType();

44533

if (SrcOpVT.isScalarInteger() && VT.isInteger() &&

44534

(SrcOpVT.getSizeInBits() % SrcEltBits) == 0) {

44535

unsigned Scale = SrcOpVT.getSizeInBits() / SrcEltBits;

44536

unsigned Offset = IdxC.urem(Scale) * SrcEltBits;

44537

// TODO support non-zero offsets.

44538

if (Offset == 0) {

44539

SrcOp = DAG.getZExtOrTrunc(SrcOp, dl, SrcVT.getScalarType());

44540

SrcOp = DAG.getZExtOrTrunc(SrcOp, dl, VT);

44541

return SrcOp;

44542

}

44543

}

44544

}

44545

44546

// If we're extracting a single element from a broadcast load and there are

44547

// no other users, just create a single load.

44548

if (SrcBC.getOpcode() == X86ISD::VBROADCAST_LOAD && SrcBC.hasOneUse()) {

44549

auto *MemIntr = cast<MemIntrinsicSDNode>(SrcBC);

44550

unsigned SrcBCWidth = SrcBC.getScalarValueSizeInBits();

44551

if (MemIntr->getMemoryVT().getSizeInBits() == SrcBCWidth &&

44552

VT.getSizeInBits() == SrcBCWidth && SrcEltBits == SrcBCWidth) {

44553

SDValue Load = DAG.getLoad(VT, dl, MemIntr->getChain(),

44554

MemIntr->getBasePtr(),

44555

MemIntr->getPointerInfo(),

44556

MemIntr->getOriginalAlign(),

44557

MemIntr->getMemOperand()->getFlags());

44558

DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), Load.getValue(1));

44559

return Load;

44560

}

44561

}

44562

44563

// Handle extract(bitcast(scalar_to_vector(scalar_value))) for integers.

44564

// TODO: Move to DAGCombine?

44565

if (SrcBC.getOpcode() == ISD::SCALAR_TO_VECTOR && VT.isInteger() &&

44566

SrcBC.getValueType().isInteger() &&

44567

(SrcBC.getScalarValueSizeInBits() % SrcEltBits) == 0 &&

44568

SrcBC.getScalarValueSizeInBits() ==

44569

SrcBC.getOperand(0).getValueSizeInBits()) {

44570

unsigned Scale = SrcBC.getScalarValueSizeInBits() / SrcEltBits;

44571

if (IdxC.ult(Scale)) {

44572

unsigned Offset = IdxC.getZExtValue() * SrcVT.getScalarSizeInBits();

44573

SDValue Scl = SrcBC.getOperand(0);

44574

EVT SclVT = Scl.getValueType();

44575

if (Offset) {

44576

Scl = DAG.getNode(ISD::SRL, dl, SclVT, Scl,

44577

DAG.getShiftAmountConstant(Offset, SclVT, dl));

44578

}

44579

Scl = DAG.getZExtOrTrunc(Scl, dl, SrcVT.getScalarType());

44580

Scl = DAG.getZExtOrTrunc(Scl, dl, VT);

44581

return Scl;

44582

}

44583

}

44584

44585

// Handle extract(truncate(x)) for 0'th index.

44586

// TODO: Treat this as a faux shuffle?

44587

// TODO: When can we use this for general indices?

44588

if (ISD::TRUNCATE == Src.getOpcode() && IdxC == 0 &&

44589

(SrcVT.getSizeInBits() % 128) == 0) {

44590

Src = extract128BitVector(Src.getOperand(0), 0, DAG, dl);

44591

MVT ExtractVT = MVT::getVectorVT(SrcSVT.getSimpleVT(), 128 / SrcEltBits);

44592

return DAG.getNode(N->getOpcode(), dl, VT, DAG.getBitcast(ExtractVT, Src),

44593

Idx);

44594

}

44595

44596

// We can only legally extract other elements from 128-bit vectors and in

44597

// certain circumstances, depending on SSE-level.

44598

// TODO: Investigate float/double extraction if it will be just stored.

44599

auto GetLegalExtract = [&Subtarget, &DAG, &dl](SDValue Vec, EVT VecVT,

44600

unsigned Idx) {

44601

EVT VecSVT = VecVT.getScalarType();

44602

if ((VecVT.is256BitVector() || VecVT.is512BitVector()) &&

44603

(VecSVT == MVT::i8 || VecSVT == MVT::i16 || VecSVT == MVT::i32 ||

44604

VecSVT == MVT::i64)) {

44605

unsigned EltSizeInBits = VecSVT.getSizeInBits();

44606

unsigned NumEltsPerLane = 128 / EltSizeInBits;

44607

unsigned LaneOffset = (Idx & ~(NumEltsPerLane - 1)) * EltSizeInBits;

44608

unsigned LaneIdx = LaneOffset / Vec.getScalarValueSizeInBits();

44609

VecVT = EVT::getVectorVT(*DAG.getContext(), VecSVT, NumEltsPerLane);

44610

Vec = extract128BitVector(Vec, LaneIdx, DAG, dl);

44611

Idx &= (NumEltsPerLane - 1);

44612

}

44613

if ((VecVT == MVT::v4i32 || VecVT == MVT::v2i64) &&

44614

((Idx == 0 && Subtarget.hasSSE2()) || Subtarget.hasSSE41())) {

44615

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VecVT.getScalarType(),

44616

DAG.getBitcast(VecVT, Vec),

44617

DAG.getIntPtrConstant(Idx, dl));

44618

}

44619

if ((VecVT == MVT::v8i16 && Subtarget.hasSSE2()) ||

44620

(VecVT == MVT::v16i8 && Subtarget.hasSSE41())) {

44621

unsigned OpCode = (VecVT == MVT::v8i16 ? X86ISD::PEXTRW : X86ISD::PEXTRB);

44622

return DAG.getNode(OpCode, dl, MVT::i32, DAG.getBitcast(VecVT, Vec),

44623

DAG.getTargetConstant(Idx, dl, MVT::i8));

44624

}

44625

return SDValue();

44626

};

44627

44628

// Resolve the target shuffle inputs and mask.

44629

SmallVector<int, 16> Mask;

44630

SmallVector<SDValue, 2> Ops;

44631

if (!getTargetShuffleInputs(SrcBC, Ops, Mask, DAG))

44632

return SDValue();

44633

44634

// Shuffle inputs must be the same size as the result.

44635

if (llvm::any_of(Ops, [SrcVT](SDValue Op) {

44636

return SrcVT.getSizeInBits() != Op.getValueSizeInBits();

44637

}))

44638

return SDValue();

44639

44640

// Attempt to narrow/widen the shuffle mask to the correct size.

44641

if (Mask.size() != NumSrcElts) {

44642

if ((NumSrcElts % Mask.size()) == 0) {

44643

SmallVector<int, 16> ScaledMask;

44644

int Scale = NumSrcElts / Mask.size();

44645

narrowShuffleMaskElts(Scale, Mask, ScaledMask);

44646

Mask = std::move(ScaledMask);

44647

} else if ((Mask.size() % NumSrcElts) == 0) {

44648

// Simplify Mask based on demanded element.

44649

int ExtractIdx = (int)IdxC.getZExtValue();

44650

int Scale = Mask.size() / NumSrcElts;

44651

int Lo = Scale * ExtractIdx;

44652

int Hi = Scale * (ExtractIdx + 1);

44653

for (int i = 0, e = (int)Mask.size(); i != e; ++i)

44654

if (i < Lo || Hi <= i)

44655

Mask[i] = SM_SentinelUndef;

44656

44657

SmallVector<int, 16> WidenedMask;

44658

while (Mask.size() > NumSrcElts &&

44659

canWidenShuffleElements(Mask, WidenedMask))

44660

Mask = std::move(WidenedMask);

44661

}

44662

}

44663

44664

// If narrowing/widening failed, see if we can extract+zero-extend.

44665

int ExtractIdx;

44666

EVT ExtractVT;

44667

if (Mask.size() == NumSrcElts) {

44668

ExtractIdx = Mask[IdxC.getZExtValue()];

44669

ExtractVT = SrcVT;

44670

} else {

44671

unsigned Scale = Mask.size() / NumSrcElts;

44672

if ((Mask.size() % NumSrcElts) != 0 || SrcVT.isFloatingPoint())

44673

return SDValue();

44674

unsigned ScaledIdx = Scale * IdxC.getZExtValue();

44675

if (!isUndefOrZeroInRange(Mask, ScaledIdx + 1, Scale - 1))

44676

return SDValue();

44677

ExtractIdx = Mask[ScaledIdx];

44678

EVT ExtractSVT = EVT::getIntegerVT(*DAG.getContext(), SrcEltBits / Scale);

44679

ExtractVT = EVT::getVectorVT(*DAG.getContext(), ExtractSVT, Mask.size());

44680

assert(SrcVT.getSizeInBits() == ExtractVT.getSizeInBits() &&(static_cast <bool> (SrcVT.getSizeInBits() == ExtractVT
.getSizeInBits() && "Failed to widen vector type") ? void
(0) : __assert_fail ("SrcVT.getSizeInBits() == ExtractVT.getSizeInBits() && \"Failed to widen vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 44681, __extension__
__PRETTY_FUNCTION__))

44681

"Failed to widen vector type")(static_cast <bool> (SrcVT.getSizeInBits() == ExtractVT
.getSizeInBits() && "Failed to widen vector type") ? void
(0) : __assert_fail ("SrcVT.getSizeInBits() == ExtractVT.getSizeInBits() && \"Failed to widen vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 44681, __extension__
__PRETTY_FUNCTION__));

44682

}

44683

44684

// If the shuffle source element is undef/zero then we can just accept it.

44685

if (ExtractIdx == SM_SentinelUndef)

44686

return DAG.getUNDEF(VT);

44687

44688

if (ExtractIdx == SM_SentinelZero)

44689

return VT.isFloatingPoint() ? DAG.getConstantFP(0.0, dl, VT)

44690

: DAG.getConstant(0, dl, VT);

44691

44692

SDValue SrcOp = Ops[ExtractIdx / Mask.size()];

44693

ExtractIdx = ExtractIdx % Mask.size();

44694

if (SDValue V = GetLegalExtract(SrcOp, ExtractVT, ExtractIdx))

44695

return DAG.getZExtOrTrunc(V, dl, VT);

44696

44697

return SDValue();

44698

}

44699

44700

/// Extracting a scalar FP value from vector element 0 is free, so extract each

44701

/// operand first, then perform the math as a scalar op.

44702

static SDValue scalarizeExtEltFP(SDNode *ExtElt, SelectionDAG &DAG,

44703

const X86Subtarget &Subtarget) {

44704

assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Expected extract")(static_cast <bool> (ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT
&& "Expected extract") ? void (0) : __assert_fail ("ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && \"Expected extract\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 44704, __extension__
__PRETTY_FUNCTION__));

44705

SDValue Vec = ExtElt->getOperand(0);

44706

SDValue Index = ExtElt->getOperand(1);

44707

EVT VT = ExtElt->getValueType(0);

44708

EVT VecVT = Vec.getValueType();

44709

44710

// TODO: If this is a unary/expensive/expand op, allow extraction from a

44711

// non-zero element because the shuffle+scalar op will be cheaper?

44712

if (!Vec.hasOneUse() || !isNullConstant(Index) || VecVT.getScalarType() != VT)

44713

return SDValue();

44714

44715

// Vector FP compares don't fit the pattern of FP math ops (propagate, not

44716

// extract, the condition code), so deal with those as a special-case.

44717

if (Vec.getOpcode() == ISD::SETCC && VT == MVT::i1) {

44718

EVT OpVT = Vec.getOperand(0).getValueType().getScalarType();

44719

if (OpVT != MVT::f32 && OpVT != MVT::f64)

44720

return SDValue();

44721

44722

// extract (setcc X, Y, CC), 0 --> setcc (extract X, 0), (extract Y, 0), CC

44723

SDLoc DL(ExtElt);

44724

SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT,

44725

Vec.getOperand(0), Index);

44726

SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT,

44727

Vec.getOperand(1), Index);

44728

return DAG.getNode(Vec.getOpcode(), DL, VT, Ext0, Ext1, Vec.getOperand(2));

44729

}

44730

44731

if (!(VT == MVT::f16 && Subtarget.hasFP16()) && VT != MVT::f32 &&

44732

VT != MVT::f64)

44733

return SDValue();

44734

44735

// Vector FP selects don't fit the pattern of FP math ops (because the

44736

// condition has a different type and we have to change the opcode), so deal

44737

// with those here.

44738

// FIXME: This is restricted to pre type legalization by ensuring the setcc

44739

// has i1 elements. If we loosen this we need to convert vector bool to a

44740

// scalar bool.

44741

if (Vec.getOpcode() == ISD::VSELECT &&

44742

Vec.getOperand(0).getOpcode() == ISD::SETCC &&

44743

Vec.getOperand(0).getValueType().getScalarType() == MVT::i1 &&

44744

Vec.getOperand(0).getOperand(0).getValueType() == VecVT) {

44745

// ext (sel Cond, X, Y), 0 --> sel (ext Cond, 0), (ext X, 0), (ext Y, 0)

44746

SDLoc DL(ExtElt);

44747

SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL,

44748

Vec.getOperand(0).getValueType().getScalarType(),

44749

Vec.getOperand(0), Index);

44750

SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,

44751

Vec.getOperand(1), Index);

44752

SDValue Ext2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,

44753

Vec.getOperand(2), Index);

44754

return DAG.getNode(ISD::SELECT, DL, VT, Ext0, Ext1, Ext2);

44755

}

44756

44757

// TODO: This switch could include FNEG and the x86-specific FP logic ops

44758

// (FAND, FANDN, FOR, FXOR). But that may require enhancements to avoid

44759

// missed load folding and fma+fneg combining.

44760

switch (Vec.getOpcode()) {

44761

case ISD::FMA: // Begin 3 operands

44762

case ISD::FMAD:

44763

case ISD::FADD: // Begin 2 operands

44764

case ISD::FSUB:

44765

case ISD::FMUL:

44766

case ISD::FDIV:

44767

case ISD::FREM:

44768

case ISD::FCOPYSIGN:

44769

case ISD::FMINNUM:

44770

case ISD::FMAXNUM:

44771

case ISD::FMINNUM_IEEE:

44772

case ISD::FMAXNUM_IEEE:

44773

case ISD::FMAXIMUM:

44774

case ISD::FMINIMUM:

44775

case X86ISD::FMAX:

44776

case X86ISD::FMIN:

44777

case ISD::FABS: // Begin 1 operand

44778

case ISD::FSQRT:

44779

case ISD::FRINT:

44780

case ISD::FCEIL:

44781

case ISD::FTRUNC:

44782

case ISD::FNEARBYINT:

44783

case ISD::FROUND:

44784

case ISD::FFLOOR:

44785

case X86ISD::FRCP:

44786

case X86ISD::FRSQRT: {

44787

// extract (fp X, Y, ...), 0 --> fp (extract X, 0), (extract Y, 0), ...

44788

SDLoc DL(ExtElt);

44789

SmallVector<SDValue, 4> ExtOps;

44790

for (SDValue Op : Vec->ops())

44791

ExtOps.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op, Index));

44792

return DAG.getNode(Vec.getOpcode(), DL, VT, ExtOps);

44793

}

44794

default:

44795

return SDValue();

44796

}

44797

llvm_unreachable("All opcodes should return within switch")::llvm::llvm_unreachable_internal("All opcodes should return within switch"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 44797);

44798

}

44799

44800

/// Try to convert a vector reduction sequence composed of binops and shuffles

44801

/// into horizontal ops.

44802

static SDValue combineArithReduction(SDNode *ExtElt, SelectionDAG &DAG,

44803

const X86Subtarget &Subtarget) {

44804

assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unexpected caller")(static_cast <bool> (ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT
&& "Unexpected caller") ? void (0) : __assert_fail (
"ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && \"Unexpected caller\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 44804, __extension__
__PRETTY_FUNCTION__));

44805

44806

// We need at least SSE2 to anything here.

44807

if (!Subtarget.hasSSE2())

44808

return SDValue();

44809

44810

ISD::NodeType Opc;

44811

SDValue Rdx = DAG.matchBinOpReduction(ExtElt, Opc,

44812

{ISD::ADD, ISD::MUL, ISD::FADD}, true);

44813

if (!Rdx)

44814

return SDValue();

44815

44816

SDValue Index = ExtElt->getOperand(1);

44817

assert(isNullConstant(Index) &&(static_cast <bool> (isNullConstant(Index) && "Reduction doesn't end in an extract from index 0"
) ? void (0) : __assert_fail ("isNullConstant(Index) && \"Reduction doesn't end in an extract from index 0\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 44818, __extension__
__PRETTY_FUNCTION__))

44818

"Reduction doesn't end in an extract from index 0")(static_cast <bool> (isNullConstant(Index) && "Reduction doesn't end in an extract from index 0"
) ? void (0) : __assert_fail ("isNullConstant(Index) && \"Reduction doesn't end in an extract from index 0\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 44818, __extension__
__PRETTY_FUNCTION__));

44819

44820

EVT VT = ExtElt->getValueType(0);

44821

EVT VecVT = Rdx.getValueType();

44822

if (VecVT.getScalarType() != VT)

44823

return SDValue();

44824

44825

SDLoc DL(ExtElt);

44826

unsigned NumElts = VecVT.getVectorNumElements();

44827

unsigned EltSizeInBits = VecVT.getScalarSizeInBits();

44828

44829

// Extend v4i8/v8i8 vector to v16i8, with undef upper 64-bits.

44830

auto WidenToV16I8 = [&](SDValue V, bool ZeroExtend) {

44831

if (V.getValueType() == MVT::v4i8) {

44832

if (ZeroExtend && Subtarget.hasSSE41()) {

44833

V = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v4i32,

44834

DAG.getConstant(0, DL, MVT::v4i32),

44835

DAG.getBitcast(MVT::i32, V),

44836

DAG.getIntPtrConstant(0, DL));

44837

return DAG.getBitcast(MVT::v16i8, V);

44838

}

44839

V = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i8, V,

44840

ZeroExtend ? DAG.getConstant(0, DL, MVT::v4i8)

44841

: DAG.getUNDEF(MVT::v4i8));

44842

}

44843

return DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V,

44844

DAG.getUNDEF(MVT::v8i8));

44845

};

44846

44847

// vXi8 mul reduction - promote to vXi16 mul reduction.

44848

if (Opc == ISD::MUL) {

44849

if (VT != MVT::i8 || NumElts < 4 || !isPowerOf2_32(NumElts))

44850

return SDValue();

44851

if (VecVT.getSizeInBits() >= 128) {

44852

EVT WideVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts / 2);

44853

SDValue Lo = getUnpackl(DAG, DL, VecVT, Rdx, DAG.getUNDEF(VecVT));

44854

SDValue Hi = getUnpackh(DAG, DL, VecVT, Rdx, DAG.getUNDEF(VecVT));

44855

Lo = DAG.getBitcast(WideVT, Lo);

44856

Hi = DAG.getBitcast(WideVT, Hi);

44857

Rdx = DAG.getNode(Opc, DL, WideVT, Lo, Hi);

44858

while (Rdx.getValueSizeInBits() > 128) {

44859

std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);

44860

Rdx = DAG.getNode(Opc, DL, Lo.getValueType(), Lo, Hi);

44861

}

44862

} else {

44863

Rdx = WidenToV16I8(Rdx, false);

44864

Rdx = getUnpackl(DAG, DL, MVT::v16i8, Rdx, DAG.getUNDEF(MVT::v16i8));

44865

Rdx = DAG.getBitcast(MVT::v8i16, Rdx);

44866

}

44867

if (NumElts >= 8)

44868

Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,

44869

DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,

44870

{4, 5, 6, 7, -1, -1, -1, -1}));

44871

Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,

44872

DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,

44873

{2, 3, -1, -1, -1, -1, -1, -1}));

44874

Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx,

44875

DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx,

44876

{1, -1, -1, -1, -1, -1, -1, -1}));

44877

Rdx = DAG.getBitcast(MVT::v16i8, Rdx);

44878

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);

44879

}

44880

44881

// vXi8 add reduction - sub 128-bit vector.

44882

if (VecVT == MVT::v4i8 || VecVT == MVT::v8i8) {

44883

Rdx = WidenToV16I8(Rdx, true);

44884

Rdx = DAG.getNode(X86ISD::PSADBW, DL, MVT::v2i64, Rdx,

44885

DAG.getConstant(0, DL, MVT::v16i8));

44886

Rdx = DAG.getBitcast(MVT::v16i8, Rdx);

44887

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);

44888

}

44889

44890

// Must be a >=128-bit vector with pow2 elements.

44891

if ((VecVT.getSizeInBits() % 128) != 0 || !isPowerOf2_32(NumElts))

44892

return SDValue();

44893

44894

// vXi8 add reduction - sum lo/hi halves then use PSADBW.

44895

if (VT == MVT::i8) {

44896

while (Rdx.getValueSizeInBits() > 128) {

44897

SDValue Lo, Hi;

44898

std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);

44899

VecVT = Lo.getValueType();

44900

Rdx = DAG.getNode(ISD::ADD, DL, VecVT, Lo, Hi);

44901

}

44902

assert(VecVT == MVT::v16i8 && "v16i8 reduction expected")(static_cast <bool> (VecVT == MVT::v16i8 && "v16i8 reduction expected"
) ? void (0) : __assert_fail ("VecVT == MVT::v16i8 && \"v16i8 reduction expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 44902, __extension__
__PRETTY_FUNCTION__));

44903

44904

SDValue Hi = DAG.getVectorShuffle(

44905

MVT::v16i8, DL, Rdx, Rdx,

44906

{8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1});

44907

Rdx = DAG.getNode(ISD::ADD, DL, MVT::v16i8, Rdx, Hi);

44908

Rdx = DAG.getNode(X86ISD::PSADBW, DL, MVT::v2i64, Rdx,

44909

getZeroVector(MVT::v16i8, Subtarget, DAG, DL));

44910

Rdx = DAG.getBitcast(MVT::v16i8, Rdx);

44911

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);

44912

}

44913

44914

// See if we can use vXi8 PSADBW add reduction for larger zext types.

44915

// If the source vector values are 0-255, then we can use PSADBW to

44916

// sum+zext v8i8 subvectors to vXi64, then perform the reduction.

44917

// TODO: See if its worth avoiding vXi16/i32 truncations?

44918

if (Opc == ISD::ADD && NumElts >= 4 && EltSizeInBits >= 16 &&

44919

DAG.computeKnownBits(Rdx).getMaxValue().ule(255) &&

44920

(EltSizeInBits == 16 || Rdx.getOpcode() == ISD::ZERO_EXTEND ||

44921

Subtarget.hasAVX512())) {

44922

EVT ByteVT = VecVT.changeVectorElementType(MVT::i8);

44923

Rdx = DAG.getNode(ISD::TRUNCATE, DL, ByteVT, Rdx);

44924

if (ByteVT.getSizeInBits() < 128)

44925

Rdx = WidenToV16I8(Rdx, true);

44926

44927

// Build the PSADBW, split as 128/256/512 bits for SSE/AVX2/AVX512BW.

44928

auto PSADBWBuilder = [](SelectionDAG &DAG, const SDLoc &DL,

44929

ArrayRef<SDValue> Ops) {

44930

MVT VT = MVT::getVectorVT(MVT::i64, Ops[0].getValueSizeInBits() / 64);

44931

SDValue Zero = DAG.getConstant(0, DL, Ops[0].getValueType());

44932

return DAG.getNode(X86ISD::PSADBW, DL, VT, Ops[0], Zero);

44933

};

44934

MVT SadVT = MVT::getVectorVT(MVT::i64, Rdx.getValueSizeInBits() / 64);

44935

Rdx = SplitOpsAndApply(DAG, Subtarget, DL, SadVT, {Rdx}, PSADBWBuilder);

44936

44937

// TODO: We could truncate to vXi16/vXi32 before performing the reduction.

44938

while (Rdx.getValueSizeInBits() > 128) {

44939

SDValue Lo, Hi;

44940

std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);

44941

VecVT = Lo.getValueType();

44942

Rdx = DAG.getNode(ISD::ADD, DL, VecVT, Lo, Hi);

44943

}

44944

assert(Rdx.getValueType() == MVT::v2i64 && "v2i64 reduction expected")(static_cast <bool> (Rdx.getValueType() == MVT::v2i64 &&
"v2i64 reduction expected") ? void (0) : __assert_fail ("Rdx.getValueType() == MVT::v2i64 && \"v2i64 reduction expected\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 44944, __extension__
__PRETTY_FUNCTION__));

44945

44946

if (NumElts > 8) {

44947

SDValue RdxHi = DAG.getVectorShuffle(MVT::v2i64, DL, Rdx, Rdx, {1, -1});

44948

Rdx = DAG.getNode(ISD::ADD, DL, MVT::v2i64, Rdx, RdxHi);

44949

}

44950

44951

VecVT = MVT::getVectorVT(VT.getSimpleVT(), 128 / VT.getSizeInBits());

44952

Rdx = DAG.getBitcast(VecVT, Rdx);

44953

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);

44954

}

44955

44956

// Only use (F)HADD opcodes if they aren't microcoded or minimizes codesize.

44957

if (!shouldUseHorizontalOp(true, DAG, Subtarget))

44958

return SDValue();

44959

44960

unsigned HorizOpcode = Opc == ISD::ADD ? X86ISD::HADD : X86ISD::FHADD;

44961

44962

// 256-bit horizontal instructions operate on 128-bit chunks rather than

44963

// across the whole vector, so we need an extract + hop preliminary stage.

44964

// This is the only step where the operands of the hop are not the same value.

44965

// TODO: We could extend this to handle 512-bit or even longer vectors.

44966

if (((VecVT == MVT::v16i16 || VecVT == MVT::v8i32) && Subtarget.hasSSSE3()) ||

44967

((VecVT == MVT::v8f32 || VecVT == MVT::v4f64) && Subtarget.hasSSE3())) {

44968

unsigned NumElts = VecVT.getVectorNumElements();

44969

SDValue Hi = extract128BitVector(Rdx, NumElts / 2, DAG, DL);

44970

SDValue Lo = extract128BitVector(Rdx, 0, DAG, DL);

44971

Rdx = DAG.getNode(HorizOpcode, DL, Lo.getValueType(), Hi, Lo);

44972

VecVT = Rdx.getValueType();

44973

}

44974

if (!((VecVT == MVT::v8i16 || VecVT == MVT::v4i32) && Subtarget.hasSSSE3()) &&

44975

!((VecVT == MVT::v4f32 || VecVT == MVT::v2f64) && Subtarget.hasSSE3()))

44976

return SDValue();

44977

44978

// extract (add (shuf X), X), 0 --> extract (hadd X, X), 0

44979

unsigned ReductionSteps = Log2_32(VecVT.getVectorNumElements());

44980

for (unsigned i = 0; i != ReductionSteps; ++i)

44981

Rdx = DAG.getNode(HorizOpcode, DL, VecVT, Rdx, Rdx);

44982

44983

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);

44984

}

44985

44986

/// Detect vector gather/scatter index generation and convert it from being a

44987

/// bunch of shuffles and extracts into a somewhat faster sequence.

44988

/// For i686, the best sequence is apparently storing the value and loading

44989

/// scalars back, while for x64 we should use 64-bit extracts and shifts.

44990

static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,

44991

TargetLowering::DAGCombinerInfo &DCI,

44992

const X86Subtarget &Subtarget) {

44993

if (SDValue NewOp = combineExtractWithShuffle(N, DAG, DCI, Subtarget))

44994

return NewOp;

44995

44996

SDValue InputVector = N->getOperand(0);

44997

SDValue EltIdx = N->getOperand(1);

44998

auto *CIdx = dyn_cast<ConstantSDNode>(EltIdx);

44999

45000

EVT SrcVT = InputVector.getValueType();

45001

EVT VT = N->getValueType(0);

45002

SDLoc dl(InputVector);

45003

bool IsPextr = N->getOpcode() != ISD::EXTRACT_VECTOR_ELT;

45004

unsigned NumSrcElts = SrcVT.getVectorNumElements();

45005

unsigned NumEltBits = VT.getScalarSizeInBits();

45006

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

45007

45008

if (CIdx && CIdx->getAPIntValue().uge(NumSrcElts))

45009

return IsPextr ? DAG.getConstant(0, dl, VT) : DAG.getUNDEF(VT);

45010

45011

// Integer Constant Folding.

45012

if (CIdx && VT.isInteger()) {

45013

APInt UndefVecElts;

45014

SmallVector<APInt, 16> EltBits;

45015

unsigned VecEltBitWidth = SrcVT.getScalarSizeInBits();

45016

if (getTargetConstantBitsFromNode(InputVector, VecEltBitWidth, UndefVecElts,

45017

EltBits, true, false)) {

45018

uint64_t Idx = CIdx->getZExtValue();

45019

if (UndefVecElts[Idx])

45020

return IsPextr ? DAG.getConstant(0, dl, VT) : DAG.getUNDEF(VT);

45021

return DAG.getConstant(EltBits[Idx].zext(NumEltBits), dl, VT);

45022

}

45023

45024

// Convert extract_element(bitcast(<X x i1>) -> bitcast(extract_subvector()).

45025

// Improves lowering of bool masks on rust which splits them into byte array.

45026

if (InputVector.getOpcode() == ISD::BITCAST && (NumEltBits % 8) == 0) {

45027

SDValue Src = peekThroughBitcasts(InputVector);

45028

if (Src.getValueType().getScalarType() == MVT::i1 &&

45029

TLI.isTypeLegal(Src.getValueType())) {

45030

MVT SubVT = MVT::getVectorVT(MVT::i1, NumEltBits);

45031

SDValue Sub = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVT, Src,

45032

DAG.getIntPtrConstant(CIdx->getZExtValue() * NumEltBits, dl));

45033

return DAG.getBitcast(VT, Sub);

45034

}

45035

}

45036

}

45037

45038

if (IsPextr) {

45039

if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(NumEltBits),

45040

DCI))

45041

return SDValue(N, 0);

45042

45043

// PEXTR*(PINSR*(v, s, c), c) -> s (with implicit zext handling).

45044

if ((InputVector.getOpcode() == X86ISD::PINSRB ||

45045

InputVector.getOpcode() == X86ISD::PINSRW) &&

45046

InputVector.getOperand(2) == EltIdx) {

45047

assert(SrcVT == InputVector.getOperand(0).getValueType() &&(static_cast <bool> (SrcVT == InputVector.getOperand(0)
.getValueType() && "Vector type mismatch") ? void (0)
: __assert_fail ("SrcVT == InputVector.getOperand(0).getValueType() && \"Vector type mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45048, __extension__
__PRETTY_FUNCTION__))

45048

"Vector type mismatch")(static_cast <bool> (SrcVT == InputVector.getOperand(0)
.getValueType() && "Vector type mismatch") ? void (0)
: __assert_fail ("SrcVT == InputVector.getOperand(0).getValueType() && \"Vector type mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45048, __extension__
__PRETTY_FUNCTION__));

45049

SDValue Scl = InputVector.getOperand(1);

45050

Scl = DAG.getNode(ISD::TRUNCATE, dl, SrcVT.getScalarType(), Scl);

45051

return DAG.getZExtOrTrunc(Scl, dl, VT);

45052

}

45053

45054

// TODO - Remove this once we can handle the implicit zero-extension of

45055

// X86ISD::PEXTRW/X86ISD::PEXTRB in combinePredicateReduction and

45056

// combineBasicSADPattern.

45057

return SDValue();

45058

}

45059

45060

// Detect mmx extraction of all bits as a i64. It works better as a bitcast.

45061

if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&

45062

VT == MVT::i64 && SrcVT == MVT::v1i64 && isNullConstant(EltIdx)) {

45063

SDValue MMXSrc = InputVector.getOperand(0);

45064

45065

// The bitcast source is a direct mmx result.

45066

if (MMXSrc.getValueType() == MVT::x86mmx)

45067

return DAG.getBitcast(VT, InputVector);

45068

}

45069

45070

// Detect mmx to i32 conversion through a v2i32 elt extract.

45071

if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&

45072

VT == MVT::i32 && SrcVT == MVT::v2i32 && isNullConstant(EltIdx)) {

45073

SDValue MMXSrc = InputVector.getOperand(0);

45074

45075

// The bitcast source is a direct mmx result.

45076

if (MMXSrc.getValueType() == MVT::x86mmx)

45077

return DAG.getNode(X86ISD::MMX_MOVD2W, dl, MVT::i32, MMXSrc);

45078

}

45079

45080

// Check whether this extract is the root of a sum of absolute differences

45081

// pattern. This has to be done here because we really want it to happen

45082

// pre-legalization,

45083

if (SDValue SAD = combineBasicSADPattern(N, DAG, Subtarget))

45084

return SAD;

45085

45086

if (SDValue VPDPBUSD = combineVPDPBUSDPattern(N, DAG, Subtarget))

45087

return VPDPBUSD;

45088

45089

// Attempt to replace an all_of/any_of horizontal reduction with a MOVMSK.

45090

if (SDValue Cmp = combinePredicateReduction(N, DAG, Subtarget))

45091

return Cmp;

45092

45093

// Attempt to replace min/max v8i16/v16i8 reductions with PHMINPOSUW.

45094

if (SDValue MinMax = combineMinMaxReduction(N, DAG, Subtarget))

45095

return MinMax;

45096

45097

// Attempt to optimize ADD/FADD/MUL reductions with HADD, promotion etc..

45098

if (SDValue V = combineArithReduction(N, DAG, Subtarget))

45099

return V;

45100

45101

if (SDValue V = scalarizeExtEltFP(N, DAG, Subtarget))

45102

return V;

45103

45104

// Attempt to extract a i1 element by using MOVMSK to extract the signbits

45105

// and then testing the relevant element.

45106

//

45107

// Note that we only combine extracts on the *same* result number, i.e.

45108

// t0 = merge_values a0, a1, a2, a3

45109

// i1 = extract_vector_elt t0, Constant:i64<2>

45110

// i1 = extract_vector_elt t0, Constant:i64<3>

45111

// but not

45112

// i1 = extract_vector_elt t0:1, Constant:i64<2>

45113

// since the latter would need its own MOVMSK.

45114

if (SrcVT.getScalarType() == MVT::i1) {

45115

bool IsVar = !CIdx;

45116

SmallVector<SDNode *, 16> BoolExtracts;

45117

unsigned ResNo = InputVector.getResNo();

45118

auto IsBoolExtract = [&BoolExtracts, &ResNo, &IsVar](SDNode *Use) {

45119

if (Use->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&

45120

Use->getOperand(0).getResNo() == ResNo &&

45121

Use->getValueType(0) == MVT::i1) {

45122

BoolExtracts.push_back(Use);

45123

IsVar |= !isa<ConstantSDNode>(Use->getOperand(1));

45124

return true;

45125

}

45126

return false;

45127

};

45128

// TODO: Can we drop the oneuse check for constant extracts?

45129

if (all_of(InputVector->uses(), IsBoolExtract) &&

45130

(IsVar || BoolExtracts.size() > 1)) {

45131

EVT BCVT = EVT::getIntegerVT(*DAG.getContext(), NumSrcElts);

45132

if (SDValue BC =

45133

combineBitcastvxi1(DAG, BCVT, InputVector, dl, Subtarget)) {

45134

for (SDNode *Use : BoolExtracts) {

45135

// extractelement vXi1 X, MaskIdx --> ((movmsk X) & Mask) == Mask

45136

// Mask = 1 << MaskIdx

45137

SDValue MaskIdx = DAG.getZExtOrTrunc(Use->getOperand(1), dl, MVT::i8);

45138

SDValue MaskBit = DAG.getConstant(1, dl, BCVT);

45139

SDValue Mask = DAG.getNode(ISD::SHL, dl, BCVT, MaskBit, MaskIdx);

45140

SDValue Res = DAG.getNode(ISD::AND, dl, BCVT, BC, Mask);

45141

Res = DAG.getSetCC(dl, MVT::i1, Res, Mask, ISD::SETEQ);

45142

DCI.CombineTo(Use, Res);

45143

}

45144

return SDValue(N, 0);

45145

}

45146

}

45147

}

45148

45149

// If this extract is from a loaded vector value and will be used as an

45150

// integer, that requires a potentially expensive XMM -> GPR transfer.

45151

// Additionally, if we can convert to a scalar integer load, that will likely

45152

// be folded into a subsequent integer op.

45153

// Note: Unlike the related fold for this in DAGCombiner, this is not limited

45154

// to a single-use of the loaded vector. For the reasons above, we

45155

// expect this to be profitable even if it creates an extra load.

45156

bool LikelyUsedAsVector = any_of(N->uses(), [](SDNode *Use) {

45157

return Use->getOpcode() == ISD::STORE ||

45158

Use->getOpcode() == ISD::INSERT_VECTOR_ELT ||

45159

Use->getOpcode() == ISD::SCALAR_TO_VECTOR;

45160

});

45161

auto *LoadVec = dyn_cast<LoadSDNode>(InputVector);

45162

if (LoadVec && CIdx && ISD::isNormalLoad(LoadVec) && VT.isInteger() &&

45163

SrcVT.getVectorElementType() == VT && DCI.isAfterLegalizeDAG() &&

45164

!LikelyUsedAsVector && LoadVec->isSimple()) {

45165

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

45166

SDValue NewPtr =

45167

TLI.getVectorElementPointer(DAG, LoadVec->getBasePtr(), SrcVT, EltIdx);

45168

unsigned PtrOff = VT.getSizeInBits() * CIdx->getZExtValue() / 8;

45169

MachinePointerInfo MPI = LoadVec->getPointerInfo().getWithOffset(PtrOff);

45170

Align Alignment = commonAlignment(LoadVec->getAlign(), PtrOff);

45171

SDValue Load =

45172

DAG.getLoad(VT, dl, LoadVec->getChain(), NewPtr, MPI, Alignment,

45173

LoadVec->getMemOperand()->getFlags(), LoadVec->getAAInfo());

45174

DAG.makeEquivalentMemoryOrdering(LoadVec, Load);

45175

return Load;

45176

}

45177

45178

return SDValue();

45179

}

45180

45181

// Convert (vXiY *ext(vXi1 bitcast(iX))) to extend_in_reg(broadcast(iX)).

45182

// This is more or less the reverse of combineBitcastvxi1.

45183

static SDValue combineToExtendBoolVectorInReg(

45184

unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N0, SelectionDAG &DAG,

45185

TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) {

45186

if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND &&

45187

Opcode != ISD::ANY_EXTEND)

45188

return SDValue();

45189

if (!DCI.isBeforeLegalizeOps())

45190

return SDValue();

45191

if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())

45192

return SDValue();

45193

45194

EVT SVT = VT.getScalarType();

45195

EVT InSVT = N0.getValueType().getScalarType();

45196

unsigned EltSizeInBits = SVT.getSizeInBits();

45197

45198

// Input type must be extending a bool vector (bit-casted from a scalar

45199

// integer) to legal integer types.

45200

if (!VT.isVector())

45201

return SDValue();

45202

if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16 && SVT != MVT::i8)

45203

return SDValue();

45204

if (InSVT != MVT::i1 || N0.getOpcode() != ISD::BITCAST)

45205

return SDValue();

45206

45207

SDValue N00 = N0.getOperand(0);

45208

EVT SclVT = N00.getValueType();

45209

if (!SclVT.isScalarInteger())

45210

return SDValue();

45211

45212

SDValue Vec;

45213

SmallVector<int> ShuffleMask;

45214

unsigned NumElts = VT.getVectorNumElements();

45215

assert(NumElts == SclVT.getSizeInBits() && "Unexpected bool vector size")(static_cast <bool> (NumElts == SclVT.getSizeInBits() &&
"Unexpected bool vector size") ? void (0) : __assert_fail ("NumElts == SclVT.getSizeInBits() && \"Unexpected bool vector size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45215, __extension__
__PRETTY_FUNCTION__));

45216

45217

// Broadcast the scalar integer to the vector elements.

45218

if (NumElts > EltSizeInBits) {

45219

// If the scalar integer is greater than the vector element size, then we

45220

// must split it down into sub-sections for broadcasting. For example:

45221

// i16 -> v16i8 (i16 -> v8i16 -> v16i8) with 2 sub-sections.

45222

// i32 -> v32i8 (i32 -> v8i32 -> v32i8) with 4 sub-sections.

45223

assert((NumElts % EltSizeInBits) == 0 && "Unexpected integer scale")(static_cast <bool> ((NumElts % EltSizeInBits) == 0 &&
"Unexpected integer scale") ? void (0) : __assert_fail ("(NumElts % EltSizeInBits) == 0 && \"Unexpected integer scale\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45223, __extension__
__PRETTY_FUNCTION__));

45224

unsigned Scale = NumElts / EltSizeInBits;

45225

EVT BroadcastVT = EVT::getVectorVT(*DAG.getContext(), SclVT, EltSizeInBits);

45226

Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00);

45227

Vec = DAG.getBitcast(VT, Vec);

45228

45229

for (unsigned i = 0; i != Scale; ++i)

45230

ShuffleMask.append(EltSizeInBits, i);

45231

Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask);

45232

} else if (Subtarget.hasAVX2() && NumElts < EltSizeInBits &&

45233

(SclVT == MVT::i8 || SclVT == MVT::i16 || SclVT == MVT::i32)) {

45234

// If we have register broadcast instructions, use the scalar size as the

45235

// element type for the shuffle. Then cast to the wider element type. The

45236

// widened bits won't be used, and this might allow the use of a broadcast

45237

// load.

45238

assert((EltSizeInBits % NumElts) == 0 && "Unexpected integer scale")(static_cast <bool> ((EltSizeInBits % NumElts) == 0 &&
"Unexpected integer scale") ? void (0) : __assert_fail ("(EltSizeInBits % NumElts) == 0 && \"Unexpected integer scale\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45238, __extension__
__PRETTY_FUNCTION__));

45239

unsigned Scale = EltSizeInBits / NumElts;

45240

EVT BroadcastVT =

45241

EVT::getVectorVT(*DAG.getContext(), SclVT, NumElts * Scale);

45242

Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00);

45243

ShuffleMask.append(NumElts * Scale, 0);

45244

Vec = DAG.getVectorShuffle(BroadcastVT, DL, Vec, Vec, ShuffleMask);

45245

Vec = DAG.getBitcast(VT, Vec);

45246

} else {

45247

// For smaller scalar integers, we can simply any-extend it to the vector

45248

// element size (we don't care about the upper bits) and broadcast it to all

45249

// elements.

45250

SDValue Scl = DAG.getAnyExtOrTrunc(N00, DL, SVT);

45251

Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl);

45252

ShuffleMask.append(NumElts, 0);

45253

Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask);

45254

}

45255

45256

// Now, mask the relevant bit in each element.

45257

SmallVector<SDValue, 32> Bits;

45258

for (unsigned i = 0; i != NumElts; ++i) {

45259

int BitIdx = (i % EltSizeInBits);

45260

APInt Bit = APInt::getBitsSet(EltSizeInBits, BitIdx, BitIdx + 1);

45261

Bits.push_back(DAG.getConstant(Bit, DL, SVT));

45262

}

45263

SDValue BitMask = DAG.getBuildVector(VT, DL, Bits);

45264

Vec = DAG.getNode(ISD::AND, DL, VT, Vec, BitMask);

45265

45266

// Compare against the bitmask and extend the result.

45267

EVT CCVT = VT.changeVectorElementType(MVT::i1);

45268

Vec = DAG.getSetCC(DL, CCVT, Vec, BitMask, ISD::SETEQ);

45269

Vec = DAG.getSExtOrTrunc(Vec, DL, VT);

45270

45271

// For SEXT, this is now done, otherwise shift the result down for

45272

// zero-extension.

45273

if (Opcode == ISD::SIGN_EXTEND)

45274

return Vec;

45275

return DAG.getNode(ISD::SRL, DL, VT, Vec,

45276

DAG.getConstant(EltSizeInBits - 1, DL, VT));

45277

}

45278

45279

/// If a vector select has an operand that is -1 or 0, try to simplify the

45280

/// select to a bitwise logic operation.

45281

/// TODO: Move to DAGCombiner, possibly using TargetLowering::hasAndNot()?

45282

static SDValue

45283

combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG,

45284

TargetLowering::DAGCombinerInfo &DCI,

45285

const X86Subtarget &Subtarget) {

45286

SDValue Cond = N->getOperand(0);

45287

SDValue LHS = N->getOperand(1);

45288

SDValue RHS = N->getOperand(2);

45289

EVT VT = LHS.getValueType();

45290

EVT CondVT = Cond.getValueType();

45291

SDLoc DL(N);

45292

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

45293

45294

if (N->getOpcode() != ISD::VSELECT)

45295

return SDValue();

45296

45297

assert(CondVT.isVector() && "Vector select expects a vector selector!")(static_cast <bool> (CondVT.isVector() && "Vector select expects a vector selector!"
) ? void (0) : __assert_fail ("CondVT.isVector() && \"Vector select expects a vector selector!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45297, __extension__
__PRETTY_FUNCTION__));

45298

45299

// TODO: Use isNullOrNullSplat() to distinguish constants with undefs?

45300

// TODO: Can we assert that both operands are not zeros (because that should

45301

// get simplified at node creation time)?

45302

bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());

45303

bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());

45304

45305

// If both inputs are 0/undef, create a complete zero vector.

45306

// FIXME: As noted above this should be handled by DAGCombiner/getNode.

45307

if (TValIsAllZeros && FValIsAllZeros) {

45308

if (VT.isFloatingPoint())

45309

return DAG.getConstantFP(0.0, DL, VT);

45310

return DAG.getConstant(0, DL, VT);

45311

}

45312

45313

// To use the condition operand as a bitwise mask, it must have elements that

45314

// are the same size as the select elements. Ie, the condition operand must

45315

// have already been promoted from the IR select condition type <N x i1>.

45316

// Don't check if the types themselves are equal because that excludes

45317

// vector floating-point selects.

45318

if (CondVT.getScalarSizeInBits() != VT.getScalarSizeInBits())

45319

return SDValue();

45320

45321

// Try to invert the condition if true value is not all 1s and false value is

45322

// not all 0s. Only do this if the condition has one use.

45323

bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode());

45324

if (!TValIsAllOnes && !FValIsAllZeros && Cond.hasOneUse() &&

45325

// Check if the selector will be produced by CMPP*/PCMP*.

45326

Cond.getOpcode() == ISD::SETCC &&

45327

// Check if SETCC has already been promoted.

45328

TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT) ==

45329

CondVT) {

45330

bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode());

45331

45332

if (TValIsAllZeros || FValIsAllOnes) {

45333

SDValue CC = Cond.getOperand(2);

45334

ISD::CondCode NewCC = ISD::getSetCCInverse(

45335

cast<CondCodeSDNode>(CC)->get(), Cond.getOperand(0).getValueType());

45336

Cond = DAG.getSetCC(DL, CondVT, Cond.getOperand(0), Cond.getOperand(1),

45337

NewCC);

45338

std::swap(LHS, RHS);

45339

TValIsAllOnes = FValIsAllOnes;

45340

FValIsAllZeros = TValIsAllZeros;

45341

}

45342

}

45343

45344

// Cond value must be 'sign splat' to be converted to a logical op.

45345

if (DAG.ComputeNumSignBits(Cond) != CondVT.getScalarSizeInBits())

45346

return SDValue();

45347

45348

// vselect Cond, 111..., 000... -> Cond

45349

if (TValIsAllOnes && FValIsAllZeros)

45350

return DAG.getBitcast(VT, Cond);

45351

45352

if (!TLI.isTypeLegal(CondVT))

45353

return SDValue();

45354

45355

// vselect Cond, 111..., X -> or Cond, X

45356

if (TValIsAllOnes) {

45357

SDValue CastRHS = DAG.getBitcast(CondVT, RHS);

45358

SDValue Or = DAG.getNode(ISD::OR, DL, CondVT, Cond, CastRHS);

45359

return DAG.getBitcast(VT, Or);

45360

}

45361

45362

// vselect Cond, X, 000... -> and Cond, X

45363

if (FValIsAllZeros) {

45364

SDValue CastLHS = DAG.getBitcast(CondVT, LHS);

45365

SDValue And = DAG.getNode(ISD::AND, DL, CondVT, Cond, CastLHS);

45366

return DAG.getBitcast(VT, And);

45367

}

45368

45369

// vselect Cond, 000..., X -> andn Cond, X

45370

if (TValIsAllZeros) {

45371

SDValue CastRHS = DAG.getBitcast(CondVT, RHS);

45372

SDValue AndN;

45373

// The canonical form differs for i1 vectors - x86andnp is not used

45374

if (CondVT.getScalarType() == MVT::i1)

45375

AndN = DAG.getNode(ISD::AND, DL, CondVT, DAG.getNOT(DL, Cond, CondVT),

45376

CastRHS);

45377

else

45378

AndN = DAG.getNode(X86ISD::ANDNP, DL, CondVT, Cond, CastRHS);

45379

return DAG.getBitcast(VT, AndN);

45380

}

45381

45382

return SDValue();

45383

}

45384

45385

/// If both arms of a vector select are concatenated vectors, split the select,

45386

/// and concatenate the result to eliminate a wide (256-bit) vector instruction:

45387

/// vselect Cond, (concat T0, T1), (concat F0, F1) -->

45388

/// concat (vselect (split Cond), T0, F0), (vselect (split Cond), T1, F1)

45389

static SDValue narrowVectorSelect(SDNode *N, SelectionDAG &DAG,

45390

const X86Subtarget &Subtarget) {

45391

unsigned Opcode = N->getOpcode();

45392

if (Opcode != X86ISD::BLENDV && Opcode != ISD::VSELECT)

45393

return SDValue();

45394

45395

// TODO: Split 512-bit vectors too?

45396

EVT VT = N->getValueType(0);

45397

if (!VT.is256BitVector())

45398

return SDValue();

45399

45400

// TODO: Split as long as any 2 of the 3 operands are concatenated?

45401

SDValue Cond = N->getOperand(0);

45402

SDValue TVal = N->getOperand(1);

45403

SDValue FVal = N->getOperand(2);

45404

SmallVector<SDValue, 4> CatOpsT, CatOpsF;

45405

if (!TVal.hasOneUse() || !FVal.hasOneUse() ||

45406

!collectConcatOps(TVal.getNode(), CatOpsT, DAG) ||

45407

!collectConcatOps(FVal.getNode(), CatOpsF, DAG))

45408

return SDValue();

45409

45410

auto makeBlend = [Opcode](SelectionDAG &DAG, const SDLoc &DL,

45411

ArrayRef<SDValue> Ops) {

45412

return DAG.getNode(Opcode, DL, Ops[1].getValueType(), Ops);

45413

};

45414

return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { Cond, TVal, FVal },

45415

makeBlend, /*CheckBWI*/ false);

45416

}

45417

45418

static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG) {

45419

SDValue Cond = N->getOperand(0);

45420

SDValue LHS = N->getOperand(1);

45421

SDValue RHS = N->getOperand(2);

45422

SDLoc DL(N);

45423

45424

auto *TrueC = dyn_cast<ConstantSDNode>(LHS);

45425

auto *FalseC = dyn_cast<ConstantSDNode>(RHS);

45426

if (!TrueC || !FalseC)

45427

return SDValue();

45428

45429

// Don't do this for crazy integer types.

45430

EVT VT = N->getValueType(0);

45431

if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))

45432

return SDValue();

45433

45434

// We're going to use the condition bit in math or logic ops. We could allow

45435

// this with a wider condition value (post-legalization it becomes an i8),

45436

// but if nothing is creating selects that late, it doesn't matter.

45437

if (Cond.getValueType() != MVT::i1)

45438

return SDValue();

45439

45440

// A power-of-2 multiply is just a shift. LEA also cheaply handles multiply by

45441

// 3, 5, or 9 with i32/i64, so those get transformed too.

45442

// TODO: For constants that overflow or do not differ by power-of-2 or small

45443

// multiplier, convert to 'and' + 'add'.

45444

const APInt &TrueVal = TrueC->getAPIntValue();

45445

const APInt &FalseVal = FalseC->getAPIntValue();

45446

45447

// We have a more efficient lowering for "(X == 0) ? Y : -1" using SBB.

45448

if ((TrueVal.isAllOnes() || FalseVal.isAllOnes()) &&

45449

Cond.getOpcode() == ISD::SETCC && isNullConstant(Cond.getOperand(1))) {

45450

ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();

45451

if (CC == ISD::SETEQ || CC == ISD::SETNE)

45452

return SDValue();

45453

}

45454

45455

bool OV;

45456

APInt Diff = TrueVal.ssub_ov(FalseVal, OV);

45457

if (OV)

45458

return SDValue();

45459

45460

APInt AbsDiff = Diff.abs();

45461

if (AbsDiff.isPowerOf2() ||

45462

((VT == MVT::i32 || VT == MVT::i64) &&

45463

(AbsDiff == 3 || AbsDiff == 5 || AbsDiff == 9))) {

45464

45465

// We need a positive multiplier constant for shift/LEA codegen. The 'not'

45466

// of the condition can usually be folded into a compare predicate, but even

45467

// without that, the sequence should be cheaper than a CMOV alternative.

45468

if (TrueVal.slt(FalseVal)) {

45469

Cond = DAG.getNOT(DL, Cond, MVT::i1);

45470

std::swap(TrueC, FalseC);

45471

}

45472

45473

// select Cond, TC, FC --> (zext(Cond) * (TC - FC)) + FC

45474

SDValue R = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Cond);

45475

45476

// Multiply condition by the difference if non-one.

45477

if (!AbsDiff.isOne())

45478

R = DAG.getNode(ISD::MUL, DL, VT, R, DAG.getConstant(AbsDiff, DL, VT));

45479

45480

// Add the base if non-zero.

45481

if (!FalseC->isZero())

45482

R = DAG.getNode(ISD::ADD, DL, VT, R, SDValue(FalseC, 0));

45483

45484

return R;

45485

}

45486

45487

return SDValue();

45488

}

45489

45490

/// If this is a *dynamic* select (non-constant condition) and we can match

45491

/// this node with one of the variable blend instructions, restructure the

45492

/// condition so that blends can use the high (sign) bit of each element.

45493

/// This function will also call SimplifyDemandedBits on already created

45494

/// BLENDV to perform additional simplifications.

45495

static SDValue combineVSelectToBLENDV(SDNode *N, SelectionDAG &DAG,

45496

TargetLowering::DAGCombinerInfo &DCI,

45497

const X86Subtarget &Subtarget) {

45498

SDValue Cond = N->getOperand(0);

45499

if ((N->getOpcode() != ISD::VSELECT &&

45500

N->getOpcode() != X86ISD::BLENDV) ||

45501

ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))

45502

return SDValue();

45503

45504

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

45505

unsigned BitWidth = Cond.getScalarValueSizeInBits();

45506

EVT VT = N->getValueType(0);

45507

45508

// We can only handle the cases where VSELECT is directly legal on the

45509

// subtarget. We custom lower VSELECT nodes with constant conditions and

45510

// this makes it hard to see whether a dynamic VSELECT will correctly

45511

// lower, so we both check the operation's status and explicitly handle the

45512

// cases where a *dynamic* blend will fail even though a constant-condition

45513

// blend could be custom lowered.

45514

// FIXME: We should find a better way to handle this class of problems.

45515

// Potentially, we should combine constant-condition vselect nodes

45516

// pre-legalization into shuffles and not mark as many types as custom

45517

// lowered.

45518

if (!TLI.isOperationLegalOrCustom(ISD::VSELECT, VT))

45519

return SDValue();

45520

// FIXME: We don't support i16-element blends currently. We could and

45521

// should support them by making *all* the bits in the condition be set

45522

// rather than just the high bit and using an i8-element blend.

45523

if (VT.getVectorElementType() == MVT::i16)

45524

return SDValue();

45525

// Dynamic blending was only available from SSE4.1 onward.

45526

if (VT.is128BitVector() && !Subtarget.hasSSE41())

45527

return SDValue();

45528

// Byte blends are only available in AVX2

45529

if (VT == MVT::v32i8 && !Subtarget.hasAVX2())

45530

return SDValue();

45531

// There are no 512-bit blend instructions that use sign bits.

45532

if (VT.is512BitVector())

45533

return SDValue();

45534

45535

// Don't optimize before the condition has been transformed to a legal type

45536

// and don't ever optimize vector selects that map to AVX512 mask-registers.

45537

if (BitWidth < 8 || BitWidth > 64)

45538

return SDValue();

45539

45540

auto OnlyUsedAsSelectCond = [](SDValue Cond) {

45541

for (SDNode::use_iterator UI = Cond->use_begin(), UE = Cond->use_end();

45542

UI != UE; ++UI)

45543

if ((UI->getOpcode() != ISD::VSELECT &&

45544

UI->getOpcode() != X86ISD::BLENDV) ||

45545

UI.getOperandNo() != 0)

45546

return false;

45547

45548

return true;

45549

};

45550

45551

APInt DemandedBits(APInt::getSignMask(BitWidth));

45552

45553

if (OnlyUsedAsSelectCond(Cond)) {

45554

KnownBits Known;

45555

TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),

45556

!DCI.isBeforeLegalizeOps());

45557

if (!TLI.SimplifyDemandedBits(Cond, DemandedBits, Known, TLO, 0, true))

45558

return SDValue();

45559

45560

// If we changed the computation somewhere in the DAG, this change will

45561

// affect all users of Cond. Update all the nodes so that we do not use

45562

// the generic VSELECT anymore. Otherwise, we may perform wrong

45563

// optimizations as we messed with the actual expectation for the vector

45564

// boolean values.

45565

for (SDNode *U : Cond->uses()) {

45566

if (U->getOpcode() == X86ISD::BLENDV)

45567

continue;

45568

45569

SDValue SB = DAG.getNode(X86ISD::BLENDV, SDLoc(U), U->getValueType(0),

45570

Cond, U->getOperand(1), U->getOperand(2));

45571

DAG.ReplaceAllUsesOfValueWith(SDValue(U, 0), SB);

45572

DCI.AddToWorklist(U);

45573

}

45574

DCI.CommitTargetLoweringOpt(TLO);

45575

return SDValue(N, 0);

45576

}

45577

45578

// Otherwise we can still at least try to simplify multiple use bits.

45579

if (SDValue V = TLI.SimplifyMultipleUseDemandedBits(Cond, DemandedBits, DAG))

45580

return DAG.getNode(X86ISD::BLENDV, SDLoc(N), N->getValueType(0), V,

45581

N->getOperand(1), N->getOperand(2));

45582

45583

return SDValue();

45584

}

45585

45586

// Try to match:

45587

// (or (and (M, (sub 0, X)), (pandn M, X)))

45588

// which is a special case of:

45589

// (select M, (sub 0, X), X)

45590

// Per:

45591

// http://graphics.stanford.edu/~seander/bithacks.html#ConditionalNegate

45592

// We know that, if fNegate is 0 or 1:

45593

// (fNegate ? -v : v) == ((v ^ -fNegate) + fNegate)

45594

//

45595

// Here, we have a mask, M (all 1s or 0), and, similarly, we know that:

45596

// ((M & 1) ? -X : X) == ((X ^ -(M & 1)) + (M & 1))

45597

// ( M ? -X : X) == ((X ^ M ) + (M & 1))

45598

// This lets us transform our vselect to:

45599

// (add (xor X, M), (and M, 1))

45600

// And further to:

45601

// (sub (xor X, M), M)

45602

static SDValue combineLogicBlendIntoConditionalNegate(

45603

EVT VT, SDValue Mask, SDValue X, SDValue Y, const SDLoc &DL,

45604

SelectionDAG &DAG, const X86Subtarget &Subtarget) {

45605

EVT MaskVT = Mask.getValueType();

45606

assert(MaskVT.isInteger() &&(static_cast <bool> (MaskVT.isInteger() && DAG.
ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() &&
"Mask must be zero/all-bits") ? void (0) : __assert_fail ("MaskVT.isInteger() && DAG.ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() && \"Mask must be zero/all-bits\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45608, __extension__
__PRETTY_FUNCTION__))

45607

DAG.ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() &&(static_cast <bool> (MaskVT.isInteger() && DAG.
ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() &&
"Mask must be zero/all-bits") ? void (0) : __assert_fail ("MaskVT.isInteger() && DAG.ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() && \"Mask must be zero/all-bits\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45608, __extension__
__PRETTY_FUNCTION__))

45608

"Mask must be zero/all-bits")(static_cast <bool> (MaskVT.isInteger() && DAG.
ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() &&
"Mask must be zero/all-bits") ? void (0) : __assert_fail ("MaskVT.isInteger() && DAG.ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() && \"Mask must be zero/all-bits\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 45608, __extension__
__PRETTY_FUNCTION__));

45609

45610

if (X.getValueType() != MaskVT || Y.getValueType() != MaskVT)

45611

return SDValue();

45612

if (!DAG.getTargetLoweringInfo().isOperationLegal(ISD::SUB, MaskVT))

45613

return SDValue();

45614

45615

auto IsNegV = [](SDNode *N, SDValue V) {

45616

return N->getOpcode() == ISD::SUB && N->getOperand(1) == V &&

45617

ISD::isBuildVectorAllZeros(N->getOperand(0).getNode());

45618

};

45619

45620

SDValue V;

45621

if (IsNegV(Y.getNode(), X))

45622

V = X;

45623

else if (IsNegV(X.getNode(), Y))

45624

V = Y;

45625

else

45626

return SDValue();

45627

45628

SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask);

45629

SDValue SubOp2 = Mask;

45630

45631

// If the negate was on the false side of the select, then

45632

// the operands of the SUB need to be swapped. PR 27251.

45633

// This is because the pattern being matched above is

45634

// (vselect M, (sub (0, X), X) -> (sub (xor X, M), M)

45635

// but if the pattern matched was

45636

// (vselect M, X, (sub (0, X))), that is really negation of the pattern

45637

// above, -(vselect M, (sub 0, X), X), and therefore the replacement

45638

// pattern also needs to be a negation of the replacement pattern above.

45639

// And -(sub X, Y) is just sub (Y, X), so swapping the operands of the

45640

// sub accomplishes the negation of the replacement pattern.

45641

if (V == Y)

45642

std::swap(SubOp1, SubOp2);

45643

45644

SDValue Res = DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2);

45645

return DAG.getBitcast(VT, Res);

45646

}

45647

45648

/// Do target-specific dag combines on SELECT and VSELECT nodes.

45649

static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,

45650

TargetLowering::DAGCombinerInfo &DCI,

45651

const X86Subtarget &Subtarget) {

45652

SDLoc DL(N);

45653

SDValue Cond = N->getOperand(0);

45654

SDValue LHS = N->getOperand(1);

45655

SDValue RHS = N->getOperand(2);

45656

45657

// Try simplification again because we use this function to optimize

45658

// BLENDV nodes that are not handled by the generic combiner.

45659

if (SDValue V = DAG.simplifySelect(Cond, LHS, RHS))

45660

return V;

45661

45662

EVT VT = LHS.getValueType();

45663

EVT CondVT = Cond.getValueType();

45664

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

45665

bool CondConstantVector = ISD::isBuildVectorOfConstantSDNodes(Cond.getNode());

45666

45667

// Attempt to combine (select M, (sub 0, X), X) -> (sub (xor X, M), M).

45668

// Limit this to cases of non-constant masks that createShuffleMaskFromVSELECT

45669

// can't catch, plus vXi8 cases where we'd likely end up with BLENDV.

45670

if (CondVT.isVector() && CondVT.isInteger() &&

45671

CondVT.getScalarSizeInBits() == VT.getScalarSizeInBits() &&

45672

(!CondConstantVector || CondVT.getScalarType() == MVT::i8) &&

45673

DAG.ComputeNumSignBits(Cond) == CondVT.getScalarSizeInBits())

45674

if (SDValue V = combineLogicBlendIntoConditionalNegate(VT, Cond, RHS, LHS,

45675

DL, DAG, Subtarget))

45676

return V;

45677

45678

// Convert vselects with constant condition into shuffles.

45679

if (CondConstantVector && DCI.isBeforeLegalizeOps() &&

45680

(N->getOpcode() == ISD::VSELECT || N->getOpcode() == X86ISD::BLENDV)) {

45681

SmallVector<int, 64> Mask;

45682

if (createShuffleMaskFromVSELECT(Mask, Cond,

45683

N->getOpcode() == X86ISD::BLENDV))

45684

return DAG.getVectorShuffle(VT, DL, LHS, RHS, Mask);

45685

}

45686

45687

// fold vselect(cond, pshufb(x), pshufb(y)) -> or (pshufb(x), pshufb(y))

45688

// by forcing the unselected elements to zero.

45689

// TODO: Can we handle more shuffles with this?

45690

if (N->getOpcode() == ISD::VSELECT && CondVT.isVector() &&

45691

LHS.getOpcode() == X86ISD::PSHUFB && RHS.getOpcode() == X86ISD::PSHUFB &&

45692

LHS.hasOneUse() && RHS.hasOneUse()) {

45693

MVT SimpleVT = VT.getSimpleVT();

45694

SmallVector<SDValue, 1> LHSOps, RHSOps;

45695

SmallVector<int, 64> LHSMask, RHSMask, CondMask;

45696

if (createShuffleMaskFromVSELECT(CondMask, Cond) &&

45697

getTargetShuffleMask(LHS.getNode(), SimpleVT, true, LHSOps, LHSMask) &&

45698

getTargetShuffleMask(RHS.getNode(), SimpleVT, true, RHSOps, RHSMask)) {

45699

int NumElts = VT.getVectorNumElements();

45700

for (int i = 0; i != NumElts; ++i) {

45701

// getConstVector sets negative shuffle mask values as undef, so ensure

45702

// we hardcode SM_SentinelZero values to zero (0x80).

45703

if (CondMask[i] < NumElts) {

45704

LHSMask[i] = isUndefOrZero(LHSMask[i]) ? 0x80 : LHSMask[i];

45705

RHSMask[i] = 0x80;

45706

} else {

45707

LHSMask[i] = 0x80;

45708

RHSMask[i] = isUndefOrZero(RHSMask[i]) ? 0x80 : RHSMask[i];

45709

}

45710

}

45711

LHS = DAG.getNode(X86ISD::PSHUFB, DL, VT, LHS.getOperand(0),

45712

getConstVector(LHSMask, SimpleVT, DAG, DL, true));

45713

RHS = DAG.getNode(X86ISD::PSHUFB, DL, VT, RHS.getOperand(0),

45714

getConstVector(RHSMask, SimpleVT, DAG, DL, true));

45715

return DAG.getNode(ISD::OR, DL, VT, LHS, RHS);

45716

}

45717

}

45718

45719

// If we have SSE[12] support, try to form min/max nodes. SSE min/max

45720

// instructions match the semantics of the common C idiom x<y?x:y but not

45721

// x<=y?x:y, because of how they handle negative zero (which can be

45722

// ignored in unsafe-math mode).

45723

// We also try to create v2f32 min/max nodes, which we later widen to v4f32.

45724

if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&

45725

VT != MVT::f80 && VT != MVT::f128 && !isSoftFP16(VT, Subtarget) &&

45726

(TLI.isTypeLegal(VT) || VT == MVT::v2f32) &&

45727

(Subtarget.hasSSE2() ||

45728

(Subtarget.hasSSE1() && VT.getScalarType() == MVT::f32))) {

45729

ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();

45730

45731

unsigned Opcode = 0;

45732

// Check for x CC y ? x : y.

45733

if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&

45734

DAG.isEqualTo(RHS, Cond.getOperand(1))) {

45735

switch (CC) {

45736

default: break;

45737

case ISD::SETULT:

45738

// Converting this to a min would handle NaNs incorrectly, and swapping

45739

// the operands would cause it to handle comparisons between positive

45740

// and negative zero incorrectly.

45741

if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {

45742

if (!DAG.getTarget().Options.NoSignedZerosFPMath &&

45743

!(DAG.isKnownNeverZeroFloat(LHS) ||

45744

DAG.isKnownNeverZeroFloat(RHS)))

45745

break;

45746

std::swap(LHS, RHS);

45747

}

45748

Opcode = X86ISD::FMIN;

45749

break;

45750

case ISD::SETOLE:

45751

// Converting this to a min would handle comparisons between positive

45752

// and negative zero incorrectly.

45753

if (!DAG.getTarget().Options.NoSignedZerosFPMath &&

45754

!DAG.isKnownNeverZeroFloat(LHS) && !DAG.isKnownNeverZeroFloat(RHS))

45755

break;

45756

Opcode = X86ISD::FMIN;

45757

break;

45758

case ISD::SETULE:

45759

// Converting this to a min would handle both negative zeros and NaNs

45760

// incorrectly, but we can swap the operands to fix both.

45761

std::swap(LHS, RHS);

45762

[[fallthrough]];

45763

case ISD::SETOLT:

45764

case ISD::SETLT:

45765

case ISD::SETLE:

45766

Opcode = X86ISD::FMIN;

45767

break;

45768

45769

case ISD::SETOGE:

45770

// Converting this to a max would handle comparisons between positive

45771

// and negative zero incorrectly.

45772

if (!DAG.getTarget().Options.NoSignedZerosFPMath &&

45773

!DAG.isKnownNeverZeroFloat(LHS) && !DAG.isKnownNeverZeroFloat(RHS))

45774

break;

45775

Opcode = X86ISD::FMAX;

45776

break;

45777

case ISD::SETUGT:

45778

// Converting this to a max would handle NaNs incorrectly, and swapping

45779

// the operands would cause it to handle comparisons between positive

45780

// and negative zero incorrectly.

45781

if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {

45782

if (!DAG.getTarget().Options.NoSignedZerosFPMath &&

45783

!(DAG.isKnownNeverZeroFloat(LHS) ||

45784

DAG.isKnownNeverZeroFloat(RHS)))

45785

break;

45786

std::swap(LHS, RHS);

45787

}

45788

Opcode = X86ISD::FMAX;

45789

break;

45790

case ISD::SETUGE:

45791

// Converting this to a max would handle both negative zeros and NaNs

45792

// incorrectly, but we can swap the operands to fix both.

45793

std::swap(LHS, RHS);

45794

[[fallthrough]];

45795

case ISD::SETOGT:

45796

case ISD::SETGT:

45797

case ISD::SETGE:

45798

Opcode = X86ISD::FMAX;

45799

break;

45800

}

45801

// Check for x CC y ? y : x -- a min/max with reversed arms.

45802

} else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&

45803

DAG.isEqualTo(RHS, Cond.getOperand(0))) {

45804

switch (CC) {

45805

default: break;

45806

case ISD::SETOGE:

45807

// Converting this to a min would handle comparisons between positive

45808

// and negative zero incorrectly, and swapping the operands would

45809

// cause it to handle NaNs incorrectly.

45810

if (!DAG.getTarget().Options.NoSignedZerosFPMath &&

45811

!(DAG.isKnownNeverZeroFloat(LHS) ||

45812

DAG.isKnownNeverZeroFloat(RHS))) {

45813

if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))

45814

break;

45815

std::swap(LHS, RHS);

45816

}

45817

Opcode = X86ISD::FMIN;

45818

break;

45819

case ISD::SETUGT:

45820

// Converting this to a min would handle NaNs incorrectly.

45821

if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))

45822

break;

45823

Opcode = X86ISD::FMIN;

45824

break;

45825

case ISD::SETUGE:

45826

// Converting this to a min would handle both negative zeros and NaNs

45827

// incorrectly, but we can swap the operands to fix both.

45828

std::swap(LHS, RHS);

45829

[[fallthrough]];

45830

case ISD::SETOGT:

45831

case ISD::SETGT:

45832

case ISD::SETGE:

45833

Opcode = X86ISD::FMIN;

45834

break;

45835

45836

case ISD::SETULT:

45837

// Converting this to a max would handle NaNs incorrectly.

45838

if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))

45839

break;

45840

Opcode = X86ISD::FMAX;

45841

break;

45842

case ISD::SETOLE:

45843

// Converting this to a max would handle comparisons between positive

45844

// and negative zero incorrectly, and swapping the operands would

45845

// cause it to handle NaNs incorrectly.

45846

if (!DAG.getTarget().Options.NoSignedZerosFPMath &&

45847

!DAG.isKnownNeverZeroFloat(LHS) &&

45848

!DAG.isKnownNeverZeroFloat(RHS)) {

45849

if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))

45850

break;

45851

std::swap(LHS, RHS);

45852

}

45853

Opcode = X86ISD::FMAX;

45854

break;

45855

case ISD::SETULE:

45856

// Converting this to a max would handle both negative zeros and NaNs

45857

// incorrectly, but we can swap the operands to fix both.

45858

std::swap(LHS, RHS);

45859

[[fallthrough]];

45860

case ISD::SETOLT:

45861

case ISD::SETLT:

45862

case ISD::SETLE:

45863

Opcode = X86ISD::FMAX;

45864

break;

45865

}

45866

}

45867

45868

if (Opcode)

45869

return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);

45870

}

45871

45872

// Some mask scalar intrinsics rely on checking if only one bit is set

45873

// and implement it in C code like this:

45874

// A[0] = (U & 1) ? A[0] : W[0];

45875

// This creates some redundant instructions that break pattern matching.

45876

// fold (select (setcc (and (X, 1), 0, seteq), Y, Z)) -> select(and(X, 1),Z,Y)

45877

if (Subtarget.hasAVX512() && N->getOpcode() == ISD::SELECT &&

45878

Cond.getOpcode() == ISD::SETCC && (VT == MVT::f32 || VT == MVT::f64)) {

45879

ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();

45880

SDValue AndNode = Cond.getOperand(0);

45881

if (AndNode.getOpcode() == ISD::AND && CC == ISD::SETEQ &&

45882

isNullConstant(Cond.getOperand(1)) &&

45883

isOneConstant(AndNode.getOperand(1))) {

45884

// LHS and RHS swapped due to

45885

// setcc outputting 1 when AND resulted in 0 and vice versa.

45886

AndNode = DAG.getZExtOrTrunc(AndNode, DL, MVT::i8);

45887

return DAG.getNode(ISD::SELECT, DL, VT, AndNode, RHS, LHS);

45888

}

45889

}

45890

45891

// v16i8 (select v16i1, v16i8, v16i8) does not have a proper

45892

// lowering on KNL. In this case we convert it to

45893

// v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction.

45894

// The same situation all vectors of i8 and i16 without BWI.

45895

// Make sure we extend these even before type legalization gets a chance to

45896

// split wide vectors.

45897

// Since SKX these selects have a proper lowering.

45898

if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && CondVT.isVector() &&

45899

CondVT.getVectorElementType() == MVT::i1 &&

45900

(VT.getVectorElementType() == MVT::i8 ||

45901

VT.getVectorElementType() == MVT::i16)) {

45902

Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond);

45903

return DAG.getNode(N->getOpcode(), DL, VT, Cond, LHS, RHS);

45904

}

45905

45906

// AVX512 - Extend select with zero to merge with target shuffle.

45907

// select(mask, extract_subvector(shuffle(x)), zero) -->

45908

// extract_subvector(select(insert_subvector(mask), shuffle(x), zero))

45909

// TODO - support non target shuffles as well.

45910

if (Subtarget.hasAVX512() && CondVT.isVector() &&

45911

CondVT.getVectorElementType() == MVT::i1) {

45912

auto SelectableOp = [&TLI](SDValue Op) {

45913

return Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&

45914

isTargetShuffle(Op.getOperand(0).getOpcode()) &&

45915

isNullConstant(Op.getOperand(1)) &&

45916

TLI.isTypeLegal(Op.getOperand(0).getValueType()) &&

45917

Op.hasOneUse() && Op.getOperand(0).hasOneUse();

45918

};

45919

45920

bool SelectableLHS = SelectableOp(LHS);

45921

bool SelectableRHS = SelectableOp(RHS);

45922

bool ZeroLHS = ISD::isBuildVectorAllZeros(LHS.getNode());

45923

bool ZeroRHS = ISD::isBuildVectorAllZeros(RHS.getNode());

45924

45925

if ((SelectableLHS && ZeroRHS) || (SelectableRHS && ZeroLHS)) {

45926

EVT SrcVT = SelectableLHS ? LHS.getOperand(0).getValueType()

45927

: RHS.getOperand(0).getValueType();

45928

EVT SrcCondVT = SrcVT.changeVectorElementType(MVT::i1);

45929

LHS = insertSubVector(DAG.getUNDEF(SrcVT), LHS, 0, DAG, DL,

45930

VT.getSizeInBits());

45931

RHS = insertSubVector(DAG.getUNDEF(SrcVT), RHS, 0, DAG, DL,

45932

VT.getSizeInBits());

45933

Cond = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, SrcCondVT,

45934

DAG.getUNDEF(SrcCondVT), Cond,

45935

DAG.getIntPtrConstant(0, DL));

45936

SDValue Res = DAG.getSelect(DL, SrcVT, Cond, LHS, RHS);

45937

return extractSubVector(Res, 0, DAG, DL, VT.getSizeInBits());

45938

}

45939

}

45940

45941

if (SDValue V = combineSelectOfTwoConstants(N, DAG))

45942

return V;

45943

45944

if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&

45945

Cond.hasOneUse()) {

45946

EVT CondVT = Cond.getValueType();

45947

SDValue Cond0 = Cond.getOperand(0);

45948

SDValue Cond1 = Cond.getOperand(1);

45949

ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();

45950

45951

// Canonicalize min/max:

45952

// (x > 0) ? x : 0 -> (x >= 0) ? x : 0

45953

// (x < -1) ? x : -1 -> (x <= -1) ? x : -1

45954

// This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates

45955

// the need for an extra compare against zero. e.g.

45956

// (a - b) > 0 : (a - b) ? 0 -> (a - b) >= 0 : (a - b) ? 0

45957

// subl %esi, %edi

45958

// testl %edi, %edi

45959

// movl $0, %eax

45960

// cmovgl %edi, %eax

45961

// =>

45962

// xorl %eax, %eax

45963

// subl %esi, $edi

45964

// cmovsl %eax, %edi

45965

//

45966

// We can also canonicalize

45967

// (x s> 1) ? x : 1 -> (x s>= 1) ? x : 1 -> (x s> 0) ? x : 1

45968

// (x u> 1) ? x : 1 -> (x u>= 1) ? x : 1 -> (x != 0) ? x : 1

45969

// This allows the use of a test instruction for the compare.

45970

if (LHS == Cond0 && RHS == Cond1) {

45971

if ((CC == ISD::SETGT && (isNullConstant(RHS) || isOneConstant(RHS))) ||

45972

(CC == ISD::SETLT && isAllOnesConstant(RHS))) {

45973

ISD::CondCode NewCC = CC == ISD::SETGT ? ISD::SETGE : ISD::SETLE;

45974

Cond = DAG.getSetCC(SDLoc(Cond), CondVT, Cond0, Cond1, NewCC);

45975

return DAG.getSelect(DL, VT, Cond, LHS, RHS);

45976

}

45977

if (CC == ISD::SETUGT && isOneConstant(RHS)) {

45978

ISD::CondCode NewCC = ISD::SETUGE;

45979

Cond = DAG.getSetCC(SDLoc(Cond), CondVT, Cond0, Cond1, NewCC);

45980

return DAG.getSelect(DL, VT, Cond, LHS, RHS);

45981

}

45982

}

45983

45984

// Similar to DAGCombine's select(or(CC0,CC1),X,Y) fold but for legal types.

45985

// fold eq + gt/lt nested selects into ge/le selects

45986

// select (cmpeq Cond0, Cond1), LHS, (select (cmpugt Cond0, Cond1), LHS, Y)

45987

// --> (select (cmpuge Cond0, Cond1), LHS, Y)

45988

// select (cmpslt Cond0, Cond1), LHS, (select (cmpeq Cond0, Cond1), LHS, Y)

45989

// --> (select (cmpsle Cond0, Cond1), LHS, Y)

45990

// .. etc ..

45991

if (RHS.getOpcode() == ISD::SELECT && RHS.getOperand(1) == LHS &&

45992

RHS.getOperand(0).getOpcode() == ISD::SETCC) {

45993

SDValue InnerSetCC = RHS.getOperand(0);

45994

ISD::CondCode InnerCC =

45995

cast<CondCodeSDNode>(InnerSetCC.getOperand(2))->get();

45996

if ((CC == ISD::SETEQ || InnerCC == ISD::SETEQ) &&

45997

Cond0 == InnerSetCC.getOperand(0) &&

45998

Cond1 == InnerSetCC.getOperand(1)) {

45999

ISD::CondCode NewCC;

46000

switch (CC == ISD::SETEQ ? InnerCC : CC) {

46001

case ISD::SETGT: NewCC = ISD::SETGE; break;

46002

case ISD::SETLT: NewCC = ISD::SETLE; break;

46003

case ISD::SETUGT: NewCC = ISD::SETUGE; break;

46004

case ISD::SETULT: NewCC = ISD::SETULE; break;

46005

default: NewCC = ISD::SETCC_INVALID; break;

46006

}

46007

if (NewCC != ISD::SETCC_INVALID) {

46008

Cond = DAG.getSetCC(DL, CondVT, Cond0, Cond1, NewCC);

46009

return DAG.getSelect(DL, VT, Cond, LHS, RHS.getOperand(2));

46010

}

46011

}

46012

}

46013

}

46014

46015

// Check if the first operand is all zeros and Cond type is vXi1.

46016

// If this an avx512 target we can improve the use of zero masking by

46017

// swapping the operands and inverting the condition.

46018

if (N->getOpcode() == ISD::VSELECT && Cond.hasOneUse() &&

46019

Subtarget.hasAVX512() && CondVT.getVectorElementType() == MVT::i1 &&

46020

ISD::isBuildVectorAllZeros(LHS.getNode()) &&

46021

!ISD::isBuildVectorAllZeros(RHS.getNode())) {

46022

// Invert the cond to not(cond) : xor(op,allones)=not(op)

46023

SDValue CondNew = DAG.getNOT(DL, Cond, CondVT);

46024

// Vselect cond, op1, op2 = Vselect not(cond), op2, op1

46025

return DAG.getSelect(DL, VT, CondNew, RHS, LHS);

46026

}

46027

46028

// Attempt to convert a (vXi1 bitcast(iX Cond)) selection mask before it might

46029

// get split by legalization.

46030

if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::BITCAST &&

46031

CondVT.getVectorElementType() == MVT::i1 && Cond.hasOneUse() &&

46032

TLI.isTypeLegal(VT.getScalarType())) {

46033

EVT ExtCondVT = VT.changeVectorElementTypeToInteger();

46034

if (SDValue ExtCond = combineToExtendBoolVectorInReg(

46035

ISD::SIGN_EXTEND, DL, ExtCondVT, Cond, DAG, DCI, Subtarget)) {

46036

ExtCond = DAG.getNode(ISD::TRUNCATE, DL, CondVT, ExtCond);

46037

return DAG.getSelect(DL, VT, ExtCond, LHS, RHS);

46038

}

46039

}

46040

46041

// Early exit check

46042

if (!TLI.isTypeLegal(VT) || isSoftFP16(VT, Subtarget))

46043

return SDValue();

46044

46045

if (SDValue V = combineVSelectWithAllOnesOrZeros(N, DAG, DCI, Subtarget))

46046

return V;

46047

46048

if (SDValue V = combineVSelectToBLENDV(N, DAG, DCI, Subtarget))

46049

return V;

46050

46051

if (SDValue V = narrowVectorSelect(N, DAG, Subtarget))

46052

return V;

46053

46054

// select(~Cond, X, Y) -> select(Cond, Y, X)

46055

if (CondVT.getScalarType() != MVT::i1) {

46056

if (SDValue CondNot = IsNOT(Cond, DAG))

46057

return DAG.getNode(N->getOpcode(), DL, VT,

46058

DAG.getBitcast(CondVT, CondNot), RHS, LHS);

46059

46060

if (Cond.getOpcode() == X86ISD::PCMPGT && Cond.hasOneUse()) {

46061

// pcmpgt(X, -1) -> pcmpgt(0, X) to help select/blendv just use the

46062

// signbit.

46063

if (ISD::isBuildVectorAllOnes(Cond.getOperand(1).getNode())) {

46064

Cond = DAG.getNode(X86ISD::PCMPGT, DL, CondVT,

46065

DAG.getConstant(0, DL, CondVT), Cond.getOperand(0));

46066

return DAG.getNode(N->getOpcode(), DL, VT, Cond, RHS, LHS);

46067

}

46068

46069

// smin(LHS, RHS) : select(pcmpgt(RHS, LHS), LHS, RHS)

46070

// -> select(pcmpgt(LHS, RHS), RHS, LHS)

46071

// iff the commuted pcmpgt() already exists.

46072

// TODO: Could DAGCombiner::combine cse search for SETCC nodes, like it

46073

// does for commutative binops?

46074

if (Cond.getOperand(0) == RHS && Cond.getOperand(1) == LHS) {

46075

if (SDNode *FlipCond =

46076

DAG.getNodeIfExists(X86ISD::PCMPGT, DAG.getVTList(CondVT),

46077

{Cond.getOperand(1), Cond.getOperand(0)})) {

46078

return DAG.getNode(N->getOpcode(), DL, VT, SDValue(FlipCond, 0), RHS,

46079

LHS);

46080

}

46081

}

46082

}

46083

}

46084

46085

// Try to optimize vXi1 selects if both operands are either all constants or

46086

// bitcasts from scalar integer type. In that case we can convert the operands

46087

// to integer and use an integer select which will be converted to a CMOV.

46088

// We need to take a little bit of care to avoid creating an i64 type after

46089

// type legalization.

46090

if (N->getOpcode() == ISD::SELECT && VT.isVector() &&

46091

VT.getVectorElementType() == MVT::i1 &&

46092

(DCI.isBeforeLegalize() || (VT != MVT::v64i1 || Subtarget.is64Bit()))) {

46093

EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getVectorNumElements());

46094

bool LHSIsConst = ISD::isBuildVectorOfConstantSDNodes(LHS.getNode());

46095

bool RHSIsConst = ISD::isBuildVectorOfConstantSDNodes(RHS.getNode());

46096

46097

if ((LHSIsConst ||

46098

(LHS.getOpcode() == ISD::BITCAST &&

46099

LHS.getOperand(0).getValueType() == IntVT)) &&

46100

(RHSIsConst ||

46101

(RHS.getOpcode() == ISD::BITCAST &&

46102

RHS.getOperand(0).getValueType() == IntVT))) {

46103

if (LHSIsConst)

46104

LHS = combinevXi1ConstantToInteger(LHS, DAG);

46105

else

46106

LHS = LHS.getOperand(0);

46107

46108

if (RHSIsConst)

46109

RHS = combinevXi1ConstantToInteger(RHS, DAG);

46110

else

46111

RHS = RHS.getOperand(0);

46112

46113

SDValue Select = DAG.getSelect(DL, IntVT, Cond, LHS, RHS);

46114

return DAG.getBitcast(VT, Select);

46115

}

46116

}

46117

46118

// If this is "((X & C) == 0) ? Y : Z" and C is a constant mask vector of

46119

// single bits, then invert the predicate and swap the select operands.

46120

// This can lower using a vector shift bit-hack rather than mask and compare.

46121

if (DCI.isBeforeLegalize() && !Subtarget.hasAVX512() &&

46122

N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&

46123

Cond.hasOneUse() && CondVT.getVectorElementType() == MVT::i1 &&

46124

Cond.getOperand(0).getOpcode() == ISD::AND &&

46125

isNullOrNullSplat(Cond.getOperand(1)) &&

46126

cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&

46127

Cond.getOperand(0).getValueType() == VT) {

46128

// The 'and' mask must be composed of power-of-2 constants.

46129

SDValue And = Cond.getOperand(0);

46130

auto *C = isConstOrConstSplat(And.getOperand(1));

46131

if (C && C->getAPIntValue().isPowerOf2()) {

46132

// vselect (X & C == 0), LHS, RHS --> vselect (X & C != 0), RHS, LHS

46133

SDValue NotCond =

46134

DAG.getSetCC(DL, CondVT, And, Cond.getOperand(1), ISD::SETNE);

46135

return DAG.getSelect(DL, VT, NotCond, RHS, LHS);

46136

}

46137

46138

// If we have a non-splat but still powers-of-2 mask, AVX1 can use pmulld

46139

// and AVX2 can use vpsllv{dq}. 8-bit lacks a proper shift or multiply.

46140

// 16-bit lacks a proper blendv.

46141

unsigned EltBitWidth = VT.getScalarSizeInBits();

46142

bool CanShiftBlend =

46143

TLI.isTypeLegal(VT) && ((Subtarget.hasAVX() && EltBitWidth == 32) ||

46144

(Subtarget.hasAVX2() && EltBitWidth == 64) ||

46145

(Subtarget.hasXOP()));

46146

if (CanShiftBlend &&

46147

ISD::matchUnaryPredicate(And.getOperand(1), [](ConstantSDNode *C) {

46148

return C->getAPIntValue().isPowerOf2();

46149

})) {

46150

// Create a left-shift constant to get the mask bits over to the sign-bit.

46151

SDValue Mask = And.getOperand(1);

46152

SmallVector<int, 32> ShlVals;

46153

for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {

46154

auto *MaskVal = cast<ConstantSDNode>(Mask.getOperand(i));

46155

ShlVals.push_back(EltBitWidth - 1 -

46156

MaskVal->getAPIntValue().exactLogBase2());

46157

}

46158

// vsel ((X & C) == 0), LHS, RHS --> vsel ((shl X, C') < 0), RHS, LHS

46159

SDValue ShlAmt = getConstVector(ShlVals, VT.getSimpleVT(), DAG, DL);

46160

SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, And.getOperand(0), ShlAmt);

46161

SDValue NewCond =

46162

DAG.getSetCC(DL, CondVT, Shl, Cond.getOperand(1), ISD::SETLT);

46163

return DAG.getSelect(DL, VT, NewCond, RHS, LHS);

46164

}

46165

}

46166

46167

return SDValue();

46168

}

46169

46170

/// Combine:

46171

/// (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S)

46172

/// to:

46173

/// (brcond/cmov/setcc .., (LADD x, 1), COND_LE)

46174

/// i.e., reusing the EFLAGS produced by the LOCKed instruction.

46175

/// Note that this is only legal for some op/cc combinations.

46176

static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC,

46177

SelectionDAG &DAG,

46178

const X86Subtarget &Subtarget) {

46179

// This combine only operates on CMP-like nodes.

46180

if (!(Cmp.getOpcode() == X86ISD::CMP ||

46181

(Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))

46182

return SDValue();

46183

46184

// Can't replace the cmp if it has more uses than the one we're looking at.

46185

// FIXME: We would like to be able to handle this, but would need to make sure

46186

// all uses were updated.

46187

if (!Cmp.hasOneUse())

46188

return SDValue();

46189

46190

// This only applies to variations of the common case:

46191

// (icmp slt x, 0) -> (icmp sle (add x, 1), 0)

46192

// (icmp sge x, 0) -> (icmp sgt (add x, 1), 0)

46193

// (icmp sle x, 0) -> (icmp slt (sub x, 1), 0)

46194

// (icmp sgt x, 0) -> (icmp sge (sub x, 1), 0)

46195

// Using the proper condcodes (see below), overflow is checked for.

46196

46197

// FIXME: We can generalize both constraints:

46198

// - XOR/OR/AND (if they were made to survive AtomicExpand)

46199

// - LHS != 1

46200

// if the result is compared.

46201

46202

SDValue CmpLHS = Cmp.getOperand(0);

46203

SDValue CmpRHS = Cmp.getOperand(1);

46204

EVT CmpVT = CmpLHS.getValueType();

46205

46206

if (!CmpLHS.hasOneUse())

46207

return SDValue();

46208

46209

unsigned Opc = CmpLHS.getOpcode();

46210

if (Opc != ISD::ATOMIC_LOAD_ADD && Opc != ISD::ATOMIC_LOAD_SUB)

46211

return SDValue();

46212

46213

SDValue OpRHS = CmpLHS.getOperand(2);

46214

auto *OpRHSC = dyn_cast<ConstantSDNode>(OpRHS);

46215

if (!OpRHSC)

46216

return SDValue();

46217

46218

APInt Addend = OpRHSC->getAPIntValue();

46219

if (Opc == ISD::ATOMIC_LOAD_SUB)

46220

Addend = -Addend;

46221

46222

auto *CmpRHSC = dyn_cast<ConstantSDNode>(CmpRHS);

46223

if (!CmpRHSC)

46224

return SDValue();

46225

46226

APInt Comparison = CmpRHSC->getAPIntValue();

46227

APInt NegAddend = -Addend;

46228

46229

// See if we can adjust the CC to make the comparison match the negated

46230

// addend.

46231

if (Comparison != NegAddend) {

46232

APInt IncComparison = Comparison + 1;

46233

if (IncComparison == NegAddend) {

46234

if (CC == X86::COND_A && !Comparison.isMaxValue()) {

46235

Comparison = IncComparison;

46236

CC = X86::COND_AE;

46237

} else if (CC == X86::COND_LE && !Comparison.isMaxSignedValue()) {

46238

Comparison = IncComparison;

46239

CC = X86::COND_L;

46240

}

46241

}

46242

APInt DecComparison = Comparison - 1;

46243

if (DecComparison == NegAddend) {

46244

if (CC == X86::COND_AE && !Comparison.isMinValue()) {

46245

Comparison = DecComparison;

46246

CC = X86::COND_A;

46247

} else if (CC == X86::COND_L && !Comparison.isMinSignedValue()) {

46248

Comparison = DecComparison;

46249

CC = X86::COND_LE;

46250

}

46251

}

46252

}

46253

46254

// If the addend is the negation of the comparison value, then we can do

46255

// a full comparison by emitting the atomic arithmetic as a locked sub.

46256

if (Comparison == NegAddend) {

46257

// The CC is fine, but we need to rewrite the LHS of the comparison as an

46258

// atomic sub.

46259

auto *AN = cast<AtomicSDNode>(CmpLHS.getNode());

46260

auto AtomicSub = DAG.getAtomic(

46261

ISD::ATOMIC_LOAD_SUB, SDLoc(CmpLHS), CmpVT,

46262

/*Chain*/ CmpLHS.getOperand(0), /*LHS*/ CmpLHS.getOperand(1),

46263

/*RHS*/ DAG.getConstant(NegAddend, SDLoc(CmpRHS), CmpVT),

46264

AN->getMemOperand());

46265

auto LockOp = lowerAtomicArithWithLOCK(AtomicSub, DAG, Subtarget);

46266

DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0), DAG.getUNDEF(CmpVT));

46267

DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));

46268

return LockOp;

46269

}

46270

46271

// We can handle comparisons with zero in a number of cases by manipulating

46272

// the CC used.

46273

if (!Comparison.isZero())

46274

return SDValue();

46275

46276

if (CC == X86::COND_S && Addend == 1)

46277

CC = X86::COND_LE;

46278

else if (CC == X86::COND_NS && Addend == 1)

46279

CC = X86::COND_G;

46280

else if (CC == X86::COND_G && Addend == -1)

46281

CC = X86::COND_GE;

46282

else if (CC == X86::COND_LE && Addend == -1)

46283

CC = X86::COND_L;

46284

else

46285

return SDValue();

46286

46287

SDValue LockOp = lowerAtomicArithWithLOCK(CmpLHS, DAG, Subtarget);

46288

DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0), DAG.getUNDEF(CmpVT));

46289

DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));

46290

return LockOp;

46291

}

46292

46293

// Check whether a boolean test is testing a boolean value generated by

46294

// X86ISD::SETCC. If so, return the operand of that SETCC and proper condition

46295

// code.

46296

//

46297

// Simplify the following patterns:

46298

// (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or

46299

// (Op (CMP (SETCC Cond EFLAGS) 0) NEQ)

46300

// to (Op EFLAGS Cond)

46301

//

46302

// (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or

46303

// (Op (CMP (SETCC Cond EFLAGS) 1) NEQ)

46304

// to (Op EFLAGS !Cond)

46305

//

46306

// where Op could be BRCOND or CMOV.

46307

//

46308

static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {

46309

// This combine only operates on CMP-like nodes.

46310

if (!(Cmp.getOpcode() == X86ISD::CMP ||

46311

(Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))

46312

return SDValue();

46313

46314

// Quit if not used as a boolean value.

46315

if (CC != X86::COND_E && CC != X86::COND_NE)

46316

return SDValue();

46317

46318

// Check CMP operands. One of them should be 0 or 1 and the other should be

46319

// an SetCC or extended from it.

46320

SDValue Op1 = Cmp.getOperand(0);

46321

SDValue Op2 = Cmp.getOperand(1);

46322

46323

SDValue SetCC;

46324

const ConstantSDNode* C = nullptr;

46325

bool needOppositeCond = (CC == X86::COND_E);

46326

bool checkAgainstTrue = false; // Is it a comparison against 1?

46327

46328

if ((C = dyn_cast<ConstantSDNode>(Op1)))

46329

SetCC = Op2;

46330

else if ((C = dyn_cast<ConstantSDNode>(Op2)))

46331

SetCC = Op1;

46332

else // Quit if all operands are not constants.

46333

return SDValue();

46334

46335

if (C->getZExtValue() == 1) {

46336

needOppositeCond = !needOppositeCond;

46337

checkAgainstTrue = true;

46338

} else if (C->getZExtValue() != 0)

46339

// Quit if the constant is neither 0 or 1.

46340

return SDValue();

46341

46342

bool truncatedToBoolWithAnd = false;

46343

// Skip (zext $x), (trunc $x), or (and $x, 1) node.

46344

while (SetCC.getOpcode() == ISD::ZERO_EXTEND ||

46345

SetCC.getOpcode() == ISD::TRUNCATE ||

46346

SetCC.getOpcode() == ISD::AND) {

46347

if (SetCC.getOpcode() == ISD::AND) {

46348

int OpIdx = -1;

46349

if (isOneConstant(SetCC.getOperand(0)))

46350

OpIdx = 1;

46351

if (isOneConstant(SetCC.getOperand(1)))

46352

OpIdx = 0;

46353

if (OpIdx < 0)

46354

break;

46355

SetCC = SetCC.getOperand(OpIdx);

46356

truncatedToBoolWithAnd = true;

46357

} else

46358

SetCC = SetCC.getOperand(0);

46359

}

46360

46361

switch (SetCC.getOpcode()) {

46362

case X86ISD::SETCC_CARRY:

46363

// Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to

46364

// simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1,

46365

// i.e. it's a comparison against true but the result of SETCC_CARRY is not

46366

// truncated to i1 using 'and'.

46367

if (checkAgainstTrue && !truncatedToBoolWithAnd)

46368

break;

46369

assert(X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B &&(static_cast <bool> (X86::CondCode(SetCC.getConstantOperandVal
(0)) == X86::COND_B && "Invalid use of SETCC_CARRY!")
? void (0) : __assert_fail ("X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B && \"Invalid use of SETCC_CARRY!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46370, __extension__
__PRETTY_FUNCTION__))

46370

"Invalid use of SETCC_CARRY!")(static_cast <bool> (X86::CondCode(SetCC.getConstantOperandVal
(0)) == X86::COND_B && "Invalid use of SETCC_CARRY!")
? void (0) : __assert_fail ("X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B && \"Invalid use of SETCC_CARRY!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46370, __extension__
__PRETTY_FUNCTION__));

46371

[[fallthrough]];

46372

case X86ISD::SETCC:

46373

// Set the condition code or opposite one if necessary.

46374

CC = X86::CondCode(SetCC.getConstantOperandVal(0));

46375

if (needOppositeCond)

46376

CC = X86::GetOppositeBranchCondition(CC);

46377

return SetCC.getOperand(1);

46378

case X86ISD::CMOV: {

46379

// Check whether false/true value has canonical one, i.e. 0 or 1.

46380

ConstantSDNode *FVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(0));

46381

ConstantSDNode *TVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(1));

46382

// Quit if true value is not a constant.

46383

if (!TVal)

46384

return SDValue();

46385

// Quit if false value is not a constant.

46386

if (!FVal) {

46387

SDValue Op = SetCC.getOperand(0);

46388

// Skip 'zext' or 'trunc' node.

46389

if (Op.getOpcode() == ISD::ZERO_EXTEND ||

46390

Op.getOpcode() == ISD::TRUNCATE)

46391

Op = Op.getOperand(0);

46392

// A special case for rdrand/rdseed, where 0 is set if false cond is

46393

// found.

46394

if ((Op.getOpcode() != X86ISD::RDRAND &&

46395

Op.getOpcode() != X86ISD::RDSEED) || Op.getResNo() != 0)

46396

return SDValue();

46397

}

46398

// Quit if false value is not the constant 0 or 1.

46399

bool FValIsFalse = true;

46400

if (FVal && FVal->getZExtValue() != 0) {

46401

if (FVal->getZExtValue() != 1)

46402

return SDValue();

46403

// If FVal is 1, opposite cond is needed.

46404

needOppositeCond = !needOppositeCond;

46405

FValIsFalse = false;

46406

}

46407

// Quit if TVal is not the constant opposite of FVal.

46408

if (FValIsFalse && TVal->getZExtValue() != 1)

46409

return SDValue();

46410

if (!FValIsFalse && TVal->getZExtValue() != 0)

46411

return SDValue();

46412

CC = X86::CondCode(SetCC.getConstantOperandVal(2));

46413

if (needOppositeCond)

46414

CC = X86::GetOppositeBranchCondition(CC);

46415

return SetCC.getOperand(3);

46416

}

46417

}

46418

46419

return SDValue();

46420

}

46421

46422

/// Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS.

46423

/// Match:

46424

/// (X86or (X86setcc) (X86setcc))

46425

/// (X86cmp (and (X86setcc) (X86setcc)), 0)

46426

static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0,

46427

X86::CondCode &CC1, SDValue &Flags,

46428

bool &isAnd) {

46429

if (Cond->getOpcode() == X86ISD::CMP) {

46430

if (!isNullConstant(Cond->getOperand(1)))

46431

return false;

46432

46433

Cond = Cond->getOperand(0);

46434

}

46435

46436

isAnd = false;

46437

46438

SDValue SetCC0, SetCC1;

46439

switch (Cond->getOpcode()) {

46440

default: return false;

46441

case ISD::AND:

46442

case X86ISD::AND:

46443

isAnd = true;

46444

[[fallthrough]];

46445

case ISD::OR:

46446

case X86ISD::OR:

46447

SetCC0 = Cond->getOperand(0);

46448

SetCC1 = Cond->getOperand(1);

46449

break;

46450

};

46451

46452

// Make sure we have SETCC nodes, using the same flags value.

46453

if (SetCC0.getOpcode() != X86ISD::SETCC ||

46454

SetCC1.getOpcode() != X86ISD::SETCC ||

46455

SetCC0->getOperand(1) != SetCC1->getOperand(1))

46456

return false;

46457

46458

CC0 = (X86::CondCode)SetCC0->getConstantOperandVal(0);

46459

CC1 = (X86::CondCode)SetCC1->getConstantOperandVal(0);

46460

Flags = SetCC0->getOperand(1);

46461

return true;

46462

}

46463

46464

// When legalizing carry, we create carries via add X, -1

46465

// If that comes from an actual carry, via setcc, we use the

46466

// carry directly.

46467

static SDValue combineCarryThroughADD(SDValue EFLAGS, SelectionDAG &DAG) {

46468

if (EFLAGS.getOpcode() == X86ISD::ADD) {

46469

if (isAllOnesConstant(EFLAGS.getOperand(1))) {

46470

bool FoundAndLSB = false;

46471

SDValue Carry = EFLAGS.getOperand(0);

46472

while (Carry.getOpcode() == ISD::TRUNCATE ||

46473

Carry.getOpcode() == ISD::ZERO_EXTEND ||

46474

(Carry.getOpcode() == ISD::AND &&

46475

isOneConstant(Carry.getOperand(1)))) {

46476

FoundAndLSB |= Carry.getOpcode() == ISD::AND;

46477

Carry = Carry.getOperand(0);

46478

}

46479

if (Carry.getOpcode() == X86ISD::SETCC ||

46480

Carry.getOpcode() == X86ISD::SETCC_CARRY) {

46481

// TODO: Merge this code with equivalent in combineAddOrSubToADCOrSBB?

46482

uint64_t CarryCC = Carry.getConstantOperandVal(0);

46483

SDValue CarryOp1 = Carry.getOperand(1);

46484

if (CarryCC == X86::COND_B)

46485

return CarryOp1;

46486

if (CarryCC == X86::COND_A) {

46487

// Try to convert COND_A into COND_B in an attempt to facilitate

46488

// materializing "setb reg".

46489

//

46490

// Do not flip "e > c", where "c" is a constant, because Cmp

46491

// instruction cannot take an immediate as its first operand.

46492

//

46493

if (CarryOp1.getOpcode() == X86ISD::SUB &&

46494

CarryOp1.getNode()->hasOneUse() &&

46495

CarryOp1.getValueType().isInteger() &&

46496

!isa<ConstantSDNode>(CarryOp1.getOperand(1))) {

46497

SDValue SubCommute =

46498

DAG.getNode(X86ISD::SUB, SDLoc(CarryOp1), CarryOp1->getVTList(),

46499

CarryOp1.getOperand(1), CarryOp1.getOperand(0));

46500

return SDValue(SubCommute.getNode(), CarryOp1.getResNo());

46501

}

46502

}

46503

// If this is a check of the z flag of an add with 1, switch to the

46504

// C flag.

46505

if (CarryCC == X86::COND_E &&

46506

CarryOp1.getOpcode() == X86ISD::ADD &&

46507

isOneConstant(CarryOp1.getOperand(1)))

46508

return CarryOp1;

46509

} else if (FoundAndLSB) {

46510

SDLoc DL(Carry);

46511

SDValue BitNo = DAG.getConstant(0, DL, Carry.getValueType());

46512

if (Carry.getOpcode() == ISD::SRL) {

46513

BitNo = Carry.getOperand(1);

46514

Carry = Carry.getOperand(0);

46515

}

46516

return getBT(Carry, BitNo, DL, DAG);

46517

}

46518

}

46519

}

46520

46521

return SDValue();

46522

}

46523

46524

/// If we are inverting an PTEST/TESTP operand, attempt to adjust the CC

46525

/// to avoid the inversion.

46526

static SDValue combinePTESTCC(SDValue EFLAGS, X86::CondCode &CC,

46527

SelectionDAG &DAG,

46528

const X86Subtarget &Subtarget) {

46529

// TODO: Handle X86ISD::KTEST/X86ISD::KORTEST.

46530

if (EFLAGS.getOpcode() != X86ISD::PTEST &&

46531

EFLAGS.getOpcode() != X86ISD::TESTP)

46532

return SDValue();

46533

46534

// PTEST/TESTP sets EFLAGS as:

46535

// TESTZ: ZF = (Op0 & Op1) == 0

46536

// TESTC: CF = (~Op0 & Op1) == 0

46537

// TESTNZC: ZF == 0 && CF == 0

46538

EVT VT = EFLAGS.getValueType();

46539

SDValue Op0 = EFLAGS.getOperand(0);

46540

SDValue Op1 = EFLAGS.getOperand(1);

46541

EVT OpVT = Op0.getValueType();

46542

46543

// TEST*(~X,Y) == TEST*(X,Y)

46544

if (SDValue NotOp0 = IsNOT(Op0, DAG)) {

46545

X86::CondCode InvCC;

46546

switch (CC) {

46547

case X86::COND_B:

46548

// testc -> testz.

46549

InvCC = X86::COND_E;

46550

break;

46551

case X86::COND_AE:

46552

// !testc -> !testz.

46553

InvCC = X86::COND_NE;

46554

break;

46555

case X86::COND_E:

46556

// testz -> testc.

46557

InvCC = X86::COND_B;

46558

break;

46559

case X86::COND_NE:

46560

// !testz -> !testc.

46561

InvCC = X86::COND_AE;

46562

break;

46563

case X86::COND_A:

46564

case X86::COND_BE:

46565

// testnzc -> testnzc (no change).

46566

InvCC = CC;

46567

break;

46568

default:

46569

InvCC = X86::COND_INVALID;

46570

break;

46571

}

46572

46573

if (InvCC != X86::COND_INVALID) {

46574

CC = InvCC;

46575

return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,

46576

DAG.getBitcast(OpVT, NotOp0), Op1);

46577

}

46578

}

46579

46580

if (CC == X86::COND_E || CC == X86::COND_NE) {

46581

// TESTZ(X,~Y) == TESTC(Y,X)

46582

if (SDValue NotOp1 = IsNOT(Op1, DAG)) {

46583

CC = (CC == X86::COND_E ? X86::COND_B : X86::COND_AE);

46584

return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,

46585

DAG.getBitcast(OpVT, NotOp1), Op0);

46586

}

46587

46588

if (Op0 == Op1) {

46589

SDValue BC = peekThroughBitcasts(Op0);

46590

EVT BCVT = BC.getValueType();

46591

assert(BCVT.isVector() && DAG.getTargetLoweringInfo().isTypeLegal(BCVT) &&(static_cast <bool> (BCVT.isVector() && DAG.getTargetLoweringInfo
().isTypeLegal(BCVT) && "Unexpected vector type") ? void
(0) : __assert_fail ("BCVT.isVector() && DAG.getTargetLoweringInfo().isTypeLegal(BCVT) && \"Unexpected vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46592, __extension__
__PRETTY_FUNCTION__))

46592

"Unexpected vector type")(static_cast <bool> (BCVT.isVector() && DAG.getTargetLoweringInfo
().isTypeLegal(BCVT) && "Unexpected vector type") ? void
(0) : __assert_fail ("BCVT.isVector() && DAG.getTargetLoweringInfo().isTypeLegal(BCVT) && \"Unexpected vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46592, __extension__
__PRETTY_FUNCTION__));

46593

46594

// TESTZ(AND(X,Y),AND(X,Y)) == TESTZ(X,Y)

46595

if (BC.getOpcode() == ISD::AND || BC.getOpcode() == X86ISD::FAND) {

46596

return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,

46597

DAG.getBitcast(OpVT, BC.getOperand(0)),

46598

DAG.getBitcast(OpVT, BC.getOperand(1)));

46599

}

46600

46601

// TESTZ(AND(~X,Y),AND(~X,Y)) == TESTC(X,Y)

46602

if (BC.getOpcode() == X86ISD::ANDNP || BC.getOpcode() == X86ISD::FANDN) {

46603

CC = (CC == X86::COND_E ? X86::COND_B : X86::COND_AE);

46604

return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,

46605

DAG.getBitcast(OpVT, BC.getOperand(0)),

46606

DAG.getBitcast(OpVT, BC.getOperand(1)));

46607

}

46608

46609

// If every element is an all-sign value, see if we can use MOVMSK to

46610

// more efficiently extract the sign bits and compare that.

46611

// TODO: Handle TESTC with comparison inversion.

46612

// TODO: Can we remove SimplifyMultipleUseDemandedBits and rely on

46613

// MOVMSK combines to make sure its never worse than PTEST?

46614

unsigned EltBits = BCVT.getScalarSizeInBits();

46615

if (DAG.ComputeNumSignBits(BC) == EltBits) {

46616

assert(VT == MVT::i32 && "Expected i32 EFLAGS comparison result")(static_cast <bool> (VT == MVT::i32 && "Expected i32 EFLAGS comparison result"
) ? void (0) : __assert_fail ("VT == MVT::i32 && \"Expected i32 EFLAGS comparison result\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46616, __extension__
__PRETTY_FUNCTION__));

46617

APInt SignMask = APInt::getSignMask(EltBits);

46618

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

46619

if (SDValue Res =

46620

TLI.SimplifyMultipleUseDemandedBits(BC, SignMask, DAG)) {

46621

// For vXi16 cases we need to use pmovmksb and extract every other

46622

// sign bit.

46623

SDLoc DL(EFLAGS);

46624

if (EltBits == 16) {

46625

MVT MovmskVT = BCVT.is128BitVector() ? MVT::v16i8 : MVT::v32i8;

46626

Res = DAG.getBitcast(MovmskVT, Res);

46627

Res = getPMOVMSKB(DL, Res, DAG, Subtarget);

46628

Res = DAG.getNode(ISD::AND, DL, MVT::i32, Res,

46629

DAG.getConstant(0xAAAAAAAA, DL, MVT::i32));

46630

} else {

46631

Res = getPMOVMSKB(DL, Res, DAG, Subtarget);

46632

}

46633

return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Res,

46634

DAG.getConstant(0, DL, MVT::i32));

46635

}

46636

}

46637

}

46638

46639

// TESTZ(-1,X) == TESTZ(X,X)

46640

if (ISD::isBuildVectorAllOnes(Op0.getNode()))

46641

return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op1, Op1);

46642

46643

// TESTZ(X,-1) == TESTZ(X,X)

46644

if (ISD::isBuildVectorAllOnes(Op1.getNode()))

46645

return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op0, Op0);

46646

46647

// TESTZ(OR(LO(X),HI(X)),OR(LO(Y),HI(Y))) -> TESTZ(X,Y)

46648

// TODO: Add COND_NE handling?

46649

if (CC == X86::COND_E && OpVT.is128BitVector() && Subtarget.hasAVX()) {

46650

SDValue Src0 = peekThroughBitcasts(Op0);

46651

SDValue Src1 = peekThroughBitcasts(Op1);

46652

if (Src0.getOpcode() == ISD::OR && Src1.getOpcode() == ISD::OR) {

46653

Src0 = getSplitVectorSrc(peekThroughBitcasts(Src0.getOperand(0)),

46654

peekThroughBitcasts(Src0.getOperand(1)), true);

46655

Src1 = getSplitVectorSrc(peekThroughBitcasts(Src1.getOperand(0)),

46656

peekThroughBitcasts(Src1.getOperand(1)), true);

46657

if (Src0 && Src1)

46658

return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,

46659

DAG.getBitcast(MVT::v4i64, Src0),

46660

DAG.getBitcast(MVT::v4i64, Src1));

46661

}

46662

}

46663

}

46664

46665

return SDValue();

46666

}

46667

46668

// Attempt to simplify the MOVMSK input based on the comparison type.

46669

static SDValue combineSetCCMOVMSK(SDValue EFLAGS, X86::CondCode &CC,

46670

SelectionDAG &DAG,

46671

const X86Subtarget &Subtarget) {

46672

// Handle eq/ne against zero (any_of).

46673

// Handle eq/ne against -1 (all_of).

46674

if (!(CC == X86::COND_E || CC == X86::COND_NE))

46675

return SDValue();

46676

if (EFLAGS.getValueType() != MVT::i32)

46677

return SDValue();

46678

unsigned CmpOpcode = EFLAGS.getOpcode();

46679

if (CmpOpcode != X86ISD::CMP && CmpOpcode != X86ISD::SUB)

46680

return SDValue();

46681

auto *CmpConstant = dyn_cast<ConstantSDNode>(EFLAGS.getOperand(1));

46682

if (!CmpConstant)

46683

return SDValue();

46684

const APInt &CmpVal = CmpConstant->getAPIntValue();

46685

46686

SDValue CmpOp = EFLAGS.getOperand(0);

46687

unsigned CmpBits = CmpOp.getValueSizeInBits();

46688

assert(CmpBits == CmpVal.getBitWidth() && "Value size mismatch")(static_cast <bool> (CmpBits == CmpVal.getBitWidth() &&
"Value size mismatch") ? void (0) : __assert_fail ("CmpBits == CmpVal.getBitWidth() && \"Value size mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46688, __extension__
__PRETTY_FUNCTION__));

46689

46690

// Peek through any truncate.

46691

if (CmpOp.getOpcode() == ISD::TRUNCATE)

46692

CmpOp = CmpOp.getOperand(0);

46693

46694

// Bail if we don't find a MOVMSK.

46695

if (CmpOp.getOpcode() != X86ISD::MOVMSK)

46696

return SDValue();

46697

46698

SDValue Vec = CmpOp.getOperand(0);

46699

MVT VecVT = Vec.getSimpleValueType();

46700

assert((VecVT.is128BitVector() || VecVT.is256BitVector()) &&(static_cast <bool> ((VecVT.is128BitVector() || VecVT.is256BitVector
()) && "Unexpected MOVMSK operand") ? void (0) : __assert_fail
("(VecVT.is128BitVector() || VecVT.is256BitVector()) && \"Unexpected MOVMSK operand\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46701, __extension__
__PRETTY_FUNCTION__))

46701

"Unexpected MOVMSK operand")(static_cast <bool> ((VecVT.is128BitVector() || VecVT.is256BitVector
()) && "Unexpected MOVMSK operand") ? void (0) : __assert_fail
("(VecVT.is128BitVector() || VecVT.is256BitVector()) && \"Unexpected MOVMSK operand\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46701, __extension__
__PRETTY_FUNCTION__));

46702

unsigned NumElts = VecVT.getVectorNumElements();

46703

unsigned NumEltBits = VecVT.getScalarSizeInBits();

46704

46705

bool IsAnyOf = CmpOpcode == X86ISD::CMP && CmpVal.isZero();

46706

bool IsAllOf = (CmpOpcode == X86ISD::SUB || CmpOpcode == X86ISD::CMP) &&

46707

NumElts <= CmpBits && CmpVal.isMask(NumElts);

46708

if (!IsAnyOf && !IsAllOf)

46709

return SDValue();

46710

46711

// TODO: Check more combining cases for me.

46712

// Here we check the cmp use number to decide do combining or not.

46713

// Currently we only get 2 tests about combining "MOVMSK(CONCAT(..))"

46714

// and "MOVMSK(PCMPEQ(..))" are fit to use this constraint.

46715

bool IsOneUse = CmpOp.getNode()->hasOneUse();

46716

46717

// See if we can peek through to a vector with a wider element type, if the

46718

// signbits extend down to all the sub-elements as well.

46719

// Calling MOVMSK with the wider type, avoiding the bitcast, helps expose

46720

// potential SimplifyDemandedBits/Elts cases.

46721

// If we looked through a truncate that discard bits, we can't do this

46722

// transform.

46723

// FIXME: We could do this transform for truncates that discarded bits by

46724

// inserting an AND mask between the new MOVMSK and the CMP.

46725

if (Vec.getOpcode() == ISD::BITCAST && NumElts <= CmpBits) {

46726

SDValue BC = peekThroughBitcasts(Vec);

46727

MVT BCVT = BC.getSimpleValueType();

46728

unsigned BCNumElts = BCVT.getVectorNumElements();

46729

unsigned BCNumEltBits = BCVT.getScalarSizeInBits();

46730

if ((BCNumEltBits == 32 || BCNumEltBits == 64) &&

46731

BCNumEltBits > NumEltBits &&

46732

DAG.ComputeNumSignBits(BC) > (BCNumEltBits - NumEltBits)) {

46733

SDLoc DL(EFLAGS);

46734

APInt CmpMask = APInt::getLowBitsSet(32, IsAnyOf ? 0 : BCNumElts);

46735

return DAG.getNode(X86ISD::CMP, DL, MVT::i32,

46736

DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, BC),

46737

DAG.getConstant(CmpMask, DL, MVT::i32));

46738

}

46739

}

46740

46741

// MOVMSK(CONCAT(X,Y)) == 0 -> MOVMSK(OR(X,Y)).

46742

// MOVMSK(CONCAT(X,Y)) != 0 -> MOVMSK(OR(X,Y)).

46743

// MOVMSK(CONCAT(X,Y)) == -1 -> MOVMSK(AND(X,Y)).

46744

// MOVMSK(CONCAT(X,Y)) != -1 -> MOVMSK(AND(X,Y)).

46745

if (VecVT.is256BitVector() && NumElts <= CmpBits && IsOneUse) {

46746

SmallVector<SDValue> Ops;

46747

if (collectConcatOps(peekThroughBitcasts(Vec).getNode(), Ops, DAG) &&

46748

Ops.size() == 2) {

46749

SDLoc DL(EFLAGS);

46750

EVT SubVT = Ops[0].getValueType().changeTypeToInteger();

46751

APInt CmpMask = APInt::getLowBitsSet(32, IsAnyOf ? 0 : NumElts / 2);

46752

SDValue V = DAG.getNode(IsAnyOf ? ISD::OR : ISD::AND, DL, SubVT,

46753

DAG.getBitcast(SubVT, Ops[0]),

46754

DAG.getBitcast(SubVT, Ops[1]));

46755

V = DAG.getBitcast(VecVT.getHalfNumVectorElementsVT(), V);

46756

return DAG.getNode(X86ISD::CMP, DL, MVT::i32,

46757

DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V),

46758

DAG.getConstant(CmpMask, DL, MVT::i32));

46759

}

46760

}

46761

46762

// MOVMSK(PCMPEQ(X,0)) == -1 -> PTESTZ(X,X).

46763

// MOVMSK(PCMPEQ(X,0)) != -1 -> !PTESTZ(X,X).

46764

// MOVMSK(PCMPEQ(X,Y)) == -1 -> PTESTZ(SUB(X,Y),SUB(X,Y)).

46765

// MOVMSK(PCMPEQ(X,Y)) != -1 -> !PTESTZ(SUB(X,Y),SUB(X,Y)).

46766

if (IsAllOf && Subtarget.hasSSE41() && IsOneUse) {

46767

MVT TestVT = VecVT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;

46768

SDValue BC = peekThroughBitcasts(Vec);

46769

// Ensure MOVMSK was testing every signbit of BC.

46770

if (BC.getValueType().getVectorNumElements() <= NumElts) {

46771

if (BC.getOpcode() == X86ISD::PCMPEQ) {

46772

SDValue V = DAG.getNode(ISD::SUB, SDLoc(BC), BC.getValueType(),

46773

BC.getOperand(0), BC.getOperand(1));

46774

V = DAG.getBitcast(TestVT, V);

46775

return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V);

46776

}

46777

// Check for 256-bit split vector cases.

46778

if (BC.getOpcode() == ISD::AND &&

46779

BC.getOperand(0).getOpcode() == X86ISD::PCMPEQ &&

46780

BC.getOperand(1).getOpcode() == X86ISD::PCMPEQ) {

46781

SDValue LHS = BC.getOperand(0);

46782

SDValue RHS = BC.getOperand(1);

46783

LHS = DAG.getNode(ISD::SUB, SDLoc(LHS), LHS.getValueType(),

46784

LHS.getOperand(0), LHS.getOperand(1));

46785

RHS = DAG.getNode(ISD::SUB, SDLoc(RHS), RHS.getValueType(),

46786

RHS.getOperand(0), RHS.getOperand(1));

46787

LHS = DAG.getBitcast(TestVT, LHS);

46788

RHS = DAG.getBitcast(TestVT, RHS);

46789

SDValue V = DAG.getNode(ISD::OR, SDLoc(EFLAGS), TestVT, LHS, RHS);

46790

return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V);

46791

}

46792

}

46793

}

46794

46795

// See if we can avoid a PACKSS by calling MOVMSK on the sources.

46796

// For vXi16 cases we can use a v2Xi8 PMOVMSKB. We must mask out

46797

// sign bits prior to the comparison with zero unless we know that

46798

// the vXi16 splats the sign bit down to the lower i8 half.

46799

// TODO: Handle all_of patterns.

46800

if (Vec.getOpcode() == X86ISD::PACKSS && VecVT == MVT::v16i8) {

46801

SDValue VecOp0 = Vec.getOperand(0);

46802

SDValue VecOp1 = Vec.getOperand(1);

46803

bool SignExt0 = DAG.ComputeNumSignBits(VecOp0) > 8;

46804

bool SignExt1 = DAG.ComputeNumSignBits(VecOp1) > 8;

46805

// PMOVMSKB(PACKSSBW(X, undef)) -> PMOVMSKB(BITCAST_v16i8(X)) & 0xAAAA.

46806

if (IsAnyOf && CmpBits == 8 && VecOp1.isUndef()) {

46807

SDLoc DL(EFLAGS);

46808

SDValue Result = DAG.getBitcast(MVT::v16i8, VecOp0);

46809

Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);

46810

Result = DAG.getZExtOrTrunc(Result, DL, MVT::i16);

46811

if (!SignExt0) {

46812

Result = DAG.getNode(ISD::AND, DL, MVT::i16, Result,

46813

DAG.getConstant(0xAAAA, DL, MVT::i16));

46814

}

46815

return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,

46816

DAG.getConstant(0, DL, MVT::i16));

46817

}

46818

// PMOVMSKB(PACKSSBW(LO(X), HI(X)))

46819

// -> PMOVMSKB(BITCAST_v32i8(X)) & 0xAAAAAAAA.

46820

if (CmpBits >= 16 && Subtarget.hasInt256() &&

46821

(IsAnyOf || (SignExt0 && SignExt1))) {

46822

if (SDValue Src = getSplitVectorSrc(VecOp0, VecOp1, true)) {

46823

SDLoc DL(EFLAGS);

46824

SDValue Result = peekThroughBitcasts(Src);

46825

if (IsAllOf && Result.getOpcode() == X86ISD::PCMPEQ &&

46826

Result.getValueType().getVectorNumElements() <= NumElts) {

46827

SDValue V = DAG.getNode(ISD::SUB, DL, Result.getValueType(),

46828

Result.getOperand(0), Result.getOperand(1));

46829

V = DAG.getBitcast(MVT::v4i64, V);

46830

return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V);

46831

}

46832

Result = DAG.getBitcast(MVT::v32i8, Result);

46833

Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);

46834

unsigned CmpMask = IsAnyOf ? 0 : 0xFFFFFFFF;

46835

if (!SignExt0 || !SignExt1) {

46836

assert(IsAnyOf &&(static_cast <bool> (IsAnyOf && "Only perform v16i16 signmasks for any_of patterns"
) ? void (0) : __assert_fail ("IsAnyOf && \"Only perform v16i16 signmasks for any_of patterns\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46837, __extension__
__PRETTY_FUNCTION__))

46837

"Only perform v16i16 signmasks for any_of patterns")(static_cast <bool> (IsAnyOf && "Only perform v16i16 signmasks for any_of patterns"
) ? void (0) : __assert_fail ("IsAnyOf && \"Only perform v16i16 signmasks for any_of patterns\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46837, __extension__
__PRETTY_FUNCTION__));

46838

Result = DAG.getNode(ISD::AND, DL, MVT::i32, Result,

46839

DAG.getConstant(0xAAAAAAAA, DL, MVT::i32));

46840

}

46841

return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,

46842

DAG.getConstant(CmpMask, DL, MVT::i32));

46843

}

46844

}

46845

}

46846

46847

// MOVMSK(SHUFFLE(X,u)) -> MOVMSK(X) iff every element is referenced.

46848

SmallVector<int, 32> ShuffleMask;

46849

SmallVector<SDValue, 2> ShuffleInputs;

46850

if (NumElts <= CmpBits &&

46851

getTargetShuffleInputs(peekThroughBitcasts(Vec), ShuffleInputs,

46852

ShuffleMask, DAG) &&

46853

ShuffleInputs.size() == 1 && !isAnyZeroOrUndef(ShuffleMask) &&

46854

ShuffleInputs[0].getValueSizeInBits() == VecVT.getSizeInBits()) {

46855

unsigned NumShuffleElts = ShuffleMask.size();

46856

APInt DemandedElts = APInt::getZero(NumShuffleElts);

46857

for (int M : ShuffleMask) {

46858

assert(0 <= M && M < (int)NumShuffleElts && "Bad unary shuffle index")(static_cast <bool> (0 <= M && M < (int)NumShuffleElts
&& "Bad unary shuffle index") ? void (0) : __assert_fail
("0 <= M && M < (int)NumShuffleElts && \"Bad unary shuffle index\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46858, __extension__
__PRETTY_FUNCTION__));

46859

DemandedElts.setBit(M);

46860

}

46861

if (DemandedElts.isAllOnes()) {

46862

SDLoc DL(EFLAGS);

46863

SDValue Result = DAG.getBitcast(VecVT, ShuffleInputs[0]);

46864

Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);

46865

Result =

46866

DAG.getZExtOrTrunc(Result, DL, EFLAGS.getOperand(0).getValueType());

46867

return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,

46868

EFLAGS.getOperand(1));

46869

}

46870

}

46871

46872

return SDValue();

46873

}

46874

46875

/// Optimize an EFLAGS definition used according to the condition code \p CC

46876

/// into a simpler EFLAGS value, potentially returning a new \p CC and replacing

46877

/// uses of chain values.

46878

static SDValue combineSetCCEFLAGS(SDValue EFLAGS, X86::CondCode &CC,

46879

SelectionDAG &DAG,

46880

const X86Subtarget &Subtarget) {

46881

if (CC == X86::COND_B)

46882

if (SDValue Flags = combineCarryThroughADD(EFLAGS, DAG))

46883

return Flags;

46884

46885

if (SDValue R = checkBoolTestSetCCCombine(EFLAGS, CC))

46886

return R;

46887

46888

if (SDValue R = combinePTESTCC(EFLAGS, CC, DAG, Subtarget))

46889

return R;

46890

46891

if (SDValue R = combineSetCCMOVMSK(EFLAGS, CC, DAG, Subtarget))

46892

return R;

46893

46894

return combineSetCCAtomicArith(EFLAGS, CC, DAG, Subtarget);

46895

}

46896

46897

/// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]

46898

static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,

46899

TargetLowering::DAGCombinerInfo &DCI,

46900

const X86Subtarget &Subtarget) {

46901

SDLoc DL(N);

46902

46903

SDValue FalseOp = N->getOperand(0);

46904

SDValue TrueOp = N->getOperand(1);

46905

X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);

46906

SDValue Cond = N->getOperand(3);

46907

46908

// cmov X, X, ?, ? --> X

46909

if (TrueOp == FalseOp)

46910

return TrueOp;

46911

46912

// Try to simplify the EFLAGS and condition code operands.

46913

// We can't always do this as FCMOV only supports a subset of X86 cond.

46914

if (SDValue Flags = combineSetCCEFLAGS(Cond, CC, DAG, Subtarget)) {

46915

if (!(FalseOp.getValueType() == MVT::f80 ||

46916

(FalseOp.getValueType() == MVT::f64 && !Subtarget.hasSSE2()) ||

46917

(FalseOp.getValueType() == MVT::f32 && !Subtarget.hasSSE1())) ||

46918

!Subtarget.canUseCMOV() || hasFPCMov(CC)) {

46919

SDValue Ops[] = {FalseOp, TrueOp, DAG.getTargetConstant(CC, DL, MVT::i8),

46920

Flags};

46921

return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);

46922

}

46923

}

46924

46925

// If this is a select between two integer constants, try to do some

46926

// optimizations. Note that the operands are ordered the opposite of SELECT

46927

// operands.

46928

if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) {

46929

if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) {

46930

// Canonicalize the TrueC/FalseC values so that TrueC (the true value) is

46931

// larger than FalseC (the false value).

46932

if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {

46933

CC = X86::GetOppositeBranchCondition(CC);

46934

std::swap(TrueC, FalseC);

46935

std::swap(TrueOp, FalseOp);

46936

}

46937

46938

// Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0.

46939

// This is efficient for any integer data type (including i8/i16) and

46940

// shift amount.

46941

if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {

46942

Cond = getSETCC(CC, Cond, DL, DAG);

46943

46944

// Zero extend the condition if needed.

46945

Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);

46946

46947

unsigned ShAmt = TrueC->getAPIntValue().logBase2();

46948

Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,

46949

DAG.getConstant(ShAmt, DL, MVT::i8));

46950

return Cond;

46951

}

46952

46953

// Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient

46954

// for any integer data type, including i8/i16.

46955

if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {

46956

Cond = getSETCC(CC, Cond, DL, DAG);

46957

46958

// Zero extend the condition if needed.

46959

Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,

46960

FalseC->getValueType(0), Cond);

46961

Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,

46962

SDValue(FalseC, 0));

46963

return Cond;

46964

}

46965

46966

// Optimize cases that will turn into an LEA instruction. This requires

46967

// an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).

46968

if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {

46969

APInt Diff = TrueC->getAPIntValue() - FalseC->getAPIntValue();

46970

assert(Diff.getBitWidth() == N->getValueType(0).getSizeInBits() &&(static_cast <bool> (Diff.getBitWidth() == N->getValueType
(0).getSizeInBits() && "Implicit constant truncation"
) ? void (0) : __assert_fail ("Diff.getBitWidth() == N->getValueType(0).getSizeInBits() && \"Implicit constant truncation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46971, __extension__
__PRETTY_FUNCTION__))

46971

"Implicit constant truncation")(static_cast <bool> (Diff.getBitWidth() == N->getValueType
(0).getSizeInBits() && "Implicit constant truncation"
) ? void (0) : __assert_fail ("Diff.getBitWidth() == N->getValueType(0).getSizeInBits() && \"Implicit constant truncation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 46971, __extension__
__PRETTY_FUNCTION__));

46972

46973

bool isFastMultiplier = false;

46974

if (Diff.ult(10)) {

46975

switch (Diff.getZExtValue()) {

46976

default: break;

46977

case 1: // result = add base, cond

46978

case 2: // result = lea base( , cond*2)

46979

case 3: // result = lea base(cond, cond*2)

46980

case 4: // result = lea base( , cond*4)

46981

case 5: // result = lea base(cond, cond*4)

46982

case 8: // result = lea base( , cond*8)

46983

case 9: // result = lea base(cond, cond*8)

46984

isFastMultiplier = true;

46985

break;

46986

}

46987

}

46988

46989

if (isFastMultiplier) {

46990

Cond = getSETCC(CC, Cond, DL ,DAG);

46991

// Zero extend the condition if needed.

46992

Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),

46993

Cond);

46994

// Scale the condition by the difference.

46995

if (Diff != 1)

46996

Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,

46997

DAG.getConstant(Diff, DL, Cond.getValueType()));

46998

46999

// Add the base if non-zero.

47000

if (FalseC->getAPIntValue() != 0)

47001

Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,

47002

SDValue(FalseC, 0));

47003

return Cond;

47004

}

47005

}

47006

}

47007

}

47008

47009

// Handle these cases:

47010

// (select (x != c), e, c) -> select (x != c), e, x),

47011

// (select (x == c), c, e) -> select (x == c), x, e)

47012

// where the c is an integer constant, and the "select" is the combination

47013

// of CMOV and CMP.

47014

//

47015

// The rationale for this change is that the conditional-move from a constant

47016

// needs two instructions, however, conditional-move from a register needs

47017

// only one instruction.

47018

//

47019

// CAVEAT: By replacing a constant with a symbolic value, it may obscure

47020

// some instruction-combining opportunities. This opt needs to be

47021

// postponed as late as possible.

47022

//

47023

if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) {

47024

// the DCI.xxxx conditions are provided to postpone the optimization as

47025

// late as possible.

47026

47027

ConstantSDNode *CmpAgainst = nullptr;

47028

if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) &&

47029

(CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) &&

47030

!isa<ConstantSDNode>(Cond.getOperand(0))) {

47031

47032

if (CC == X86::COND_NE &&

47033

CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) {

47034

CC = X86::GetOppositeBranchCondition(CC);

47035

std::swap(TrueOp, FalseOp);

47036

}

47037

47038

if (CC == X86::COND_E &&

47039

CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {

47040

SDValue Ops[] = {FalseOp, Cond.getOperand(0),

47041

DAG.getTargetConstant(CC, DL, MVT::i8), Cond};

47042

return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);

47043

}

47044

}

47045

}

47046

47047

// Fold and/or of setcc's to double CMOV:

47048

// (CMOV F, T, ((cc1 | cc2) != 0)) -> (CMOV (CMOV F, T, cc1), T, cc2)

47049

// (CMOV F, T, ((cc1 & cc2) != 0)) -> (CMOV (CMOV T, F, !cc1), F, !cc2)

47050

//

47051

// This combine lets us generate:

47052

// cmovcc1 (jcc1 if we don't have CMOV)

47053

// cmovcc2 (same)

47054

// instead of:

47055

// setcc1

47056

// setcc2

47057

// and/or

47058

// cmovne (jne if we don't have CMOV)

47059

// When we can't use the CMOV instruction, it might increase branch

47060

// mispredicts.

47061

// When we can use CMOV, or when there is no mispredict, this improves

47062

// throughput and reduces register pressure.

47063

//

47064

if (CC == X86::COND_NE) {

47065

SDValue Flags;

47066

X86::CondCode CC0, CC1;

47067

bool isAndSetCC;

47068

if (checkBoolTestAndOrSetCCCombine(Cond, CC0, CC1, Flags, isAndSetCC)) {

47069

if (isAndSetCC) {

47070

std::swap(FalseOp, TrueOp);

47071

CC0 = X86::GetOppositeBranchCondition(CC0);

47072

CC1 = X86::GetOppositeBranchCondition(CC1);

47073

}

47074

47075

SDValue LOps[] = {FalseOp, TrueOp,

47076

DAG.getTargetConstant(CC0, DL, MVT::i8), Flags};

47077

SDValue LCMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), LOps);

47078

SDValue Ops[] = {LCMOV, TrueOp, DAG.getTargetConstant(CC1, DL, MVT::i8),

47079

Flags};

47080

SDValue CMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);

47081

return CMOV;

47082

}

47083

}

47084

47085

// Fold (CMOV C1, (ADD (CTTZ X), C2), (X != 0)) ->

47086

// (ADD (CMOV C1-C2, (CTTZ X), (X != 0)), C2)

47087

// Or (CMOV (ADD (CTTZ X), C2), C1, (X == 0)) ->

47088

// (ADD (CMOV (CTTZ X), C1-C2, (X == 0)), C2)

47089

if ((CC == X86::COND_NE || CC == X86::COND_E) &&

47090

Cond.getOpcode() == X86ISD::CMP && isNullConstant(Cond.getOperand(1))) {

47091

SDValue Add = TrueOp;

47092

SDValue Const = FalseOp;

47093

// Canonicalize the condition code for easier matching and output.

47094

if (CC == X86::COND_E)

47095

std::swap(Add, Const);

47096

47097

// We might have replaced the constant in the cmov with the LHS of the

47098

// compare. If so change it to the RHS of the compare.

47099

if (Const == Cond.getOperand(0))

47100

Const = Cond.getOperand(1);

47101

47102

// Ok, now make sure that Add is (add (cttz X), C2) and Const is a constant.

47103

if (isa<ConstantSDNode>(Const) && Add.getOpcode() == ISD::ADD &&

47104

Add.hasOneUse() && isa<ConstantSDNode>(Add.getOperand(1)) &&

47105

(Add.getOperand(0).getOpcode() == ISD::CTTZ_ZERO_UNDEF ||

47106

Add.getOperand(0).getOpcode() == ISD::CTTZ) &&

47107

Add.getOperand(0).getOperand(0) == Cond.getOperand(0)) {

47108

EVT VT = N->getValueType(0);

47109

// This should constant fold.

47110

SDValue Diff = DAG.getNode(ISD::SUB, DL, VT, Const, Add.getOperand(1));

47111

SDValue CMov =

47112

DAG.getNode(X86ISD::CMOV, DL, VT, Diff, Add.getOperand(0),

47113

DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8), Cond);

47114

return DAG.getNode(ISD::ADD, DL, VT, CMov, Add.getOperand(1));

47115

}

47116

}

47117

47118

return SDValue();

47119

}

47120

47121

/// Different mul shrinking modes.

47122

enum class ShrinkMode { MULS8, MULU8, MULS16, MULU16 };

47123

47124

static bool canReduceVMulWidth(SDNode *N, SelectionDAG &DAG, ShrinkMode &Mode) {

47125

EVT VT = N->getOperand(0).getValueType();

47126

if (VT.getScalarSizeInBits() != 32)

47127

return false;

47128

47129

assert(N->getNumOperands() == 2 && "NumOperands of Mul are 2")(static_cast <bool> (N->getNumOperands() == 2 &&
"NumOperands of Mul are 2") ? void (0) : __assert_fail ("N->getNumOperands() == 2 && \"NumOperands of Mul are 2\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 47129, __extension__
__PRETTY_FUNCTION__));

47130

unsigned SignBits[2] = {1, 1};

47131

bool IsPositive[2] = {false, false};

47132

for (unsigned i = 0; i < 2; i++) {

47133

SDValue Opd = N->getOperand(i);

47134

47135

SignBits[i] = DAG.ComputeNumSignBits(Opd);

47136

IsPositive[i] = DAG.SignBitIsZero(Opd);

47137

}

47138

47139

bool AllPositive = IsPositive[0] && IsPositive[1];

47140

unsigned MinSignBits = std::min(SignBits[0], SignBits[1]);

47141

// When ranges are from -128 ~ 127, use MULS8 mode.

47142

if (MinSignBits >= 25)

47143

Mode = ShrinkMode::MULS8;

47144

// When ranges are from 0 ~ 255, use MULU8 mode.

47145

else if (AllPositive && MinSignBits >= 24)

47146

Mode = ShrinkMode::MULU8;

47147

// When ranges are from -32768 ~ 32767, use MULS16 mode.

47148

else if (MinSignBits >= 17)

47149

Mode = ShrinkMode::MULS16;

47150

// When ranges are from 0 ~ 65535, use MULU16 mode.

47151

else if (AllPositive && MinSignBits >= 16)

47152

Mode = ShrinkMode::MULU16;

47153

else

47154

return false;

47155

return true;

47156

}

47157

47158

/// When the operands of vector mul are extended from smaller size values,

47159

/// like i8 and i16, the type of mul may be shrinked to generate more

47160

/// efficient code. Two typical patterns are handled:

47161

/// Pattern1:

47162

/// %2 = sext/zext <N x i8> %1 to <N x i32>

47163

/// %4 = sext/zext <N x i8> %3 to <N x i32>

47164

// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)

47165

/// %5 = mul <N x i32> %2, %4

47166

///

47167

/// Pattern2:

47168

/// %2 = zext/sext <N x i16> %1 to <N x i32>

47169

/// %4 = zext/sext <N x i16> %3 to <N x i32>

47170

/// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)

47171

/// %5 = mul <N x i32> %2, %4

47172

///

47173

/// There are four mul shrinking modes:

47174

/// If %2 == sext32(trunc8(%2)), i.e., the scalar value range of %2 is

47175

/// -128 to 128, and the scalar value range of %4 is also -128 to 128,

47176

/// generate pmullw+sext32 for it (MULS8 mode).

47177

/// If %2 == zext32(trunc8(%2)), i.e., the scalar value range of %2 is

47178

/// 0 to 255, and the scalar value range of %4 is also 0 to 255,

47179

/// generate pmullw+zext32 for it (MULU8 mode).

47180

/// If %2 == sext32(trunc16(%2)), i.e., the scalar value range of %2 is

47181

/// -32768 to 32767, and the scalar value range of %4 is also -32768 to 32767,

47182

/// generate pmullw+pmulhw for it (MULS16 mode).

47183

/// If %2 == zext32(trunc16(%2)), i.e., the scalar value range of %2 is

47184

/// 0 to 65535, and the scalar value range of %4 is also 0 to 65535,

47185

/// generate pmullw+pmulhuw for it (MULU16 mode).

47186

static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG,

47187

const X86Subtarget &Subtarget) {

47188

// Check for legality

47189

// pmullw/pmulhw are not supported by SSE.

47190

if (!Subtarget.hasSSE2())

47191

return SDValue();

47192

47193

// Check for profitability

47194

// pmulld is supported since SSE41. It is better to use pmulld

47195

// instead of pmullw+pmulhw, except for subtargets where pmulld is slower than

47196

// the expansion.

47197

bool OptForMinSize = DAG.getMachineFunction().getFunction().hasMinSize();

47198

if (Subtarget.hasSSE41() && (OptForMinSize || !Subtarget.isPMULLDSlow()))

47199

return SDValue();

47200

47201

ShrinkMode Mode;

47202

if (!canReduceVMulWidth(N, DAG, Mode))

47203

return SDValue();

47204

47205

SDLoc DL(N);

47206

SDValue N0 = N->getOperand(0);

47207

SDValue N1 = N->getOperand(1);

47208

EVT VT = N->getOperand(0).getValueType();

47209

unsigned NumElts = VT.getVectorNumElements();

47210

if ((NumElts % 2) != 0)

47211

return SDValue();

47212

47213

EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts);

47214

47215

// Shrink the operands of mul.

47216

SDValue NewN0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N0);

47217

SDValue NewN1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N1);

47218

47219

// Generate the lower part of mul: pmullw. For MULU8/MULS8, only the

47220

// lower part is needed.

47221

SDValue MulLo = DAG.getNode(ISD::MUL, DL, ReducedVT, NewN0, NewN1);

47222

if (Mode == ShrinkMode::MULU8 || Mode == ShrinkMode::MULS8)

47223

return DAG.getNode((Mode == ShrinkMode::MULU8) ? ISD::ZERO_EXTEND

47224

: ISD::SIGN_EXTEND,

47225

DL, VT, MulLo);

47226

47227

EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts / 2);

47228

// Generate the higher part of mul: pmulhw/pmulhuw. For MULU16/MULS16,

47229

// the higher part is also needed.

47230

SDValue MulHi =

47231

DAG.getNode(Mode == ShrinkMode::MULS16 ? ISD::MULHS : ISD::MULHU, DL,

47232

ReducedVT, NewN0, NewN1);

47233

47234

// Repack the lower part and higher part result of mul into a wider

47235

// result.

47236

// Generate shuffle functioning as punpcklwd.

47237

SmallVector<int, 16> ShuffleMask(NumElts);

47238

for (unsigned i = 0, e = NumElts / 2; i < e; i++) {

47239

ShuffleMask[2 * i] = i;

47240

ShuffleMask[2 * i + 1] = i + NumElts;

47241

}

47242

SDValue ResLo =

47243

DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);

47244

ResLo = DAG.getBitcast(ResVT, ResLo);

47245

// Generate shuffle functioning as punpckhwd.

47246

for (unsigned i = 0, e = NumElts / 2; i < e; i++) {

47247

ShuffleMask[2 * i] = i + NumElts / 2;

47248

ShuffleMask[2 * i + 1] = i + NumElts * 3 / 2;

47249

}

47250

SDValue ResHi =

47251

DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);

47252

ResHi = DAG.getBitcast(ResVT, ResHi);

47253

return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ResLo, ResHi);

47254

}

47255

47256

static SDValue combineMulSpecial(uint64_t MulAmt, SDNode *N, SelectionDAG &DAG,

47257

EVT VT, const SDLoc &DL) {

47258

47259

auto combineMulShlAddOrSub = [&](int Mult, int Shift, bool isAdd) {

47260

SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),

47261

DAG.getConstant(Mult, DL, VT));

47262

Result = DAG.getNode(ISD::SHL, DL, VT, Result,

47263

DAG.getConstant(Shift, DL, MVT::i8));

47264

Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,

47265

N->getOperand(0));

47266

return Result;

47267

};

47268

47269

auto combineMulMulAddOrSub = [&](int Mul1, int Mul2, bool isAdd) {

47270

SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),

47271

DAG.getConstant(Mul1, DL, VT));

47272

Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, Result,

47273

DAG.getConstant(Mul2, DL, VT));

47274

Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,

47275

N->getOperand(0));

47276

return Result;

47277

};

47278

47279

switch (MulAmt) {

47280

default:

47281

break;

47282

case 11:

47283

// mul x, 11 => add ((shl (mul x, 5), 1), x)

47284

return combineMulShlAddOrSub(5, 1, /*isAdd*/ true);

47285

case 21:

47286

// mul x, 21 => add ((shl (mul x, 5), 2), x)

47287

return combineMulShlAddOrSub(5, 2, /*isAdd*/ true);

47288

case 41:

47289

// mul x, 41 => add ((shl (mul x, 5), 3), x)

47290

return combineMulShlAddOrSub(5, 3, /*isAdd*/ true);

47291

case 22:

47292

// mul x, 22 => add (add ((shl (mul x, 5), 2), x), x)

47293

return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),

47294

combineMulShlAddOrSub(5, 2, /*isAdd*/ true));

47295

case 19:

47296

// mul x, 19 => add ((shl (mul x, 9), 1), x)

47297

return combineMulShlAddOrSub(9, 1, /*isAdd*/ true);

47298

case 37:

47299

// mul x, 37 => add ((shl (mul x, 9), 2), x)

47300

return combineMulShlAddOrSub(9, 2, /*isAdd*/ true);

47301

case 73:

47302

// mul x, 73 => add ((shl (mul x, 9), 3), x)

47303

return combineMulShlAddOrSub(9, 3, /*isAdd*/ true);

47304

case 13:

47305

// mul x, 13 => add ((shl (mul x, 3), 2), x)

47306

return combineMulShlAddOrSub(3, 2, /*isAdd*/ true);

47307

case 23:

47308

// mul x, 23 => sub ((shl (mul x, 3), 3), x)

47309

return combineMulShlAddOrSub(3, 3, /*isAdd*/ false);

47310

case 26:

47311

// mul x, 26 => add ((mul (mul x, 5), 5), x)

47312

return combineMulMulAddOrSub(5, 5, /*isAdd*/ true);

47313

case 28:

47314

// mul x, 28 => add ((mul (mul x, 9), 3), x)

47315

return combineMulMulAddOrSub(9, 3, /*isAdd*/ true);

47316

case 29:

47317

// mul x, 29 => add (add ((mul (mul x, 9), 3), x), x)

47318

return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),

47319

combineMulMulAddOrSub(9, 3, /*isAdd*/ true));

47320

}

47321

47322

// Another trick. If this is a power 2 + 2/4/8, we can use a shift followed

47323

// by a single LEA.

47324

// First check if this a sum of two power of 2s because that's easy. Then

47325

// count how many zeros are up to the first bit.

47326

// TODO: We can do this even without LEA at a cost of two shifts and an add.

47327

if (isPowerOf2_64(MulAmt & (MulAmt - 1))) {

47328

unsigned ScaleShift = countTrailingZeros(MulAmt);

47329

if (ScaleShift >= 1 && ScaleShift < 4) {

47330

unsigned ShiftAmt = Log2_64((MulAmt & (MulAmt - 1)));

47331

SDValue Shift1 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),

47332

DAG.getConstant(ShiftAmt, DL, MVT::i8));

47333

SDValue Shift2 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),

47334

DAG.getConstant(ScaleShift, DL, MVT::i8));

47335

return DAG.getNode(ISD::ADD, DL, VT, Shift1, Shift2);

47336

}

47337

}

47338

47339

return SDValue();

47340

}

47341

47342

// If the upper 17 bits of either element are zero and the other element are

47343

// zero/sign bits then we can use PMADDWD, which is always at least as quick as

47344

// PMULLD, except on KNL.

47345

static SDValue combineMulToPMADDWD(SDNode *N, SelectionDAG &DAG,

47346

const X86Subtarget &Subtarget) {

47347

if (!Subtarget.hasSSE2())

47348

return SDValue();

47349

47350

if (Subtarget.isPMADDWDSlow())

47351

return SDValue();

47352

47353

EVT VT = N->getValueType(0);

47354

47355

// Only support vXi32 vectors.

47356

if (!VT.isVector() || VT.getVectorElementType() != MVT::i32)

47357

return SDValue();

47358

47359

// Make sure the type is legal or can split/widen to a legal type.

47360

// With AVX512 but without BWI, we would need to split v32i16.

47361

unsigned NumElts = VT.getVectorNumElements();

47362

if (NumElts == 1 || !isPowerOf2_32(NumElts))

47363

return SDValue();

47364

47365

// With AVX512 but without BWI, we would need to split v32i16.

47366

if (32 <= (2 * NumElts) && Subtarget.hasAVX512() && !Subtarget.hasBWI())

47367

return SDValue();

47368

47369

SDValue N0 = N->getOperand(0);

47370

SDValue N1 = N->getOperand(1);

47371

47372

// If we are zero/sign extending two steps without SSE4.1, its better to

47373

// reduce the vmul width instead.

47374

if (!Subtarget.hasSSE41() &&

47375

(((N0.getOpcode() == ISD::ZERO_EXTEND &&

47376

N0.getOperand(0).getScalarValueSizeInBits() <= 8) &&

47377

(N1.getOpcode() == ISD::ZERO_EXTEND &&

47378

N1.getOperand(0).getScalarValueSizeInBits() <= 8)) ||

47379

((N0.getOpcode() == ISD::SIGN_EXTEND &&

47380

N0.getOperand(0).getScalarValueSizeInBits() <= 8) &&

47381

(N1.getOpcode() == ISD::SIGN_EXTEND &&

47382

N1.getOperand(0).getScalarValueSizeInBits() <= 8))))

47383

return SDValue();

47384

47385

// If we are sign extending a wide vector without SSE4.1, its better to reduce

47386

// the vmul width instead.

47387

if (!Subtarget.hasSSE41() &&

47388

(N0.getOpcode() == ISD::SIGN_EXTEND &&

47389

N0.getOperand(0).getValueSizeInBits() > 128) &&

47390

(N1.getOpcode() == ISD::SIGN_EXTEND &&

47391

N1.getOperand(0).getValueSizeInBits() > 128))

47392

return SDValue();

47393

47394

// Sign bits must extend down to the lowest i16.

47395

if (DAG.ComputeMaxSignificantBits(N1) > 16 ||

47396

DAG.ComputeMaxSignificantBits(N0) > 16)

47397

return SDValue();

47398

47399

// At least one of the elements must be zero in the upper 17 bits, or can be

47400

// safely made zero without altering the final result.

47401

auto GetZeroableOp = [&](SDValue Op) {

47402

APInt Mask17 = APInt::getHighBitsSet(32, 17);

47403

if (DAG.MaskedValueIsZero(Op, Mask17))

47404

return Op;

47405

// Mask off upper 16-bits of sign-extended constants.

47406

if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode()))

47407

return DAG.getNode(ISD::AND, SDLoc(N), VT, Op,

47408

DAG.getConstant(0xFFFF, SDLoc(N), VT));

47409

if (Op.getOpcode() == ISD::SIGN_EXTEND && N->isOnlyUserOf(Op.getNode())) {

47410

SDValue Src = Op.getOperand(0);

47411

// Convert sext(vXi16) to zext(vXi16).

47412

if (Src.getScalarValueSizeInBits() == 16 && VT.getSizeInBits() <= 128)

47413

return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT, Src);

47414

// Convert sext(vXi8) to zext(vXi16 sext(vXi8)) on pre-SSE41 targets

47415

// which will expand the extension.

47416

if (Src.getScalarValueSizeInBits() < 16 && !Subtarget.hasSSE41()) {

47417

EVT ExtVT = VT.changeVectorElementType(MVT::i16);

47418

Src = DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), ExtVT, Src);

47419

return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT, Src);

47420

}

47421

}

47422

// Convert SIGN_EXTEND_VECTOR_INREG to ZEXT_EXTEND_VECTOR_INREG.

47423

if (Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG &&

47424

N->isOnlyUserOf(Op.getNode())) {

47425

SDValue Src = Op.getOperand(0);

47426

if (Src.getScalarValueSizeInBits() == 16)

47427

return DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(N), VT, Src);

47428

}

47429

// Convert VSRAI(Op, 16) to VSRLI(Op, 16).

47430

if (Op.getOpcode() == X86ISD::VSRAI && Op.getConstantOperandVal(1) == 16 &&

47431

N->isOnlyUserOf(Op.getNode())) {

47432

return DAG.getNode(X86ISD::VSRLI, SDLoc(N), VT, Op.getOperand(0),

47433

Op.getOperand(1));

47434

}

47435

return SDValue();

47436

};

47437

SDValue ZeroN0 = GetZeroableOp(N0);

47438

SDValue ZeroN1 = GetZeroableOp(N1);

47439

if (!ZeroN0 && !ZeroN1)

47440

return SDValue();

47441

N0 = ZeroN0 ? ZeroN0 : N0;

47442

N1 = ZeroN1 ? ZeroN1 : N1;

47443

47444

// Use SplitOpsAndApply to handle AVX splitting.

47445

auto PMADDWDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,

47446

ArrayRef<SDValue> Ops) {

47447

MVT ResVT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);

47448

MVT OpVT = MVT::getVectorVT(MVT::i16, Ops[0].getValueSizeInBits() / 16);

47449

return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT,

47450

DAG.getBitcast(OpVT, Ops[0]),

47451

DAG.getBitcast(OpVT, Ops[1]));

47452

};

47453

return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, {N0, N1},

47454

PMADDWDBuilder);

47455

}

47456

47457

static SDValue combineMulToPMULDQ(SDNode *N, SelectionDAG &DAG,

47458

const X86Subtarget &Subtarget) {

47459

if (!Subtarget.hasSSE2())

47460

return SDValue();

47461

47462

EVT VT = N->getValueType(0);

47463

47464

// Only support vXi64 vectors.

47465

if (!VT.isVector() || VT.getVectorElementType() != MVT::i64 ||

47466

VT.getVectorNumElements() < 2 ||

47467

!isPowerOf2_32(VT.getVectorNumElements()))

47468

return SDValue();

47469

47470

SDValue N0 = N->getOperand(0);

47471

SDValue N1 = N->getOperand(1);

47472

47473

// MULDQ returns the 64-bit result of the signed multiplication of the lower

47474

// 32-bits. We can lower with this if the sign bits stretch that far.

47475

if (Subtarget.hasSSE41() && DAG.ComputeNumSignBits(N0) > 32 &&

47476

DAG.ComputeNumSignBits(N1) > 32) {

47477

auto PMULDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL,

47478

ArrayRef<SDValue> Ops) {

47479

return DAG.getNode(X86ISD::PMULDQ, DL, Ops[0].getValueType(), Ops);

47480

};

47481

return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { N0, N1 },

47482

PMULDQBuilder, /*CheckBWI*/false);

47483

}

47484

47485

// If the upper bits are zero we can use a single pmuludq.

47486

APInt Mask = APInt::getHighBitsSet(64, 32);

47487

if (DAG.MaskedValueIsZero(N0, Mask) && DAG.MaskedValueIsZero(N1, Mask)) {

47488

auto PMULUDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL,

47489

ArrayRef<SDValue> Ops) {

47490

return DAG.getNode(X86ISD::PMULUDQ, DL, Ops[0].getValueType(), Ops);

47491

};

47492

return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { N0, N1 },

47493

PMULUDQBuilder, /*CheckBWI*/false);

47494

}

47495

47496

return SDValue();

47497

}

47498

47499

static SDValue combineMul(SDNode *N, SelectionDAG &DAG,

47500

TargetLowering::DAGCombinerInfo &DCI,

47501

const X86Subtarget &Subtarget) {

47502

EVT VT = N->getValueType(0);

47503

47504

if (SDValue V = combineMulToPMADDWD(N, DAG, Subtarget))

47505

return V;

47506

47507

if (SDValue V = combineMulToPMULDQ(N, DAG, Subtarget))

47508

return V;

47509

47510

if (DCI.isBeforeLegalize() && VT.isVector())

47511

return reduceVMULWidth(N, DAG, Subtarget);

47512

47513

// Optimize a single multiply with constant into two operations in order to

47514

// implement it with two cheaper instructions, e.g. LEA + SHL, LEA + LEA.

47515

if (!MulConstantOptimization)

47516

return SDValue();

47517

47518

// An imul is usually smaller than the alternative sequence.

47519

if (DAG.getMachineFunction().getFunction().hasMinSize())

47520

return SDValue();

47521

47522

if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())

47523

return SDValue();

47524

47525

if (VT != MVT::i64 && VT != MVT::i32)

47526

return SDValue();

47527

47528

ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));

47529

if (!C)

47530

return SDValue();

47531

if (isPowerOf2_64(C->getZExtValue()))

47532

return SDValue();

47533

47534

int64_t SignMulAmt = C->getSExtValue();

47535

assert(SignMulAmt != INT64_MIN && "Int min should have been handled!")(static_cast <bool> (SignMulAmt != (-9223372036854775807L
-1) && "Int min should have been handled!") ? void (
0) : __assert_fail ("SignMulAmt != INT64_MIN && \"Int min should have been handled!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 47535, __extension__
__PRETTY_FUNCTION__));

47536

uint64_t AbsMulAmt = SignMulAmt < 0 ? -SignMulAmt : SignMulAmt;

47537

47538

SDLoc DL(N);

47539

if (AbsMulAmt == 3 || AbsMulAmt == 5 || AbsMulAmt == 9) {

47540

SDValue NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),

47541

DAG.getConstant(AbsMulAmt, DL, VT));

47542

if (SignMulAmt < 0)

47543

NewMul = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),

47544

NewMul);

47545

47546

return NewMul;

47547

}

47548

47549

uint64_t MulAmt1 = 0;

47550

uint64_t MulAmt2 = 0;

47551

if ((AbsMulAmt % 9) == 0) {

47552

MulAmt1 = 9;

47553

MulAmt2 = AbsMulAmt / 9;

47554

} else if ((AbsMulAmt % 5) == 0) {

47555

MulAmt1 = 5;

47556

MulAmt2 = AbsMulAmt / 5;

47557

} else if ((AbsMulAmt % 3) == 0) {

47558

MulAmt1 = 3;

47559

MulAmt2 = AbsMulAmt / 3;

47560

}

47561

47562

SDValue NewMul;

47563

// For negative multiply amounts, only allow MulAmt2 to be a power of 2.

47564

if (MulAmt2 &&

47565

(isPowerOf2_64(MulAmt2) ||

47566

(SignMulAmt >= 0 && (MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)))) {

47567

47568

if (isPowerOf2_64(MulAmt2) &&

47569

!(SignMulAmt >= 0 && N->hasOneUse() &&

47570

N->use_begin()->getOpcode() == ISD::ADD))

47571

// If second multiplifer is pow2, issue it first. We want the multiply by

47572

// 3, 5, or 9 to be folded into the addressing mode unless the lone use

47573

// is an add. Only do this for positive multiply amounts since the

47574

// negate would prevent it from being used as an address mode anyway.

47575

std::swap(MulAmt1, MulAmt2);

47576

47577

if (isPowerOf2_64(MulAmt1))

47578

NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),

47579

DAG.getConstant(Log2_64(MulAmt1), DL, MVT::i8));

47580

else

47581

NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),

47582

DAG.getConstant(MulAmt1, DL, VT));

47583

47584

if (isPowerOf2_64(MulAmt2))

47585

NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,

47586

DAG.getConstant(Log2_64(MulAmt2), DL, MVT::i8));

47587

else

47588

NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,

47589

DAG.getConstant(MulAmt2, DL, VT));

47590

47591

// Negate the result.

47592

if (SignMulAmt < 0)

47593

NewMul = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),

47594

NewMul);

47595

} else if (!Subtarget.slowLEA())

47596

NewMul = combineMulSpecial(C->getZExtValue(), N, DAG, VT, DL);

47597

47598

if (!NewMul) {

47599

assert(C->getZExtValue() != 0 &&(static_cast <bool> (C->getZExtValue() != 0 &&
C->getZExtValue() != (VT == MVT::i64 ? (18446744073709551615UL
) : (4294967295U)) && "Both cases that could cause potential overflows should have "
"already been handled.") ? void (0) : __assert_fail ("C->getZExtValue() != 0 && C->getZExtValue() != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) && \"Both cases that could cause potential overflows should have \" \"already been handled.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 47602, __extension__
__PRETTY_FUNCTION__))

47600

C->getZExtValue() != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) &&(static_cast <bool> (C->getZExtValue() != 0 &&
C->getZExtValue() != (VT == MVT::i64 ? (18446744073709551615UL
) : (4294967295U)) && "Both cases that could cause potential overflows should have "
"already been handled.") ? void (0) : __assert_fail ("C->getZExtValue() != 0 && C->getZExtValue() != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) && \"Both cases that could cause potential overflows should have \" \"already been handled.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 47602, __extension__
__PRETTY_FUNCTION__))

47601

"Both cases that could cause potential overflows should have "(static_cast <bool> (C->getZExtValue() != 0 &&
C->getZExtValue() != (VT == MVT::i64 ? (18446744073709551615UL
) : (4294967295U)) && "Both cases that could cause potential overflows should have "
"already been handled.") ? void (0) : __assert_fail ("C->getZExtValue() != 0 && C->getZExtValue() != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) && \"Both cases that could cause potential overflows should have \" \"already been handled.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 47602, __extension__
__PRETTY_FUNCTION__))

47602

"already been handled.")(static_cast <bool> (C->getZExtValue() != 0 &&
C->getZExtValue() != (VT == MVT::i64 ? (18446744073709551615UL
) : (4294967295U)) && "Both cases that could cause potential overflows should have "
"already been handled.") ? void (0) : __assert_fail ("C->getZExtValue() != 0 && C->getZExtValue() != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) && \"Both cases that could cause potential overflows should have \" \"already been handled.\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 47602, __extension__
__PRETTY_FUNCTION__));

47603

if (isPowerOf2_64(AbsMulAmt - 1)) {

47604

// (mul x, 2^N + 1) => (add (shl x, N), x)

47605

NewMul = DAG.getNode(

47606

ISD::ADD, DL, VT, N->getOperand(0),

47607

DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),

47608

DAG.getConstant(Log2_64(AbsMulAmt - 1), DL,

47609

MVT::i8)));

47610

// To negate, subtract the number from zero

47611

if (SignMulAmt < 0)

47612

NewMul = DAG.getNode(ISD::SUB, DL, VT,

47613

DAG.getConstant(0, DL, VT), NewMul);

47614

} else if (isPowerOf2_64(AbsMulAmt + 1)) {

47615

// (mul x, 2^N - 1) => (sub (shl x, N), x)

47616

NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),

47617

DAG.getConstant(Log2_64(AbsMulAmt + 1),

47618

DL, MVT::i8));

47619

// To negate, reverse the operands of the subtract.

47620

if (SignMulAmt < 0)

47621

NewMul = DAG.getNode(ISD::SUB, DL, VT, N->getOperand(0), NewMul);

47622

else

47623

NewMul = DAG.getNode(ISD::SUB, DL, VT, NewMul, N->getOperand(0));

47624

} else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt - 2)) {

47625

// (mul x, 2^N + 2) => (add (shl x, N), (add x, x))

47626

NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),

47627

DAG.getConstant(Log2_64(AbsMulAmt - 2),

47628

DL, MVT::i8));

47629

NewMul = DAG.getNode(

47630

ISD::ADD, DL, VT, NewMul,

47631

DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0), N->getOperand(0)));

47632

} else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt + 2)) {

47633

// (mul x, 2^N - 2) => (sub (shl x, N), (add x, x))

47634

NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),

47635

DAG.getConstant(Log2_64(AbsMulAmt + 2),

47636

DL, MVT::i8));

47637

NewMul = DAG.getNode(

47638

ISD::SUB, DL, VT, NewMul,

47639

DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0), N->getOperand(0)));

47640

}

47641

}

47642

47643

return NewMul;

47644

}

47645

47646

// Try to form a MULHU or MULHS node by looking for

47647

// (srl (mul ext, ext), 16)

47648

// TODO: This is X86 specific because we want to be able to handle wide types

47649

// before type legalization. But we can only do it if the vector will be

47650

// legalized via widening/splitting. Type legalization can't handle promotion

47651

// of a MULHU/MULHS. There isn't a way to convey this to the generic DAG

47652

// combiner.

47653

static SDValue combineShiftToPMULH(SDNode *N, SelectionDAG &DAG,

47654

const X86Subtarget &Subtarget) {

47655

assert((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) &&(static_cast <bool> ((N->getOpcode() == ISD::SRL || N
->getOpcode() == ISD::SRA) && "SRL or SRA node is required here!"
) ? void (0) : __assert_fail ("(N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) && \"SRL or SRA node is required here!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 47656, __extension__
__PRETTY_FUNCTION__))

47656

"SRL or SRA node is required here!")(static_cast <bool> ((N->getOpcode() == ISD::SRL || N
->getOpcode() == ISD::SRA) && "SRL or SRA node is required here!"
) ? void (0) : __assert_fail ("(N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) && \"SRL or SRA node is required here!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 47656, __extension__
__PRETTY_FUNCTION__));

47657

SDLoc DL(N);

47658

47659

if (!Subtarget.hasSSE2())

47660

return SDValue();

47661

47662

// The operation feeding into the shift must be a multiply.

47663

SDValue ShiftOperand = N->getOperand(0);

47664

if (ShiftOperand.getOpcode() != ISD::MUL || !ShiftOperand.hasOneUse())

47665

return SDValue();

47666

47667

// Input type should be at least vXi32.

47668

EVT VT = N->getValueType(0);

47669

if (!VT.isVector() || VT.getVectorElementType().getSizeInBits() < 32)

47670

return SDValue();

47671

47672

// Need a shift by 16.

47673

APInt ShiftAmt;

47674

if (!ISD::isConstantSplatVector(N->getOperand(1).getNode(), ShiftAmt) ||

47675

ShiftAmt != 16)

47676

return SDValue();

47677

47678

SDValue LHS = ShiftOperand.getOperand(0);

47679

SDValue RHS = ShiftOperand.getOperand(1);

47680

47681

unsigned ExtOpc = LHS.getOpcode();

47682

if ((ExtOpc != ISD::SIGN_EXTEND && ExtOpc != ISD::ZERO_EXTEND) ||

47683

RHS.getOpcode() != ExtOpc)

47684

return SDValue();

47685

47686

// Peek through the extends.

47687

LHS = LHS.getOperand(0);

47688

RHS = RHS.getOperand(0);

47689

47690

// Ensure the input types match.

47691

EVT MulVT = LHS.getValueType();

47692

if (MulVT.getVectorElementType() != MVT::i16 || RHS.getValueType() != MulVT)

47693

return SDValue();

47694

47695

unsigned Opc = ExtOpc == ISD::SIGN_EXTEND ? ISD::MULHS : ISD::MULHU;

47696

SDValue Mulh = DAG.getNode(Opc, DL, MulVT, LHS, RHS);

47697

47698

ExtOpc = N->getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;

47699

return DAG.getNode(ExtOpc, DL, VT, Mulh);

47700

}

47701

47702

static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG) {

47703

SDValue N0 = N->getOperand(0);

47704

SDValue N1 = N->getOperand(1);

47705

ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);

47706

EVT VT = N0.getValueType();

47707

47708

// fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))

47709

// since the result of setcc_c is all zero's or all ones.

47710

if (VT.isInteger() && !VT.isVector() &&

47711

N1C && N0.getOpcode() == ISD::AND &&

47712

N0.getOperand(1).getOpcode() == ISD::Constant) {

47713

SDValue N00 = N0.getOperand(0);

47714

APInt Mask = N0.getConstantOperandAPInt(1);

47715

Mask <<= N1C->getAPIntValue();

47716

bool MaskOK = false;

47717

// We can handle cases concerning bit-widening nodes containing setcc_c if

47718

// we carefully interrogate the mask to make sure we are semantics

47719

// preserving.

47720

// The transform is not safe if the result of C1 << C2 exceeds the bitwidth

47721

// of the underlying setcc_c operation if the setcc_c was zero extended.

47722

// Consider the following example:

47723

// zext(setcc_c) -> i32 0x0000FFFF

47724

// c1 -> i32 0x0000FFFF

47725

// c2 -> i32 0x00000001

47726

// (shl (and (setcc_c), c1), c2) -> i32 0x0001FFFE

47727

// (and setcc_c, (c1 << c2)) -> i32 0x0000FFFE

47728

if (N00.getOpcode() == X86ISD::SETCC_CARRY) {

47729

MaskOK = true;

47730

} else if (N00.getOpcode() == ISD::SIGN_EXTEND &&

47731

N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {

47732

MaskOK = true;

47733

} else if ((N00.getOpcode() == ISD::ZERO_EXTEND ||

47734

N00.getOpcode() == ISD::ANY_EXTEND) &&

47735

N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {

47736

MaskOK = Mask.isIntN(N00.getOperand(0).getValueSizeInBits());

47737

}

47738

if (MaskOK && Mask != 0) {

47739

SDLoc DL(N);

47740

return DAG.getNode(ISD::AND, DL, VT, N00, DAG.getConstant(Mask, DL, VT));

47741

}

47742

}

47743

47744

return SDValue();

47745

}

47746

47747

static SDValue combineShiftRightArithmetic(SDNode *N, SelectionDAG &DAG,

47748

const X86Subtarget &Subtarget) {

47749

SDValue N0 = N->getOperand(0);

47750

SDValue N1 = N->getOperand(1);

47751

EVT VT = N0.getValueType();

47752

unsigned Size = VT.getSizeInBits();

47753

47754

if (SDValue V = combineShiftToPMULH(N, DAG, Subtarget))

47755

return V;

47756

47757

// fold (ashr (shl, a, [56,48,32,24,16]), SarConst)

47758

// into (shl, (sext (a), [56,48,32,24,16] - SarConst)) or

47759

// into (lshr, (sext (a), SarConst - [56,48,32,24,16]))

47760

// depending on sign of (SarConst - [56,48,32,24,16])

47761

47762

// sexts in X86 are MOVs. The MOVs have the same code size

47763

// as above SHIFTs (only SHIFT on 1 has lower code size).

47764

// However the MOVs have 2 advantages to a SHIFT:

47765

// 1. MOVs can write to a register that differs from source

47766

// 2. MOVs accept memory operands

47767

47768

if (VT.isVector() || N1.getOpcode() != ISD::Constant ||

47769

N0.getOpcode() != ISD::SHL || !N0.hasOneUse() ||

47770

N0.getOperand(1).getOpcode() != ISD::Constant)

47771

return SDValue();

47772

47773

SDValue N00 = N0.getOperand(0);

47774

SDValue N01 = N0.getOperand(1);

47775

APInt ShlConst = (cast<ConstantSDNode>(N01))->getAPIntValue();

47776

APInt SarConst = (cast<ConstantSDNode>(N1))->getAPIntValue();

47777

EVT CVT = N1.getValueType();

47778

47779

if (SarConst.isNegative())

47780

return SDValue();

47781

47782

for (MVT SVT : { MVT::i8, MVT::i16, MVT::i32 }) {

47783

unsigned ShiftSize = SVT.getSizeInBits();

47784

// skipping types without corresponding sext/zext and

47785

// ShlConst that is not one of [56,48,32,24,16]

47786

if (ShiftSize >= Size || ShlConst != Size - ShiftSize)

47787

continue;

47788

SDLoc DL(N);

47789

SDValue NN =

47790

DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, N00, DAG.getValueType(SVT));

47791

SarConst = SarConst - (Size - ShiftSize);

47792

if (SarConst == 0)

47793

return NN;

47794

if (SarConst.isNegative())

47795

return DAG.getNode(ISD::SHL, DL, VT, NN,

47796

DAG.getConstant(-SarConst, DL, CVT));

47797

return DAG.getNode(ISD::SRA, DL, VT, NN,

47798

DAG.getConstant(SarConst, DL, CVT));

47799

}

47800

return SDValue();

47801

}

47802

47803

static SDValue combineShiftRightLogical(SDNode *N, SelectionDAG &DAG,

47804

TargetLowering::DAGCombinerInfo &DCI,

47805

const X86Subtarget &Subtarget) {

47806

SDValue N0 = N->getOperand(0);

47807

SDValue N1 = N->getOperand(1);

47808

EVT VT = N0.getValueType();

47809

47810

if (SDValue V = combineShiftToPMULH(N, DAG, Subtarget))

47811

return V;

47812

47813

// Only do this on the last DAG combine as it can interfere with other

47814

// combines.

47815

if (!DCI.isAfterLegalizeDAG())

47816

return SDValue();

47817

47818

// Try to improve a sequence of srl (and X, C1), C2 by inverting the order.

47819

// TODO: This is a generic DAG combine that became an x86-only combine to

47820

// avoid shortcomings in other folds such as bswap, bit-test ('bt'), and

47821

// and-not ('andn').

47822

if (N0.getOpcode() != ISD::AND || !N0.hasOneUse())

47823

return SDValue();

47824

47825

auto *ShiftC = dyn_cast<ConstantSDNode>(N1);

47826

auto *AndC = dyn_cast<ConstantSDNode>(N0.getOperand(1));

47827

if (!ShiftC || !AndC)

47828

return SDValue();

47829

47830

// If we can shrink the constant mask below 8-bits or 32-bits, then this

47831

// transform should reduce code size. It may also enable secondary transforms

47832

// from improved known-bits analysis or instruction selection.

47833

APInt MaskVal = AndC->getAPIntValue();

47834

47835

// If this can be matched by a zero extend, don't optimize.

47836

if (MaskVal.isMask()) {

47837

unsigned TO = MaskVal.countTrailingOnes();

47838

if (TO >= 8 && isPowerOf2_32(TO))

47839

return SDValue();

47840

}

47841

47842

APInt NewMaskVal = MaskVal.lshr(ShiftC->getAPIntValue());

47843

unsigned OldMaskSize = MaskVal.getMinSignedBits();

47844

unsigned NewMaskSize = NewMaskVal.getMinSignedBits();

47845

if ((OldMaskSize > 8 && NewMaskSize <= 8) ||

47846

(OldMaskSize > 32 && NewMaskSize <= 32)) {

47847

// srl (and X, AndC), ShiftC --> and (srl X, ShiftC), (AndC >> ShiftC)

47848

SDLoc DL(N);

47849

SDValue NewMask = DAG.getConstant(NewMaskVal, DL, VT);

47850

SDValue NewShift = DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0), N1);

47851

return DAG.getNode(ISD::AND, DL, VT, NewShift, NewMask);

47852

}

47853

return SDValue();

47854

}

47855

47856

static SDValue combineHorizOpWithShuffle(SDNode *N, SelectionDAG &DAG,

47857

const X86Subtarget &Subtarget) {

47858

unsigned Opcode = N->getOpcode();

47859

assert(isHorizOp(Opcode) && "Unexpected hadd/hsub/pack opcode")(static_cast <bool> (isHorizOp(Opcode) && "Unexpected hadd/hsub/pack opcode"
) ? void (0) : __assert_fail ("isHorizOp(Opcode) && \"Unexpected hadd/hsub/pack opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 47859, __extension__
__PRETTY_FUNCTION__));

47860

47861

SDLoc DL(N);

47862

EVT VT = N->getValueType(0);

47863

SDValue N0 = N->getOperand(0);

47864

SDValue N1 = N->getOperand(1);

47865

EVT SrcVT = N0.getValueType();

47866

47867

SDValue BC0 =

47868

N->isOnlyUserOf(N0.getNode()) ? peekThroughOneUseBitcasts(N0) : N0;

47869

SDValue BC1 =

47870

N->isOnlyUserOf(N1.getNode()) ? peekThroughOneUseBitcasts(N1) : N1;

47871

47872

// Attempt to fold HOP(LOSUBVECTOR(SHUFFLE(X)),HISUBVECTOR(SHUFFLE(X)))

47873

// to SHUFFLE(HOP(LOSUBVECTOR(X),HISUBVECTOR(X))), this is mainly for

47874

// truncation trees that help us avoid lane crossing shuffles.

47875

// TODO: There's a lot more we can do for PACK/HADD style shuffle combines.

47876

// TODO: We don't handle vXf64 shuffles yet.

47877

if (VT.is128BitVector() && SrcVT.getScalarSizeInBits() <= 32) {

47878

if (SDValue BCSrc = getSplitVectorSrc(BC0, BC1, false)) {

47879

SmallVector<SDValue> ShuffleOps;

47880

SmallVector<int> ShuffleMask, ScaledMask;

47881

SDValue Vec = peekThroughBitcasts(BCSrc);

47882

if (getTargetShuffleInputs(Vec, ShuffleOps, ShuffleMask, DAG)) {

47883

resolveTargetShuffleInputsAndMask(ShuffleOps, ShuffleMask);

47884

// To keep the HOP LHS/RHS coherency, we must be able to scale the unary

47885

// shuffle to a v4X64 width - we can probably relax this in the future.

47886

if (!isAnyZero(ShuffleMask) && ShuffleOps.size() == 1 &&

47887

ShuffleOps[0].getValueType().is256BitVector() &&

47888

scaleShuffleElements(ShuffleMask, 4, ScaledMask)) {

47889

SDValue Lo, Hi;

47890

MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f32 : MVT::v4i32;

47891

std::tie(Lo, Hi) = DAG.SplitVector(ShuffleOps[0], DL);

47892

Lo = DAG.getBitcast(SrcVT, Lo);

47893

Hi = DAG.getBitcast(SrcVT, Hi);

47894

SDValue Res = DAG.getNode(Opcode, DL, VT, Lo, Hi);

47895

Res = DAG.getBitcast(ShufVT, Res);

47896

Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, ScaledMask);

47897

return DAG.getBitcast(VT, Res);

47898

}

47899

}

47900

}

47901

}

47902

47903

// Attempt to fold HOP(SHUFFLE(X,Y),SHUFFLE(Z,W)) -> SHUFFLE(HOP()).

47904

if (VT.is128BitVector() && SrcVT.getScalarSizeInBits() <= 32) {

47905

// If either/both ops are a shuffle that can scale to v2x64,

47906

// then see if we can perform this as a v4x32 post shuffle.

47907

SmallVector<SDValue> Ops0, Ops1;

47908

SmallVector<int> Mask0, Mask1, ScaledMask0, ScaledMask1;

47909

bool IsShuf0 =

47910

getTargetShuffleInputs(BC0, Ops0, Mask0, DAG) && !isAnyZero(Mask0) &&

47911

scaleShuffleElements(Mask0, 2, ScaledMask0) &&

47912

all_of(Ops0, [](SDValue Op) { return Op.getValueSizeInBits() == 128; });

47913

bool IsShuf1 =

47914

getTargetShuffleInputs(BC1, Ops1, Mask1, DAG) && !isAnyZero(Mask1) &&

47915

scaleShuffleElements(Mask1, 2, ScaledMask1) &&

47916

all_of(Ops1, [](SDValue Op) { return Op.getValueSizeInBits() == 128; });

47917

if (IsShuf0 || IsShuf1) {

47918

if (!IsShuf0) {

47919

Ops0.assign({BC0});

47920

ScaledMask0.assign({0, 1});

47921

}

47922

if (!IsShuf1) {

47923

Ops1.assign({BC1});

47924

ScaledMask1.assign({0, 1});

47925

}

47926

47927

SDValue LHS, RHS;

47928

int PostShuffle[4] = {-1, -1, -1, -1};

47929

auto FindShuffleOpAndIdx = [&](int M, int &Idx, ArrayRef<SDValue> Ops) {

47930

if (M < 0)

47931

return true;

47932

Idx = M % 2;

47933

SDValue Src = Ops[M / 2];

47934

if (!LHS || LHS == Src) {

47935

LHS = Src;

47936

return true;

47937

}

47938

if (!RHS || RHS == Src) {

47939

Idx += 2;

47940

RHS = Src;

47941

return true;

47942

}

47943

return false;

47944

};

47945

if (FindShuffleOpAndIdx(ScaledMask0[0], PostShuffle[0], Ops0) &&

47946

FindShuffleOpAndIdx(ScaledMask0[1], PostShuffle[1], Ops0) &&

47947

FindShuffleOpAndIdx(ScaledMask1[0], PostShuffle[2], Ops1) &&

47948

FindShuffleOpAndIdx(ScaledMask1[1], PostShuffle[3], Ops1)) {

47949

LHS = DAG.getBitcast(SrcVT, LHS);

47950

RHS = DAG.getBitcast(SrcVT, RHS ? RHS : LHS);

47951

MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f32 : MVT::v4i32;

47952

SDValue Res = DAG.getNode(Opcode, DL, VT, LHS, RHS);

47953

Res = DAG.getBitcast(ShufVT, Res);

47954

Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, PostShuffle);

47955

return DAG.getBitcast(VT, Res);

47956

}

47957

}

47958

}

47959

47960

// Attempt to fold HOP(SHUFFLE(X,Y),SHUFFLE(X,Y)) -> SHUFFLE(HOP(X,Y)).

47961

if (VT.is256BitVector() && Subtarget.hasInt256()) {

47962

SmallVector<int> Mask0, Mask1;

47963

SmallVector<SDValue> Ops0, Ops1;

47964

SmallVector<int, 2> ScaledMask0, ScaledMask1;

47965

if (getTargetShuffleInputs(BC0, Ops0, Mask0, DAG) && !isAnyZero(Mask0) &&

47966

getTargetShuffleInputs(BC1, Ops1, Mask1, DAG) && !isAnyZero(Mask1) &&

47967

!Ops0.empty() && !Ops1.empty() &&

47968

all_of(Ops0,

47969

[](SDValue Op) { return Op.getValueType().is256BitVector(); }) &&

47970

all_of(Ops1,

47971

[](SDValue Op) { return Op.getValueType().is256BitVector(); }) &&

47972

scaleShuffleElements(Mask0, 2, ScaledMask0) &&

47973

scaleShuffleElements(Mask1, 2, ScaledMask1)) {

47974

SDValue Op00 = peekThroughBitcasts(Ops0.front());

47975

SDValue Op10 = peekThroughBitcasts(Ops1.front());

47976

SDValue Op01 = peekThroughBitcasts(Ops0.back());

47977

SDValue Op11 = peekThroughBitcasts(Ops1.back());

47978

if ((Op00 == Op11) && (Op01 == Op10)) {

47979

std::swap(Op10, Op11);

47980

ShuffleVectorSDNode::commuteMask(ScaledMask1);

47981

}

47982

if ((Op00 == Op10) && (Op01 == Op11)) {

47983

const int Map[4] = {0, 2, 1, 3};

47984

SmallVector<int, 4> ShuffleMask(

47985

{Map[ScaledMask0[0]], Map[ScaledMask1[0]], Map[ScaledMask0[1]],

47986

Map[ScaledMask1[1]]});

47987

MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;

47988

SDValue Res = DAG.getNode(Opcode, DL, VT, DAG.getBitcast(SrcVT, Op00),

47989

DAG.getBitcast(SrcVT, Op01));

47990

Res = DAG.getBitcast(ShufVT, Res);

47991

Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, ShuffleMask);

47992

return DAG.getBitcast(VT, Res);

47993

}

47994

}

47995

}

47996

47997

return SDValue();

47998

}

47999

48000

static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG,

48001

TargetLowering::DAGCombinerInfo &DCI,

48002

const X86Subtarget &Subtarget) {

48003

unsigned Opcode = N->getOpcode();

48004

assert((X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) &&(static_cast <bool> ((X86ISD::PACKSS == Opcode || X86ISD
::PACKUS == Opcode) && "Unexpected pack opcode") ? void
(0) : __assert_fail ("(X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) && \"Unexpected pack opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48005, __extension__
__PRETTY_FUNCTION__))

48005

"Unexpected pack opcode")(static_cast <bool> ((X86ISD::PACKSS == Opcode || X86ISD
::PACKUS == Opcode) && "Unexpected pack opcode") ? void
(0) : __assert_fail ("(X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) && \"Unexpected pack opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48005, __extension__
__PRETTY_FUNCTION__));

48006

48007

EVT VT = N->getValueType(0);

48008

SDValue N0 = N->getOperand(0);

48009

SDValue N1 = N->getOperand(1);

48010

unsigned NumDstElts = VT.getVectorNumElements();

48011

unsigned DstBitsPerElt = VT.getScalarSizeInBits();

48012

unsigned SrcBitsPerElt = 2 * DstBitsPerElt;

48013

assert(N0.getScalarValueSizeInBits() == SrcBitsPerElt &&(static_cast <bool> (N0.getScalarValueSizeInBits() == SrcBitsPerElt
&& N1.getScalarValueSizeInBits() == SrcBitsPerElt &&
"Unexpected PACKSS/PACKUS input type") ? void (0) : __assert_fail
("N0.getScalarValueSizeInBits() == SrcBitsPerElt && N1.getScalarValueSizeInBits() == SrcBitsPerElt && \"Unexpected PACKSS/PACKUS input type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48015, __extension__
__PRETTY_FUNCTION__))

48014

N1.getScalarValueSizeInBits() == SrcBitsPerElt &&(static_cast <bool> (N0.getScalarValueSizeInBits() == SrcBitsPerElt
&& N1.getScalarValueSizeInBits() == SrcBitsPerElt &&
"Unexpected PACKSS/PACKUS input type") ? void (0) : __assert_fail
("N0.getScalarValueSizeInBits() == SrcBitsPerElt && N1.getScalarValueSizeInBits() == SrcBitsPerElt && \"Unexpected PACKSS/PACKUS input type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48015, __extension__
__PRETTY_FUNCTION__))

48015

"Unexpected PACKSS/PACKUS input type")(static_cast <bool> (N0.getScalarValueSizeInBits() == SrcBitsPerElt
&& N1.getScalarValueSizeInBits() == SrcBitsPerElt &&
"Unexpected PACKSS/PACKUS input type") ? void (0) : __assert_fail
("N0.getScalarValueSizeInBits() == SrcBitsPerElt && N1.getScalarValueSizeInBits() == SrcBitsPerElt && \"Unexpected PACKSS/PACKUS input type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48015, __extension__
__PRETTY_FUNCTION__));

48016

48017

bool IsSigned = (X86ISD::PACKSS == Opcode);

48018

48019

// Constant Folding.

48020

APInt UndefElts0, UndefElts1;

48021

SmallVector<APInt, 32> EltBits0, EltBits1;

48022

if ((N0.isUndef() || N->isOnlyUserOf(N0.getNode())) &&

48023

(N1.isUndef() || N->isOnlyUserOf(N1.getNode())) &&

48024

getTargetConstantBitsFromNode(N0, SrcBitsPerElt, UndefElts0, EltBits0) &&

48025

getTargetConstantBitsFromNode(N1, SrcBitsPerElt, UndefElts1, EltBits1)) {

48026

unsigned NumLanes = VT.getSizeInBits() / 128;

48027

unsigned NumSrcElts = NumDstElts / 2;

48028

unsigned NumDstEltsPerLane = NumDstElts / NumLanes;

48029

unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes;

48030

48031

APInt Undefs(NumDstElts, 0);

48032

SmallVector<APInt, 32> Bits(NumDstElts, APInt::getZero(DstBitsPerElt));

48033

for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {

48034

for (unsigned Elt = 0; Elt != NumDstEltsPerLane; ++Elt) {

48035

unsigned SrcIdx = Lane * NumSrcEltsPerLane + Elt % NumSrcEltsPerLane;

48036

auto &UndefElts = (Elt >= NumSrcEltsPerLane ? UndefElts1 : UndefElts0);

48037

auto &EltBits = (Elt >= NumSrcEltsPerLane ? EltBits1 : EltBits0);

48038

48039

if (UndefElts[SrcIdx]) {

48040

Undefs.setBit(Lane * NumDstEltsPerLane + Elt);

48041

continue;

48042

}

48043

48044

APInt &Val = EltBits[SrcIdx];

48045

if (IsSigned) {

48046

// PACKSS: Truncate signed value with signed saturation.

48047

// Source values less than dst minint are saturated to minint.

48048

// Source values greater than dst maxint are saturated to maxint.

48049

if (Val.isSignedIntN(DstBitsPerElt))

48050

Val = Val.trunc(DstBitsPerElt);

48051

else if (Val.isNegative())

48052

Val = APInt::getSignedMinValue(DstBitsPerElt);

48053

else

48054

Val = APInt::getSignedMaxValue(DstBitsPerElt);

48055

} else {

48056

// PACKUS: Truncate signed value with unsigned saturation.

48057

// Source values less than zero are saturated to zero.

48058

// Source values greater than dst maxuint are saturated to maxuint.

48059

if (Val.isIntN(DstBitsPerElt))

48060

Val = Val.trunc(DstBitsPerElt);

48061

else if (Val.isNegative())

48062

Val = APInt::getZero(DstBitsPerElt);

48063

else

48064

Val = APInt::getAllOnes(DstBitsPerElt);

48065

}

48066

Bits[Lane * NumDstEltsPerLane + Elt] = Val;

48067

}

48068

}

48069

48070

return getConstVector(Bits, Undefs, VT.getSimpleVT(), DAG, SDLoc(N));

48071

}

48072

48073

// Try to fold PACK(SHUFFLE(),SHUFFLE()) -> SHUFFLE(PACK()).

48074

if (SDValue V = combineHorizOpWithShuffle(N, DAG, Subtarget))

48075

return V;

48076

48077

// Try to combine a PACKUSWB/PACKSSWB implemented truncate with a regular

48078

// truncate to create a larger truncate.

48079

if (Subtarget.hasAVX512() &&

48080

N0.getOpcode() == ISD::TRUNCATE && N1.isUndef() && VT == MVT::v16i8 &&

48081

N0.getOperand(0).getValueType() == MVT::v8i32) {

48082

if ((IsSigned && DAG.ComputeNumSignBits(N0) > 8) ||

48083

(!IsSigned &&

48084

DAG.MaskedValueIsZero(N0, APInt::getHighBitsSet(16, 8)))) {

48085

if (Subtarget.hasVLX())

48086

return DAG.getNode(X86ISD::VTRUNC, SDLoc(N), VT, N0.getOperand(0));

48087

48088

// Widen input to v16i32 so we can truncate that.

48089

SDLoc dl(N);

48090

SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i32,

48091

N0.getOperand(0), DAG.getUNDEF(MVT::v8i32));

48092

return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Concat);

48093

}

48094

}

48095

48096

// Try to fold PACK(EXTEND(X),EXTEND(Y)) -> CONCAT(X,Y) subvectors.

48097

if (VT.is128BitVector()) {

48098

unsigned ExtOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;

48099

SDValue Src0, Src1;

48100

if (N0.getOpcode() == ExtOpc &&

48101

N0.getOperand(0).getValueType().is64BitVector() &&

48102

N0.getOperand(0).getScalarValueSizeInBits() == DstBitsPerElt) {

48103

Src0 = N0.getOperand(0);

48104

}

48105

if (N1.getOpcode() == ExtOpc &&

48106

N1.getOperand(0).getValueType().is64BitVector() &&

48107

N1.getOperand(0).getScalarValueSizeInBits() == DstBitsPerElt) {

48108

Src1 = N1.getOperand(0);

48109

}

48110

if ((Src0 || N0.isUndef()) && (Src1 || N1.isUndef())) {

48111

assert((Src0 || Src1) && "Found PACK(UNDEF,UNDEF)")(static_cast <bool> ((Src0 || Src1) && "Found PACK(UNDEF,UNDEF)"
) ? void (0) : __assert_fail ("(Src0 || Src1) && \"Found PACK(UNDEF,UNDEF)\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48111, __extension__
__PRETTY_FUNCTION__));

48112

Src0 = Src0 ? Src0 : DAG.getUNDEF(Src1.getValueType());

48113

Src1 = Src1 ? Src1 : DAG.getUNDEF(Src0.getValueType());

48114

return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Src0, Src1);

48115

}

48116

48117

// Try again with pack(*_extend_vector_inreg, undef).

48118

unsigned VecInRegOpc = IsSigned ? ISD::SIGN_EXTEND_VECTOR_INREG

48119

: ISD::ZERO_EXTEND_VECTOR_INREG;

48120

if (N0.getOpcode() == VecInRegOpc && N1.isUndef() &&

48121

N0.getOperand(0).getScalarValueSizeInBits() < DstBitsPerElt)

48122

return getEXTEND_VECTOR_INREG(ExtOpc, SDLoc(N), VT, N0.getOperand(0),

48123

DAG);

48124

}

48125

48126

// Attempt to combine as shuffle.

48127

SDValue Op(N, 0);

48128

if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))

48129

return Res;

48130

48131

return SDValue();

48132

}

48133

48134

static SDValue combineVectorHADDSUB(SDNode *N, SelectionDAG &DAG,

48135

TargetLowering::DAGCombinerInfo &DCI,

48136

const X86Subtarget &Subtarget) {

48137

assert((X86ISD::HADD == N->getOpcode() || X86ISD::FHADD == N->getOpcode() ||(static_cast <bool> ((X86ISD::HADD == N->getOpcode()
|| X86ISD::FHADD == N->getOpcode() || X86ISD::HSUB == N->
getOpcode() || X86ISD::FHSUB == N->getOpcode()) &&
"Unexpected horizontal add/sub opcode") ? void (0) : __assert_fail
("(X86ISD::HADD == N->getOpcode() || X86ISD::FHADD == N->getOpcode() || X86ISD::HSUB == N->getOpcode() || X86ISD::FHSUB == N->getOpcode()) && \"Unexpected horizontal add/sub opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48139, __extension__
__PRETTY_FUNCTION__))

48138

X86ISD::HSUB == N->getOpcode() || X86ISD::FHSUB == N->getOpcode()) &&(static_cast <bool> ((X86ISD::HADD == N->getOpcode()
|| X86ISD::FHADD == N->getOpcode() || X86ISD::HSUB == N->
getOpcode() || X86ISD::FHSUB == N->getOpcode()) &&
"Unexpected horizontal add/sub opcode") ? void (0) : __assert_fail
("(X86ISD::HADD == N->getOpcode() || X86ISD::FHADD == N->getOpcode() || X86ISD::HSUB == N->getOpcode() || X86ISD::FHSUB == N->getOpcode()) && \"Unexpected horizontal add/sub opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48139, __extension__
__PRETTY_FUNCTION__))

48139

"Unexpected horizontal add/sub opcode")(static_cast <bool> ((X86ISD::HADD == N->getOpcode()
|| X86ISD::FHADD == N->getOpcode() || X86ISD::HSUB == N->
getOpcode() || X86ISD::FHSUB == N->getOpcode()) &&
"Unexpected horizontal add/sub opcode") ? void (0) : __assert_fail
("(X86ISD::HADD == N->getOpcode() || X86ISD::FHADD == N->getOpcode() || X86ISD::HSUB == N->getOpcode() || X86ISD::FHSUB == N->getOpcode()) && \"Unexpected horizontal add/sub opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48139, __extension__
__PRETTY_FUNCTION__));

48140

48141

if (!shouldUseHorizontalOp(true, DAG, Subtarget)) {

48142

MVT VT = N->getSimpleValueType(0);

48143

SDValue LHS = N->getOperand(0);

48144

SDValue RHS = N->getOperand(1);

48145

48146

// HOP(HOP'(X,X),HOP'(Y,Y)) -> HOP(PERMUTE(HOP'(X,Y)),PERMUTE(HOP'(X,Y)).

48147

if (LHS != RHS && LHS.getOpcode() == N->getOpcode() &&

48148

LHS.getOpcode() == RHS.getOpcode() &&

48149

LHS.getValueType() == RHS.getValueType() &&

48150

N->isOnlyUserOf(LHS.getNode()) && N->isOnlyUserOf(RHS.getNode())) {

48151

SDValue LHS0 = LHS.getOperand(0);

48152

SDValue LHS1 = LHS.getOperand(1);

48153

SDValue RHS0 = RHS.getOperand(0);

48154

SDValue RHS1 = RHS.getOperand(1);

48155

if ((LHS0 == LHS1 || LHS0.isUndef() || LHS1.isUndef()) &&

48156

(RHS0 == RHS1 || RHS0.isUndef() || RHS1.isUndef())) {

48157

SDLoc DL(N);

48158

SDValue Res = DAG.getNode(LHS.getOpcode(), DL, LHS.getValueType(),

48159

LHS0.isUndef() ? LHS1 : LHS0,

48160

RHS0.isUndef() ? RHS1 : RHS0);

48161

MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);

48162

Res = DAG.getBitcast(ShufVT, Res);

48163

SDValue NewLHS =

48164

DAG.getNode(X86ISD::PSHUFD, DL, ShufVT, Res,

48165

getV4X86ShuffleImm8ForMask({0, 1, 0, 1}, DL, DAG));

48166

SDValue NewRHS =

48167

DAG.getNode(X86ISD::PSHUFD, DL, ShufVT, Res,

48168

getV4X86ShuffleImm8ForMask({2, 3, 2, 3}, DL, DAG));

48169

return DAG.getNode(N->getOpcode(), DL, VT, DAG.getBitcast(VT, NewLHS),

48170

DAG.getBitcast(VT, NewRHS));

48171

}

48172

}

48173

}

48174

48175

// Try to fold HOP(SHUFFLE(),SHUFFLE()) -> SHUFFLE(HOP()).

48176

if (SDValue V = combineHorizOpWithShuffle(N, DAG, Subtarget))

48177

return V;

48178

48179

return SDValue();

48180

}

48181

48182

static SDValue combineVectorShiftVar(SDNode *N, SelectionDAG &DAG,

48183

TargetLowering::DAGCombinerInfo &DCI,

48184

const X86Subtarget &Subtarget) {

48185

assert((X86ISD::VSHL == N->getOpcode() || X86ISD::VSRA == N->getOpcode() ||(static_cast <bool> ((X86ISD::VSHL == N->getOpcode()
|| X86ISD::VSRA == N->getOpcode() || X86ISD::VSRL == N->
getOpcode()) && "Unexpected shift opcode") ? void (0)
: __assert_fail ("(X86ISD::VSHL == N->getOpcode() || X86ISD::VSRA == N->getOpcode() || X86ISD::VSRL == N->getOpcode()) && \"Unexpected shift opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48187, __extension__
__PRETTY_FUNCTION__))

48186

X86ISD::VSRL == N->getOpcode()) &&(static_cast <bool> ((X86ISD::VSHL == N->getOpcode()
|| X86ISD::VSRA == N->getOpcode() || X86ISD::VSRL == N->
getOpcode()) && "Unexpected shift opcode") ? void (0)
: __assert_fail ("(X86ISD::VSHL == N->getOpcode() || X86ISD::VSRA == N->getOpcode() || X86ISD::VSRL == N->getOpcode()) && \"Unexpected shift opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48187, __extension__
__PRETTY_FUNCTION__))

48187

"Unexpected shift opcode")(static_cast <bool> ((X86ISD::VSHL == N->getOpcode()
|| X86ISD::VSRA == N->getOpcode() || X86ISD::VSRL == N->
getOpcode()) && "Unexpected shift opcode") ? void (0)
: __assert_fail ("(X86ISD::VSHL == N->getOpcode() || X86ISD::VSRA == N->getOpcode() || X86ISD::VSRL == N->getOpcode()) && \"Unexpected shift opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48187, __extension__
__PRETTY_FUNCTION__));

48188

EVT VT = N->getValueType(0);

48189

SDValue N0 = N->getOperand(0);

48190

SDValue N1 = N->getOperand(1);

48191

48192

// Shift zero -> zero.

48193

if (ISD::isBuildVectorAllZeros(N0.getNode()))

48194

return DAG.getConstant(0, SDLoc(N), VT);

48195

48196

// Detect constant shift amounts.

48197

APInt UndefElts;

48198

SmallVector<APInt, 32> EltBits;

48199

if (getTargetConstantBitsFromNode(N1, 64, UndefElts, EltBits, true, false)) {

48200

unsigned X86Opc = getTargetVShiftUniformOpcode(N->getOpcode(), false);

48201

return getTargetVShiftByConstNode(X86Opc, SDLoc(N), VT.getSimpleVT(), N0,

48202

EltBits[0].getZExtValue(), DAG);

48203

}

48204

48205

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

48206

APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());

48207

if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))

48208

return SDValue(N, 0);

48209

48210

return SDValue();

48211

}

48212

48213

static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG,

48214

TargetLowering::DAGCombinerInfo &DCI,

48215

const X86Subtarget &Subtarget) {

48216

unsigned Opcode = N->getOpcode();

48217

assert((X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode ||(static_cast <bool> ((X86ISD::VSHLI == Opcode || X86ISD
::VSRAI == Opcode || X86ISD::VSRLI == Opcode) && "Unexpected shift opcode"
) ? void (0) : __assert_fail ("(X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode || X86ISD::VSRLI == Opcode) && \"Unexpected shift opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48219, __extension__
__PRETTY_FUNCTION__))

48218

X86ISD::VSRLI == Opcode) &&(static_cast <bool> ((X86ISD::VSHLI == Opcode || X86ISD
::VSRAI == Opcode || X86ISD::VSRLI == Opcode) && "Unexpected shift opcode"
) ? void (0) : __assert_fail ("(X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode || X86ISD::VSRLI == Opcode) && \"Unexpected shift opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48219, __extension__
__PRETTY_FUNCTION__))

48219

"Unexpected shift opcode")(static_cast <bool> ((X86ISD::VSHLI == Opcode || X86ISD
::VSRAI == Opcode || X86ISD::VSRLI == Opcode) && "Unexpected shift opcode"
) ? void (0) : __assert_fail ("(X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode || X86ISD::VSRLI == Opcode) && \"Unexpected shift opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48219, __extension__
__PRETTY_FUNCTION__));

48220

bool LogicalShift = X86ISD::VSHLI == Opcode || X86ISD::VSRLI == Opcode;

48221

EVT VT = N->getValueType(0);

48222

SDValue N0 = N->getOperand(0);

48223

unsigned NumBitsPerElt = VT.getScalarSizeInBits();

48224

assert(VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 &&(static_cast <bool> (VT == N0.getValueType() &&
(NumBitsPerElt % 8) == 0 && "Unexpected value type")
? void (0) : __assert_fail ("VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48225, __extension__
__PRETTY_FUNCTION__))

48225

"Unexpected value type")(static_cast <bool> (VT == N0.getValueType() &&
(NumBitsPerElt % 8) == 0 && "Unexpected value type")
? void (0) : __assert_fail ("VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 && \"Unexpected value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48225, __extension__
__PRETTY_FUNCTION__));

48226

assert(N->getOperand(1).getValueType() == MVT::i8 &&(static_cast <bool> (N->getOperand(1).getValueType()
== MVT::i8 && "Unexpected shift amount type") ? void
(0) : __assert_fail ("N->getOperand(1).getValueType() == MVT::i8 && \"Unexpected shift amount type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48227, __extension__
__PRETTY_FUNCTION__))

48227

"Unexpected shift amount type")(static_cast <bool> (N->getOperand(1).getValueType()
== MVT::i8 && "Unexpected shift amount type") ? void
(0) : __assert_fail ("N->getOperand(1).getValueType() == MVT::i8 && \"Unexpected shift amount type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48227, __extension__
__PRETTY_FUNCTION__));

48228

48229

// (shift undef, X) -> 0

48230

if (N0.isUndef())

48231

return DAG.getConstant(0, SDLoc(N), VT);

48232

48233

// Out of range logical bit shifts are guaranteed to be zero.

48234

// Out of range arithmetic bit shifts splat the sign bit.

48235

unsigned ShiftVal = N->getConstantOperandVal(1);

48236

if (ShiftVal >= NumBitsPerElt) {

48237

if (LogicalShift)

48238

return DAG.getConstant(0, SDLoc(N), VT);

48239

ShiftVal = NumBitsPerElt - 1;

48240

}

48241

48242

// (shift X, 0) -> X

48243

if (!ShiftVal)

48244

return N0;

48245

48246

// (shift 0, C) -> 0

48247

if (ISD::isBuildVectorAllZeros(N0.getNode()))

48248

// N0 is all zeros or undef. We guarantee that the bits shifted into the

48249

// result are all zeros, not undef.

48250

return DAG.getConstant(0, SDLoc(N), VT);

48251

48252

// (VSRAI -1, C) -> -1

48253

if (!LogicalShift && ISD::isBuildVectorAllOnes(N0.getNode()))

48254

// N0 is all ones or undef. We guarantee that the bits shifted into the

48255

// result are all ones, not undef.

48256

return DAG.getConstant(-1, SDLoc(N), VT);

48257

48258

auto MergeShifts = [&](SDValue X, uint64_t Amt0, uint64_t Amt1) {

48259

unsigned NewShiftVal = Amt0 + Amt1;

48260

if (NewShiftVal >= NumBitsPerElt) {

48261

// Out of range logical bit shifts are guaranteed to be zero.

48262

// Out of range arithmetic bit shifts splat the sign bit.

48263

if (LogicalShift)

48264

return DAG.getConstant(0, SDLoc(N), VT);

48265

NewShiftVal = NumBitsPerElt - 1;

48266

}

48267

return DAG.getNode(Opcode, SDLoc(N), VT, N0.getOperand(0),

48268

DAG.getTargetConstant(NewShiftVal, SDLoc(N), MVT::i8));

48269

};

48270

48271

// (shift (shift X, C2), C1) -> (shift X, (C1 + C2))

48272

if (Opcode == N0.getOpcode())

48273

return MergeShifts(N0.getOperand(0), ShiftVal, N0.getConstantOperandVal(1));

48274

48275

// (shl (add X, X), C) -> (shl X, (C + 1))

48276

if (Opcode == X86ISD::VSHLI && N0.getOpcode() == ISD::ADD &&

48277

N0.getOperand(0) == N0.getOperand(1))

48278

return MergeShifts(N0.getOperand(0), ShiftVal, 1);

48279

48280

// We can decode 'whole byte' logical bit shifts as shuffles.

48281

if (LogicalShift && (ShiftVal % 8) == 0) {

48282

SDValue Op(N, 0);

48283

if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))

48284

return Res;

48285

}

48286

48287

// Constant Folding.

48288

APInt UndefElts;

48289

SmallVector<APInt, 32> EltBits;

48290

if (N->isOnlyUserOf(N0.getNode()) &&

48291

getTargetConstantBitsFromNode(N0, NumBitsPerElt, UndefElts, EltBits)) {

48292

assert(EltBits.size() == VT.getVectorNumElements() &&(static_cast <bool> (EltBits.size() == VT.getVectorNumElements
() && "Unexpected shift value type") ? void (0) : __assert_fail
("EltBits.size() == VT.getVectorNumElements() && \"Unexpected shift value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48293, __extension__
__PRETTY_FUNCTION__))

48293

"Unexpected shift value type")(static_cast <bool> (EltBits.size() == VT.getVectorNumElements
() && "Unexpected shift value type") ? void (0) : __assert_fail
("EltBits.size() == VT.getVectorNumElements() && \"Unexpected shift value type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48293, __extension__
__PRETTY_FUNCTION__));

48294

// Undef elements need to fold to 0. It's possible SimplifyDemandedBits

48295

// created an undef input due to no input bits being demanded, but user

48296

// still expects 0 in other bits.

48297

for (unsigned i = 0, e = EltBits.size(); i != e; ++i) {

48298

APInt &Elt = EltBits[i];

48299

if (UndefElts[i])

48300

Elt = 0;

48301

else if (X86ISD::VSHLI == Opcode)

48302

Elt <<= ShiftVal;

48303

else if (X86ISD::VSRAI == Opcode)

48304

Elt.ashrInPlace(ShiftVal);

48305

else

48306

Elt.lshrInPlace(ShiftVal);

48307

}

48308

// Reset undef elements since they were zeroed above.

48309

UndefElts = 0;

48310

return getConstVector(EltBits, UndefElts, VT.getSimpleVT(), DAG, SDLoc(N));

48311

}

48312

48313

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

48314

if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(NumBitsPerElt),

48315

DCI))

48316

return SDValue(N, 0);

48317

48318

return SDValue();

48319

}

48320

48321

static SDValue combineVectorInsert(SDNode *N, SelectionDAG &DAG,

48322

TargetLowering::DAGCombinerInfo &DCI,

48323

const X86Subtarget &Subtarget) {

48324

EVT VT = N->getValueType(0);

48325

unsigned Opcode = N->getOpcode();

48326

assert(((Opcode == X86ISD::PINSRB && VT == MVT::v16i8) ||(static_cast <bool> (((Opcode == X86ISD::PINSRB &&
VT == MVT::v16i8) || (Opcode == X86ISD::PINSRW && VT
== MVT::v8i16) || Opcode == ISD::INSERT_VECTOR_ELT) &&
"Unexpected vector insertion") ? void (0) : __assert_fail ("((Opcode == X86ISD::PINSRB && VT == MVT::v16i8) || (Opcode == X86ISD::PINSRW && VT == MVT::v8i16) || Opcode == ISD::INSERT_VECTOR_ELT) && \"Unexpected vector insertion\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48329, __extension__
__PRETTY_FUNCTION__))

48327

(Opcode == X86ISD::PINSRW && VT == MVT::v8i16) ||(static_cast <bool> (((Opcode == X86ISD::PINSRB &&
VT == MVT::v16i8) || (Opcode == X86ISD::PINSRW && VT
== MVT::v8i16) || Opcode == ISD::INSERT_VECTOR_ELT) &&
"Unexpected vector insertion") ? void (0) : __assert_fail ("((Opcode == X86ISD::PINSRB && VT == MVT::v16i8) || (Opcode == X86ISD::PINSRW && VT == MVT::v8i16) || Opcode == ISD::INSERT_VECTOR_ELT) && \"Unexpected vector insertion\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48329, __extension__
__PRETTY_FUNCTION__))

48328

Opcode == ISD::INSERT_VECTOR_ELT) &&(static_cast <bool> (((Opcode == X86ISD::PINSRB &&
VT == MVT::v16i8) || (Opcode == X86ISD::PINSRW && VT
== MVT::v8i16) || Opcode == ISD::INSERT_VECTOR_ELT) &&
"Unexpected vector insertion") ? void (0) : __assert_fail ("((Opcode == X86ISD::PINSRB && VT == MVT::v16i8) || (Opcode == X86ISD::PINSRW && VT == MVT::v8i16) || Opcode == ISD::INSERT_VECTOR_ELT) && \"Unexpected vector insertion\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48329, __extension__
__PRETTY_FUNCTION__))

48329

"Unexpected vector insertion")(static_cast <bool> (((Opcode == X86ISD::PINSRB &&
VT == MVT::v16i8) || (Opcode == X86ISD::PINSRW && VT
== MVT::v8i16) || Opcode == ISD::INSERT_VECTOR_ELT) &&
"Unexpected vector insertion") ? void (0) : __assert_fail ("((Opcode == X86ISD::PINSRB && VT == MVT::v16i8) || (Opcode == X86ISD::PINSRW && VT == MVT::v8i16) || Opcode == ISD::INSERT_VECTOR_ELT) && \"Unexpected vector insertion\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48329, __extension__
__PRETTY_FUNCTION__));

48330

48331

// Fold insert_vector_elt(undef, elt, 0) --> scalar_to_vector(elt).

48332

if (Opcode == ISD::INSERT_VECTOR_ELT && N->getOperand(0).isUndef() &&

48333

isNullConstant(N->getOperand(2)))

48334

return DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), VT, N->getOperand(1));

48335

48336

if (Opcode == X86ISD::PINSRB || Opcode == X86ISD::PINSRW) {

48337

unsigned NumBitsPerElt = VT.getScalarSizeInBits();

48338

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

48339

if (TLI.SimplifyDemandedBits(SDValue(N, 0),

48340

APInt::getAllOnes(NumBitsPerElt), DCI))

48341

return SDValue(N, 0);

48342

}

48343

48344

// Attempt to combine insertion patterns to a shuffle.

48345

if (VT.isSimple() && DCI.isAfterLegalizeDAG()) {

48346

SDValue Op(N, 0);

48347

if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))

48348

return Res;

48349

}

48350

48351

return SDValue();

48352

}

48353

48354

/// Recognize the distinctive (AND (setcc ...) (setcc ..)) where both setccs

48355

/// reference the same FP CMP, and rewrite for CMPEQSS and friends. Likewise for

48356

/// OR -> CMPNEQSS.

48357

static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG,

48358

TargetLowering::DAGCombinerInfo &DCI,

48359

const X86Subtarget &Subtarget) {

48360

unsigned opcode;

48361

48362

// SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but

48363

// we're requiring SSE2 for both.

48364

if (Subtarget.hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {

48365

SDValue N0 = N->getOperand(0);

48366

SDValue N1 = N->getOperand(1);

48367

SDValue CMP0 = N0.getOperand(1);

48368

SDValue CMP1 = N1.getOperand(1);

48369

SDLoc DL(N);

48370

48371

// The SETCCs should both refer to the same CMP.

48372

if (CMP0.getOpcode() != X86ISD::FCMP || CMP0 != CMP1)

48373

return SDValue();

48374

48375

SDValue CMP00 = CMP0->getOperand(0);

48376

SDValue CMP01 = CMP0->getOperand(1);

48377

EVT VT = CMP00.getValueType();

48378

48379

if (VT == MVT::f32 || VT == MVT::f64 ||

48380

(VT == MVT::f16 && Subtarget.hasFP16())) {

48381

bool ExpectingFlags = false;

48382

// Check for any users that want flags:

48383

for (const SDNode *U : N->uses()) {

48384

if (ExpectingFlags)

48385

break;

48386

48387

switch (U->getOpcode()) {

48388

default:

48389

case ISD::BR_CC:

48390

case ISD::BRCOND:

48391

case ISD::SELECT:

48392

ExpectingFlags = true;

48393

break;

48394

case ISD::CopyToReg:

48395

case ISD::SIGN_EXTEND:

48396

case ISD::ZERO_EXTEND:

48397

case ISD::ANY_EXTEND:

48398

break;

48399

}

48400

}

48401

48402

if (!ExpectingFlags) {

48403

enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0);

48404

enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0);

48405

48406

if (cc1 == X86::COND_E || cc1 == X86::COND_NE) {

48407

X86::CondCode tmp = cc0;

48408

cc0 = cc1;

48409

cc1 = tmp;

48410

}

48411

48412

if ((cc0 == X86::COND_E && cc1 == X86::COND_NP) ||

48413

(cc0 == X86::COND_NE && cc1 == X86::COND_P)) {

48414

// FIXME: need symbolic constants for these magic numbers.

48415

// See X86ATTInstPrinter.cpp:printSSECC().

48416

unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;

48417

if (Subtarget.hasAVX512()) {

48418

SDValue FSetCC =

48419

DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CMP00, CMP01,

48420

DAG.getTargetConstant(x86cc, DL, MVT::i8));

48421

// Need to fill with zeros to ensure the bitcast will produce zeroes

48422

// for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.

48423

SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v16i1,

48424

DAG.getConstant(0, DL, MVT::v16i1),

48425

FSetCC, DAG.getIntPtrConstant(0, DL));

48426

return DAG.getZExtOrTrunc(DAG.getBitcast(MVT::i16, Ins), DL,

48427

N->getSimpleValueType(0));

48428

}

48429

SDValue OnesOrZeroesF =

48430

DAG.getNode(X86ISD::FSETCC, DL, CMP00.getValueType(), CMP00,

48431

CMP01, DAG.getTargetConstant(x86cc, DL, MVT::i8));

48432

48433

bool is64BitFP = (CMP00.getValueType() == MVT::f64);

48434

MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32;

48435

48436

if (is64BitFP && !Subtarget.is64Bit()) {

48437

// On a 32-bit target, we cannot bitcast the 64-bit float to a

48438

// 64-bit integer, since that's not a legal type. Since

48439

// OnesOrZeroesF is all ones or all zeroes, we don't need all the

48440

// bits, but can do this little dance to extract the lowest 32 bits

48441

// and work with those going forward.

48442

SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,

48443

OnesOrZeroesF);

48444

SDValue Vector32 = DAG.getBitcast(MVT::v4f32, Vector64);

48445

OnesOrZeroesF = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,

48446

Vector32, DAG.getIntPtrConstant(0, DL));

48447

IntVT = MVT::i32;

48448

}

48449

48450

SDValue OnesOrZeroesI = DAG.getBitcast(IntVT, OnesOrZeroesF);

48451

SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI,

48452

DAG.getConstant(1, DL, IntVT));

48453

SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,

48454

ANDed);

48455

return OneBitOfTruth;

48456

}

48457

}

48458

}

48459

}

48460

return SDValue();

48461

}

48462

48463

/// Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).

48464

static SDValue combineAndNotIntoANDNP(SDNode *N, SelectionDAG &DAG) {

48465

assert(N->getOpcode() == ISD::AND && "Unexpected opcode combine into ANDNP")(static_cast <bool> (N->getOpcode() == ISD::AND &&
"Unexpected opcode combine into ANDNP") ? void (0) : __assert_fail
("N->getOpcode() == ISD::AND && \"Unexpected opcode combine into ANDNP\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48465, __extension__
__PRETTY_FUNCTION__));

48466

48467

MVT VT = N->getSimpleValueType(0);

48468

if (!VT.is128BitVector() && !VT.is256BitVector() && !VT.is512BitVector())

48469

return SDValue();

48470

48471

SDValue X, Y;

48472

SDValue N0 = N->getOperand(0);

48473

SDValue N1 = N->getOperand(1);

48474

48475

if (SDValue Not = IsNOT(N0, DAG)) {

48476

X = Not;

48477

Y = N1;

48478

} else if (SDValue Not = IsNOT(N1, DAG)) {

48479

X = Not;

48480

Y = N0;

48481

} else

48482

return SDValue();

48483

48484

X = DAG.getBitcast(VT, X);

48485

Y = DAG.getBitcast(VT, Y);

48486

return DAG.getNode(X86ISD::ANDNP, SDLoc(N), VT, X, Y);

48487

}

48488

48489

/// Try to fold:

48490

/// and (vector_shuffle<Z,...,Z>

48491

/// (insert_vector_elt undef, (xor X, -1), Z), undef), Y

48492

/// ->

48493

/// andnp (vector_shuffle<Z,...,Z>

48494

/// (insert_vector_elt undef, X, Z), undef), Y

48495

static SDValue combineAndShuffleNot(SDNode *N, SelectionDAG &DAG,

48496

const X86Subtarget &Subtarget) {

48497

assert(N->getOpcode() == ISD::AND && "Unexpected opcode combine into ANDNP")(static_cast <bool> (N->getOpcode() == ISD::AND &&
"Unexpected opcode combine into ANDNP") ? void (0) : __assert_fail
("N->getOpcode() == ISD::AND && \"Unexpected opcode combine into ANDNP\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48497, __extension__
__PRETTY_FUNCTION__));

48498

48499

EVT VT = N->getValueType(0);

48500

// Do not split 256 and 512 bit vectors with SSE2 as they overwrite original

48501

// value and require extra moves.

48502

if (!((VT.is128BitVector() && Subtarget.hasSSE2()) ||

48503

((VT.is256BitVector() || VT.is512BitVector()) && Subtarget.hasAVX())))

48504

return SDValue();

48505

48506

auto GetNot = [&DAG](SDValue V) {

48507

auto *SVN = dyn_cast<ShuffleVectorSDNode>(peekThroughOneUseBitcasts(V));

48508

// TODO: SVN->hasOneUse() is a strong condition. It can be relaxed if all

48509

// end-users are ISD::AND including cases

48510

// (and(extract_vector_element(SVN), Y)).

48511

if (!SVN || !SVN->hasOneUse() || !SVN->isSplat() ||

48512

!SVN->getOperand(1).isUndef()) {

48513

return SDValue();

48514

}

48515

SDValue IVEN = SVN->getOperand(0);

48516

if (IVEN.getOpcode() != ISD::INSERT_VECTOR_ELT ||

48517

!IVEN.getOperand(0).isUndef() || !IVEN.hasOneUse())

48518

return SDValue();

48519

if (!isa<ConstantSDNode>(IVEN.getOperand(2)) ||

48520

IVEN.getConstantOperandAPInt(2) != SVN->getSplatIndex())

48521

return SDValue();

48522

SDValue Src = IVEN.getOperand(1);

48523

if (SDValue Not = IsNOT(Src, DAG)) {

48524

SDValue NotSrc = DAG.getBitcast(Src.getValueType(), Not);

48525

SDValue NotIVEN =

48526

DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(IVEN), IVEN.getValueType(),

48527

IVEN.getOperand(0), NotSrc, IVEN.getOperand(2));

48528

return DAG.getVectorShuffle(SVN->getValueType(0), SDLoc(SVN), NotIVEN,

48529

SVN->getOperand(1), SVN->getMask());

48530

}

48531

return SDValue();

48532

};

48533

48534

SDValue X, Y;

48535

SDValue N0 = N->getOperand(0);

48536

SDValue N1 = N->getOperand(1);

48537

48538

if (SDValue Not = GetNot(N0)) {

48539

X = Not;

48540

Y = N1;

48541

} else if (SDValue Not = GetNot(N1)) {

48542

X = Not;

48543

Y = N0;

48544

} else

48545

return SDValue();

48546

48547

X = DAG.getBitcast(VT, X);

48548

Y = DAG.getBitcast(VT, Y);

48549

SDLoc DL(N);

48550

// We do not split for SSE at all, but we need to split vectors for AVX1 and

48551

// AVX2.

48552

if (!Subtarget.useAVX512Regs() && VT.is512BitVector()) {

48553

SDValue LoX, HiX;

48554

std::tie(LoX, HiX) = splitVector(X, DAG, DL);

48555

SDValue LoY, HiY;

48556

std::tie(LoY, HiY) = splitVector(Y, DAG, DL);

48557

EVT SplitVT = LoX.getValueType();

48558

SDValue LoV = DAG.getNode(X86ISD::ANDNP, DL, SplitVT, {LoX, LoY});

48559

SDValue HiV = DAG.getNode(X86ISD::ANDNP, DL, SplitVT, {HiX, HiY});

48560

return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, {LoV, HiV});

48561

}

48562

return DAG.getNode(X86ISD::ANDNP, DL, VT, {X, Y});

48563

}

48564

48565

// Try to widen AND, OR and XOR nodes to VT in order to remove casts around

48566

// logical operations, like in the example below.

48567

// or (and (truncate x, truncate y)),

48568

// (xor (truncate z, build_vector (constants)))

48569

// Given a target type \p VT, we generate

48570

// or (and x, y), (xor z, zext(build_vector (constants)))

48571

// given x, y and z are of type \p VT. We can do so, if operands are either

48572

// truncates from VT types, the second operand is a vector of constants or can

48573

// be recursively promoted.

48574

static SDValue PromoteMaskArithmetic(SDNode *N, EVT VT, SelectionDAG &DAG,

48575

unsigned Depth) {

48576

// Limit recursion to avoid excessive compile times.

48577

if (Depth >= SelectionDAG::MaxRecursionDepth)

48578

return SDValue();

48579

48580

if (N->getOpcode() != ISD::XOR && N->getOpcode() != ISD::AND &&

48581

N->getOpcode() != ISD::OR)

48582

return SDValue();

48583

48584

SDValue N0 = N->getOperand(0);

48585

SDValue N1 = N->getOperand(1);

48586

SDLoc DL(N);

48587

48588

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

48589

if (!TLI.isOperationLegalOrPromote(N->getOpcode(), VT))

48590

return SDValue();

48591

48592

if (SDValue NN0 = PromoteMaskArithmetic(N0.getNode(), VT, DAG, Depth + 1))

48593

N0 = NN0;

48594

else {

48595

// The Left side has to be a trunc.

48596

if (N0.getOpcode() != ISD::TRUNCATE)

48597

return SDValue();

48598

48599

// The type of the truncated inputs.

48600

if (N0.getOperand(0).getValueType() != VT)

48601

return SDValue();

48602

48603

N0 = N0.getOperand(0);

48604

}

48605

48606

if (SDValue NN1 = PromoteMaskArithmetic(N1.getNode(), VT, DAG, Depth + 1))

48607

N1 = NN1;

48608

else {

48609

// The right side has to be a 'trunc' or a constant vector.

48610

bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE &&

48611

N1.getOperand(0).getValueType() == VT;

48612

if (!RHSTrunc && !ISD::isBuildVectorOfConstantSDNodes(N1.getNode()))

48613

return SDValue();

48614

48615

if (RHSTrunc)

48616

N1 = N1.getOperand(0);

48617

else

48618

N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N1);

48619

}

48620

48621

return DAG.getNode(N->getOpcode(), DL, VT, N0, N1);

48622

}

48623

48624

// On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized

48625

// register. In most cases we actually compare or select YMM-sized registers

48626

// and mixing the two types creates horrible code. This method optimizes

48627

// some of the transition sequences.

48628

// Even with AVX-512 this is still useful for removing casts around logical

48629

// operations on vXi1 mask types.

48630

static SDValue PromoteMaskArithmetic(SDNode *N, SelectionDAG &DAG,

48631

const X86Subtarget &Subtarget) {

48632

EVT VT = N->getValueType(0);

48633

assert(VT.isVector() && "Expected vector type")(static_cast <bool> (VT.isVector() && "Expected vector type"
) ? void (0) : __assert_fail ("VT.isVector() && \"Expected vector type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48633, __extension__
__PRETTY_FUNCTION__));

48634

48635

SDLoc DL(N);

48636

assert((N->getOpcode() == ISD::ANY_EXTEND ||(static_cast <bool> ((N->getOpcode() == ISD::ANY_EXTEND
|| N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode(
) == ISD::SIGN_EXTEND) && "Invalid Node") ? void (0) :
__assert_fail ("(N->getOpcode() == ISD::ANY_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::SIGN_EXTEND) && \"Invalid Node\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48638, __extension__
__PRETTY_FUNCTION__))

48637

N->getOpcode() == ISD::ZERO_EXTEND ||(static_cast <bool> ((N->getOpcode() == ISD::ANY_EXTEND
|| N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode(
) == ISD::SIGN_EXTEND) && "Invalid Node") ? void (0) :
__assert_fail ("(N->getOpcode() == ISD::ANY_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::SIGN_EXTEND) && \"Invalid Node\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48638, __extension__
__PRETTY_FUNCTION__))

48638

N->getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node")(static_cast <bool> ((N->getOpcode() == ISD::ANY_EXTEND
|| N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode(
) == ISD::SIGN_EXTEND) && "Invalid Node") ? void (0) :
__assert_fail ("(N->getOpcode() == ISD::ANY_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::SIGN_EXTEND) && \"Invalid Node\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48638, __extension__
__PRETTY_FUNCTION__));

48639

48640

SDValue Narrow = N->getOperand(0);

48641

EVT NarrowVT = Narrow.getValueType();

48642

48643

// Generate the wide operation.

48644

SDValue Op = PromoteMaskArithmetic(Narrow.getNode(), VT, DAG, 0);

48645

if (!Op)

48646

return SDValue();

48647

switch (N->getOpcode()) {

48648

default: llvm_unreachable("Unexpected opcode")::llvm::llvm_unreachable_internal("Unexpected opcode", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 48648);

48649

case ISD::ANY_EXTEND:

48650

return Op;

48651

case ISD::ZERO_EXTEND:

48652

return DAG.getZeroExtendInReg(Op, DL, NarrowVT);

48653

case ISD::SIGN_EXTEND:

48654

return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT,

48655

Op, DAG.getValueType(NarrowVT));

48656

}

48657

}

48658

48659

static unsigned convertIntLogicToFPLogicOpcode(unsigned Opcode) {

48660

unsigned FPOpcode;

48661

switch (Opcode) {

48662

default: llvm_unreachable("Unexpected input node for FP logic conversion")::llvm::llvm_unreachable_internal("Unexpected input node for FP logic conversion"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48662);

48663

case ISD::AND: FPOpcode = X86ISD::FAND; break;

48664

case ISD::OR: FPOpcode = X86ISD::FOR; break;

48665

case ISD::XOR: FPOpcode = X86ISD::FXOR; break;

48666

}

48667

return FPOpcode;

48668

}

48669

48670

/// If both input operands of a logic op are being cast from floating-point

48671

/// types or FP compares, try to convert this into a floating-point logic node

48672

/// to avoid unnecessary moves from SSE to integer registers.

48673

static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG,

48674

TargetLowering::DAGCombinerInfo &DCI,

48675

const X86Subtarget &Subtarget) {

48676

EVT VT = N->getValueType(0);

48677

SDValue N0 = N->getOperand(0);

48678

SDValue N1 = N->getOperand(1);

48679

SDLoc DL(N);

48680

48681

if (!((N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST) ||

48682

(N0.getOpcode() == ISD::SETCC && N1.getOpcode() == ISD::SETCC)))

48683

return SDValue();

48684

48685

SDValue N00 = N0.getOperand(0);

48686

SDValue N10 = N1.getOperand(0);

48687

EVT N00Type = N00.getValueType();

48688

EVT N10Type = N10.getValueType();

48689

48690

// Ensure that both types are the same and are legal scalar fp types.

48691

if (N00Type != N10Type || !((Subtarget.hasSSE1() && N00Type == MVT::f32) ||

48692

(Subtarget.hasSSE2() && N00Type == MVT::f64) ||

48693

(Subtarget.hasFP16() && N00Type == MVT::f16)))

48694

return SDValue();

48695

48696

if (N0.getOpcode() == ISD::BITCAST && !DCI.isBeforeLegalizeOps()) {

48697

unsigned FPOpcode = convertIntLogicToFPLogicOpcode(N->getOpcode());

48698

SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10);

48699

return DAG.getBitcast(VT, FPLogic);

48700

}

48701

48702

if (VT != MVT::i1 || N0.getOpcode() != ISD::SETCC || !N0.hasOneUse() ||

48703

!N1.hasOneUse())

48704

return SDValue();

48705

48706

ISD::CondCode CC0 = cast<CondCodeSDNode>(N0.getOperand(2))->get();

48707

ISD::CondCode CC1 = cast<CondCodeSDNode>(N1.getOperand(2))->get();

48708

48709

// The vector ISA for FP predicates is incomplete before AVX, so converting

48710

// COMIS* to CMPS* may not be a win before AVX.

48711

if (!Subtarget.hasAVX() &&

48712

!(cheapX86FSETCC_SSE(CC0) && cheapX86FSETCC_SSE(CC1)))

48713

return SDValue();

48714

48715

// Convert scalar FP compares and logic to vector compares (COMIS* to CMPS*)

48716

// and vector logic:

48717

// logic (setcc N00, N01), (setcc N10, N11) -->

48718

// extelt (logic (setcc (s2v N00), (s2v N01)), setcc (s2v N10), (s2v N11))), 0

48719

unsigned NumElts = 128 / N00Type.getSizeInBits();

48720

EVT VecVT = EVT::getVectorVT(*DAG.getContext(), N00Type, NumElts);

48721

EVT BoolVecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElts);

48722

SDValue ZeroIndex = DAG.getVectorIdxConstant(0, DL);

48723

SDValue N01 = N0.getOperand(1);

48724

SDValue N11 = N1.getOperand(1);

48725

SDValue Vec00 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N00);

48726

SDValue Vec01 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N01);

48727

SDValue Vec10 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N10);

48728

SDValue Vec11 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N11);

48729

SDValue Setcc0 = DAG.getSetCC(DL, BoolVecVT, Vec00, Vec01, CC0);

48730

SDValue Setcc1 = DAG.getSetCC(DL, BoolVecVT, Vec10, Vec11, CC1);

48731

SDValue Logic = DAG.getNode(N->getOpcode(), DL, BoolVecVT, Setcc0, Setcc1);

48732

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Logic, ZeroIndex);

48733

}

48734

48735

// Attempt to fold BITOP(MOVMSK(X),MOVMSK(Y)) -> MOVMSK(BITOP(X,Y))

48736

// to reduce XMM->GPR traffic.

48737

static SDValue combineBitOpWithMOVMSK(SDNode *N, SelectionDAG &DAG) {

48738

unsigned Opc = N->getOpcode();

48739

assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) &&(static_cast <bool> ((Opc == ISD::OR || Opc == ISD::AND
|| Opc == ISD::XOR) && "Unexpected bit opcode") ? void
(0) : __assert_fail ("(Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) && \"Unexpected bit opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48740, __extension__
__PRETTY_FUNCTION__))

48740

"Unexpected bit opcode")(static_cast <bool> ((Opc == ISD::OR || Opc == ISD::AND
|| Opc == ISD::XOR) && "Unexpected bit opcode") ? void
(0) : __assert_fail ("(Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) && \"Unexpected bit opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48740, __extension__
__PRETTY_FUNCTION__));

48741

48742

SDValue N0 = N->getOperand(0);

48743

SDValue N1 = N->getOperand(1);

48744

48745

// Both operands must be single use MOVMSK.

48746

if (N0.getOpcode() != X86ISD::MOVMSK || !N0.hasOneUse() ||

48747

N1.getOpcode() != X86ISD::MOVMSK || !N1.hasOneUse())

48748

return SDValue();

48749

48750

SDValue Vec0 = N0.getOperand(0);

48751

SDValue Vec1 = N1.getOperand(0);

48752

EVT VecVT0 = Vec0.getValueType();

48753

EVT VecVT1 = Vec1.getValueType();

48754

48755

// Both MOVMSK operands must be from vectors of the same size and same element

48756

// size, but its OK for a fp/int diff.

48757

if (VecVT0.getSizeInBits() != VecVT1.getSizeInBits() ||

48758

VecVT0.getScalarSizeInBits() != VecVT1.getScalarSizeInBits())

48759

return SDValue();

48760

48761

SDLoc DL(N);

48762

unsigned VecOpc =

48763

VecVT0.isFloatingPoint() ? convertIntLogicToFPLogicOpcode(Opc) : Opc;

48764

SDValue Result =

48765

DAG.getNode(VecOpc, DL, VecVT0, Vec0, DAG.getBitcast(VecVT0, Vec1));

48766

return DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);

48767

}

48768

48769

// Attempt to fold BITOP(SHIFT(X,Z),SHIFT(Y,Z)) -> SHIFT(BITOP(X,Y),Z).

48770

// NOTE: This is a very limited case of what SimplifyUsingDistributiveLaws

48771

// handles in InstCombine.

48772

static SDValue combineBitOpWithShift(SDNode *N, SelectionDAG &DAG) {

48773

unsigned Opc = N->getOpcode();

48774

assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) &&(static_cast <bool> ((Opc == ISD::OR || Opc == ISD::AND
|| Opc == ISD::XOR) && "Unexpected bit opcode") ? void
(0) : __assert_fail ("(Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) && \"Unexpected bit opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48775, __extension__
__PRETTY_FUNCTION__))

48775

"Unexpected bit opcode")(static_cast <bool> ((Opc == ISD::OR || Opc == ISD::AND
|| Opc == ISD::XOR) && "Unexpected bit opcode") ? void
(0) : __assert_fail ("(Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) && \"Unexpected bit opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 48775, __extension__
__PRETTY_FUNCTION__));

48776

48777

SDValue N0 = N->getOperand(0);

48778

SDValue N1 = N->getOperand(1);

48779

EVT VT = N->getValueType(0);

48780

48781

// Both operands must be single use.

48782

if (!N0.hasOneUse() || !N1.hasOneUse())

48783

return SDValue();

48784

48785

// Search for matching shifts.

48786

SDValue BC0 = peekThroughOneUseBitcasts(N0);

48787

SDValue BC1 = peekThroughOneUseBitcasts(N1);

48788

48789

unsigned BCOpc = BC0.getOpcode();

48790

EVT BCVT = BC0.getValueType();

48791

if (BCOpc != BC1->getOpcode() || BCVT != BC1.getValueType())

48792

return SDValue();

48793

48794

switch (BCOpc) {

48795

case X86ISD::VSHLI:

48796

case X86ISD::VSRLI:

48797

case X86ISD::VSRAI: {

48798

if (BC0.getOperand(1) != BC1.getOperand(1))

48799

return SDValue();

48800

48801

SDLoc DL(N);

48802

SDValue BitOp =

48803

DAG.getNode(Opc, DL, BCVT, BC0.getOperand(0), BC1.getOperand(0));

48804

SDValue Shift = DAG.getNode(BCOpc, DL, BCVT, BitOp, BC0.getOperand(1));

48805

return DAG.getBitcast(VT, Shift);

48806

}

48807

}

48808

48809

return SDValue();

48810

}

48811

48812

/// If this is a zero/all-bits result that is bitwise-anded with a low bits

48813

/// mask. (Mask == 1 for the x86 lowering of a SETCC + ZEXT), replace the 'and'

48814

/// with a shift-right to eliminate loading the vector constant mask value.

48815

static SDValue combineAndMaskToShift(SDNode *N, SelectionDAG &DAG,

48816

const X86Subtarget &Subtarget) {

48817

SDValue Op0 = peekThroughBitcasts(N->getOperand(0));

48818

SDValue Op1 = peekThroughBitcasts(N->getOperand(1));

48819

EVT VT = Op0.getValueType();

48820

if (VT != Op1.getValueType() || !VT.isSimple() || !VT.isInteger())

48821

return SDValue();

48822

48823

// Try to convert an "is positive" signbit masking operation into arithmetic

48824

// shift and "andn". This saves a materialization of a -1 vector constant.

48825

// The "is negative" variant should be handled more generally because it only

48826

// requires "and" rather than "andn":

48827

// and (pcmpgt X, -1), Y --> pandn (vsrai X, BitWidth - 1), Y

48828

//

48829

// This is limited to the original type to avoid producing even more bitcasts.

48830

// If the bitcasts can't be eliminated, then it is unlikely that this fold

48831

// will be profitable.

48832

if (N->getValueType(0) == VT &&

48833

supportedVectorShiftWithImm(VT.getSimpleVT(), Subtarget, ISD::SRA)) {

48834

SDValue X, Y;

48835

if (Op1.hasOneUse() && Op1.getOpcode() == X86ISD::PCMPGT &&

48836

isAllOnesOrAllOnesSplat(Op1.getOperand(1))) {

48837

X = Op1.getOperand(0);

48838

Y = Op0;

48839

} else if (Op0.hasOneUse() && Op0.getOpcode() == X86ISD::PCMPGT &&

48840

isAllOnesOrAllOnesSplat(Op0.getOperand(1))) {

48841

X = Op0.getOperand(0);

48842

Y = Op1;

48843

}

48844

if (X && Y) {

48845

SDLoc DL(N);

48846

SDValue Sra =

48847

getTargetVShiftByConstNode(X86ISD::VSRAI, DL, VT.getSimpleVT(), X,

48848

VT.getScalarSizeInBits() - 1, DAG);

48849

return DAG.getNode(X86ISD::ANDNP, DL, VT, Sra, Y);

48850

}

48851

}

48852

48853

APInt SplatVal;

48854

if (!ISD::isConstantSplatVector(Op1.getNode(), SplatVal) ||

48855

!SplatVal.isMask())

48856

return SDValue();

48857

48858

// Don't prevent creation of ANDN.

48859

if (isBitwiseNot(Op0))

48860

return SDValue();

48861

48862

if (!supportedVectorShiftWithImm(VT.getSimpleVT(), Subtarget, ISD::SRL))

48863

return SDValue();

48864

48865

unsigned EltBitWidth = VT.getScalarSizeInBits();

48866

if (EltBitWidth != DAG.ComputeNumSignBits(Op0))

48867

return SDValue();

48868

48869

SDLoc DL(N);

48870

unsigned ShiftVal = SplatVal.countTrailingOnes();

48871

SDValue ShAmt = DAG.getTargetConstant(EltBitWidth - ShiftVal, DL, MVT::i8);

48872

SDValue Shift = DAG.getNode(X86ISD::VSRLI, DL, VT, Op0, ShAmt);

48873

return DAG.getBitcast(N->getValueType(0), Shift);

48874

}

48875

48876

// Get the index node from the lowered DAG of a GEP IR instruction with one

48877

// indexing dimension.

48878

static SDValue getIndexFromUnindexedLoad(LoadSDNode *Ld) {

48879

if (Ld->isIndexed())

48880

return SDValue();

48881

48882

SDValue Base = Ld->getBasePtr();

48883

48884

if (Base.getOpcode() != ISD::ADD)

48885

return SDValue();

48886

48887

SDValue ShiftedIndex = Base.getOperand(0);

48888

48889

if (ShiftedIndex.getOpcode() != ISD::SHL)

48890

return SDValue();

48891

48892

return ShiftedIndex.getOperand(0);

48893

48894

}

48895

48896

static bool hasBZHI(const X86Subtarget &Subtarget, MVT VT) {

48897

if (Subtarget.hasBMI2() && VT.isScalarInteger()) {

48898

switch (VT.getSizeInBits()) {

48899

default: return false;

48900

case 64: return Subtarget.is64Bit() ? true : false;

48901

case 32: return true;

48902

}

48903

}

48904

return false;

48905

}

48906

48907

// This function recognizes cases where X86 bzhi instruction can replace and

48908

// 'and-load' sequence.

48909

// In case of loading integer value from an array of constants which is defined

48910

// as follows:

48911

//

48912

// int array[SIZE] = {0x0, 0x1, 0x3, 0x7, 0xF ..., 2^(SIZE-1) - 1}

48913

//

48914

// then applying a bitwise and on the result with another input.

48915

// It's equivalent to performing bzhi (zero high bits) on the input, with the

48916

// same index of the load.

48917

static SDValue combineAndLoadToBZHI(SDNode *Node, SelectionDAG &DAG,

48918

const X86Subtarget &Subtarget) {

48919

MVT VT = Node->getSimpleValueType(0);

48920

SDLoc dl(Node);

48921

48922

// Check if subtarget has BZHI instruction for the node's type

48923

if (!hasBZHI(Subtarget, VT))

48924

return SDValue();

48925

48926

// Try matching the pattern for both operands.

48927

for (unsigned i = 0; i < 2; i++) {

48928

SDValue N = Node->getOperand(i);

48929

LoadSDNode *Ld = dyn_cast<LoadSDNode>(N.getNode());

48930

48931

// continue if the operand is not a load instruction

48932

if (!Ld)

48933

return SDValue();

48934

48935

const Value *MemOp = Ld->getMemOperand()->getValue();

48936

48937

if (!MemOp)

48938

return SDValue();

48939

48940

if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(MemOp)) {

48941

if (GlobalVariable *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0))) {

48942

if (GV->isConstant() && GV->hasDefinitiveInitializer()) {

48943

48944

Constant *Init = GV->getInitializer();

48945

Type *Ty = Init->getType();

48946

if (!isa<ConstantDataArray>(Init) ||

48947

!Ty->getArrayElementType()->isIntegerTy() ||

48948

Ty->getArrayElementType()->getScalarSizeInBits() !=

48949

VT.getSizeInBits() ||

48950

Ty->getArrayNumElements() >

48951

Ty->getArrayElementType()->getScalarSizeInBits())

48952

continue;

48953

48954

// Check if the array's constant elements are suitable to our case.

48955

uint64_t ArrayElementCount = Init->getType()->getArrayNumElements();

48956

bool ConstantsMatch = true;

48957

for (uint64_t j = 0; j < ArrayElementCount; j++) {

48958

auto *Elem = cast<ConstantInt>(Init->getAggregateElement(j));

48959

if (Elem->getZExtValue() != (((uint64_t)1 << j) - 1)) {

48960

ConstantsMatch = false;

48961

break;

48962

}

48963

}

48964

if (!ConstantsMatch)

48965

continue;

48966

48967

// Do the transformation (For 32-bit type):

48968

// -> (and (load arr[idx]), inp)

48969

// <- (and (srl 0xFFFFFFFF, (sub 32, idx)))

48970

// that will be replaced with one bzhi instruction.

48971

SDValue Inp = (i == 0) ? Node->getOperand(1) : Node->getOperand(0);

48972

SDValue SizeC = DAG.getConstant(VT.getSizeInBits(), dl, MVT::i32);

48973

48974

// Get the Node which indexes into the array.

48975

SDValue Index = getIndexFromUnindexedLoad(Ld);

48976

if (!Index)

48977

return SDValue();

48978

Index = DAG.getZExtOrTrunc(Index, dl, MVT::i32);

48979

48980

SDValue Sub = DAG.getNode(ISD::SUB, dl, MVT::i32, SizeC, Index);

48981

Sub = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Sub);

48982

48983

SDValue AllOnes = DAG.getAllOnesConstant(dl, VT);

48984

SDValue LShr = DAG.getNode(ISD::SRL, dl, VT, AllOnes, Sub);

48985

48986

return DAG.getNode(ISD::AND, dl, VT, Inp, LShr);

48987

}

48988

}

48989

}

48990

}

48991

return SDValue();

48992

}

48993

48994

// Look for (and (bitcast (vXi1 (concat_vectors (vYi1 setcc), undef,))), C)

48995

// Where C is a mask containing the same number of bits as the setcc and

48996

// where the setcc will freely 0 upper bits of k-register. We can replace the

48997

// undef in the concat with 0s and remove the AND. This mainly helps with

48998

// v2i1/v4i1 setcc being casted to scalar.

48999

static SDValue combineScalarAndWithMaskSetcc(SDNode *N, SelectionDAG &DAG,

49000

const X86Subtarget &Subtarget) {

49001

assert(N->getOpcode() == ISD::AND && "Unexpected opcode!")(static_cast <bool> (N->getOpcode() == ISD::AND &&
"Unexpected opcode!") ? void (0) : __assert_fail ("N->getOpcode() == ISD::AND && \"Unexpected opcode!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49001, __extension__
__PRETTY_FUNCTION__));

49002

49003

EVT VT = N->getValueType(0);

49004

49005

// Make sure this is an AND with constant. We will check the value of the

49006

// constant later.

49007

auto *C1 = dyn_cast<ConstantSDNode>(N->getOperand(1));

49008

if (!C1)

49009

return SDValue();

49010

49011

// This is implied by the ConstantSDNode.

49012

assert(!VT.isVector() && "Expected scalar VT!")(static_cast <bool> (!VT.isVector() && "Expected scalar VT!"
) ? void (0) : __assert_fail ("!VT.isVector() && \"Expected scalar VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49012, __extension__
__PRETTY_FUNCTION__));

49013

49014

SDValue Src = N->getOperand(0);

49015

if (!Src.hasOneUse())

49016

return SDValue();

49017

49018

// (Optionally) peek through any_extend().

49019

if (Src.getOpcode() == ISD::ANY_EXTEND) {

49020

if (!Src.getOperand(0).hasOneUse())

49021

return SDValue();

49022

Src = Src.getOperand(0);

49023

}

49024

49025

if (Src.getOpcode() != ISD::BITCAST || !Src.getOperand(0).hasOneUse())

49026

return SDValue();

49027

49028

Src = Src.getOperand(0);

49029

EVT SrcVT = Src.getValueType();

49030

49031

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

49032

if (!SrcVT.isVector() || SrcVT.getVectorElementType() != MVT::i1 ||

49033

!TLI.isTypeLegal(SrcVT))

49034

return SDValue();

49035

49036

if (Src.getOpcode() != ISD::CONCAT_VECTORS)

49037

return SDValue();

49038

49039

// We only care about the first subvector of the concat, we expect the

49040

// other subvectors to be ignored due to the AND if we make the change.

49041

SDValue SubVec = Src.getOperand(0);

49042

EVT SubVecVT = SubVec.getValueType();

49043

49044

// The RHS of the AND should be a mask with as many bits as SubVec.

49045

if (!TLI.isTypeLegal(SubVecVT) ||

49046

!C1->getAPIntValue().isMask(SubVecVT.getVectorNumElements()))

49047

return SDValue();

49048

49049

// First subvector should be a setcc with a legal result type or a

49050

// AND containing at least one setcc with a legal result type.

49051

auto IsLegalSetCC = [&](SDValue V) {

49052

if (V.getOpcode() != ISD::SETCC)

49053

return false;

49054

EVT SetccVT = V.getOperand(0).getValueType();

49055

if (!TLI.isTypeLegal(SetccVT) ||

49056

!(Subtarget.hasVLX() || SetccVT.is512BitVector()))

49057

return false;

49058

if (!(Subtarget.hasBWI() || SetccVT.getScalarSizeInBits() >= 32))

49059

return false;

49060

return true;

49061

};

49062

if (!(IsLegalSetCC(SubVec) || (SubVec.getOpcode() == ISD::AND &&

49063

(IsLegalSetCC(SubVec.getOperand(0)) ||

49064

IsLegalSetCC(SubVec.getOperand(1))))))

49065

return SDValue();

49066

49067

// We passed all the checks. Rebuild the concat_vectors with zeroes

49068

// and cast it back to VT.

49069

SDLoc dl(N);

49070

SmallVector<SDValue, 4> Ops(Src.getNumOperands(),

49071

DAG.getConstant(0, dl, SubVecVT));

49072

Ops[0] = SubVec;

49073

SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, dl, SrcVT,

49074

Ops);

49075

EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), SrcVT.getSizeInBits());

49076

return DAG.getZExtOrTrunc(DAG.getBitcast(IntVT, Concat), dl, VT);

49077

}

49078

49079

static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,

49080

TargetLowering::DAGCombinerInfo &DCI,

49081

const X86Subtarget &Subtarget) {

49082

SDValue N0 = N->getOperand(0);

49083

SDValue N1 = N->getOperand(1);

49084

EVT VT = N->getValueType(0);

49085

SDLoc dl(N);

49086

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

49087

49088

// If this is SSE1 only convert to FAND to avoid scalarization.

49089

if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {

49090

return DAG.getBitcast(MVT::v4i32,

49091

DAG.getNode(X86ISD::FAND, dl, MVT::v4f32,

49092

DAG.getBitcast(MVT::v4f32, N0),

49093

DAG.getBitcast(MVT::v4f32, N1)));

49094

}

49095

49096

// Use a 32-bit and+zext if upper bits known zero.

49097

if (VT == MVT::i64 && Subtarget.is64Bit() && !isa<ConstantSDNode>(N1)) {

49098

APInt HiMask = APInt::getHighBitsSet(64, 32);

49099

if (DAG.MaskedValueIsZero(N1, HiMask) ||

49100

DAG.MaskedValueIsZero(N0, HiMask)) {

49101

SDValue LHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N0);

49102

SDValue RHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N1);

49103

return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64,

49104

DAG.getNode(ISD::AND, dl, MVT::i32, LHS, RHS));

49105

}

49106

}

49107

49108

// Match all-of bool scalar reductions into a bitcast/movmsk + cmp.

49109

// TODO: Support multiple SrcOps.

49110

if (VT == MVT::i1) {

49111

SmallVector<SDValue, 2> SrcOps;

49112

SmallVector<APInt, 2> SrcPartials;

49113

if (matchScalarReduction(SDValue(N, 0), ISD::AND, SrcOps, &SrcPartials) &&

49114

SrcOps.size() == 1) {

49115

unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements();

49116

EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);

49117

SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget);

49118

if (!Mask && TLI.isTypeLegal(SrcOps[0].getValueType()))

49119

Mask = DAG.getBitcast(MaskVT, SrcOps[0]);

49120

if (Mask) {

49121

assert(SrcPartials[0].getBitWidth() == NumElts &&(static_cast <bool> (SrcPartials[0].getBitWidth() == NumElts
&& "Unexpected partial reduction mask") ? void (0) :
__assert_fail ("SrcPartials[0].getBitWidth() == NumElts && \"Unexpected partial reduction mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49122, __extension__
__PRETTY_FUNCTION__))

49122

"Unexpected partial reduction mask")(static_cast <bool> (SrcPartials[0].getBitWidth() == NumElts
&& "Unexpected partial reduction mask") ? void (0) :
__assert_fail ("SrcPartials[0].getBitWidth() == NumElts && \"Unexpected partial reduction mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49122, __extension__
__PRETTY_FUNCTION__));

49123

SDValue PartialBits = DAG.getConstant(SrcPartials[0], dl, MaskVT);

49124

Mask = DAG.getNode(ISD::AND, dl, MaskVT, Mask, PartialBits);

49125

return DAG.getSetCC(dl, MVT::i1, Mask, PartialBits, ISD::SETEQ);

49126

}

49127

}

49128

}

49129

49130

if (SDValue V = combineScalarAndWithMaskSetcc(N, DAG, Subtarget))

49131

return V;

49132

49133

if (SDValue R = combineBitOpWithMOVMSK(N, DAG))

49134

return R;

49135

49136

if (SDValue R = combineBitOpWithShift(N, DAG))

49137

return R;

49138

49139

if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, DCI, Subtarget))

49140

return FPLogic;

49141

49142

if (SDValue R = combineAndShuffleNot(N, DAG, Subtarget))

49143

return R;

49144

49145

if (DCI.isBeforeLegalizeOps())

49146

return SDValue();

49147

49148

if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))

49149

return R;

49150

49151

if (SDValue R = combineAndNotIntoANDNP(N, DAG))

49152

return R;

49153

49154

if (SDValue ShiftRight = combineAndMaskToShift(N, DAG, Subtarget))

49155

return ShiftRight;

49156

49157

if (SDValue R = combineAndLoadToBZHI(N, DAG, Subtarget))

49158

return R;

49159

49160

// fold (and (mul x, c1), c2) -> (mul x, (and c1, c2))

49161

// iff c2 is all/no bits mask - i.e. a select-with-zero mask.

49162

// TODO: Handle PMULDQ/PMULUDQ/VPMADDWD/VPMADDUBSW?

49163

if (VT.isVector() && getTargetConstantFromNode(N1)) {

49164

unsigned Opc0 = N0.getOpcode();

49165

if ((Opc0 == ISD::MUL || Opc0 == ISD::MULHU || Opc0 == ISD::MULHS) &&

49166

getTargetConstantFromNode(N0.getOperand(1)) &&

49167

DAG.ComputeNumSignBits(N1) == VT.getScalarSizeInBits() &&

49168

N0->hasOneUse() && N0.getOperand(1)->hasOneUse()) {

49169

SDValue MaskMul = DAG.getNode(ISD::AND, dl, VT, N0.getOperand(1), N1);

49170

return DAG.getNode(Opc0, dl, VT, N0.getOperand(0), MaskMul);

49171

}

49172

}

49173

49174

// Fold AND(SRL(X,Y),1) -> SETCC(BT(X,Y), COND_B) iff Y is not a constant

49175

// avoids slow variable shift (moving shift amount to ECX etc.)

49176

if (isOneConstant(N1) && N0->hasOneUse()) {

49177

SDValue Src = N0;

49178

while ((Src.getOpcode() == ISD::ZERO_EXTEND ||

49179

Src.getOpcode() == ISD::TRUNCATE) &&

49180

Src.getOperand(0)->hasOneUse())

49181

Src = Src.getOperand(0);

49182

bool ContainsNOT = false;

49183

X86::CondCode X86CC = X86::COND_B;

49184

// Peek through AND(NOT(SRL(X,Y)),1).

49185

if (isBitwiseNot(Src)) {

49186

Src = Src.getOperand(0);

49187

X86CC = X86::COND_AE;

49188

ContainsNOT = true;

49189

}

49190

if (Src.getOpcode() == ISD::SRL &&

49191

!isa<ConstantSDNode>(Src.getOperand(1))) {

49192

SDValue BitNo = Src.getOperand(1);

49193

Src = Src.getOperand(0);

49194

// Peek through AND(SRL(NOT(X),Y),1).

49195

if (isBitwiseNot(Src)) {

49196

Src = Src.getOperand(0);

49197

X86CC = X86CC == X86::COND_AE ? X86::COND_B : X86::COND_AE;

49198

ContainsNOT = true;

49199

}

49200

// If we have BMI2 then SHRX should be faster for i32/i64 cases.

49201

if (!(Subtarget.hasBMI2() && !ContainsNOT && VT.getSizeInBits() >= 32))

49202

if (SDValue BT = getBT(Src, BitNo, dl, DAG))

49203

return DAG.getZExtOrTrunc(getSETCC(X86CC, BT, dl, DAG), dl, VT);

49204

}

49205

}

49206

49207

if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {

49208

// Attempt to recursively combine a bitmask AND with shuffles.

49209

SDValue Op(N, 0);

49210

if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))

49211

return Res;

49212

49213

// If either operand is a constant mask, then only the elements that aren't

49214

// zero are actually demanded by the other operand.

49215

auto GetDemandedMasks = [&](SDValue Op) {

49216

APInt UndefElts;

49217

SmallVector<APInt> EltBits;

49218

int NumElts = VT.getVectorNumElements();

49219

int EltSizeInBits = VT.getScalarSizeInBits();

49220

APInt DemandedBits = APInt::getAllOnes(EltSizeInBits);

49221

APInt DemandedElts = APInt::getAllOnes(NumElts);

49222

if (getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,

49223

EltBits)) {

49224

DemandedBits.clearAllBits();

49225

DemandedElts.clearAllBits();

49226

for (int I = 0; I != NumElts; ++I) {

49227

if (UndefElts[I]) {

49228

// We can't assume an undef src element gives an undef dst - the

49229

// other src might be zero.

49230

DemandedBits.setAllBits();

49231

DemandedElts.setBit(I);

49232

} else if (!EltBits[I].isZero()) {

49233

DemandedBits |= EltBits[I];

49234

DemandedElts.setBit(I);

49235

}

49236

}

49237

}

49238

return std::make_pair(DemandedBits, DemandedElts);

49239

};

49240

APInt Bits0, Elts0;

49241

APInt Bits1, Elts1;

49242

std::tie(Bits0, Elts0) = GetDemandedMasks(N1);

49243

std::tie(Bits1, Elts1) = GetDemandedMasks(N0);

49244

49245

if (TLI.SimplifyDemandedVectorElts(N0, Elts0, DCI) ||

49246

TLI.SimplifyDemandedVectorElts(N1, Elts1, DCI) ||

49247

TLI.SimplifyDemandedBits(N0, Bits0, Elts0, DCI) ||

49248

TLI.SimplifyDemandedBits(N1, Bits1, Elts1, DCI)) {

49249

if (N->getOpcode() != ISD::DELETED_NODE)

49250

DCI.AddToWorklist(N);

49251

return SDValue(N, 0);

49252

}

49253

49254

SDValue NewN0 = TLI.SimplifyMultipleUseDemandedBits(N0, Bits0, Elts0, DAG);

49255

SDValue NewN1 = TLI.SimplifyMultipleUseDemandedBits(N1, Bits1, Elts1, DAG);

49256

if (NewN0 || NewN1)

49257

return DAG.getNode(ISD::AND, dl, VT, NewN0 ? NewN0 : N0,

49258

NewN1 ? NewN1 : N1);

49259

}

49260

49261

// Attempt to combine a scalar bitmask AND with an extracted shuffle.

49262

if ((VT.getScalarSizeInBits() % 8) == 0 &&

49263

N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&

49264

isa<ConstantSDNode>(N0.getOperand(1))) {

49265

SDValue BitMask = N1;

49266

SDValue SrcVec = N0.getOperand(0);

49267

EVT SrcVecVT = SrcVec.getValueType();

49268

49269

// Check that the constant bitmask masks whole bytes.

49270

APInt UndefElts;

49271

SmallVector<APInt, 64> EltBits;

49272

if (VT == SrcVecVT.getScalarType() && N0->isOnlyUserOf(SrcVec.getNode()) &&

49273

getTargetConstantBitsFromNode(BitMask, 8, UndefElts, EltBits) &&

49274

llvm::all_of(EltBits, [](const APInt &M) {

49275

return M.isZero() || M.isAllOnes();

49276

})) {

49277

unsigned NumElts = SrcVecVT.getVectorNumElements();

49278

unsigned Scale = SrcVecVT.getScalarSizeInBits() / 8;

49279

unsigned Idx = N0.getConstantOperandVal(1);

49280

49281

// Create a root shuffle mask from the byte mask and the extracted index.

49282

SmallVector<int, 16> ShuffleMask(NumElts * Scale, SM_SentinelUndef);

49283

for (unsigned i = 0; i != Scale; ++i) {

49284

if (UndefElts[i])

49285

continue;

49286

int VecIdx = Scale * Idx + i;

49287

ShuffleMask[VecIdx] = EltBits[i].isZero() ? SM_SentinelZero : VecIdx;

49288

}

49289

49290

if (SDValue Shuffle = combineX86ShufflesRecursively(

49291

{SrcVec}, 0, SrcVec, ShuffleMask, {}, /*Depth*/ 1,

49292

X86::MaxShuffleCombineDepth,

49293

/*HasVarMask*/ false, /*AllowVarCrossLaneMask*/ true,

49294

/*AllowVarPerLaneMask*/ true, DAG, Subtarget))

49295

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Shuffle,

49296

N0.getOperand(1));

49297

}

49298

}

49299

49300

return SDValue();

49301

}

49302

49303

// Canonicalize OR(AND(X,C),AND(Y,~C)) -> OR(AND(X,C),ANDNP(C,Y))

49304

static SDValue canonicalizeBitSelect(SDNode *N, SelectionDAG &DAG,

49305

const X86Subtarget &Subtarget) {

49306

assert(N->getOpcode() == ISD::OR && "Unexpected Opcode")(static_cast <bool> (N->getOpcode() == ISD::OR &&
"Unexpected Opcode") ? void (0) : __assert_fail ("N->getOpcode() == ISD::OR && \"Unexpected Opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49306, __extension__
__PRETTY_FUNCTION__));

49307

49308

MVT VT = N->getSimpleValueType(0);

49309

unsigned EltSizeInBits = VT.getScalarSizeInBits();

49310

if (!VT.isVector() || (EltSizeInBits % 8) != 0)

49311

return SDValue();

49312

49313

SDValue N0 = peekThroughBitcasts(N->getOperand(0));

49314

SDValue N1 = peekThroughBitcasts(N->getOperand(1));

49315

if (N0.getOpcode() != ISD::AND || N1.getOpcode() != ISD::AND)

49316

return SDValue();

49317

49318

// On XOP we'll lower to PCMOV so accept one use. With AVX512, we can use

49319

// VPTERNLOG. Otherwise only do this if either mask has multiple uses already.

49320

if (!(Subtarget.hasXOP() || useVPTERNLOG(Subtarget, VT) ||

49321

!N0.getOperand(1).hasOneUse() || !N1.getOperand(1).hasOneUse()))

49322

return SDValue();

49323

49324

// Attempt to extract constant byte masks.

49325

APInt UndefElts0, UndefElts1;

49326

SmallVector<APInt, 32> EltBits0, EltBits1;

49327

if (!getTargetConstantBitsFromNode(N0.getOperand(1), 8, UndefElts0, EltBits0,

49328

false, false))

49329

return SDValue();

49330

if (!getTargetConstantBitsFromNode(N1.getOperand(1), 8, UndefElts1, EltBits1,

49331

false, false))

49332

return SDValue();

49333

49334

for (unsigned i = 0, e = EltBits0.size(); i != e; ++i) {

49335

// TODO - add UNDEF elts support.

49336

if (UndefElts0[i] || UndefElts1[i])

49337

return SDValue();

49338

if (EltBits0[i] != ~EltBits1[i])

49339

return SDValue();

49340

}

49341

49342

SDLoc DL(N);

49343

49344

if (useVPTERNLOG(Subtarget, VT)) {

49345

// Emit a VPTERNLOG node directly - 0xCA is the imm code for A?B:C.

49346

// VPTERNLOG is only available as vXi32/64-bit types.

49347

MVT OpSVT = EltSizeInBits == 32 ? MVT::i32 : MVT::i64;

49348

MVT OpVT =

49349

MVT::getVectorVT(OpSVT, VT.getSizeInBits() / OpSVT.getSizeInBits());

49350

SDValue A = DAG.getBitcast(OpVT, N0.getOperand(1));

49351

SDValue B = DAG.getBitcast(OpVT, N0.getOperand(0));

49352

SDValue C = DAG.getBitcast(OpVT, N1.getOperand(0));

49353

SDValue Imm = DAG.getTargetConstant(0xCA, DL, MVT::i8);

49354

SDValue Res = getAVX512Node(X86ISD::VPTERNLOG, DL, OpVT, {A, B, C, Imm},

49355

DAG, Subtarget);

49356

return DAG.getBitcast(VT, Res);

49357

}

49358

49359

SDValue X = N->getOperand(0);

49360

SDValue Y =

49361

DAG.getNode(X86ISD::ANDNP, DL, VT, DAG.getBitcast(VT, N0.getOperand(1)),

49362

DAG.getBitcast(VT, N1.getOperand(0)));

49363

return DAG.getNode(ISD::OR, DL, VT, X, Y);

49364

}

49365

49366

// Try to match OR(AND(~MASK,X),AND(MASK,Y)) logic pattern.

49367

static bool matchLogicBlend(SDNode *N, SDValue &X, SDValue &Y, SDValue &Mask) {

49368

if (N->getOpcode() != ISD::OR)

49369

return false;

49370

49371

SDValue N0 = N->getOperand(0);

49372

SDValue N1 = N->getOperand(1);

49373

49374

// Canonicalize AND to LHS.

49375

if (N1.getOpcode() == ISD::AND)

49376

std::swap(N0, N1);

49377

49378

// Attempt to match OR(AND(M,Y),ANDNP(M,X)).

49379

if (N0.getOpcode() != ISD::AND || N1.getOpcode() != X86ISD::ANDNP)

49380

return false;

49381

49382

Mask = N1.getOperand(0);

49383

X = N1.getOperand(1);

49384

49385

// Check to see if the mask appeared in both the AND and ANDNP.

49386

if (N0.getOperand(0) == Mask)

49387

Y = N0.getOperand(1);

49388

else if (N0.getOperand(1) == Mask)

49389

Y = N0.getOperand(0);

49390

else

49391

return false;

49392

49393

// TODO: Attempt to match against AND(XOR(-1,M),Y) as well, waiting for

49394

// ANDNP combine allows other combines to happen that prevent matching.

49395

return true;

49396

}

49397

49398

// Try to fold:

49399

// (or (and (m, y), (pandn m, x)))

49400

// into:

49401

// (vselect m, x, y)

49402

// As a special case, try to fold:

49403

// (or (and (m, (sub 0, x)), (pandn m, x)))

49404

// into:

49405

// (sub (xor X, M), M)

49406

static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG,

49407

const X86Subtarget &Subtarget) {

49408

assert(N->getOpcode() == ISD::OR && "Unexpected Opcode")(static_cast <bool> (N->getOpcode() == ISD::OR &&
"Unexpected Opcode") ? void (0) : __assert_fail ("N->getOpcode() == ISD::OR && \"Unexpected Opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49408, __extension__
__PRETTY_FUNCTION__));

49409

49410

EVT VT = N->getValueType(0);

49411

if (!((VT.is128BitVector() && Subtarget.hasSSE2()) ||

49412

(VT.is256BitVector() && Subtarget.hasInt256())))

49413

return SDValue();

49414

49415

SDValue X, Y, Mask;

49416

if (!matchLogicBlend(N, X, Y, Mask))

49417

return SDValue();

49418

49419

// Validate that X, Y, and Mask are bitcasts, and see through them.

49420

Mask = peekThroughBitcasts(Mask);

49421

X = peekThroughBitcasts(X);

49422

Y = peekThroughBitcasts(Y);

49423

49424

EVT MaskVT = Mask.getValueType();

49425

unsigned EltBits = MaskVT.getScalarSizeInBits();

49426

49427

// TODO: Attempt to handle floating point cases as well?

49428

if (!MaskVT.isInteger() || DAG.ComputeNumSignBits(Mask) != EltBits)

49429

return SDValue();

49430

49431

SDLoc DL(N);

49432

49433

// Attempt to combine to conditional negate: (sub (xor X, M), M)

49434

if (SDValue Res = combineLogicBlendIntoConditionalNegate(VT, Mask, X, Y, DL,

49435

DAG, Subtarget))

49436

return Res;

49437

49438

// PBLENDVB is only available on SSE 4.1.

49439

if (!Subtarget.hasSSE41())

49440

return SDValue();

49441

49442

// If we have VPTERNLOG we should prefer that since PBLENDVB is multiple uops.

49443

if (Subtarget.hasVLX())

49444

return SDValue();

49445

49446

MVT BlendVT = VT.is256BitVector() ? MVT::v32i8 : MVT::v16i8;

49447

49448

X = DAG.getBitcast(BlendVT, X);

49449

Y = DAG.getBitcast(BlendVT, Y);

49450

Mask = DAG.getBitcast(BlendVT, Mask);

49451

Mask = DAG.getSelect(DL, BlendVT, Mask, Y, X);

49452

return DAG.getBitcast(VT, Mask);

49453

}

49454

49455

// Helper function for combineOrCmpEqZeroToCtlzSrl

49456

// Transforms:

49457

// seteq(cmp x, 0)

49458

// into:

49459

// srl(ctlz x), log2(bitsize(x))

49460

// Input pattern is checked by caller.

49461

static SDValue lowerX86CmpEqZeroToCtlzSrl(SDValue Op, SelectionDAG &DAG) {

49462

SDValue Cmp = Op.getOperand(1);

49463

EVT VT = Cmp.getOperand(0).getValueType();

49464

unsigned Log2b = Log2_32(VT.getSizeInBits());

49465

SDLoc dl(Op);

49466

SDValue Clz = DAG.getNode(ISD::CTLZ, dl, VT, Cmp->getOperand(0));

49467

// The result of the shift is true or false, and on X86, the 32-bit

49468

// encoding of shr and lzcnt is more desirable.

49469

SDValue Trunc = DAG.getZExtOrTrunc(Clz, dl, MVT::i32);

49470

SDValue Scc = DAG.getNode(ISD::SRL, dl, MVT::i32, Trunc,

49471

DAG.getConstant(Log2b, dl, MVT::i8));

49472

return Scc;

49473

}

49474

49475

// Try to transform:

49476

// zext(or(setcc(eq, (cmp x, 0)), setcc(eq, (cmp y, 0))))

49477

// into:

49478

// srl(or(ctlz(x), ctlz(y)), log2(bitsize(x))

49479

// Will also attempt to match more generic cases, eg:

49480

// zext(or(or(setcc(eq, cmp 0), setcc(eq, cmp 0)), setcc(eq, cmp 0)))

49481

// Only applies if the target supports the FastLZCNT feature.

49482

static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG,

49483

TargetLowering::DAGCombinerInfo &DCI,

49484

const X86Subtarget &Subtarget) {

49485

if (DCI.isBeforeLegalize() || !Subtarget.getTargetLowering()->isCtlzFast())

49486

return SDValue();

49487

49488

auto isORCandidate = [](SDValue N) {

49489

return (N->getOpcode() == ISD::OR && N->hasOneUse());

49490

};

49491

49492

// Check the zero extend is extending to 32-bit or more. The code generated by

49493

// srl(ctlz) for 16-bit or less variants of the pattern would require extra

49494

// instructions to clear the upper bits.

49495

if (!N->hasOneUse() || !N->getSimpleValueType(0).bitsGE(MVT::i32) ||

49496

!isORCandidate(N->getOperand(0)))

49497

return SDValue();

49498

49499

// Check the node matches: setcc(eq, cmp 0)

49500

auto isSetCCCandidate = [](SDValue N) {

49501

return N->getOpcode() == X86ISD::SETCC && N->hasOneUse() &&

49502

X86::CondCode(N->getConstantOperandVal(0)) == X86::COND_E &&

49503

N->getOperand(1).getOpcode() == X86ISD::CMP &&

49504

isNullConstant(N->getOperand(1).getOperand(1)) &&

49505

N->getOperand(1).getValueType().bitsGE(MVT::i32);

49506

};

49507

49508

SDNode *OR = N->getOperand(0).getNode();

49509

SDValue LHS = OR->getOperand(0);

49510

SDValue RHS = OR->getOperand(1);

49511

49512

// Save nodes matching or(or, setcc(eq, cmp 0)).

49513

SmallVector<SDNode *, 2> ORNodes;

49514

while (((isORCandidate(LHS) && isSetCCCandidate(RHS)) ||

49515

(isORCandidate(RHS) && isSetCCCandidate(LHS)))) {

49516

ORNodes.push_back(OR);

49517

OR = (LHS->getOpcode() == ISD::OR) ? LHS.getNode() : RHS.getNode();

49518

LHS = OR->getOperand(0);

49519

RHS = OR->getOperand(1);

49520

}

49521

49522

// The last OR node should match or(setcc(eq, cmp 0), setcc(eq, cmp 0)).

49523

if (!(isSetCCCandidate(LHS) && isSetCCCandidate(RHS)) ||

49524

!isORCandidate(SDValue(OR, 0)))

49525

return SDValue();

49526

49527

// We have a or(setcc(eq, cmp 0), setcc(eq, cmp 0)) pattern, try to lower it

49528

// to

49529

// or(srl(ctlz),srl(ctlz)).

49530

// The dag combiner can then fold it into:

49531

// srl(or(ctlz, ctlz)).

49532

SDValue NewLHS = lowerX86CmpEqZeroToCtlzSrl(LHS, DAG);

49533

SDValue Ret, NewRHS;

49534

if (NewLHS && (NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, DAG)))

49535

Ret = DAG.getNode(ISD::OR, SDLoc(OR), MVT::i32, NewLHS, NewRHS);

49536

49537

if (!Ret)

49538

return SDValue();

49539

49540

// Try to lower nodes matching the or(or, setcc(eq, cmp 0)) pattern.

49541

while (ORNodes.size() > 0) {

49542

OR = ORNodes.pop_back_val();

49543

LHS = OR->getOperand(0);

49544

RHS = OR->getOperand(1);

49545

// Swap rhs with lhs to match or(setcc(eq, cmp, 0), or).

49546

if (RHS->getOpcode() == ISD::OR)

49547

std::swap(LHS, RHS);

49548

NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, DAG);

49549

if (!NewRHS)

49550

return SDValue();

49551

Ret = DAG.getNode(ISD::OR, SDLoc(OR), MVT::i32, Ret, NewRHS);

49552

}

49553

49554

return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), Ret);

49555

}

49556

49557

static SDValue foldMaskedMergeImpl(SDValue And0_L, SDValue And0_R,

49558

SDValue And1_L, SDValue And1_R,

49559

const SDLoc &DL, SelectionDAG &DAG) {

49560

if (!isBitwiseNot(And0_L, true) || !And0_L->hasOneUse())

49561

return SDValue();

49562

SDValue NotOp = And0_L->getOperand(0);

49563

if (NotOp == And1_R)

49564

std::swap(And1_R, And1_L);

49565

if (NotOp != And1_L)

49566

return SDValue();

49567

49568

// (~(NotOp) & And0_R) | (NotOp & And1_R)

49569

// --> ((And0_R ^ And1_R) & NotOp) ^ And1_R

49570

EVT VT = And1_L->getValueType(0);

49571

SDValue Freeze_And0_R = DAG.getNode(ISD::FREEZE, SDLoc(), VT, And0_R);

49572

SDValue Xor0 = DAG.getNode(ISD::XOR, DL, VT, And1_R, Freeze_And0_R);

49573

SDValue And = DAG.getNode(ISD::AND, DL, VT, Xor0, NotOp);

49574

SDValue Xor1 = DAG.getNode(ISD::XOR, DL, VT, And, Freeze_And0_R);

49575

return Xor1;

49576

}

49577

49578

/// Fold "masked merge" expressions like `(m & x) | (~m & y)` into the

49579

/// equivalent `((x ^ y) & m) ^ y)` pattern.

49580

/// This is typically a better representation for targets without a fused

49581

/// "and-not" operation. This function is intended to be called from a

49582

/// `TargetLowering::PerformDAGCombine` callback on `ISD::OR` nodes.

49583

static SDValue foldMaskedMerge(SDNode *Node, SelectionDAG &DAG) {

49584

// Note that masked-merge variants using XOR or ADD expressions are

49585

// normalized to OR by InstCombine so we only check for OR.

49586

assert(Node->getOpcode() == ISD::OR && "Must be called with ISD::OR node")(static_cast <bool> (Node->getOpcode() == ISD::OR &&
"Must be called with ISD::OR node") ? void (0) : __assert_fail
("Node->getOpcode() == ISD::OR && \"Must be called with ISD::OR node\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49586, __extension__
__PRETTY_FUNCTION__));

49587

SDValue N0 = Node->getOperand(0);

49588

if (N0->getOpcode() != ISD::AND || !N0->hasOneUse())

49589

return SDValue();

49590

SDValue N1 = Node->getOperand(1);

49591

if (N1->getOpcode() != ISD::AND || !N1->hasOneUse())

49592

return SDValue();

49593

49594

SDLoc DL(Node);

49595

SDValue N00 = N0->getOperand(0);

49596

SDValue N01 = N0->getOperand(1);

49597

SDValue N10 = N1->getOperand(0);

49598

SDValue N11 = N1->getOperand(1);

49599

if (SDValue Result = foldMaskedMergeImpl(N00, N01, N10, N11, DL, DAG))

49600

return Result;

49601

if (SDValue Result = foldMaskedMergeImpl(N01, N00, N10, N11, DL, DAG))

49602

return Result;

49603

if (SDValue Result = foldMaskedMergeImpl(N10, N11, N00, N01, DL, DAG))

49604

return Result;

49605

if (SDValue Result = foldMaskedMergeImpl(N11, N10, N00, N01, DL, DAG))

49606

return Result;

49607

return SDValue();

49608

}

49609

49610

static SDValue combineOr(SDNode *N, SelectionDAG &DAG,

49611

TargetLowering::DAGCombinerInfo &DCI,

49612

const X86Subtarget &Subtarget) {

49613

SDValue N0 = N->getOperand(0);

49614

SDValue N1 = N->getOperand(1);

49615

EVT VT = N->getValueType(0);

49616

SDLoc dl(N);

49617

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

49618

49619

// If this is SSE1 only convert to FOR to avoid scalarization.

49620

if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {

49621

return DAG.getBitcast(MVT::v4i32,

49622

DAG.getNode(X86ISD::FOR, dl, MVT::v4f32,

49623

DAG.getBitcast(MVT::v4f32, N0),

49624

DAG.getBitcast(MVT::v4f32, N1)));

49625

}

49626

49627

// Match any-of bool scalar reductions into a bitcast/movmsk + cmp.

49628

// TODO: Support multiple SrcOps.

49629

if (VT == MVT::i1) {

49630

SmallVector<SDValue, 2> SrcOps;

49631

SmallVector<APInt, 2> SrcPartials;

49632

if (matchScalarReduction(SDValue(N, 0), ISD::OR, SrcOps, &SrcPartials) &&

49633

SrcOps.size() == 1) {

49634

unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements();

49635

EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);

49636

SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget);

49637

if (!Mask && TLI.isTypeLegal(SrcOps[0].getValueType()))

49638

Mask = DAG.getBitcast(MaskVT, SrcOps[0]);

49639

if (Mask) {

49640

assert(SrcPartials[0].getBitWidth() == NumElts &&(static_cast <bool> (SrcPartials[0].getBitWidth() == NumElts
&& "Unexpected partial reduction mask") ? void (0) :
__assert_fail ("SrcPartials[0].getBitWidth() == NumElts && \"Unexpected partial reduction mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49641, __extension__
__PRETTY_FUNCTION__))

49641

"Unexpected partial reduction mask")(static_cast <bool> (SrcPartials[0].getBitWidth() == NumElts
&& "Unexpected partial reduction mask") ? void (0) :
__assert_fail ("SrcPartials[0].getBitWidth() == NumElts && \"Unexpected partial reduction mask\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49641, __extension__
__PRETTY_FUNCTION__));

49642

SDValue ZeroBits = DAG.getConstant(0, dl, MaskVT);

49643

SDValue PartialBits = DAG.getConstant(SrcPartials[0], dl, MaskVT);

49644

Mask = DAG.getNode(ISD::AND, dl, MaskVT, Mask, PartialBits);

49645

return DAG.getSetCC(dl, MVT::i1, Mask, ZeroBits, ISD::SETNE);

49646

}

49647

}

49648

}

49649

49650

if (SDValue R = combineBitOpWithMOVMSK(N, DAG))

49651

return R;

49652

49653

if (SDValue R = combineBitOpWithShift(N, DAG))

49654

return R;

49655

49656

if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, DCI, Subtarget))

49657

return FPLogic;

49658

49659

if (DCI.isBeforeLegalizeOps())

49660

return SDValue();

49661

49662

if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))

49663

return R;

49664

49665

if (SDValue R = canonicalizeBitSelect(N, DAG, Subtarget))

49666

return R;

49667

49668

if (SDValue R = combineLogicBlendIntoPBLENDV(N, DAG, Subtarget))

49669

return R;

49670

49671

// (0 - SetCC) | C -> (zext (not SetCC)) * (C + 1) - 1 if we can get a LEA out of it.

49672

if ((VT == MVT::i32 || VT == MVT::i64) &&

49673

N0.getOpcode() == ISD::SUB && N0.hasOneUse() &&

49674

isNullConstant(N0.getOperand(0))) {

49675

SDValue Cond = N0.getOperand(1);

49676

if (Cond.getOpcode() == ISD::ZERO_EXTEND && Cond.hasOneUse())

49677

Cond = Cond.getOperand(0);

49678

49679

if (Cond.getOpcode() == X86ISD::SETCC && Cond.hasOneUse()) {

49680

if (auto *CN = dyn_cast<ConstantSDNode>(N1)) {

49681

uint64_t Val = CN->getZExtValue();

49682

if (Val == 1 || Val == 2 || Val == 3 || Val == 4 || Val == 7 || Val == 8) {

49683

X86::CondCode CCode = (X86::CondCode)Cond.getConstantOperandVal(0);

49684

CCode = X86::GetOppositeBranchCondition(CCode);

49685

SDValue NotCond = getSETCC(CCode, Cond.getOperand(1), SDLoc(Cond), DAG);

49686

49687

SDValue R = DAG.getZExtOrTrunc(NotCond, dl, VT);

49688

R = DAG.getNode(ISD::MUL, dl, VT, R, DAG.getConstant(Val + 1, dl, VT));

49689

R = DAG.getNode(ISD::SUB, dl, VT, R, DAG.getConstant(1, dl, VT));

49690

return R;

49691

}

49692

}

49693

}

49694

}

49695

49696

// Combine OR(X,KSHIFTL(Y,Elts/2)) -> CONCAT_VECTORS(X,Y) == KUNPCK(X,Y).

49697

// Combine OR(KSHIFTL(X,Elts/2),Y) -> CONCAT_VECTORS(Y,X) == KUNPCK(Y,X).

49698

// iff the upper elements of the non-shifted arg are zero.

49699

// KUNPCK require 16+ bool vector elements.

49700

if (N0.getOpcode() == X86ISD::KSHIFTL || N1.getOpcode() == X86ISD::KSHIFTL) {

49701

unsigned NumElts = VT.getVectorNumElements();

49702

unsigned HalfElts = NumElts / 2;

49703

APInt UpperElts = APInt::getHighBitsSet(NumElts, HalfElts);

49704

if (NumElts >= 16 && N1.getOpcode() == X86ISD::KSHIFTL &&

49705

N1.getConstantOperandAPInt(1) == HalfElts &&

49706

DAG.MaskedVectorIsZero(N0, UpperElts)) {

49707

return DAG.getNode(

49708

ISD::CONCAT_VECTORS, dl, VT,

49709

extractSubVector(N0, 0, DAG, dl, HalfElts),

49710

extractSubVector(N1.getOperand(0), 0, DAG, dl, HalfElts));

49711

}

49712

if (NumElts >= 16 && N0.getOpcode() == X86ISD::KSHIFTL &&

49713

N0.getConstantOperandAPInt(1) == HalfElts &&

49714

DAG.MaskedVectorIsZero(N1, UpperElts)) {

49715

return DAG.getNode(

49716

ISD::CONCAT_VECTORS, dl, VT,

49717

extractSubVector(N1, 0, DAG, dl, HalfElts),

49718

extractSubVector(N0.getOperand(0), 0, DAG, dl, HalfElts));

49719

}

49720

}

49721

49722

if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {

49723

// Attempt to recursively combine an OR of shuffles.

49724

SDValue Op(N, 0);

49725

if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))

49726

return Res;

49727

49728

// If either operand is a constant mask, then only the elements that aren't

49729

// allones are actually demanded by the other operand.

49730

auto SimplifyUndemandedElts = [&](SDValue Op, SDValue OtherOp) {

49731

APInt UndefElts;

49732

SmallVector<APInt> EltBits;

49733

int NumElts = VT.getVectorNumElements();

49734

int EltSizeInBits = VT.getScalarSizeInBits();

49735

if (!getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts, EltBits))

49736

return false;

49737

49738

APInt DemandedElts = APInt::getZero(NumElts);

49739

for (int I = 0; I != NumElts; ++I)

49740

if (!EltBits[I].isAllOnes())

49741

DemandedElts.setBit(I);

49742

49743

return TLI.SimplifyDemandedVectorElts(OtherOp, DemandedElts, DCI);

49744

};

49745

if (SimplifyUndemandedElts(N0, N1) || SimplifyUndemandedElts(N1, N0)) {

49746

if (N->getOpcode() != ISD::DELETED_NODE)

49747

DCI.AddToWorklist(N);

49748

return SDValue(N, 0);

49749

}

49750

}

49751

49752

// We should fold "masked merge" patterns when `andn` is not available.

49753

if (!Subtarget.hasBMI() && VT.isScalarInteger() && VT != MVT::i1)

49754

if (SDValue R = foldMaskedMerge(N, DAG))

49755

return R;

49756

49757

return SDValue();

49758

}

49759

49760

/// Try to turn tests against the signbit in the form of:

49761

/// XOR(TRUNCATE(SRL(X, size(X)-1)), 1)

49762

/// into:

49763

/// SETGT(X, -1)

49764

static SDValue foldXorTruncShiftIntoCmp(SDNode *N, SelectionDAG &DAG) {

49765

// This is only worth doing if the output type is i8 or i1.

49766

EVT ResultType = N->getValueType(0);

49767

if (ResultType != MVT::i8 && ResultType != MVT::i1)

49768

return SDValue();

49769

49770

SDValue N0 = N->getOperand(0);

49771

SDValue N1 = N->getOperand(1);

49772

49773

// We should be performing an xor against a truncated shift.

49774

if (N0.getOpcode() != ISD::TRUNCATE || !N0.hasOneUse())

49775

return SDValue();

49776

49777

// Make sure we are performing an xor against one.

49778

if (!isOneConstant(N1))

49779

return SDValue();

49780

49781

// SetCC on x86 zero extends so only act on this if it's a logical shift.

49782

SDValue Shift = N0.getOperand(0);

49783

if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse())

49784

return SDValue();

49785

49786

// Make sure we are truncating from one of i16, i32 or i64.

49787

EVT ShiftTy = Shift.getValueType();

49788

if (ShiftTy != MVT::i16 && ShiftTy != MVT::i32 && ShiftTy != MVT::i64)

49789

return SDValue();

49790

49791

// Make sure the shift amount extracts the sign bit.

49792

if (!isa<ConstantSDNode>(Shift.getOperand(1)) ||

49793

Shift.getConstantOperandAPInt(1) != (ShiftTy.getSizeInBits() - 1))

49794

return SDValue();

49795

49796

// Create a greater-than comparison against -1.

49797

// N.B. Using SETGE against 0 works but we want a canonical looking

49798

// comparison, using SETGT matches up with what TranslateX86CC.

49799

SDLoc DL(N);

49800

SDValue ShiftOp = Shift.getOperand(0);

49801

EVT ShiftOpTy = ShiftOp.getValueType();

49802

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

49803

EVT SetCCResultType = TLI.getSetCCResultType(DAG.getDataLayout(),

49804

*DAG.getContext(), ResultType);

49805

SDValue Cond = DAG.getSetCC(DL, SetCCResultType, ShiftOp,

49806

DAG.getConstant(-1, DL, ShiftOpTy), ISD::SETGT);

49807

if (SetCCResultType != ResultType)

49808

Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, ResultType, Cond);

49809

return Cond;

49810

}

49811

49812

/// Turn vector tests of the signbit in the form of:

49813

/// xor (sra X, elt_size(X)-1), -1

49814

/// into:

49815

/// pcmpgt X, -1

49816

///

49817

/// This should be called before type legalization because the pattern may not

49818

/// persist after that.

49819

static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,

49820

const X86Subtarget &Subtarget) {

49821

EVT VT = N->getValueType(0);

49822

if (!VT.isSimple())

49823

return SDValue();

49824

49825

switch (VT.getSimpleVT().SimpleTy) {

49826

default: return SDValue();

49827

case MVT::v16i8:

49828

case MVT::v8i16:

49829

case MVT::v4i32:

49830

case MVT::v2i64: if (!Subtarget.hasSSE2()) return SDValue(); break;

49831

case MVT::v32i8:

49832

case MVT::v16i16:

49833

case MVT::v8i32:

49834

case MVT::v4i64: if (!Subtarget.hasAVX2()) return SDValue(); break;

49835

}

49836

49837

// There must be a shift right algebraic before the xor, and the xor must be a

49838

// 'not' operation.

49839

SDValue Shift = N->getOperand(0);

49840

SDValue Ones = N->getOperand(1);

49841

if (Shift.getOpcode() != ISD::SRA || !Shift.hasOneUse() ||

49842

!ISD::isBuildVectorAllOnes(Ones.getNode()))

49843

return SDValue();

49844

49845

// The shift should be smearing the sign bit across each vector element.

49846

auto *ShiftAmt =

49847

isConstOrConstSplat(Shift.getOperand(1), /*AllowUndefs*/ true);

49848

if (!ShiftAmt ||

49849

ShiftAmt->getAPIntValue() != (Shift.getScalarValueSizeInBits() - 1))

49850

return SDValue();

49851

49852

// Create a greater-than comparison against -1. We don't use the more obvious

49853

// greater-than-or-equal-to-zero because SSE/AVX don't have that instruction.

49854

return DAG.getSetCC(SDLoc(N), VT, Shift.getOperand(0), Ones, ISD::SETGT);

49855

}

49856

49857

/// Detect patterns of truncation with unsigned saturation:

49858

///

49859

/// 1. (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).

49860

/// Return the source value x to be truncated or SDValue() if the pattern was

49861

/// not matched.

49862

///

49863

/// 2. (truncate (smin (smax (x, C1), C2)) to dest_type),

49864

/// where C1 >= 0 and C2 is unsigned max of destination type.

49865

///

49866

/// (truncate (smax (smin (x, C2), C1)) to dest_type)

49867

/// where C1 >= 0, C2 is unsigned max of destination type and C1 <= C2.

49868

///

49869

/// These two patterns are equivalent to:

49870

/// (truncate (umin (smax(x, C1), unsigned_max_of_dest_type)) to dest_type)

49871

/// So return the smax(x, C1) value to be truncated or SDValue() if the

49872

/// pattern was not matched.

49873

static SDValue detectUSatPattern(SDValue In, EVT VT, SelectionDAG &DAG,

49874

const SDLoc &DL) {

49875

EVT InVT = In.getValueType();

49876

49877

// Saturation with truncation. We truncate from InVT to VT.

49878

assert(InVT.getScalarSizeInBits() > VT.getScalarSizeInBits() &&(static_cast <bool> (InVT.getScalarSizeInBits() > VT
.getScalarSizeInBits() && "Unexpected types for truncate operation"
) ? void (0) : __assert_fail ("InVT.getScalarSizeInBits() > VT.getScalarSizeInBits() && \"Unexpected types for truncate operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49879, __extension__
__PRETTY_FUNCTION__))

49879

"Unexpected types for truncate operation")(static_cast <bool> (InVT.getScalarSizeInBits() > VT
.getScalarSizeInBits() && "Unexpected types for truncate operation"
) ? void (0) : __assert_fail ("InVT.getScalarSizeInBits() > VT.getScalarSizeInBits() && \"Unexpected types for truncate operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49879, __extension__
__PRETTY_FUNCTION__));

49880

49881

// Match min/max and return limit value as a parameter.

49882

auto MatchMinMax = [](SDValue V, unsigned Opcode, APInt &Limit) -> SDValue {

49883

if (V.getOpcode() == Opcode &&

49884

ISD::isConstantSplatVector(V.getOperand(1).getNode(), Limit))

49885

return V.getOperand(0);

49886

return SDValue();

49887

};

49888

49889

APInt C1, C2;

49890

if (SDValue UMin = MatchMinMax(In, ISD::UMIN, C2))

49891

// C2 should be equal to UINT32_MAX / UINT16_MAX / UINT8_MAX according

49892

// the element size of the destination type.

49893

if (C2.isMask(VT.getScalarSizeInBits()))

49894

return UMin;

49895

49896

if (SDValue SMin = MatchMinMax(In, ISD::SMIN, C2))

49897

if (MatchMinMax(SMin, ISD::SMAX, C1))

49898

if (C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits()))

49899

return SMin;

49900

49901

if (SDValue SMax = MatchMinMax(In, ISD::SMAX, C1))

49902

if (SDValue SMin = MatchMinMax(SMax, ISD::SMIN, C2))

49903

if (C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits()) &&

49904

C2.uge(C1)) {

49905

return DAG.getNode(ISD::SMAX, DL, InVT, SMin, In.getOperand(1));

49906

}

49907

49908

return SDValue();

49909

}

49910

49911

/// Detect patterns of truncation with signed saturation:

49912

/// (truncate (smin ((smax (x, signed_min_of_dest_type)),

49913

/// signed_max_of_dest_type)) to dest_type)

49914

/// or:

49915

/// (truncate (smax ((smin (x, signed_max_of_dest_type)),

49916

/// signed_min_of_dest_type)) to dest_type).

49917

/// With MatchPackUS, the smax/smin range is [0, unsigned_max_of_dest_type].

49918

/// Return the source value to be truncated or SDValue() if the pattern was not

49919

/// matched.

49920

static SDValue detectSSatPattern(SDValue In, EVT VT, bool MatchPackUS = false) {

49921

unsigned NumDstBits = VT.getScalarSizeInBits();

49922

unsigned NumSrcBits = In.getScalarValueSizeInBits();

49923

assert(NumSrcBits > NumDstBits && "Unexpected types for truncate operation")(static_cast <bool> (NumSrcBits > NumDstBits &&
"Unexpected types for truncate operation") ? void (0) : __assert_fail
("NumSrcBits > NumDstBits && \"Unexpected types for truncate operation\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49923, __extension__
__PRETTY_FUNCTION__));

49924

49925

auto MatchMinMax = [](SDValue V, unsigned Opcode,

49926

const APInt &Limit) -> SDValue {

49927

APInt C;

49928

if (V.getOpcode() == Opcode &&

49929

ISD::isConstantSplatVector(V.getOperand(1).getNode(), C) && C == Limit)

49930

return V.getOperand(0);

49931

return SDValue();

49932

};

49933

49934

APInt SignedMax, SignedMin;

49935

if (MatchPackUS) {

49936

SignedMax = APInt::getAllOnes(NumDstBits).zext(NumSrcBits);

49937

SignedMin = APInt(NumSrcBits, 0);

49938

} else {

49939

SignedMax = APInt::getSignedMaxValue(NumDstBits).sext(NumSrcBits);

49940

SignedMin = APInt::getSignedMinValue(NumDstBits).sext(NumSrcBits);

49941

}

49942

49943

if (SDValue SMin = MatchMinMax(In, ISD::SMIN, SignedMax))

49944

if (SDValue SMax = MatchMinMax(SMin, ISD::SMAX, SignedMin))

49945

return SMax;

49946

49947

if (SDValue SMax = MatchMinMax(In, ISD::SMAX, SignedMin))

49948

if (SDValue SMin = MatchMinMax(SMax, ISD::SMIN, SignedMax))

49949

return SMin;

49950

49951

return SDValue();

49952

}

49953

49954

static SDValue combineTruncateWithSat(SDValue In, EVT VT, const SDLoc &DL,

49955

SelectionDAG &DAG,

49956

const X86Subtarget &Subtarget) {

49957

if (!Subtarget.hasSSE2() || !VT.isVector())

49958

return SDValue();

49959

49960

EVT SVT = VT.getVectorElementType();

49961

EVT InVT = In.getValueType();

49962

EVT InSVT = InVT.getVectorElementType();

49963

49964

// If we're clamping a signed 32-bit vector to 0-255 and the 32-bit vector is

49965

// split across two registers. We can use a packusdw+perm to clamp to 0-65535

49966

// and concatenate at the same time. Then we can use a final vpmovuswb to

49967

// clip to 0-255.

49968

if (Subtarget.hasBWI() && !Subtarget.useAVX512Regs() &&

49969

InVT == MVT::v16i32 && VT == MVT::v16i8) {

49970

if (SDValue USatVal = detectSSatPattern(In, VT, true)) {

49971

// Emit a VPACKUSDW+VPERMQ followed by a VPMOVUSWB.

49972

SDValue Mid = truncateVectorWithPACK(X86ISD::PACKUS, MVT::v16i16, USatVal,

49973

DL, DAG, Subtarget);

49974

assert(Mid && "Failed to pack!")(static_cast <bool> (Mid && "Failed to pack!") ?
void (0) : __assert_fail ("Mid && \"Failed to pack!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 49974, __extension__
__PRETTY_FUNCTION__));

49975

return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, Mid);

49976

}

49977

}

49978

49979

// vXi32 truncate instructions are available with AVX512F.

49980

// vXi16 truncate instructions are only available with AVX512BW.

49981

// For 256-bit or smaller vectors, we require VLX.

49982

// FIXME: We could widen truncates to 512 to remove the VLX restriction.

49983

// If the result type is 256-bits or larger and we have disable 512-bit

49984

// registers, we should go ahead and use the pack instructions if possible.

49985

bool PreferAVX512 = ((Subtarget.hasAVX512() && InSVT == MVT::i32) ||

49986

(Subtarget.hasBWI() && InSVT == MVT::i16)) &&

49987

(InVT.getSizeInBits() > 128) &&

49988

(Subtarget.hasVLX() || InVT.getSizeInBits() > 256) &&

49989

!(!Subtarget.useAVX512Regs() && VT.getSizeInBits() >= 256);

49990

49991

if (isPowerOf2_32(VT.getVectorNumElements()) && !PreferAVX512 &&

49992

VT.getSizeInBits() >= 64 &&

49993

(SVT == MVT::i8 || SVT == MVT::i16) &&

49994

(InSVT == MVT::i16 || InSVT == MVT::i32)) {

49995

if (SDValue USatVal = detectSSatPattern(In, VT, true)) {

49996

// vXi32 -> vXi8 must be performed as PACKUSWB(PACKSSDW,PACKSSDW).

49997

// Only do this when the result is at least 64 bits or we'll leaving

49998

// dangling PACKSSDW nodes.

49999

if (SVT == MVT::i8 && InSVT == MVT::i32) {

50000

EVT MidVT = VT.changeVectorElementType(MVT::i16);

50001

SDValue Mid = truncateVectorWithPACK(X86ISD::PACKSS, MidVT, USatVal, DL,

50002

DAG, Subtarget);

50003

assert(Mid && "Failed to pack!")(static_cast <bool> (Mid && "Failed to pack!") ?
void (0) : __assert_fail ("Mid && \"Failed to pack!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 50003, __extension__
__PRETTY_FUNCTION__));

50004

SDValue V = truncateVectorWithPACK(X86ISD::PACKUS, VT, Mid, DL, DAG,

50005

Subtarget);

50006

assert(V && "Failed to pack!")(static_cast <bool> (V && "Failed to pack!") ? void
(0) : __assert_fail ("V && \"Failed to pack!\"", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 50006, __extension__ __PRETTY_FUNCTION__));

50007

return V;

50008

} else if (SVT == MVT::i8 || Subtarget.hasSSE41())

50009

return truncateVectorWithPACK(X86ISD::PACKUS, VT, USatVal, DL, DAG,

50010

Subtarget);

50011

}

50012

if (SDValue SSatVal = detectSSatPattern(In, VT))

50013

return truncateVectorWithPACK(X86ISD::PACKSS, VT, SSatVal, DL, DAG,

50014

Subtarget);

50015

}

50016

50017

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

50018

if (TLI.isTypeLegal(InVT) && InVT.isVector() && SVT != MVT::i1 &&

50019

Subtarget.hasAVX512() && (InSVT != MVT::i16 || Subtarget.hasBWI()) &&

50020

(SVT == MVT::i32 || SVT == MVT::i16 || SVT == MVT::i8)) {

50021

unsigned TruncOpc = 0;

50022

SDValue SatVal;

50023

if (SDValue SSatVal = detectSSatPattern(In, VT)) {

50024

SatVal = SSatVal;

50025

TruncOpc = X86ISD::VTRUNCS;

50026

} else if (SDValue USatVal = detectUSatPattern(In, VT, DAG, DL)) {

50027

SatVal = USatVal;

50028

TruncOpc = X86ISD::VTRUNCUS;

50029

}

50030

if (SatVal) {

50031

unsigned ResElts = VT.getVectorNumElements();

50032

// If the input type is less than 512 bits and we don't have VLX, we need

50033

// to widen to 512 bits.

50034

if (!Subtarget.hasVLX() && !InVT.is512BitVector()) {

50035

unsigned NumConcats = 512 / InVT.getSizeInBits();

50036

ResElts *= NumConcats;

50037

SmallVector<SDValue, 4> ConcatOps(NumConcats, DAG.getUNDEF(InVT));

50038

ConcatOps[0] = SatVal;

50039

InVT = EVT::getVectorVT(*DAG.getContext(), InSVT,

50040

NumConcats * InVT.getVectorNumElements());

50041

SatVal = DAG.getNode(ISD::CONCAT_VECTORS, DL, InVT, ConcatOps);

50042

}

50043

// Widen the result if its narrower than 128 bits.

50044

if (ResElts * SVT.getSizeInBits() < 128)

50045

ResElts = 128 / SVT.getSizeInBits();

50046

EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), SVT, ResElts);

50047

SDValue Res = DAG.getNode(TruncOpc, DL, TruncVT, SatVal);

50048

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,

50049

DAG.getIntPtrConstant(0, DL));

50050

}

50051

}

50052

50053

return SDValue();

50054

}

50055

50056

/// This function detects the AVG pattern between vectors of unsigned i8/i16,

50057

/// which is c = (a + b + 1) / 2, and replace this operation with the efficient

50058

/// ISD::AVGCEILU (AVG) instruction.

50059

static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,

50060

const X86Subtarget &Subtarget,

50061

const SDLoc &DL) {

50062

if (!VT.isVector())

50063

return SDValue();

50064

EVT InVT = In.getValueType();

50065

unsigned NumElems = VT.getVectorNumElements();

50066

50067

EVT ScalarVT = VT.getVectorElementType();

50068

if (!((ScalarVT == MVT::i8 || ScalarVT == MVT::i16) && NumElems >= 2))

50069

return SDValue();

50070

50071

// InScalarVT is the intermediate type in AVG pattern and it should be greater

50072

// than the original input type (i8/i16).

50073

EVT InScalarVT = InVT.getVectorElementType();

50074

if (InScalarVT.getFixedSizeInBits() <= ScalarVT.getFixedSizeInBits())

50075

return SDValue();

50076

50077

if (!Subtarget.hasSSE2())

50078

return SDValue();

50079

50080

// Detect the following pattern:

50081

//

50082

// %1 = zext <N x i8> %a to <N x i32>

50083

// %2 = zext <N x i8> %b to <N x i32>

50084

// %3 = add nuw nsw <N x i32> %1, <i32 1 x N>

50085

// %4 = add nuw nsw <N x i32> %3, %2

50086

// %5 = lshr <N x i32> %N, <i32 1 x N>

50087

// %6 = trunc <N x i32> %5 to <N x i8>

50088

//

50089

// In AVX512, the last instruction can also be a trunc store.

50090

if (In.getOpcode() != ISD::SRL)

50091

return SDValue();

50092

50093

// A lambda checking the given SDValue is a constant vector and each element

50094

// is in the range [Min, Max].

50095

auto IsConstVectorInRange = [](SDValue V, unsigned Min, unsigned Max) {

50096

return ISD::matchUnaryPredicate(V, [Min, Max](ConstantSDNode *C) {

50097

return !(C->getAPIntValue().ult(Min) || C->getAPIntValue().ugt(Max));

50098

});

50099

};

50100

50101

auto IsZExtLike = [DAG = &DAG, ScalarVT](SDValue V) {

50102

unsigned MaxActiveBits = DAG->computeKnownBits(V).countMaxActiveBits();

50103

return MaxActiveBits <= ScalarVT.getSizeInBits();

50104

};

50105

50106

// Check if each element of the vector is right-shifted by one.

50107

SDValue LHS = In.getOperand(0);

50108

SDValue RHS = In.getOperand(1);

50109

if (!IsConstVectorInRange(RHS, 1, 1))

50110

return SDValue();

50111

if (LHS.getOpcode() != ISD::ADD)

50112

return SDValue();

50113

50114

// Detect a pattern of a + b + 1 where the order doesn't matter.

50115

SDValue Operands[3];

50116

Operands[0] = LHS.getOperand(0);

50117

Operands[1] = LHS.getOperand(1);

50118

50119

auto AVGBuilder = [](SelectionDAG &DAG, const SDLoc &DL,

50120

ArrayRef<SDValue> Ops) {

50121

return DAG.getNode(ISD::AVGCEILU, DL, Ops[0].getValueType(), Ops);

50122

};

50123

50124

auto AVGSplitter = [&](std::array<SDValue, 2> Ops) {

50125

for (SDValue &Op : Ops)

50126

if (Op.getValueType() != VT)

50127

Op = DAG.getNode(ISD::TRUNCATE, DL, VT, Op);

50128

// Pad to a power-of-2 vector, split+apply and extract the original vector.

50129

unsigned NumElemsPow2 = PowerOf2Ceil(NumElems);

50130

EVT Pow2VT = EVT::getVectorVT(*DAG.getContext(), ScalarVT, NumElemsPow2);

50131

if (NumElemsPow2 != NumElems) {

50132

for (SDValue &Op : Ops) {

50133

SmallVector<SDValue, 32> EltsOfOp(NumElemsPow2, DAG.getUNDEF(ScalarVT));

50134

for (unsigned i = 0; i != NumElems; ++i) {

50135

SDValue Idx = DAG.getIntPtrConstant(i, DL);

50136

EltsOfOp[i] =

50137

DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarVT, Op, Idx);

50138

}

50139

Op = DAG.getBuildVector(Pow2VT, DL, EltsOfOp);

50140

}

50141

}

50142

SDValue Res = SplitOpsAndApply(DAG, Subtarget, DL, Pow2VT, Ops, AVGBuilder);

50143

if (NumElemsPow2 == NumElems)

50144

return Res;

50145

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,

50146

DAG.getIntPtrConstant(0, DL));

50147

};

50148

50149

// Take care of the case when one of the operands is a constant vector whose

50150

// element is in the range [1, 256].

50151

if (IsConstVectorInRange(Operands[1], 1, ScalarVT == MVT::i8 ? 256 : 65536) &&

50152

IsZExtLike(Operands[0])) {

50153

// The pattern is detected. Subtract one from the constant vector, then

50154

// demote it and emit X86ISD::AVG instruction.

50155

SDValue VecOnes = DAG.getConstant(1, DL, InVT);

50156

Operands[1] = DAG.getNode(ISD::SUB, DL, InVT, Operands[1], VecOnes);

50157

return AVGSplitter({Operands[0], Operands[1]});

50158

}

50159

50160

// Matches 'add like' patterns: add(Op0,Op1) + zext(or(Op0,Op1)).

50161

// Match the or case only if its 'add-like' - can be replaced by an add.

50162

auto FindAddLike = [&](SDValue V, SDValue &Op0, SDValue &Op1) {

50163

if (ISD::ADD == V.getOpcode()) {

50164

Op0 = V.getOperand(0);

50165

Op1 = V.getOperand(1);

50166

return true;

50167

}

50168

if (ISD::ZERO_EXTEND != V.getOpcode())

50169

return false;

50170

V = V.getOperand(0);

50171

if (V.getValueType() != VT || ISD::OR != V.getOpcode() ||

50172

!DAG.haveNoCommonBitsSet(V.getOperand(0), V.getOperand(1)))

50173

return false;

50174

Op0 = V.getOperand(0);

50175

Op1 = V.getOperand(1);

50176

return true;

50177

};

50178

50179

SDValue Op0, Op1;

50180

if (FindAddLike(Operands[0], Op0, Op1))

50181

std::swap(Operands[0], Operands[1]);

50182

else if (!FindAddLike(Operands[1], Op0, Op1))

50183

return SDValue();

50184

Operands[2] = Op0;

50185

Operands[1] = Op1;

50186

50187

// Now we have three operands of two additions. Check that one of them is a

50188

// constant vector with ones, and the other two can be promoted from i8/i16.

50189

for (SDValue &Op : Operands) {

50190

if (!IsConstVectorInRange(Op, 1, 1))

50191

continue;

50192

std::swap(Op, Operands[2]);

50193

50194

// Check if Operands[0] and Operands[1] are results of type promotion.

50195

for (int j = 0; j < 2; ++j)

50196

if (Operands[j].getValueType() != VT)

50197

if (!IsZExtLike(Operands[j]))

50198

return SDValue();

50199

50200

// The pattern is detected, emit X86ISD::AVG instruction(s).

50201

return AVGSplitter({Operands[0], Operands[1]});

50202

}

50203

50204

return SDValue();

50205

}

50206

50207

static SDValue combineLoad(SDNode *N, SelectionDAG &DAG,

50208

TargetLowering::DAGCombinerInfo &DCI,

50209

const X86Subtarget &Subtarget) {

50210

LoadSDNode *Ld = cast<LoadSDNode>(N);

50211

EVT RegVT = Ld->getValueType(0);

50212

EVT MemVT = Ld->getMemoryVT();

50213

SDLoc dl(Ld);

50214

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

50215

50216

// For chips with slow 32-byte unaligned loads, break the 32-byte operation

50217

// into two 16-byte operations. Also split non-temporal aligned loads on

50218

// pre-AVX2 targets as 32-byte loads will lower to regular temporal loads.

50219

ISD::LoadExtType Ext = Ld->getExtensionType();

50220

unsigned Fast;

50221

if (RegVT.is256BitVector() && !DCI.isBeforeLegalizeOps() &&

50222

Ext == ISD::NON_EXTLOAD &&

50223

((Ld->isNonTemporal() && !Subtarget.hasInt256() &&

50224

Ld->getAlign() >= Align(16)) ||

50225

(TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT,

50226

*Ld->getMemOperand(), &Fast) &&

50227

!Fast))) {

50228

unsigned NumElems = RegVT.getVectorNumElements();

50229

if (NumElems < 2)

50230

return SDValue();

50231

50232

unsigned HalfOffset = 16;

50233

SDValue Ptr1 = Ld->getBasePtr();

50234

SDValue Ptr2 =

50235

DAG.getMemBasePlusOffset(Ptr1, TypeSize::Fixed(HalfOffset), dl);

50236

EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),

50237

NumElems / 2);

50238

SDValue Load1 =

50239

DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr1, Ld->getPointerInfo(),

50240

Ld->getOriginalAlign(),

50241

Ld->getMemOperand()->getFlags());

50242

SDValue Load2 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr2,

50243

Ld->getPointerInfo().getWithOffset(HalfOffset),

50244

Ld->getOriginalAlign(),

50245

Ld->getMemOperand()->getFlags());

50246

SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,

50247

Load1.getValue(1), Load2.getValue(1));

50248

50249

SDValue NewVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Load1, Load2);

50250

return DCI.CombineTo(N, NewVec, TF, true);

50251

}

50252

50253

// Bool vector load - attempt to cast to an integer, as we have good

50254

// (vXiY *ext(vXi1 bitcast(iX))) handling.

50255

if (Ext == ISD::NON_EXTLOAD && !Subtarget.hasAVX512() && RegVT.isVector() &&

50256

RegVT.getScalarType() == MVT::i1 && DCI.isBeforeLegalize()) {

50257

unsigned NumElts = RegVT.getVectorNumElements();

50258

EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);

50259

if (TLI.isTypeLegal(IntVT)) {

50260

SDValue IntLoad = DAG.getLoad(IntVT, dl, Ld->getChain(), Ld->getBasePtr(),

50261

Ld->getPointerInfo(),

50262

Ld->getOriginalAlign(),

50263

Ld->getMemOperand()->getFlags());

50264

SDValue BoolVec = DAG.getBitcast(RegVT, IntLoad);

50265

return DCI.CombineTo(N, BoolVec, IntLoad.getValue(1), true);

50266

}

50267

}

50268

50269

// If we also broadcast this as a subvector to a wider type, then just extract

50270

// the lowest subvector.

50271

if (Ext == ISD::NON_EXTLOAD && Subtarget.hasAVX() && Ld->isSimple() &&

50272

(RegVT.is128BitVector() || RegVT.is256BitVector())) {

50273

SDValue Ptr = Ld->getBasePtr();

50274

SDValue Chain = Ld->getChain();

50275

for (SDNode *User : Ptr->uses()) {

50276

if (User != N && User->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD &&

50277

cast<MemIntrinsicSDNode>(User)->getBasePtr() == Ptr &&

50278

cast<MemIntrinsicSDNode>(User)->getChain() == Chain &&

50279

cast<MemIntrinsicSDNode>(User)->getMemoryVT().getSizeInBits() ==

50280

MemVT.getSizeInBits() &&

50281

!User->hasAnyUseOfValue(1) &&

50282

User->getValueSizeInBits(0).getFixedValue() >

50283

RegVT.getFixedSizeInBits()) {

50284

SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, SDLoc(N),

50285

RegVT.getSizeInBits());

50286

Extract = DAG.getBitcast(RegVT, Extract);

50287

return DCI.CombineTo(N, Extract, SDValue(User, 1));

50288

}

50289

}

50290

}

50291

50292

// Cast ptr32 and ptr64 pointers to the default address space before a load.

50293

unsigned AddrSpace = Ld->getAddressSpace();

50294

if (AddrSpace == X86AS::PTR64 || AddrSpace == X86AS::PTR32_SPTR ||

50295

AddrSpace == X86AS::PTR32_UPTR) {

50296

MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());

50297

if (PtrVT != Ld->getBasePtr().getSimpleValueType()) {

50298

SDValue Cast =

50299

DAG.getAddrSpaceCast(dl, PtrVT, Ld->getBasePtr(), AddrSpace, 0);

50300

return DAG.getLoad(RegVT, dl, Ld->getChain(), Cast, Ld->getPointerInfo(),

50301

Ld->getOriginalAlign(),

50302

Ld->getMemOperand()->getFlags());

50303

}

50304

}

50305

50306

return SDValue();

50307

}

50308

50309

/// If V is a build vector of boolean constants and exactly one of those

50310

/// constants is true, return the operand index of that true element.

50311

/// Otherwise, return -1.

50312

static int getOneTrueElt(SDValue V) {

50313

// This needs to be a build vector of booleans.

50314

// TODO: Checking for the i1 type matches the IR definition for the mask,

50315

// but the mask check could be loosened to i8 or other types. That might

50316

// also require checking more than 'allOnesValue'; eg, the x86 HW

50317

// instructions only require that the MSB is set for each mask element.

50318

// The ISD::MSTORE comments/definition do not specify how the mask operand

50319

// is formatted.

50320

auto *BV = dyn_cast<BuildVectorSDNode>(V);

50321

if (!BV || BV->getValueType(0).getVectorElementType() != MVT::i1)

50322

return -1;

50323

50324

int TrueIndex = -1;

50325

unsigned NumElts = BV->getValueType(0).getVectorNumElements();

50326

for (unsigned i = 0; i < NumElts; ++i) {

50327

const SDValue &Op = BV->getOperand(i);

50328

if (Op.isUndef())

50329

continue;

50330

auto *ConstNode = dyn_cast<ConstantSDNode>(Op);

50331

if (!ConstNode)

50332

return -1;

50333

if (ConstNode->getAPIntValue().countTrailingOnes() >= 1) {

50334

// If we already found a one, this is too many.

50335

if (TrueIndex >= 0)

50336

return -1;

50337

TrueIndex = i;

50338

}

50339

}

50340

return TrueIndex;

50341

}

50342

50343

/// Given a masked memory load/store operation, return true if it has one mask

50344

/// bit set. If it has one mask bit set, then also return the memory address of

50345

/// the scalar element to load/store, the vector index to insert/extract that

50346

/// scalar element, and the alignment for the scalar memory access.

50347

static bool getParamsForOneTrueMaskedElt(MaskedLoadStoreSDNode *MaskedOp,

50348

SelectionDAG &DAG, SDValue &Addr,

50349

SDValue &Index, Align &Alignment,

50350

unsigned &Offset) {

50351

int TrueMaskElt = getOneTrueElt(MaskedOp->getMask());

50352

if (TrueMaskElt < 0)

50353

return false;

50354

50355

// Get the address of the one scalar element that is specified by the mask

50356

// using the appropriate offset from the base pointer.

50357

EVT EltVT = MaskedOp->getMemoryVT().getVectorElementType();

50358

Offset = 0;

50359

Addr = MaskedOp->getBasePtr();

50360

if (TrueMaskElt != 0) {

50361

Offset = TrueMaskElt * EltVT.getStoreSize();

50362

Addr = DAG.getMemBasePlusOffset(Addr, TypeSize::Fixed(Offset),

50363

SDLoc(MaskedOp));

50364

}

50365

50366

Index = DAG.getIntPtrConstant(TrueMaskElt, SDLoc(MaskedOp));

50367

Alignment = commonAlignment(MaskedOp->getOriginalAlign(),

50368

EltVT.getStoreSize());

50369

return true;

50370

}

50371

50372

/// If exactly one element of the mask is set for a non-extending masked load,

50373

/// it is a scalar load and vector insert.

50374

/// Note: It is expected that the degenerate cases of an all-zeros or all-ones

50375

/// mask have already been optimized in IR, so we don't bother with those here.

50376

static SDValue

50377

reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG,

50378

TargetLowering::DAGCombinerInfo &DCI,

50379

const X86Subtarget &Subtarget) {

50380

assert(ML->isUnindexed() && "Unexpected indexed masked load!")(static_cast <bool> (ML->isUnindexed() && "Unexpected indexed masked load!"
) ? void (0) : __assert_fail ("ML->isUnindexed() && \"Unexpected indexed masked load!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 50380, __extension__
__PRETTY_FUNCTION__));

50381

// TODO: This is not x86-specific, so it could be lifted to DAGCombiner.

50382

// However, some target hooks may need to be added to know when the transform

50383

// is profitable. Endianness would also have to be considered.

50384

50385

SDValue Addr, VecIndex;

50386

Align Alignment;

50387

unsigned Offset;

50388

if (!getParamsForOneTrueMaskedElt(ML, DAG, Addr, VecIndex, Alignment, Offset))

50389

return SDValue();

50390

50391

// Load the one scalar element that is specified by the mask using the

50392

// appropriate offset from the base pointer.

50393

SDLoc DL(ML);

50394

EVT VT = ML->getValueType(0);

50395

EVT EltVT = VT.getVectorElementType();

50396

50397

EVT CastVT = VT;

50398

if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {

50399

EltVT = MVT::f64;

50400

CastVT = VT.changeVectorElementType(EltVT);

50401

}

50402

50403

SDValue Load =

50404

DAG.getLoad(EltVT, DL, ML->getChain(), Addr,

50405

ML->getPointerInfo().getWithOffset(Offset),

50406

Alignment, ML->getMemOperand()->getFlags());

50407

50408

SDValue PassThru = DAG.getBitcast(CastVT, ML->getPassThru());

50409

50410

// Insert the loaded element into the appropriate place in the vector.

50411

SDValue Insert =

50412

DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, CastVT, PassThru, Load, VecIndex);

50413

Insert = DAG.getBitcast(VT, Insert);

50414

return DCI.CombineTo(ML, Insert, Load.getValue(1), true);

50415

}

50416

50417

static SDValue

50418

combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG,

50419

TargetLowering::DAGCombinerInfo &DCI) {

50420

assert(ML->isUnindexed() && "Unexpected indexed masked load!")(static_cast <bool> (ML->isUnindexed() && "Unexpected indexed masked load!"
) ? void (0) : __assert_fail ("ML->isUnindexed() && \"Unexpected indexed masked load!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 50420, __extension__
__PRETTY_FUNCTION__));

50421

if (!ISD::isBuildVectorOfConstantSDNodes(ML->getMask().getNode()))

50422

return SDValue();

50423

50424

SDLoc DL(ML);

50425

EVT VT = ML->getValueType(0);

50426

50427

// If we are loading the first and last elements of a vector, it is safe and

50428

// always faster to load the whole vector. Replace the masked load with a

50429

// vector load and select.

50430

unsigned NumElts = VT.getVectorNumElements();

50431

BuildVectorSDNode *MaskBV = cast<BuildVectorSDNode>(ML->getMask());

50432

bool LoadFirstElt = !isNullConstant(MaskBV->getOperand(0));

50433

bool LoadLastElt = !isNullConstant(MaskBV->getOperand(NumElts - 1));

50434

if (LoadFirstElt && LoadLastElt) {

50435

SDValue VecLd = DAG.getLoad(VT, DL, ML->getChain(), ML->getBasePtr(),

50436

ML->getMemOperand());

50437

SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), VecLd,

50438

ML->getPassThru());

50439

return DCI.CombineTo(ML, Blend, VecLd.getValue(1), true);

50440

}

50441

50442

// Convert a masked load with a constant mask into a masked load and a select.

50443

// This allows the select operation to use a faster kind of select instruction

50444

// (for example, vblendvps -> vblendps).

50445

50446

// Don't try this if the pass-through operand is already undefined. That would

50447

// cause an infinite loop because that's what we're about to create.

50448

if (ML->getPassThru().isUndef())

50449

return SDValue();

50450

50451

if (ISD::isBuildVectorAllZeros(ML->getPassThru().getNode()))

50452

return SDValue();

50453

50454

// The new masked load has an undef pass-through operand. The select uses the

50455

// original pass-through operand.

50456

SDValue NewML = DAG.getMaskedLoad(

50457

VT, DL, ML->getChain(), ML->getBasePtr(), ML->getOffset(), ML->getMask(),

50458

DAG.getUNDEF(VT), ML->getMemoryVT(), ML->getMemOperand(),

50459

ML->getAddressingMode(), ML->getExtensionType());

50460

SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), NewML,

50461

ML->getPassThru());

50462

50463

return DCI.CombineTo(ML, Blend, NewML.getValue(1), true);

50464

}

50465

50466

static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG,

50467

TargetLowering::DAGCombinerInfo &DCI,

50468

const X86Subtarget &Subtarget) {

50469

auto *Mld = cast<MaskedLoadSDNode>(N);

50470

50471

// TODO: Expanding load with constant mask may be optimized as well.

50472

if (Mld->isExpandingLoad())

50473

return SDValue();

50474

50475

if (Mld->getExtensionType() == ISD::NON_EXTLOAD) {

50476

if (SDValue ScalarLoad =

50477

reduceMaskedLoadToScalarLoad(Mld, DAG, DCI, Subtarget))

50478

return ScalarLoad;

50479

50480

// TODO: Do some AVX512 subsets benefit from this transform?

50481

if (!Subtarget.hasAVX512())

50482

if (SDValue Blend = combineMaskedLoadConstantMask(Mld, DAG, DCI))

50483

return Blend;

50484

}

50485

50486

// If the mask value has been legalized to a non-boolean vector, try to

50487

// simplify ops leading up to it. We only demand the MSB of each lane.

50488

SDValue Mask = Mld->getMask();

50489

if (Mask.getScalarValueSizeInBits() != 1) {

50490

EVT VT = Mld->getValueType(0);

50491

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

50492

APInt DemandedBits(APInt::getSignMask(VT.getScalarSizeInBits()));

50493

if (TLI.SimplifyDemandedBits(Mask, DemandedBits, DCI)) {

50494

if (N->getOpcode() != ISD::DELETED_NODE)

50495

DCI.AddToWorklist(N);

50496

return SDValue(N, 0);

50497

}

50498

if (SDValue NewMask =

50499

TLI.SimplifyMultipleUseDemandedBits(Mask, DemandedBits, DAG))

50500

return DAG.getMaskedLoad(

50501

VT, SDLoc(N), Mld->getChain(), Mld->getBasePtr(), Mld->getOffset(),

50502

NewMask, Mld->getPassThru(), Mld->getMemoryVT(), Mld->getMemOperand(),

50503

Mld->getAddressingMode(), Mld->getExtensionType());

50504

}

50505

50506

return SDValue();

50507

}

50508

50509

/// If exactly one element of the mask is set for a non-truncating masked store,

50510

/// it is a vector extract and scalar store.

50511

/// Note: It is expected that the degenerate cases of an all-zeros or all-ones

50512

/// mask have already been optimized in IR, so we don't bother with those here.

50513

static SDValue reduceMaskedStoreToScalarStore(MaskedStoreSDNode *MS,

50514

SelectionDAG &DAG,

50515

const X86Subtarget &Subtarget) {

50516

// TODO: This is not x86-specific, so it could be lifted to DAGCombiner.

50517

// However, some target hooks may need to be added to know when the transform

50518

// is profitable. Endianness would also have to be considered.

50519

50520

SDValue Addr, VecIndex;

50521

Align Alignment;

50522

unsigned Offset;

50523

if (!getParamsForOneTrueMaskedElt(MS, DAG, Addr, VecIndex, Alignment, Offset))

50524

return SDValue();

50525

50526

// Extract the one scalar element that is actually being stored.

50527

SDLoc DL(MS);

50528

SDValue Value = MS->getValue();

50529

EVT VT = Value.getValueType();

50530

EVT EltVT = VT.getVectorElementType();

50531

if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {

50532

EltVT = MVT::f64;

50533

EVT CastVT = VT.changeVectorElementType(EltVT);

50534

Value = DAG.getBitcast(CastVT, Value);

50535

}

50536

SDValue Extract =

50537

DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Value, VecIndex);

50538

50539

// Store that element at the appropriate offset from the base pointer.

50540

return DAG.getStore(MS->getChain(), DL, Extract, Addr,

50541

MS->getPointerInfo().getWithOffset(Offset),

50542

Alignment, MS->getMemOperand()->getFlags());

50543

}

50544

50545

static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,

50546

TargetLowering::DAGCombinerInfo &DCI,

50547

const X86Subtarget &Subtarget) {

50548

MaskedStoreSDNode *Mst = cast<MaskedStoreSDNode>(N);

50549

if (Mst->isCompressingStore())

50550

return SDValue();

50551

50552

EVT VT = Mst->getValue().getValueType();

50553

SDLoc dl(Mst);

50554

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

50555

50556

if (Mst->isTruncatingStore())

50557

return SDValue();

50558

50559

if (SDValue ScalarStore = reduceMaskedStoreToScalarStore(Mst, DAG, Subtarget))

50560

return ScalarStore;

50561

50562

// If the mask value has been legalized to a non-boolean vector, try to

50563

// simplify ops leading up to it. We only demand the MSB of each lane.

50564

SDValue Mask = Mst->getMask();

50565

if (Mask.getScalarValueSizeInBits() != 1) {

50566

APInt DemandedBits(APInt::getSignMask(VT.getScalarSizeInBits()));

50567

if (TLI.SimplifyDemandedBits(Mask, DemandedBits, DCI)) {

50568

if (N->getOpcode() != ISD::DELETED_NODE)

50569

DCI.AddToWorklist(N);

50570

return SDValue(N, 0);

50571

}

50572

if (SDValue NewMask =

50573

TLI.SimplifyMultipleUseDemandedBits(Mask, DemandedBits, DAG))

50574

return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Mst->getValue(),

50575

Mst->getBasePtr(), Mst->getOffset(), NewMask,

50576

Mst->getMemoryVT(), Mst->getMemOperand(),

50577

Mst->getAddressingMode());

50578

}

50579

50580

SDValue Value = Mst->getValue();

50581

if (Value.getOpcode() == ISD::TRUNCATE && Value.getNode()->hasOneUse() &&

50582

TLI.isTruncStoreLegal(Value.getOperand(0).getValueType(),

50583

Mst->getMemoryVT())) {

50584

return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Value.getOperand(0),

50585

Mst->getBasePtr(), Mst->getOffset(), Mask,

50586

Mst->getMemoryVT(), Mst->getMemOperand(),

50587

Mst->getAddressingMode(), true);

50588

}

50589

50590

return SDValue();

50591

}

50592

50593

static SDValue combineStore(SDNode *N, SelectionDAG &DAG,

50594

TargetLowering::DAGCombinerInfo &DCI,

50595

const X86Subtarget &Subtarget) {

50596

StoreSDNode *St = cast<StoreSDNode>(N);

50597

EVT StVT = St->getMemoryVT();

50598

SDLoc dl(St);

50599

SDValue StoredVal = St->getValue();

50600

EVT VT = StoredVal.getValueType();

50601

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

50602

50603

// Convert a store of vXi1 into a store of iX and a bitcast.

50604

if (!Subtarget.hasAVX512() && VT == StVT && VT.isVector() &&

50605

VT.getVectorElementType() == MVT::i1) {

50606

50607

EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), VT.getVectorNumElements());

50608

StoredVal = DAG.getBitcast(NewVT, StoredVal);

50609

50610

return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),

50611

St->getPointerInfo(), St->getOriginalAlign(),

50612

St->getMemOperand()->getFlags());

50613

}

50614

50615

// If this is a store of a scalar_to_vector to v1i1, just use a scalar store.

50616

// This will avoid a copy to k-register.

50617

if (VT == MVT::v1i1 && VT == StVT && Subtarget.hasAVX512() &&

50618

StoredVal.getOpcode() == ISD::SCALAR_TO_VECTOR &&

50619

StoredVal.getOperand(0).getValueType() == MVT::i8) {

50620

SDValue Val = StoredVal.getOperand(0);

50621

// We must store zeros to the unused bits.

50622

Val = DAG.getZeroExtendInReg(Val, dl, MVT::i1);

50623

return DAG.getStore(St->getChain(), dl, Val,

50624

St->getBasePtr(), St->getPointerInfo(),

50625

St->getOriginalAlign(),

50626

St->getMemOperand()->getFlags());

50627

}

50628

50629

// Widen v2i1/v4i1 stores to v8i1.

50630

if ((VT == MVT::v1i1 || VT == MVT::v2i1 || VT == MVT::v4i1) && VT == StVT &&

50631

Subtarget.hasAVX512()) {

50632

unsigned NumConcats = 8 / VT.getVectorNumElements();

50633

// We must store zeros to the unused bits.

50634

SmallVector<SDValue, 4> Ops(NumConcats, DAG.getConstant(0, dl, VT));

50635

Ops[0] = StoredVal;

50636

StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);

50637

return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),

50638

St->getPointerInfo(), St->getOriginalAlign(),

50639

St->getMemOperand()->getFlags());

50640

}

50641

50642

// Turn vXi1 stores of constants into a scalar store.

50643

if ((VT == MVT::v8i1 || VT == MVT::v16i1 || VT == MVT::v32i1 ||

50644

VT == MVT::v64i1) && VT == StVT && TLI.isTypeLegal(VT) &&

50645

ISD::isBuildVectorOfConstantSDNodes(StoredVal.getNode())) {

50646

// If its a v64i1 store without 64-bit support, we need two stores.

50647

if (!DCI.isBeforeLegalize() && VT == MVT::v64i1 && !Subtarget.is64Bit()) {

50648

SDValue Lo = DAG.getBuildVector(MVT::v32i1, dl,

50649

StoredVal->ops().slice(0, 32));

50650

Lo = combinevXi1ConstantToInteger(Lo, DAG);

50651

SDValue Hi = DAG.getBuildVector(MVT::v32i1, dl,

50652

StoredVal->ops().slice(32, 32));

50653

Hi = combinevXi1ConstantToInteger(Hi, DAG);

50654

50655

SDValue Ptr0 = St->getBasePtr();

50656

SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, TypeSize::Fixed(4), dl);

50657

50658

SDValue Ch0 =

50659

DAG.getStore(St->getChain(), dl, Lo, Ptr0, St->getPointerInfo(),

50660

St->getOriginalAlign(),

50661

St->getMemOperand()->getFlags());

50662

SDValue Ch1 =

50663

DAG.getStore(St->getChain(), dl, Hi, Ptr1,

50664

St->getPointerInfo().getWithOffset(4),

50665

St->getOriginalAlign(),

50666

St->getMemOperand()->getFlags());

50667

return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);

50668

}

50669

50670

StoredVal = combinevXi1ConstantToInteger(StoredVal, DAG);

50671

return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),

50672

St->getPointerInfo(), St->getOriginalAlign(),

50673

St->getMemOperand()->getFlags());

50674

}

50675

50676

// If we are saving a 32-byte vector and 32-byte stores are slow, such as on

50677

// Sandy Bridge, perform two 16-byte stores.

50678

unsigned Fast;

50679

if (VT.is256BitVector() && StVT == VT &&

50680

TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,

50681

*St->getMemOperand(), &Fast) &&

50682

!Fast) {

50683

unsigned NumElems = VT.getVectorNumElements();

50684

if (NumElems < 2)

50685

return SDValue();

50686

50687

return splitVectorStore(St, DAG);

50688

}

50689

50690

// Split under-aligned vector non-temporal stores.

50691

if (St->isNonTemporal() && StVT == VT &&

50692

St->getAlign().value() < VT.getStoreSize()) {

50693

// ZMM/YMM nt-stores - either it can be stored as a series of shorter

50694

// vectors or the legalizer can scalarize it to use MOVNTI.

50695

if (VT.is256BitVector() || VT.is512BitVector()) {

50696

unsigned NumElems = VT.getVectorNumElements();

50697

if (NumElems < 2)

50698

return SDValue();

50699

return splitVectorStore(St, DAG);

50700

}

50701

50702

// XMM nt-stores - scalarize this to f64 nt-stores on SSE4A, else i32/i64

50703

// to use MOVNTI.

50704

if (VT.is128BitVector() && Subtarget.hasSSE2()) {

50705

MVT NTVT = Subtarget.hasSSE4A()

50706

? MVT::v2f64

50707

: (TLI.isTypeLegal(MVT::i64) ? MVT::v2i64 : MVT::v4i32);

50708

return scalarizeVectorStore(St, NTVT, DAG);

50709

}

50710

}

50711

50712

// Try to optimize v16i16->v16i8 truncating stores when BWI is not

50713

// supported, but avx512f is by extending to v16i32 and truncating.

50714

if (!St->isTruncatingStore() && VT == MVT::v16i8 && !Subtarget.hasBWI() &&

50715

St->getValue().getOpcode() == ISD::TRUNCATE &&

50716

St->getValue().getOperand(0).getValueType() == MVT::v16i16 &&

50717

TLI.isTruncStoreLegal(MVT::v16i32, MVT::v16i8) &&

50718

St->getValue().hasOneUse() && !DCI.isBeforeLegalizeOps()) {

50719

SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::v16i32,

50720

St->getValue().getOperand(0));

50721

return DAG.getTruncStore(St->getChain(), dl, Ext, St->getBasePtr(),

50722

MVT::v16i8, St->getMemOperand());

50723

}

50724

50725

// Try to fold a VTRUNCUS or VTRUNCS into a truncating store.

50726

if (!St->isTruncatingStore() &&

50727

(StoredVal.getOpcode() == X86ISD::VTRUNCUS ||

50728

StoredVal.getOpcode() == X86ISD::VTRUNCS) &&

50729

StoredVal.hasOneUse() &&

50730

TLI.isTruncStoreLegal(StoredVal.getOperand(0).getValueType(), VT)) {

50731

bool IsSigned = StoredVal.getOpcode() == X86ISD::VTRUNCS;

50732

return EmitTruncSStore(IsSigned, St->getChain(),

50733

dl, StoredVal.getOperand(0), St->getBasePtr(),

50734

VT, St->getMemOperand(), DAG);

50735

}

50736

50737

// Try to fold a extract_element(VTRUNC) pattern into a truncating store.

50738

if (!St->isTruncatingStore()) {

50739

auto IsExtractedElement = [](SDValue V) {

50740

if (V.getOpcode() == ISD::TRUNCATE && V.hasOneUse())

50741

V = V.getOperand(0);

50742

unsigned Opc = V.getOpcode();

50743

if ((Opc == ISD::EXTRACT_VECTOR_ELT || Opc == X86ISD::PEXTRW) &&

50744

isNullConstant(V.getOperand(1)) && V.hasOneUse() &&

50745

V.getOperand(0).hasOneUse())

50746

return V.getOperand(0);

50747

return SDValue();

50748

};

50749

if (SDValue Extract = IsExtractedElement(StoredVal)) {

50750

SDValue Trunc = peekThroughOneUseBitcasts(Extract);

50751

if (Trunc.getOpcode() == X86ISD::VTRUNC) {

50752

SDValue Src = Trunc.getOperand(0);

50753

MVT DstVT = Trunc.getSimpleValueType();

50754

MVT SrcVT = Src.getSimpleValueType();

50755

unsigned NumSrcElts = SrcVT.getVectorNumElements();

50756

unsigned NumTruncBits = DstVT.getScalarSizeInBits() * NumSrcElts;

50757

MVT TruncVT = MVT::getVectorVT(DstVT.getScalarType(), NumSrcElts);

50758

if (NumTruncBits == VT.getSizeInBits() &&

50759

TLI.isTruncStoreLegal(SrcVT, TruncVT)) {

50760

return DAG.getTruncStore(St->getChain(), dl, Src, St->getBasePtr(),

50761

TruncVT, St->getMemOperand());

50762

}

50763

}

50764

}

50765

}

50766

50767

// Optimize trunc store (of multiple scalars) to shuffle and store.

50768

// First, pack all of the elements in one place. Next, store to memory

50769

// in fewer chunks.

50770

if (St->isTruncatingStore() && VT.isVector()) {

50771

// Check if we can detect an AVG pattern from the truncation. If yes,

50772

// replace the trunc store by a normal store with the result of X86ISD::AVG

50773

// instruction.

50774

if (DCI.isBeforeLegalize() || TLI.isTypeLegal(St->getMemoryVT()))

50775

if (SDValue Avg = detectAVGPattern(St->getValue(), St->getMemoryVT(), DAG,

50776

Subtarget, dl))

50777

return DAG.getStore(St->getChain(), dl, Avg, St->getBasePtr(),

50778

St->getPointerInfo(), St->getOriginalAlign(),

50779

St->getMemOperand()->getFlags());

50780

50781

if (TLI.isTruncStoreLegal(VT, StVT)) {

50782

if (SDValue Val = detectSSatPattern(St->getValue(), St->getMemoryVT()))

50783

return EmitTruncSStore(true /* Signed saturation */, St->getChain(),

50784

dl, Val, St->getBasePtr(),

50785

St->getMemoryVT(), St->getMemOperand(), DAG);

50786

if (SDValue Val = detectUSatPattern(St->getValue(), St->getMemoryVT(),

50787

DAG, dl))

50788

return EmitTruncSStore(false /* Unsigned saturation */, St->getChain(),

50789

dl, Val, St->getBasePtr(),

50790

St->getMemoryVT(), St->getMemOperand(), DAG);

50791

}

50792

50793

return SDValue();

50794

}

50795

50796

// Cast ptr32 and ptr64 pointers to the default address space before a store.

50797

unsigned AddrSpace = St->getAddressSpace();

50798

if (AddrSpace == X86AS::PTR64 || AddrSpace == X86AS::PTR32_SPTR ||

50799

AddrSpace == X86AS::PTR32_UPTR) {

50800

MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());

50801

if (PtrVT != St->getBasePtr().getSimpleValueType()) {

50802

SDValue Cast =

50803

DAG.getAddrSpaceCast(dl, PtrVT, St->getBasePtr(), AddrSpace, 0);

50804

return DAG.getStore(St->getChain(), dl, StoredVal, Cast,

50805

St->getPointerInfo(), St->getOriginalAlign(),

50806

St->getMemOperand()->getFlags(), St->getAAInfo());

50807

}

50808

}

50809

50810

// Turn load->store of MMX types into GPR load/stores. This avoids clobbering

50811

// the FP state in cases where an emms may be missing.

50812

// A preferable solution to the general problem is to figure out the right

50813

// places to insert EMMS. This qualifies as a quick hack.

50814

50815

// Similarly, turn load->store of i64 into double load/stores in 32-bit mode.

50816

if (VT.getSizeInBits() != 64)

50817

return SDValue();

50818

50819

const Function &F = DAG.getMachineFunction().getFunction();

50820

bool NoImplicitFloatOps = F.hasFnAttribute(Attribute::NoImplicitFloat);

50821

bool F64IsLegal =

50822

!Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2();

50823

if ((VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit()) &&

50824

isa<LoadSDNode>(St->getValue()) &&

50825

cast<LoadSDNode>(St->getValue())->isSimple() &&

50826

St->getChain().hasOneUse() && St->isSimple()) {

50827

LoadSDNode *Ld = cast<LoadSDNode>(St->getValue().getNode());

50828

50829

if (!ISD::isNormalLoad(Ld))

50830

return SDValue();

50831

50832

// Avoid the transformation if there are multiple uses of the loaded value.

50833

if (!Ld->hasNUsesOfValue(1, 0))

50834

return SDValue();

50835

50836

SDLoc LdDL(Ld);

50837

SDLoc StDL(N);

50838

// Lower to a single movq load/store pair.

50839

SDValue NewLd = DAG.getLoad(MVT::f64, LdDL, Ld->getChain(),

50840

Ld->getBasePtr(), Ld->getMemOperand());

50841

50842

// Make sure new load is placed in same chain order.

50843

DAG.makeEquivalentMemoryOrdering(Ld, NewLd);

50844

return DAG.getStore(St->getChain(), StDL, NewLd, St->getBasePtr(),

50845

St->getMemOperand());

50846

}

50847

50848

// This is similar to the above case, but here we handle a scalar 64-bit

50849

// integer store that is extracted from a vector on a 32-bit target.

50850

// If we have SSE2, then we can treat it like a floating-point double

50851

// to get past legalization. The execution dependencies fixup pass will

50852

// choose the optimal machine instruction for the store if this really is

50853

// an integer or v2f32 rather than an f64.

50854

if (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit() &&

50855

St->getOperand(1).getOpcode() == ISD::EXTRACT_VECTOR_ELT) {

50856

SDValue OldExtract = St->getOperand(1);

50857

SDValue ExtOp0 = OldExtract.getOperand(0);

50858

unsigned VecSize = ExtOp0.getValueSizeInBits();

50859

EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, VecSize / 64);

50860

SDValue BitCast = DAG.getBitcast(VecVT, ExtOp0);

50861

SDValue NewExtract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,

50862

BitCast, OldExtract.getOperand(1));

50863

return DAG.getStore(St->getChain(), dl, NewExtract, St->getBasePtr(),

50864

St->getPointerInfo(), St->getOriginalAlign(),

50865

St->getMemOperand()->getFlags());

50866

}

50867

50868

return SDValue();

50869

}

50870

50871

static SDValue combineVEXTRACT_STORE(SDNode *N, SelectionDAG &DAG,

50872

TargetLowering::DAGCombinerInfo &DCI,

50873

const X86Subtarget &Subtarget) {

50874

auto *St = cast<MemIntrinsicSDNode>(N);

50875

50876

SDValue StoredVal = N->getOperand(1);

50877

MVT VT = StoredVal.getSimpleValueType();

50878

EVT MemVT = St->getMemoryVT();

50879

50880

// Figure out which elements we demand.

50881

unsigned StElts = MemVT.getSizeInBits() / VT.getScalarSizeInBits();

50882

APInt DemandedElts = APInt::getLowBitsSet(VT.getVectorNumElements(), StElts);

50883

50884

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

50885

if (TLI.SimplifyDemandedVectorElts(StoredVal, DemandedElts, DCI)) {

50886

if (N->getOpcode() != ISD::DELETED_NODE)

50887

DCI.AddToWorklist(N);

50888

return SDValue(N, 0);

50889

}

50890

50891

return SDValue();

50892

}

50893

50894

/// Return 'true' if this vector operation is "horizontal"

50895

/// and return the operands for the horizontal operation in LHS and RHS. A

50896

/// horizontal operation performs the binary operation on successive elements

50897

/// of its first operand, then on successive elements of its second operand,

50898

/// returning the resulting values in a vector. For example, if

50899

/// A = < float a0, float a1, float a2, float a3 >

50900

/// and

50901

/// B = < float b0, float b1, float b2, float b3 >

50902

/// then the result of doing a horizontal operation on A and B is

50903

/// A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.

50904

/// In short, LHS and RHS are inspected to see if LHS op RHS is of the form

50905

/// A horizontal-op B, for some already available A and B, and if so then LHS is

50906

/// set to A, RHS to B, and the routine returns 'true'.

50907

static bool isHorizontalBinOp(unsigned HOpcode, SDValue &LHS, SDValue &RHS,

50908

SelectionDAG &DAG, const X86Subtarget &Subtarget,

50909

bool IsCommutative,

50910

SmallVectorImpl<int> &PostShuffleMask) {

50911

// If either operand is undef, bail out. The binop should be simplified.

50912

if (LHS.isUndef() || RHS.isUndef())

50913

return false;

50914

50915

// Look for the following pattern:

50916

// A = < float a0, float a1, float a2, float a3 >

50917

// B = < float b0, float b1, float b2, float b3 >

50918

// and

50919

// LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6>

50920

// RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7>

50921

// then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >

50922

// which is A horizontal-op B.

50923

50924

MVT VT = LHS.getSimpleValueType();

50925

assert((VT.is128BitVector() || VT.is256BitVector()) &&(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
()) && "Unsupported vector type for horizontal add/sub"
) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector()) && \"Unsupported vector type for horizontal add/sub\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 50926, __extension__
__PRETTY_FUNCTION__))

50926

"Unsupported vector type for horizontal add/sub")(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector
()) && "Unsupported vector type for horizontal add/sub"
) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector()) && \"Unsupported vector type for horizontal add/sub\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 50926, __extension__
__PRETTY_FUNCTION__));

50927

unsigned NumElts = VT.getVectorNumElements();

50928

50929

auto GetShuffle = [&](SDValue Op, SDValue &N0, SDValue &N1,

50930

SmallVectorImpl<int> &ShuffleMask) {

50931

bool UseSubVector = false;

50932

if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&

50933

Op.getOperand(0).getValueType().is256BitVector() &&

50934

llvm::isNullConstant(Op.getOperand(1))) {

50935

Op = Op.getOperand(0);

50936

UseSubVector = true;

50937

}

50938

SmallVector<SDValue, 2> SrcOps;

50939

SmallVector<int, 16> SrcMask, ScaledMask;

50940

SDValue BC = peekThroughBitcasts(Op);

50941

if (getTargetShuffleInputs(BC, SrcOps, SrcMask, DAG) &&

50942

!isAnyZero(SrcMask) && all_of(SrcOps, [BC](SDValue Op) {

50943

return Op.getValueSizeInBits() == BC.getValueSizeInBits();

50944

})) {

50945

resolveTargetShuffleInputsAndMask(SrcOps, SrcMask);

50946

if (!UseSubVector && SrcOps.size() <= 2 &&

50947

scaleShuffleElements(SrcMask, NumElts, ScaledMask)) {

50948

N0 = SrcOps.size() > 0 ? SrcOps[0] : SDValue();

50949

N1 = SrcOps.size() > 1 ? SrcOps[1] : SDValue();

50950

ShuffleMask.assign(ScaledMask.begin(), ScaledMask.end());

50951

}

50952

if (UseSubVector && SrcOps.size() == 1 &&

50953

scaleShuffleElements(SrcMask, 2 * NumElts, ScaledMask)) {

50954

std::tie(N0, N1) = DAG.SplitVector(SrcOps[0], SDLoc(Op));

50955

ArrayRef<int> Mask = ArrayRef<int>(ScaledMask).slice(0, NumElts);

50956

ShuffleMask.assign(Mask.begin(), Mask.end());

50957

}

50958

}

50959

};

50960

50961

// View LHS in the form

50962

// LHS = VECTOR_SHUFFLE A, B, LMask

50963

// If LHS is not a shuffle, then pretend it is the identity shuffle:

50964

// LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>

50965

// NOTE: A default initialized SDValue represents an UNDEF of type VT.

50966

SDValue A, B;

50967

SmallVector<int, 16> LMask;

50968

GetShuffle(LHS, A, B, LMask);

50969

50970

// Likewise, view RHS in the form

50971

// RHS = VECTOR_SHUFFLE C, D, RMask

50972

SDValue C, D;

50973

SmallVector<int, 16> RMask;

50974

GetShuffle(RHS, C, D, RMask);

50975

50976

// At least one of the operands should be a vector shuffle.

50977

unsigned NumShuffles = (LMask.empty() ? 0 : 1) + (RMask.empty() ? 0 : 1);

50978

if (NumShuffles == 0)

50979

return false;

50980

50981

if (LMask.empty()) {

50982

A = LHS;

50983

for (unsigned i = 0; i != NumElts; ++i)

50984

LMask.push_back(i);

50985

}

50986

50987

if (RMask.empty()) {

50988

C = RHS;

50989

for (unsigned i = 0; i != NumElts; ++i)

50990

RMask.push_back(i);

50991

}

50992

50993

// If we have an unary mask, ensure the other op is set to null.

50994

if (isUndefOrInRange(LMask, 0, NumElts))

50995

B = SDValue();

50996

else if (isUndefOrInRange(LMask, NumElts, NumElts * 2))

50997

A = SDValue();

50998

50999

if (isUndefOrInRange(RMask, 0, NumElts))

51000

D = SDValue();

51001

else if (isUndefOrInRange(RMask, NumElts, NumElts * 2))

51002

C = SDValue();

51003

51004

// If A and B occur in reverse order in RHS, then canonicalize by commuting

51005

// RHS operands and shuffle mask.

51006

if (A != C) {

51007

std::swap(C, D);

51008

ShuffleVectorSDNode::commuteMask(RMask);

51009

}

51010

// Check that the shuffles are both shuffling the same vectors.

51011

if (!(A == C && B == D))

51012

return false;

51013

51014

PostShuffleMask.clear();

51015

PostShuffleMask.append(NumElts, SM_SentinelUndef);

51016

51017

// LHS and RHS are now:

51018

// LHS = shuffle A, B, LMask

51019

// RHS = shuffle A, B, RMask

51020

// Check that the masks correspond to performing a horizontal operation.

51021

// AVX defines horizontal add/sub to operate independently on 128-bit lanes,

51022

// so we just repeat the inner loop if this is a 256-bit op.

51023

unsigned Num128BitChunks = VT.getSizeInBits() / 128;

51024

unsigned NumEltsPer128BitChunk = NumElts / Num128BitChunks;

51025

unsigned NumEltsPer64BitChunk = NumEltsPer128BitChunk / 2;

51026

assert((NumEltsPer128BitChunk % 2 == 0) &&(static_cast <bool> ((NumEltsPer128BitChunk % 2 == 0) &&
"Vector type should have an even number of elements in each lane"
) ? void (0) : __assert_fail ("(NumEltsPer128BitChunk % 2 == 0) && \"Vector type should have an even number of elements in each lane\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 51027, __extension__
__PRETTY_FUNCTION__))

51027

"Vector type should have an even number of elements in each lane")(static_cast <bool> ((NumEltsPer128BitChunk % 2 == 0) &&
"Vector type should have an even number of elements in each lane"
) ? void (0) : __assert_fail ("(NumEltsPer128BitChunk % 2 == 0) && \"Vector type should have an even number of elements in each lane\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 51027, __extension__
__PRETTY_FUNCTION__));

51028

for (unsigned j = 0; j != NumElts; j += NumEltsPer128BitChunk) {

51029

for (unsigned i = 0; i != NumEltsPer128BitChunk; ++i) {

51030

// Ignore undefined components.

51031

int LIdx = LMask[i + j], RIdx = RMask[i + j];

51032

if (LIdx < 0 || RIdx < 0 ||

51033

(!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) ||

51034

(!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts)))

51035

continue;

51036

51037

// Check that successive odd/even elements are being operated on. If not,

51038

// this is not a horizontal operation.

51039

if (!((RIdx & 1) == 1 && (LIdx + 1) == RIdx) &&

51040

!((LIdx & 1) == 1 && (RIdx + 1) == LIdx && IsCommutative))

51041

return false;

51042

51043

// Compute the post-shuffle mask index based on where the element

51044

// is stored in the HOP result, and where it needs to be moved to.

51045

int Base = LIdx & ~1u;

51046

int Index = ((Base % NumEltsPer128BitChunk) / 2) +

51047

((Base % NumElts) & ~(NumEltsPer128BitChunk - 1));

51048

51049

// The low half of the 128-bit result must choose from A.

51050

// The high half of the 128-bit result must choose from B,

51051

// unless B is undef. In that case, we are always choosing from A.

51052

if ((B && Base >= (int)NumElts) || (!B && i >= NumEltsPer64BitChunk))

51053

Index += NumEltsPer64BitChunk;

51054

PostShuffleMask[i + j] = Index;

51055

}

51056

}

51057

51058

SDValue NewLHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.

51059

SDValue NewRHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.

51060

51061

bool IsIdentityPostShuffle =

51062

isSequentialOrUndefInRange(PostShuffleMask, 0, NumElts, 0);

51063

if (IsIdentityPostShuffle)

51064

PostShuffleMask.clear();

51065

51066

// Avoid 128-bit multi lane shuffles if pre-AVX2 and FP (integer will split).

51067

if (!IsIdentityPostShuffle && !Subtarget.hasAVX2() && VT.isFloatingPoint() &&

51068

isMultiLaneShuffleMask(128, VT.getScalarSizeInBits(), PostShuffleMask))

51069

return false;

51070

51071

// If the source nodes are already used in HorizOps then always accept this.

51072

// Shuffle folding should merge these back together.

51073

bool FoundHorizLHS = llvm::any_of(NewLHS->uses(), [&](SDNode *User) {

51074

return User->getOpcode() == HOpcode && User->getValueType(0) == VT;

51075

});

51076

bool FoundHorizRHS = llvm::any_of(NewRHS->uses(), [&](SDNode *User) {

51077

return User->getOpcode() == HOpcode && User->getValueType(0) == VT;

51078

});

51079

bool ForceHorizOp = FoundHorizLHS && FoundHorizRHS;

51080

51081

// Assume a SingleSource HOP if we only shuffle one input and don't need to

51082

// shuffle the result.

51083

if (!ForceHorizOp &&

51084

!shouldUseHorizontalOp(NewLHS == NewRHS &&

51085

(NumShuffles < 2 || !IsIdentityPostShuffle),

51086

DAG, Subtarget))

51087

return false;

51088

51089

LHS = DAG.getBitcast(VT, NewLHS);

51090

RHS = DAG.getBitcast(VT, NewRHS);

51091

return true;

51092

}

51093

51094

// Try to synthesize horizontal (f)hadd/hsub from (f)adds/subs of shuffles.

51095

static SDValue combineToHorizontalAddSub(SDNode *N, SelectionDAG &DAG,

51096

const X86Subtarget &Subtarget) {

51097

EVT VT = N->getValueType(0);

51098

unsigned Opcode = N->getOpcode();

51099

bool IsAdd = (Opcode == ISD::FADD) || (Opcode == ISD::ADD);

51100

SmallVector<int, 8> PostShuffleMask;

51101

51102

switch (Opcode) {

51103

case ISD::FADD:

51104

case ISD::FSUB:

51105

if ((Subtarget.hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||

51106

(Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) {

51107

SDValue LHS = N->getOperand(0);

51108

SDValue RHS = N->getOperand(1);

51109

auto HorizOpcode = IsAdd ? X86ISD::FHADD : X86ISD::FHSUB;

51110

if (isHorizontalBinOp(HorizOpcode, LHS, RHS, DAG, Subtarget, IsAdd,

51111

PostShuffleMask)) {

51112

SDValue HorizBinOp = DAG.getNode(HorizOpcode, SDLoc(N), VT, LHS, RHS);

51113

if (!PostShuffleMask.empty())

51114

HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp,

51115

DAG.getUNDEF(VT), PostShuffleMask);

51116

return HorizBinOp;

51117

}

51118

}

51119

break;

51120

case ISD::ADD:

51121

case ISD::SUB:

51122

if (Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32 ||

51123

VT == MVT::v16i16 || VT == MVT::v8i32)) {

51124

SDValue LHS = N->getOperand(0);

51125

SDValue RHS = N->getOperand(1);

51126

auto HorizOpcode = IsAdd ? X86ISD::HADD : X86ISD::HSUB;

51127

if (isHorizontalBinOp(HorizOpcode, LHS, RHS, DAG, Subtarget, IsAdd,

51128

PostShuffleMask)) {

51129

auto HOpBuilder = [HorizOpcode](SelectionDAG &DAG, const SDLoc &DL,

51130

ArrayRef<SDValue> Ops) {

51131

return DAG.getNode(HorizOpcode, DL, Ops[0].getValueType(), Ops);

51132

};

51133

SDValue HorizBinOp = SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT,

51134

{LHS, RHS}, HOpBuilder);

51135

if (!PostShuffleMask.empty())

51136

HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp,

51137

DAG.getUNDEF(VT), PostShuffleMask);

51138

return HorizBinOp;

51139

}

51140

}

51141

break;

51142

}

51143

51144

return SDValue();

51145

}

51146

51147

// Try to combine the following nodes

51148

// t29: i64 = X86ISD::Wrapper TargetConstantPool:i64

51149

// <i32 -2147483648[float -0.000000e+00]> 0

51150

// t27: v16i32[v16f32],ch = X86ISD::VBROADCAST_LOAD

51151

// <(load 4 from constant-pool)> t0, t29

51152

// [t30: v16i32 = bitcast t27]

51153

// t6: v16i32 = xor t7, t27[t30]

51154

// t11: v16f32 = bitcast t6

51155

// t21: v16f32 = X86ISD::VFMULC[X86ISD::VCFMULC] t11, t8

51156

// into X86ISD::VFCMULC[X86ISD::VFMULC] if possible:

51157

// t22: v16f32 = bitcast t7

51158

// t23: v16f32 = X86ISD::VFCMULC[X86ISD::VFMULC] t8, t22

51159

// t24: v32f16 = bitcast t23

51160

static SDValue combineFMulcFCMulc(SDNode *N, SelectionDAG &DAG,

51161

const X86Subtarget &Subtarget) {

51162

EVT VT = N->getValueType(0);

51163

SDValue LHS = N->getOperand(0);

51164

SDValue RHS = N->getOperand(1);

51165

int CombineOpcode =

51166

N->getOpcode() == X86ISD::VFCMULC ? X86ISD::VFMULC : X86ISD::VFCMULC;

51167

auto isConjugationConstant = [](const Constant *c) {

51168

if (const auto *CI = dyn_cast<ConstantInt>(c)) {

51169

APInt ConjugationInt32 = APInt(32, 0x80000000, true);

51170

APInt ConjugationInt64 = APInt(64, 0x8000000080000000ULL, true);

51171

switch (CI->getBitWidth()) {

51172

case 16:

51173

return false;

51174

case 32:

51175

return CI->getValue() == ConjugationInt32;

51176

case 64:

51177

return CI->getValue() == ConjugationInt64;

51178

default:

51179

llvm_unreachable("Unexpected bit width")::llvm::llvm_unreachable_internal("Unexpected bit width", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 51179);

51180

}

51181

}

51182

if (const auto *CF = dyn_cast<ConstantFP>(c))

51183

return CF->isNegativeZeroValue();

51184

return false;

51185

};

51186

auto combineConjugation = [&](SDValue &r) {

51187

if (LHS->getOpcode() == ISD::BITCAST && RHS.hasOneUse()) {

51188

SDValue XOR = LHS.getOperand(0);

51189

if (XOR->getOpcode() == ISD::XOR && XOR.hasOneUse()) {

51190

SDValue XORRHS = XOR.getOperand(1);

51191

if (XORRHS.getOpcode() == ISD::BITCAST && XORRHS.hasOneUse())

51192

XORRHS = XORRHS.getOperand(0);

51193

if (XORRHS.getOpcode() == X86ISD::VBROADCAST_LOAD &&

51194

XORRHS.getOperand(1).getNumOperands()) {

51195

ConstantPoolSDNode *CP =

51196

dyn_cast<ConstantPoolSDNode>(XORRHS.getOperand(1).getOperand(0));

51197

if (CP && isConjugationConstant(CP->getConstVal())) {

51198

SelectionDAG::FlagInserter FlagsInserter(DAG, N);

51199

SDValue I2F = DAG.getBitcast(VT, LHS.getOperand(0).getOperand(0));

51200

SDValue FCMulC = DAG.getNode(CombineOpcode, SDLoc(N), VT, RHS, I2F);

51201

r = DAG.getBitcast(VT, FCMulC);

51202

return true;

51203

}

51204

}

51205

}

51206

}

51207

return false;

51208

};

51209

SDValue Res;

51210

if (combineConjugation(Res))

51211

return Res;

51212

std::swap(LHS, RHS);

51213

if (combineConjugation(Res))

51214

return Res;

51215

return Res;

51216

}

51217

51218

// Try to combine the following nodes:

51219

// FADD(A, FMA(B, C, 0)) and FADD(A, FMUL(B, C)) to FMA(B, C, A)

51220

static SDValue combineFaddCFmul(SDNode *N, SelectionDAG &DAG,

51221

const X86Subtarget &Subtarget) {

51222

auto AllowContract = [&DAG](const SDNodeFlags &Flags) {

51223

return DAG.getTarget().Options.AllowFPOpFusion == FPOpFusion::Fast ||

51224

Flags.hasAllowContract();

51225

};

51226

51227

auto HasNoSignedZero = [&DAG](const SDNodeFlags &Flags) {

51228

return DAG.getTarget().Options.NoSignedZerosFPMath ||

51229

Flags.hasNoSignedZeros();

51230

};

51231

auto IsVectorAllNegativeZero = [](const SDNode *N) {

51232

if (N->getOpcode() != X86ISD::VBROADCAST_LOAD)

51233

return false;

51234

assert(N->getSimpleValueType(0).getScalarType() == MVT::f32 &&(static_cast <bool> (N->getSimpleValueType(0).getScalarType
() == MVT::f32 && "Unexpected vector type!") ? void (
0) : __assert_fail ("N->getSimpleValueType(0).getScalarType() == MVT::f32 && \"Unexpected vector type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 51235, __extension__
__PRETTY_FUNCTION__))

51235

"Unexpected vector type!")(static_cast <bool> (N->getSimpleValueType(0).getScalarType
() == MVT::f32 && "Unexpected vector type!") ? void (
0) : __assert_fail ("N->getSimpleValueType(0).getScalarType() == MVT::f32 && \"Unexpected vector type!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 51235, __extension__
__PRETTY_FUNCTION__));

51236

if (ConstantPoolSDNode *CP =

51237

dyn_cast<ConstantPoolSDNode>(N->getOperand(1)->getOperand(0))) {

51238

APInt AI = APInt(32, 0x80008000, true);

51239

if (const auto *CI = dyn_cast<ConstantInt>(CP->getConstVal()))

51240

return CI->getValue() == AI;

51241

if (const auto *CF = dyn_cast<ConstantFP>(CP->getConstVal()))

51242

return CF->getValue() == APFloat(APFloat::IEEEsingle(), AI);

51243

}

51244

return false;

51245

};

51246

51247

if (N->getOpcode() != ISD::FADD || !Subtarget.hasFP16() ||

51248

!AllowContract(N->getFlags()))

51249

return SDValue();

51250

51251

EVT VT = N->getValueType(0);

51252

if (VT != MVT::v8f16 && VT != MVT::v16f16 && VT != MVT::v32f16)

51253

return SDValue();

51254

51255

SDValue LHS = N->getOperand(0);

51256

SDValue RHS = N->getOperand(1);

51257

bool IsConj;

51258

SDValue FAddOp1, MulOp0, MulOp1;

51259

auto GetCFmulFrom = [&MulOp0, &MulOp1, &IsConj, &AllowContract,

51260

&IsVectorAllNegativeZero,

51261

&HasNoSignedZero](SDValue N) -> bool {

51262

if (!N.hasOneUse() || N.getOpcode() != ISD::BITCAST)

51263

return false;

51264

SDValue Op0 = N.getOperand(0);

51265

unsigned Opcode = Op0.getOpcode();

51266

if (Op0.hasOneUse() && AllowContract(Op0->getFlags())) {

51267

if ((Opcode == X86ISD::VFMULC || Opcode == X86ISD::VFCMULC)) {

51268

MulOp0 = Op0.getOperand(0);

51269

MulOp1 = Op0.getOperand(1);

51270

IsConj = Opcode == X86ISD::VFCMULC;

51271

return true;

51272

}

51273

if ((Opcode == X86ISD::VFMADDC || Opcode == X86ISD::VFCMADDC) &&

51274

((ISD::isBuildVectorAllZeros(Op0->getOperand(2).getNode()) &&

51275

HasNoSignedZero(Op0->getFlags())) ||

51276

IsVectorAllNegativeZero(Op0->getOperand(2).getNode()))) {

51277

MulOp0 = Op0.getOperand(0);

51278

MulOp1 = Op0.getOperand(1);

51279

IsConj = Opcode == X86ISD::VFCMADDC;

51280

return true;

51281

}

51282

}

51283

return false;

51284

};

51285

51286

if (GetCFmulFrom(LHS))

51287

FAddOp1 = RHS;

51288

else if (GetCFmulFrom(RHS))

51289

FAddOp1 = LHS;

51290

else

51291

return SDValue();

51292

51293

MVT CVT = MVT::getVectorVT(MVT::f32, VT.getVectorNumElements() / 2);

51294

FAddOp1 = DAG.getBitcast(CVT, FAddOp1);

51295

unsigned NewOp = IsConj ? X86ISD::VFCMADDC : X86ISD::VFMADDC;

51296

// FIXME: How do we handle when fast math flags of FADD are different from

51297

// CFMUL's?

51298

SDValue CFmul =

51299

DAG.getNode(NewOp, SDLoc(N), CVT, MulOp0, MulOp1, FAddOp1, N->getFlags());

51300

return DAG.getBitcast(VT, CFmul);

51301

}

51302

51303

/// Do target-specific dag combines on floating-point adds/subs.

51304

static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG,

51305

const X86Subtarget &Subtarget) {

51306

if (SDValue HOp = combineToHorizontalAddSub(N, DAG, Subtarget))

51307

return HOp;

51308

51309

if (SDValue COp = combineFaddCFmul(N, DAG, Subtarget))

51310

return COp;

51311

51312

return SDValue();

51313

}

51314

51315

/// Attempt to pre-truncate inputs to arithmetic ops if it will simplify

51316

/// the codegen.

51317

/// e.g. TRUNC( BINOP( X, Y ) ) --> BINOP( TRUNC( X ), TRUNC( Y ) )

51318

/// TODO: This overlaps with the generic combiner's visitTRUNCATE. Remove

51319

/// anything that is guaranteed to be transformed by DAGCombiner.

51320

static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,

51321

const X86Subtarget &Subtarget,

51322

const SDLoc &DL) {

51323

assert(N->getOpcode() == ISD::TRUNCATE && "Wrong opcode")(static_cast <bool> (N->getOpcode() == ISD::TRUNCATE
&& "Wrong opcode") ? void (0) : __assert_fail ("N->getOpcode() == ISD::TRUNCATE && \"Wrong opcode\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 51323, __extension__
__PRETTY_FUNCTION__));

51324

SDValue Src = N->getOperand(0);

51325

unsigned SrcOpcode = Src.getOpcode();

51326

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

51327

51328

EVT VT = N->getValueType(0);

51329

EVT SrcVT = Src.getValueType();

51330

51331

auto IsFreeTruncation = [VT](SDValue Op) {

51332

unsigned TruncSizeInBits = VT.getScalarSizeInBits();

51333

51334

// See if this has been extended from a smaller/equal size to

51335

// the truncation size, allowing a truncation to combine with the extend.

51336

unsigned Opcode = Op.getOpcode();

51337

if ((Opcode == ISD::ANY_EXTEND || Opcode == ISD::SIGN_EXTEND ||

51338

Opcode == ISD::ZERO_EXTEND) &&

51339

Op.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits)

51340

return true;

51341

51342

// See if this is a single use constant which can be constant folded.

51343

// NOTE: We don't peek throught bitcasts here because there is currently

51344

// no support for constant folding truncate+bitcast+vector_of_constants. So

51345

// we'll just send up with a truncate on both operands which will

51346

// get turned back into (truncate (binop)) causing an infinite loop.

51347

return ISD::isBuildVectorOfConstantSDNodes(Op.getNode());

51348

};

51349

51350

auto TruncateArithmetic = [&](SDValue N0, SDValue N1) {

51351

SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, VT, N0);

51352

SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, VT, N1);

51353

return DAG.getNode(SrcOpcode, DL, VT, Trunc0, Trunc1);

51354

};

51355

51356

// Don't combine if the operation has other uses.

51357

if (!Src.hasOneUse())

51358

return SDValue();

51359

51360

// Only support vector truncation for now.

51361

// TODO: i64 scalar math would benefit as well.

51362

if (!VT.isVector())

51363

return SDValue();

51364

51365

// In most cases its only worth pre-truncating if we're only facing the cost

51366

// of one truncation.

51367

// i.e. if one of the inputs will constant fold or the input is repeated.

51368

switch (SrcOpcode) {

51369

case ISD::MUL:

51370

// X86 is rubbish at scalar and vector i64 multiplies (until AVX512DQ) - its

51371

// better to truncate if we have the chance.

51372

if (SrcVT.getScalarType() == MVT::i64 &&

51373

TLI.isOperationLegal(SrcOpcode, VT) &&

51374

!TLI.isOperationLegal(SrcOpcode, SrcVT))

51375

return TruncateArithmetic(Src.getOperand(0), Src.getOperand(1));

51376

[[fallthrough]];

51377

case ISD::AND:

51378

case ISD::XOR:

51379

case ISD::OR:

51380

case ISD::ADD:

51381

case ISD::SUB: {

51382

SDValue Op0 = Src.getOperand(0);

51383

SDValue Op1 = Src.getOperand(1);

51384

if (TLI.isOperationLegal(SrcOpcode, VT) &&

51385

(Op0 == Op1 || IsFreeTruncation(Op0) || IsFreeTruncation(Op1)))

51386

return TruncateArithmetic(Op0, Op1);

51387

break;

51388

}

51389

}

51390

51391

return SDValue();

51392

}

51393

51394

/// Truncate using ISD::AND mask and X86ISD::PACKUS.

51395

/// e.g. trunc <8 x i32> X to <8 x i16> -->

51396

/// MaskX = X & 0xffff (clear high bits to prevent saturation)

51397

/// packus (extract_subv MaskX, 0), (extract_subv MaskX, 1)

51398

static SDValue combineVectorTruncationWithPACKUS(SDNode *N, const SDLoc &DL,

51399

const X86Subtarget &Subtarget,

51400

SelectionDAG &DAG) {

51401

SDValue In = N->getOperand(0);

51402

EVT InVT = In.getValueType();

51403

EVT OutVT = N->getValueType(0);

51404

51405

APInt Mask = APInt::getLowBitsSet(InVT.getScalarSizeInBits(),

51406

OutVT.getScalarSizeInBits());

51407

In = DAG.getNode(ISD::AND, DL, InVT, In, DAG.getConstant(Mask, DL, InVT));

51408

return truncateVectorWithPACK(X86ISD::PACKUS, OutVT, In, DL, DAG, Subtarget);

51409

}

51410

51411

/// Truncate a group of v4i32 into v8i16 using X86ISD::PACKSS.

51412

static SDValue combineVectorTruncationWithPACKSS(SDNode *N, const SDLoc &DL,

51413

const X86Subtarget &Subtarget,

51414

SelectionDAG &DAG) {

51415

SDValue In = N->getOperand(0);

51416

EVT InVT = In.getValueType();

51417

EVT OutVT = N->getValueType(0);

51418

In = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, InVT, In,

51419

DAG.getValueType(OutVT));

51420

return truncateVectorWithPACK(X86ISD::PACKSS, OutVT, In, DL, DAG, Subtarget);

51421

}

51422

51423

/// This function transforms truncation from vXi32/vXi64 to vXi8/vXi16 into

51424

/// X86ISD::PACKUS/X86ISD::PACKSS operations. We do it here because after type

51425

/// legalization the truncation will be translated into a BUILD_VECTOR with each

51426

/// element that is extracted from a vector and then truncated, and it is

51427

/// difficult to do this optimization based on them.

51428

static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG,

51429

const X86Subtarget &Subtarget) {

51430

EVT OutVT = N->getValueType(0);

51431

if (!OutVT.isVector())

51432

return SDValue();

51433

51434

SDValue In = N->getOperand(0);

51435

if (!In.getValueType().isSimple())

51436

return SDValue();

51437

51438

EVT InVT = In.getValueType();

51439

unsigned NumElems = OutVT.getVectorNumElements();

51440

51441

// AVX512 provides fast truncate ops.

51442

if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())

51443

return SDValue();

51444

51445

EVT OutSVT = OutVT.getVectorElementType();

51446

EVT InSVT = InVT.getVectorElementType();

51447

if (!((InSVT == MVT::i16 || InSVT == MVT::i32 || InSVT == MVT::i64) &&

51448

(OutSVT == MVT::i8 || OutSVT == MVT::i16) && isPowerOf2_32(NumElems) &&

51449

NumElems >= 8))

51450

return SDValue();

51451

51452

// SSSE3's pshufb results in less instructions in the cases below.

51453

if (Subtarget.hasSSSE3() && NumElems == 8) {

51454

if (InSVT == MVT::i16)

51455

return SDValue();

51456

if (InSVT == MVT::i32 &&

51457

(OutSVT == MVT::i8 || !Subtarget.hasSSE41() || Subtarget.hasInt256()))

51458

return SDValue();

51459

}

51460

51461

SDLoc DL(N);

51462

// SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PACKUS

51463

// for 2 x v4i32 -> v8i16. For SSSE3 and below, we need to use PACKSS to

51464

// truncate 2 x v4i32 to v8i16.

51465

if (Subtarget.hasSSE41() || OutSVT == MVT::i8)

51466

return combineVectorTruncationWithPACKUS(N, DL, Subtarget, DAG);

51467

if (InSVT == MVT::i32)

51468

return combineVectorTruncationWithPACKSS(N, DL, Subtarget, DAG);

51469

51470

return SDValue();

51471

}

51472

51473

/// This function transforms vector truncation of 'extended sign-bits' or

51474

/// 'extended zero-bits' values.

51475

/// vXi16/vXi32/vXi64 to vXi8/vXi16/vXi32 into X86ISD::PACKSS/PACKUS operations.

51476

static SDValue combineVectorSignBitsTruncation(SDNode *N, const SDLoc &DL,

51477

SelectionDAG &DAG,

51478

const X86Subtarget &Subtarget) {

51479

// Requires SSE2.

51480

if (!Subtarget.hasSSE2())

51481

return SDValue();

51482

51483

if (!N->getValueType(0).isVector() || !N->getValueType(0).isSimple())

51484

return SDValue();

51485

51486

SDValue In = N->getOperand(0);

51487

if (!In.getValueType().isSimple())

51488

return SDValue();

51489

51490

MVT VT = N->getValueType(0).getSimpleVT();

51491

MVT SVT = VT.getScalarType();

51492

51493

MVT InVT = In.getValueType().getSimpleVT();

51494

MVT InSVT = InVT.getScalarType();

51495

51496

// Check we have a truncation suited for PACKSS/PACKUS.

51497

if (!isPowerOf2_32(VT.getVectorNumElements()))

51498

return SDValue();

51499

if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32)

51500

return SDValue();

51501

if (InSVT != MVT::i16 && InSVT != MVT::i32 && InSVT != MVT::i64)

51502

return SDValue();

51503

51504

// Truncation to sub-128bit vXi32 can be better handled with shuffles.

51505

if (SVT == MVT::i32 && VT.getSizeInBits() < 128)

51506

return SDValue();

51507

51508

// AVX512 has fast truncate, but if the input is already going to be split,

51509

// there's no harm in trying pack.

51510

if (Subtarget.hasAVX512() &&

51511

!(!Subtarget.useAVX512Regs() && VT.is256BitVector() &&

51512

InVT.is512BitVector())) {

51513

// PACK should still be worth it for 128-bit vectors if the sources were

51514

// originally concatenated from subvectors.

51515

SmallVector<SDValue> ConcatOps;

51516

if (VT.getSizeInBits() > 128 ||

51517

!collectConcatOps(In.getNode(), ConcatOps, DAG))

51518

return SDValue();

51519

}

51520

51521

unsigned NumPackedSignBits = std::min<unsigned>(SVT.getSizeInBits(), 16);

51522

unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8;

51523

51524

// Use PACKUS if the input has zero-bits that extend all the way to the

51525

// packed/truncated value. e.g. masks, zext_in_reg, etc.

51526

KnownBits Known = DAG.computeKnownBits(In);

51527

unsigned NumLeadingZeroBits = Known.countMinLeadingZeros();

51528

if (NumLeadingZeroBits >= (InSVT.getSizeInBits() - NumPackedZeroBits))

51529

return truncateVectorWithPACK(X86ISD::PACKUS, VT, In, DL, DAG, Subtarget);

51530

51531

// Use PACKSS if the input has sign-bits that extend all the way to the

51532

// packed/truncated value. e.g. Comparison result, sext_in_reg, etc.

51533

unsigned NumSignBits = DAG.ComputeNumSignBits(In);

51534

51535

// Don't use PACKSS for vXi64 -> vXi32 truncations unless we're dealing with

51536

// a sign splat. ComputeNumSignBits struggles to see through BITCASTs later

51537

// on and combines/simplifications can't then use it.

51538

if (SVT == MVT::i32 && NumSignBits != InSVT.getSizeInBits())

51539

return SDValue();

51540

51541

unsigned MinSignBits = InSVT.getSizeInBits() - NumPackedSignBits;

51542

if (NumSignBits > MinSignBits)

51543

return truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget);

51544

51545

// If we have a srl that only generates signbits that we will discard in

51546

// the truncation then we can use PACKSS by converting the srl to a sra.

51547

// SimplifyDemandedBits often relaxes sra to srl so we need to reverse it.

51548

if (In.getOpcode() == ISD::SRL && N->isOnlyUserOf(In.getNode()))

51549

if (const APInt *ShAmt = DAG.getValidShiftAmountConstant(

51550

In, APInt::getAllOnes(VT.getVectorNumElements()))) {

51551

if (*ShAmt == MinSignBits) {

51552

SDValue NewIn = DAG.getNode(ISD::SRA, DL, InVT, In->ops());

51553

return truncateVectorWithPACK(X86ISD::PACKSS, VT, NewIn, DL, DAG,

51554

Subtarget);

51555

}

51556

}

51557

51558

return SDValue();

51559

}

51560

51561

// Try to form a MULHU or MULHS node by looking for

51562

// (trunc (srl (mul ext, ext), 16))

51563

// TODO: This is X86 specific because we want to be able to handle wide types

51564

// before type legalization. But we can only do it if the vector will be

51565

// legalized via widening/splitting. Type legalization can't handle promotion

51566

// of a MULHU/MULHS. There isn't a way to convey this to the generic DAG

51567

// combiner.

51568

static SDValue combinePMULH(SDValue Src, EVT VT, const SDLoc &DL,

51569

SelectionDAG &DAG, const X86Subtarget &Subtarget) {

51570

// First instruction should be a right shift of a multiply.

51571

if (Src.getOpcode() != ISD::SRL ||

51572

Src.getOperand(0).getOpcode() != ISD::MUL)

51573

return SDValue();

51574

51575

if (!Subtarget.hasSSE2())

51576

return SDValue();

51577

51578

// Only handle vXi16 types that are at least 128-bits unless they will be

51579

// widened.

51580

if (!VT.isVector() || VT.getVectorElementType() != MVT::i16)

51581

return SDValue();

51582

51583

// Input type should be at least vXi32.

51584

EVT InVT = Src.getValueType();

51585

if (InVT.getVectorElementType().getSizeInBits() < 32)

51586

return SDValue();

51587

51588

// Need a shift by 16.

51589

APInt ShiftAmt;

51590

if (!ISD::isConstantSplatVector(Src.getOperand(1).getNode(), ShiftAmt) ||

51591

ShiftAmt != 16)

51592

return SDValue();

51593

51594

SDValue LHS = Src.getOperand(0).getOperand(0);

51595

SDValue RHS = Src.getOperand(0).getOperand(1);

51596

51597

// Count leading sign/zero bits on both inputs - if there are enough then

51598

// truncation back to vXi16 will be cheap - either as a pack/shuffle

51599

// sequence or using AVX512 truncations. If the inputs are sext/zext then the

51600

// truncations may actually be free by peeking through to the ext source.

51601

auto IsSext = [&DAG](SDValue V) {

51602

return DAG.ComputeMaxSignificantBits(V) <= 16;

51603

};

51604

auto IsZext = [&DAG](SDValue V) {

51605

return DAG.computeKnownBits(V).countMaxActiveBits() <= 16;

51606

};

51607

51608

bool IsSigned = IsSext(LHS) && IsSext(RHS);

51609

bool IsUnsigned = IsZext(LHS) && IsZext(RHS);

51610

if (!IsSigned && !IsUnsigned)

51611

return SDValue();

51612

51613

// Check if both inputs are extensions, which will be removed by truncation.

51614

bool IsTruncateFree = (LHS.getOpcode() == ISD::SIGN_EXTEND ||

51615

LHS.getOpcode() == ISD::ZERO_EXTEND) &&

51616

(RHS.getOpcode() == ISD::SIGN_EXTEND ||

51617

RHS.getOpcode() == ISD::ZERO_EXTEND) &&

51618

LHS.getOperand(0).getScalarValueSizeInBits() <= 16 &&

51619

RHS.getOperand(0).getScalarValueSizeInBits() <= 16;

51620

51621

// For AVX2+ targets, with the upper bits known zero, we can perform MULHU on

51622

// the (bitcasted) inputs directly, and then cheaply pack/truncate the result

51623

// (upper elts will be zero). Don't attempt this with just AVX512F as MULHU

51624

// will have to split anyway.

51625

unsigned InSizeInBits = InVT.getSizeInBits();

51626

if (IsUnsigned && !IsTruncateFree && Subtarget.hasInt256() &&

51627

!(Subtarget.hasAVX512() && !Subtarget.hasBWI() && VT.is256BitVector()) &&

51628

(InSizeInBits % 16) == 0) {

51629

EVT BCVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,

51630

InVT.getSizeInBits() / 16);

51631

SDValue Res = DAG.getNode(ISD::MULHU, DL, BCVT, DAG.getBitcast(BCVT, LHS),

51632

DAG.getBitcast(BCVT, RHS));

51633

return DAG.getNode(ISD::TRUNCATE, DL, VT, DAG.getBitcast(InVT, Res));

51634

}

51635

51636

// Truncate back to source type.

51637

LHS = DAG.getNode(ISD::TRUNCATE, DL, VT, LHS);

51638

RHS = DAG.getNode(ISD::TRUNCATE, DL, VT, RHS);

51639

51640

unsigned Opc = IsSigned ? ISD::MULHS : ISD::MULHU;

51641

return DAG.getNode(Opc, DL, VT, LHS, RHS);

51642

}

51643

51644

// Attempt to match PMADDUBSW, which multiplies corresponding unsigned bytes

51645

// from one vector with signed bytes from another vector, adds together

51646

// adjacent pairs of 16-bit products, and saturates the result before

51647

// truncating to 16-bits.

51648

//

51649

// Which looks something like this:

51650

// (i16 (ssat (add (mul (zext (even elts (i8 A))), (sext (even elts (i8 B)))),

51651

// (mul (zext (odd elts (i8 A)), (sext (odd elts (i8 B))))))))

51652

static SDValue detectPMADDUBSW(SDValue In, EVT VT, SelectionDAG &DAG,

51653

const X86Subtarget &Subtarget,

51654

const SDLoc &DL) {

51655

if (!VT.isVector() || !Subtarget.hasSSSE3())

51656

return SDValue();

51657

51658

unsigned NumElems = VT.getVectorNumElements();

51659

EVT ScalarVT = VT.getVectorElementType();

51660

if (ScalarVT != MVT::i16 || NumElems < 8 || !isPowerOf2_32(NumElems))

51661

return SDValue();

51662

51663

SDValue SSatVal = detectSSatPattern(In, VT);

51664

if (!SSatVal || SSatVal.getOpcode() != ISD::ADD)

51665

return SDValue();

51666

51667

// Ok this is a signed saturation of an ADD. See if this ADD is adding pairs

51668

// of multiplies from even/odd elements.

51669

SDValue N0 = SSatVal.getOperand(0);

51670

SDValue N1 = SSatVal.getOperand(1);

51671

51672

if (N0.getOpcode() != ISD::MUL || N1.getOpcode() != ISD::MUL)

51673

return SDValue();

51674

51675

SDValue N00 = N0.getOperand(0);

51676

SDValue N01 = N0.getOperand(1);

51677

SDValue N10 = N1.getOperand(0);

51678

SDValue N11 = N1.getOperand(1);

51679

51680

// TODO: Handle constant vectors and use knownbits/computenumsignbits?

51681

// Canonicalize zero_extend to LHS.

51682

if (N01.getOpcode() == ISD::ZERO_EXTEND)

51683

std::swap(N00, N01);

51684

if (N11.getOpcode() == ISD::ZERO_EXTEND)

51685

std::swap(N10, N11);

51686

51687

// Ensure we have a zero_extend and a sign_extend.

51688

if (N00.getOpcode() != ISD::ZERO_EXTEND ||

51689

N01.getOpcode() != ISD::SIGN_EXTEND ||

51690

N10.getOpcode() != ISD::ZERO_EXTEND ||

51691

N11.getOpcode() != ISD::SIGN_EXTEND)

51692

return SDValue();

51693

51694

// Peek through the extends.

51695

N00 = N00.getOperand(0);

51696

N01 = N01.getOperand(0);

51697

N10 = N10.getOperand(0);

51698

N11 = N11.getOperand(0);

51699

51700

// Ensure the extend is from vXi8.

51701

if (N00.getValueType().getVectorElementType() != MVT::i8 ||

51702

N01.getValueType().getVectorElementType() != MVT::i8 ||

51703

N10.getValueType().getVectorElementType() != MVT::i8 ||

51704

N11.getValueType().getVectorElementType() != MVT::i8)

51705

return SDValue();

51706

51707

// All inputs should be build_vectors.

51708

if (N00.getOpcode() != ISD::BUILD_VECTOR ||

51709

N01.getOpcode() != ISD::BUILD_VECTOR ||

51710

N10.getOpcode() != ISD::BUILD_VECTOR ||

51711

N11.getOpcode() != ISD::BUILD_VECTOR)

51712

return SDValue();

51713

51714

// N00/N10 are zero extended. N01/N11 are sign extended.

51715

51716

// For each element, we need to ensure we have an odd element from one vector

51717

// multiplied by the odd element of another vector and the even element from

51718

// one of the same vectors being multiplied by the even element from the

51719

// other vector. So we need to make sure for each element i, this operator

51720

// is being performed:

51721

// A[2 * i] * B[2 * i] + A[2 * i + 1] * B[2 * i + 1]

51722

SDValue ZExtIn, SExtIn;

51723

for (unsigned i = 0; i != NumElems; ++i) {

51724

SDValue N00Elt = N00.getOperand(i);

51725

SDValue N01Elt = N01.getOperand(i);

51726

SDValue N10Elt = N10.getOperand(i);

51727

SDValue N11Elt = N11.getOperand(i);

51728

// TODO: Be more tolerant to undefs.

51729

if (N00Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

51730

N01Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

51731

N10Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

51732

N11Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT)

51733

return SDValue();

51734

auto *ConstN00Elt = dyn_cast<ConstantSDNode>(N00Elt.getOperand(1));

51735

auto *ConstN01Elt = dyn_cast<ConstantSDNode>(N01Elt.getOperand(1));

51736

auto *ConstN10Elt = dyn_cast<ConstantSDNode>(N10Elt.getOperand(1));

51737

auto *ConstN11Elt = dyn_cast<ConstantSDNode>(N11Elt.getOperand(1));

51738

if (!ConstN00Elt || !ConstN01Elt || !ConstN10Elt || !ConstN11Elt)

51739

return SDValue();

51740

unsigned IdxN00 = ConstN00Elt->getZExtValue();

51741

unsigned IdxN01 = ConstN01Elt->getZExtValue();

51742

unsigned IdxN10 = ConstN10Elt->getZExtValue();

51743

unsigned IdxN11 = ConstN11Elt->getZExtValue();

51744

// Add is commutative so indices can be reordered.

51745

if (IdxN00 > IdxN10) {

51746

std::swap(IdxN00, IdxN10);

51747

std::swap(IdxN01, IdxN11);

51748

}

51749

// N0 indices be the even element. N1 indices must be the next odd element.

51750

if (IdxN00 != 2 * i || IdxN10 != 2 * i + 1 ||

51751

IdxN01 != 2 * i || IdxN11 != 2 * i + 1)

51752

return SDValue();

51753

SDValue N00In = N00Elt.getOperand(0);

51754

SDValue N01In = N01Elt.getOperand(0);

51755

SDValue N10In = N10Elt.getOperand(0);

51756

SDValue N11In = N11Elt.getOperand(0);

51757

// First time we find an input capture it.

51758

if (!ZExtIn) {

51759

ZExtIn = N00In;

51760

SExtIn = N01In;

51761

}

51762

if (ZExtIn != N00In || SExtIn != N01In ||

51763

ZExtIn != N10In || SExtIn != N11In)

51764

return SDValue();

51765

}

51766

51767

auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,

51768

ArrayRef<SDValue> Ops) {

51769

// Shrink by adding truncate nodes and let DAGCombine fold with the

51770

// sources.

51771

EVT InVT = Ops[0].getValueType();

51772

assert(InVT.getScalarType() == MVT::i8 &&(static_cast <bool> (InVT.getScalarType() == MVT::i8 &&
"Unexpected scalar element type") ? void (0) : __assert_fail
("InVT.getScalarType() == MVT::i8 && \"Unexpected scalar element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 51773, __extension__
__PRETTY_FUNCTION__))

51773

"Unexpected scalar element type")(static_cast <bool> (InVT.getScalarType() == MVT::i8 &&
"Unexpected scalar element type") ? void (0) : __assert_fail
("InVT.getScalarType() == MVT::i8 && \"Unexpected scalar element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 51773, __extension__
__PRETTY_FUNCTION__));

51774

assert(InVT == Ops[1].getValueType() && "Operands' types mismatch")(static_cast <bool> (InVT == Ops[1].getValueType() &&
"Operands' types mismatch") ? void (0) : __assert_fail ("InVT == Ops[1].getValueType() && \"Operands' types mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 51774, __extension__
__PRETTY_FUNCTION__));

51775

EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,

51776

InVT.getVectorNumElements() / 2);

51777

return DAG.getNode(X86ISD::VPMADDUBSW, DL, ResVT, Ops[0], Ops[1]);

51778

};

51779

return SplitOpsAndApply(DAG, Subtarget, DL, VT, { ZExtIn, SExtIn },

51780

PMADDBuilder);

51781

}

51782

51783

static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,

51784

const X86Subtarget &Subtarget) {

51785

EVT VT = N->getValueType(0);

51786

SDValue Src = N->getOperand(0);

51787

SDLoc DL(N);

51788

51789

// Attempt to pre-truncate inputs to arithmetic ops instead.

51790

if (SDValue V = combineTruncatedArithmetic(N, DAG, Subtarget, DL))

51791

return V;

51792

51793

// Try to detect AVG pattern first.

51794

if (SDValue Avg = detectAVGPattern(Src, VT, DAG, Subtarget, DL))

51795

return Avg;

51796

51797

// Try to detect PMADD

51798

if (SDValue PMAdd = detectPMADDUBSW(Src, VT, DAG, Subtarget, DL))

51799

return PMAdd;

51800

51801

// Try to combine truncation with signed/unsigned saturation.

51802

if (SDValue Val = combineTruncateWithSat(Src, VT, DL, DAG, Subtarget))

51803

return Val;

51804

51805

// Try to combine PMULHUW/PMULHW for vXi16.

51806

if (SDValue V = combinePMULH(Src, VT, DL, DAG, Subtarget))

51807

return V;

51808

51809

// The bitcast source is a direct mmx result.

51810

// Detect bitcasts between i32 to x86mmx

51811

if (Src.getOpcode() == ISD::BITCAST && VT == MVT::i32) {

51812

SDValue BCSrc = Src.getOperand(0);

51813

if (BCSrc.getValueType() == MVT::x86mmx)

51814

return DAG.getNode(X86ISD::MMX_MOVD2W, DL, MVT::i32, BCSrc);

51815

}

51816

51817

// Try to truncate extended sign/zero bits with PACKSS/PACKUS.

51818

if (SDValue V = combineVectorSignBitsTruncation(N, DL, DAG, Subtarget))

51819

return V;

51820

51821

return combineVectorTruncation(N, DAG, Subtarget);

51822

}

51823

51824

static SDValue combineVTRUNC(SDNode *N, SelectionDAG &DAG,

51825

TargetLowering::DAGCombinerInfo &DCI) {

51826

EVT VT = N->getValueType(0);

51827

SDValue In = N->getOperand(0);

51828

SDLoc DL(N);

51829

51830

if (SDValue SSatVal = detectSSatPattern(In, VT))

51831

return DAG.getNode(X86ISD::VTRUNCS, DL, VT, SSatVal);

51832

if (SDValue USatVal = detectUSatPattern(In, VT, DAG, DL))

51833

return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, USatVal);

51834

51835

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

51836

APInt DemandedMask(APInt::getAllOnes(VT.getScalarSizeInBits()));

51837

if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))

51838

return SDValue(N, 0);

51839

51840

return SDValue();

51841

}

51842

51843

/// Returns the negated value if the node \p N flips sign of FP value.

51844

///

51845

/// FP-negation node may have different forms: FNEG(x), FXOR (x, 0x80000000)

51846

/// or FSUB(0, x)

51847

/// AVX512F does not have FXOR, so FNEG is lowered as

51848

/// (bitcast (xor (bitcast x), (bitcast ConstantFP(0x80000000)))).

51849

/// In this case we go though all bitcasts.

51850

/// This also recognizes splat of a negated value and returns the splat of that

51851

/// value.

51852

static SDValue isFNEG(SelectionDAG &DAG, SDNode *N, unsigned Depth = 0) {

51853

if (N->getOpcode() == ISD::FNEG)

51854

return N->getOperand(0);

51855

51856

// Don't recurse exponentially.

51857

if (Depth > SelectionDAG::MaxRecursionDepth)

51858

return SDValue();

51859

51860

unsigned ScalarSize = N->getValueType(0).getScalarSizeInBits();

51861

51862

SDValue Op = peekThroughBitcasts(SDValue(N, 0));

51863

EVT VT = Op->getValueType(0);

51864

51865

// Make sure the element size doesn't change.

51866

if (VT.getScalarSizeInBits() != ScalarSize)

51867

return SDValue();

51868

51869

unsigned Opc = Op.getOpcode();

51870

switch (Opc) {

51871

case ISD::VECTOR_SHUFFLE: {

51872

// For a VECTOR_SHUFFLE(VEC1, VEC2), if the VEC2 is undef, then the negate

51873

// of this is VECTOR_SHUFFLE(-VEC1, UNDEF). The mask can be anything here.

51874

if (!Op.getOperand(1).isUndef())

51875

return SDValue();

51876

if (SDValue NegOp0 = isFNEG(DAG, Op.getOperand(0).getNode(), Depth + 1))

51877

if (NegOp0.getValueType() == VT) // FIXME: Can we do better?

51878

return DAG.getVectorShuffle(VT, SDLoc(Op), NegOp0, DAG.getUNDEF(VT),

51879

cast<ShuffleVectorSDNode>(Op)->getMask());

51880

break;

51881

}

51882

case ISD::INSERT_VECTOR_ELT: {

51883

// Negate of INSERT_VECTOR_ELT(UNDEF, V, INDEX) is INSERT_VECTOR_ELT(UNDEF,

51884

// -V, INDEX).

51885

SDValue InsVector = Op.getOperand(0);

51886

SDValue InsVal = Op.getOperand(1);

51887

if (!InsVector.isUndef())

51888

return SDValue();

51889

if (SDValue NegInsVal = isFNEG(DAG, InsVal.getNode(), Depth + 1))

51890

if (NegInsVal.getValueType() == VT.getVectorElementType()) // FIXME

51891

return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(Op), VT, InsVector,

51892

NegInsVal, Op.getOperand(2));

51893

break;

51894

}

51895

case ISD::FSUB:

51896

case ISD::XOR:

51897

case X86ISD::FXOR: {

51898

SDValue Op1 = Op.getOperand(1);

51899

SDValue Op0 = Op.getOperand(0);

51900

51901

// For XOR and FXOR, we want to check if constant

51902

// bits of Op1 are sign bit masks. For FSUB, we

51903

// have to check if constant bits of Op0 are sign

51904

// bit masks and hence we swap the operands.

51905

if (Opc == ISD::FSUB)

51906

std::swap(Op0, Op1);

51907

51908

APInt UndefElts;

51909

SmallVector<APInt, 16> EltBits;

51910

// Extract constant bits and see if they are all

51911

// sign bit masks. Ignore the undef elements.

51912

if (getTargetConstantBitsFromNode(Op1, ScalarSize, UndefElts, EltBits,

51913

/* AllowWholeUndefs */ true,

51914

/* AllowPartialUndefs */ false)) {

51915

for (unsigned I = 0, E = EltBits.size(); I < E; I++)

51916

if (!UndefElts[I] && !EltBits[I].isSignMask())

51917

return SDValue();

51918

51919

// Only allow bitcast from correctly-sized constant.

51920

Op0 = peekThroughBitcasts(Op0);

51921

if (Op0.getScalarValueSizeInBits() == ScalarSize)

51922

return Op0;

51923

}

51924

break;

51925

} // case

51926

} // switch

51927

51928

return SDValue();

51929

}

51930

51931

static unsigned negateFMAOpcode(unsigned Opcode, bool NegMul, bool NegAcc,

51932

bool NegRes) {

51933

if (NegMul) {

51934

switch (Opcode) {

51935

default: llvm_unreachable("Unexpected opcode")::llvm::llvm_unreachable_internal("Unexpected opcode", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 51935);

51936

case ISD::FMA: Opcode = X86ISD::FNMADD; break;

51937

case ISD::STRICT_FMA: Opcode = X86ISD::STRICT_FNMADD; break;

51938

case X86ISD::FMADD_RND: Opcode = X86ISD::FNMADD_RND; break;

51939

case X86ISD::FMSUB: Opcode = X86ISD::FNMSUB; break;

51940

case X86ISD::STRICT_FMSUB: Opcode = X86ISD::STRICT_FNMSUB; break;

51941

case X86ISD::FMSUB_RND: Opcode = X86ISD::FNMSUB_RND; break;

51942

case X86ISD::FNMADD: Opcode = ISD::FMA; break;

51943

case X86ISD::STRICT_FNMADD: Opcode = ISD::STRICT_FMA; break;

51944

case X86ISD::FNMADD_RND: Opcode = X86ISD::FMADD_RND; break;

51945

case X86ISD::FNMSUB: Opcode = X86ISD::FMSUB; break;

51946

case X86ISD::STRICT_FNMSUB: Opcode = X86ISD::STRICT_FMSUB; break;

51947

case X86ISD::FNMSUB_RND: Opcode = X86ISD::FMSUB_RND; break;

51948

}

51949

}

51950

51951

if (NegAcc) {

51952

switch (Opcode) {

51953

default: llvm_unreachable("Unexpected opcode")::llvm::llvm_unreachable_internal("Unexpected opcode", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 51953);

51954

case ISD::FMA: Opcode = X86ISD::FMSUB; break;

51955

case ISD::STRICT_FMA: Opcode = X86ISD::STRICT_FMSUB; break;

51956

case X86ISD::FMADD_RND: Opcode = X86ISD::FMSUB_RND; break;

51957

case X86ISD::FMSUB: Opcode = ISD::FMA; break;

51958

case X86ISD::STRICT_FMSUB: Opcode = ISD::STRICT_FMA; break;

51959

case X86ISD::FMSUB_RND: Opcode = X86ISD::FMADD_RND; break;

51960

case X86ISD::FNMADD: Opcode = X86ISD::FNMSUB; break;

51961

case X86ISD::STRICT_FNMADD: Opcode = X86ISD::STRICT_FNMSUB; break;

51962

case X86ISD::FNMADD_RND: Opcode = X86ISD::FNMSUB_RND; break;

51963

case X86ISD::FNMSUB: Opcode = X86ISD::FNMADD; break;

51964

case X86ISD::STRICT_FNMSUB: Opcode = X86ISD::STRICT_FNMADD; break;

51965

case X86ISD::FNMSUB_RND: Opcode = X86ISD::FNMADD_RND; break;

51966

case X86ISD::FMADDSUB: Opcode = X86ISD::FMSUBADD; break;

51967

case X86ISD::FMADDSUB_RND: Opcode = X86ISD::FMSUBADD_RND; break;

51968

case X86ISD::FMSUBADD: Opcode = X86ISD::FMADDSUB; break;

51969

case X86ISD::FMSUBADD_RND: Opcode = X86ISD::FMADDSUB_RND; break;

51970

}

51971

}

51972

51973

if (NegRes) {

51974

switch (Opcode) {

51975

// For accuracy reason, we never combine fneg and fma under strict FP.

51976

default: llvm_unreachable("Unexpected opcode")::llvm::llvm_unreachable_internal("Unexpected opcode", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 51976);

51977

case ISD::FMA: Opcode = X86ISD::FNMSUB; break;

51978

case X86ISD::FMADD_RND: Opcode = X86ISD::FNMSUB_RND; break;

51979

case X86ISD::FMSUB: Opcode = X86ISD::FNMADD; break;

51980

case X86ISD::FMSUB_RND: Opcode = X86ISD::FNMADD_RND; break;

51981

case X86ISD::FNMADD: Opcode = X86ISD::FMSUB; break;

51982

case X86ISD::FNMADD_RND: Opcode = X86ISD::FMSUB_RND; break;

51983

case X86ISD::FNMSUB: Opcode = ISD::FMA; break;

51984

case X86ISD::FNMSUB_RND: Opcode = X86ISD::FMADD_RND; break;

51985

}

51986

}

51987

51988

return Opcode;

51989

}

51990

51991

/// Do target-specific dag combines on floating point negations.

51992

static SDValue combineFneg(SDNode *N, SelectionDAG &DAG,

51993

TargetLowering::DAGCombinerInfo &DCI,

51994

const X86Subtarget &Subtarget) {

51995

EVT OrigVT = N->getValueType(0);

51996

SDValue Arg = isFNEG(DAG, N);

51997

if (!Arg)

51998

return SDValue();

51999

52000

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

52001

EVT VT = Arg.getValueType();

52002

EVT SVT = VT.getScalarType();

52003

SDLoc DL(N);

52004

52005

// Let legalize expand this if it isn't a legal type yet.

52006

if (!TLI.isTypeLegal(VT))

52007

return SDValue();

52008

52009

// If we're negating a FMUL node on a target with FMA, then we can avoid the

52010

// use of a constant by performing (-0 - A*B) instead.

52011

// FIXME: Check rounding control flags as well once it becomes available.

52012

if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 || SVT == MVT::f64) &&

52013

Arg->getFlags().hasNoSignedZeros() && Subtarget.hasAnyFMA()) {

52014

SDValue Zero = DAG.getConstantFP(0.0, DL, VT);

52015

SDValue NewNode = DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0),

52016

Arg.getOperand(1), Zero);

52017

return DAG.getBitcast(OrigVT, NewNode);

52018

}

52019

52020

bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();

52021

bool LegalOperations = !DCI.isBeforeLegalizeOps();

52022

if (SDValue NegArg =

52023

TLI.getNegatedExpression(Arg, DAG, LegalOperations, CodeSize))

52024

return DAG.getBitcast(OrigVT, NegArg);

52025

52026

return SDValue();

52027

}

52028

52029

SDValue X86TargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG,

52030

bool LegalOperations,

52031

bool ForCodeSize,

52032

NegatibleCost &Cost,

52033

unsigned Depth) const {

52034

// fneg patterns are removable even if they have multiple uses.

52035

if (SDValue Arg = isFNEG(DAG, Op.getNode(), Depth)) {

52036

Cost = NegatibleCost::Cheaper;

52037

return DAG.getBitcast(Op.getValueType(), Arg);

52038

}

52039

52040

EVT VT = Op.getValueType();

52041

EVT SVT = VT.getScalarType();

52042

unsigned Opc = Op.getOpcode();

52043

SDNodeFlags Flags = Op.getNode()->getFlags();

52044

switch (Opc) {

52045

case ISD::FMA:

52046

case X86ISD::FMSUB:

52047

case X86ISD::FNMADD:

52048

case X86ISD::FNMSUB:

52049

case X86ISD::FMADD_RND:

52050

case X86ISD::FMSUB_RND:

52051

case X86ISD::FNMADD_RND:

52052

case X86ISD::FNMSUB_RND: {

52053

if (!Op.hasOneUse() || !Subtarget.hasAnyFMA() || !isTypeLegal(VT) ||

52054

!(SVT == MVT::f32 || SVT == MVT::f64) ||

52055

!isOperationLegal(ISD::FMA, VT))

52056

break;

52057

52058

// Don't fold (fneg (fma (fneg x), y, (fneg z))) to (fma x, y, z)

52059

// if it may have signed zeros.

52060

if (!Flags.hasNoSignedZeros())

52061

break;

52062

52063

// This is always negatible for free but we might be able to remove some

52064

// extra operand negations as well.

52065

SmallVector<SDValue, 4> NewOps(Op.getNumOperands(), SDValue());

52066

for (int i = 0; i != 3; ++i)

52067

NewOps[i] = getCheaperNegatedExpression(

52068

Op.getOperand(i), DAG, LegalOperations, ForCodeSize, Depth + 1);

52069

52070

bool NegA = !!NewOps[0];

52071

bool NegB = !!NewOps[1];

52072

bool NegC = !!NewOps[2];

52073

unsigned NewOpc = negateFMAOpcode(Opc, NegA != NegB, NegC, true);

52074

52075

Cost = (NegA || NegB || NegC) ? NegatibleCost::Cheaper

52076

: NegatibleCost::Neutral;

52077

52078

// Fill in the non-negated ops with the original values.

52079

for (int i = 0, e = Op.getNumOperands(); i != e; ++i)

52080

if (!NewOps[i])

52081

NewOps[i] = Op.getOperand(i);

52082

return DAG.getNode(NewOpc, SDLoc(Op), VT, NewOps);

52083

}

52084

case X86ISD::FRCP:

52085

if (SDValue NegOp0 =

52086

getNegatedExpression(Op.getOperand(0), DAG, LegalOperations,

52087

ForCodeSize, Cost, Depth + 1))

52088

return DAG.getNode(Opc, SDLoc(Op), VT, NegOp0);

52089

break;

52090

}

52091

52092

return TargetLowering::getNegatedExpression(Op, DAG, LegalOperations,

52093

ForCodeSize, Cost, Depth);

52094

}

52095

52096

static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG,

52097

const X86Subtarget &Subtarget) {

52098

MVT VT = N->getSimpleValueType(0);

52099

// If we have integer vector types available, use the integer opcodes.

52100

if (!VT.isVector() || !Subtarget.hasSSE2())

52101

return SDValue();

52102

52103

SDLoc dl(N);

52104

52105

unsigned IntBits = VT.getScalarSizeInBits();

52106

MVT IntSVT = MVT::getIntegerVT(IntBits);

52107

MVT IntVT = MVT::getVectorVT(IntSVT, VT.getSizeInBits() / IntBits);

52108

52109

SDValue Op0 = DAG.getBitcast(IntVT, N->getOperand(0));

52110

SDValue Op1 = DAG.getBitcast(IntVT, N->getOperand(1));

52111

unsigned IntOpcode;

52112

switch (N->getOpcode()) {

52113

default: llvm_unreachable("Unexpected FP logic op")::llvm::llvm_unreachable_internal("Unexpected FP logic op", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 52113);

52114

case X86ISD::FOR: IntOpcode = ISD::OR; break;

52115

case X86ISD::FXOR: IntOpcode = ISD::XOR; break;

52116

case X86ISD::FAND: IntOpcode = ISD::AND; break;

52117

case X86ISD::FANDN: IntOpcode = X86ISD::ANDNP; break;

52118

}

52119

SDValue IntOp = DAG.getNode(IntOpcode, dl, IntVT, Op0, Op1);

52120

return DAG.getBitcast(VT, IntOp);

52121

}

52122

52123

52124

/// Fold a xor(setcc cond, val), 1 --> setcc (inverted(cond), val)

52125

static SDValue foldXor1SetCC(SDNode *N, SelectionDAG &DAG) {

52126

if (N->getOpcode() != ISD::XOR)

52127

return SDValue();

52128

52129

SDValue LHS = N->getOperand(0);

52130

if (!isOneConstant(N->getOperand(1)) || LHS->getOpcode() != X86ISD::SETCC)

52131

return SDValue();

52132

52133

X86::CondCode NewCC = X86::GetOppositeBranchCondition(

52134

X86::CondCode(LHS->getConstantOperandVal(0)));

52135

SDLoc DL(N);

52136

return getSETCC(NewCC, LHS->getOperand(1), DL, DAG);

52137

}

52138

52139

static SDValue combineXor(SDNode *N, SelectionDAG &DAG,

52140

TargetLowering::DAGCombinerInfo &DCI,

52141

const X86Subtarget &Subtarget) {

52142

SDValue N0 = N->getOperand(0);

52143

SDValue N1 = N->getOperand(1);

52144

EVT VT = N->getValueType(0);

52145

52146

// If this is SSE1 only convert to FXOR to avoid scalarization.

52147

if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {

52148

return DAG.getBitcast(MVT::v4i32,

52149

DAG.getNode(X86ISD::FXOR, SDLoc(N), MVT::v4f32,

52150

DAG.getBitcast(MVT::v4f32, N0),

52151

DAG.getBitcast(MVT::v4f32, N1)));

52152

}

52153

52154

if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget))

52155

return Cmp;

52156

52157

if (SDValue R = combineBitOpWithMOVMSK(N, DAG))

52158

return R;

52159

52160

if (SDValue R = combineBitOpWithShift(N, DAG))

52161

return R;

52162

52163

if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, DCI, Subtarget))

52164

return FPLogic;

52165

52166

if (DCI.isBeforeLegalizeOps())

52167

return SDValue();

52168

52169

if (SDValue SetCC = foldXor1SetCC(N, DAG))

52170

return SetCC;

52171

52172

if (SDValue RV = foldXorTruncShiftIntoCmp(N, DAG))

52173

return RV;

52174

52175

// Fold not(iX bitcast(vXi1)) -> (iX bitcast(not(vec))) for legal boolvecs.

52176

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

52177

if (llvm::isAllOnesConstant(N1) && N0.getOpcode() == ISD::BITCAST &&

52178

N0.getOperand(0).getValueType().isVector() &&

52179

N0.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&

52180

TLI.isTypeLegal(N0.getOperand(0).getValueType()) && N0.hasOneUse()) {

52181

return DAG.getBitcast(VT, DAG.getNOT(SDLoc(N), N0.getOperand(0),

52182

N0.getOperand(0).getValueType()));

52183

}

52184

52185

// Handle AVX512 mask widening.

52186

// Fold not(insert_subvector(undef,sub)) -> insert_subvector(undef,not(sub))

52187

if (ISD::isBuildVectorAllOnes(N1.getNode()) && VT.isVector() &&

52188

VT.getVectorElementType() == MVT::i1 &&

52189

N0.getOpcode() == ISD::INSERT_SUBVECTOR && N0.getOperand(0).isUndef() &&

52190

TLI.isTypeLegal(N0.getOperand(1).getValueType())) {

52191

return DAG.getNode(

52192

ISD::INSERT_SUBVECTOR, SDLoc(N), VT, N0.getOperand(0),

52193

DAG.getNOT(SDLoc(N), N0.getOperand(1), N0.getOperand(1).getValueType()),

52194

N0.getOperand(2));

52195

}

52196

52197

// Fold xor(zext(xor(x,c1)),c2) -> xor(zext(x),xor(zext(c1),c2))

52198

// Fold xor(truncate(xor(x,c1)),c2) -> xor(truncate(x),xor(truncate(c1),c2))

52199

// TODO: Under what circumstances could this be performed in DAGCombine?

52200

if ((N0.getOpcode() == ISD::TRUNCATE || N0.getOpcode() == ISD::ZERO_EXTEND) &&

52201

N0.getOperand(0).getOpcode() == N->getOpcode()) {

52202

SDValue TruncExtSrc = N0.getOperand(0);

52203

auto *N1C = dyn_cast<ConstantSDNode>(N1);

52204

auto *N001C = dyn_cast<ConstantSDNode>(TruncExtSrc.getOperand(1));

52205

if (N1C && !N1C->isOpaque() && N001C && !N001C->isOpaque()) {

52206

SDLoc DL(N);

52207

SDValue LHS = DAG.getZExtOrTrunc(TruncExtSrc.getOperand(0), DL, VT);

52208

SDValue RHS = DAG.getZExtOrTrunc(TruncExtSrc.getOperand(1), DL, VT);

52209

return DAG.getNode(ISD::XOR, DL, VT, LHS,

52210

DAG.getNode(ISD::XOR, DL, VT, RHS, N1));

52211

}

52212

}

52213

52214

return combineFneg(N, DAG, DCI, Subtarget);

52215

}

52216

52217

static SDValue combineBEXTR(SDNode *N, SelectionDAG &DAG,

52218

TargetLowering::DAGCombinerInfo &DCI,

52219

const X86Subtarget &Subtarget) {

52220

EVT VT = N->getValueType(0);

52221

unsigned NumBits = VT.getSizeInBits();

52222

52223

// TODO - Constant Folding.

52224

52225

// Simplify the inputs.

52226

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

52227

APInt DemandedMask(APInt::getAllOnes(NumBits));

52228

if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))

52229

return SDValue(N, 0);

52230

52231

return SDValue();

52232

}

52233

52234

static bool isNullFPScalarOrVectorConst(SDValue V) {

52235

return isNullFPConstant(V) || ISD::isBuildVectorAllZeros(V.getNode());

52236

}

52237

52238

/// If a value is a scalar FP zero or a vector FP zero (potentially including

52239

/// undefined elements), return a zero constant that may be used to fold away

52240

/// that value. In the case of a vector, the returned constant will not contain

52241

/// undefined elements even if the input parameter does. This makes it suitable

52242

/// to be used as a replacement operand with operations (eg, bitwise-and) where

52243

/// an undef should not propagate.

52244

static SDValue getNullFPConstForNullVal(SDValue V, SelectionDAG &DAG,

52245

const X86Subtarget &Subtarget) {

52246

if (!isNullFPScalarOrVectorConst(V))

52247

return SDValue();

52248

52249

if (V.getValueType().isVector())

52250

return getZeroVector(V.getSimpleValueType(), Subtarget, DAG, SDLoc(V));

52251

52252

return V;

52253

}

52254

52255

static SDValue combineFAndFNotToFAndn(SDNode *N, SelectionDAG &DAG,

52256

const X86Subtarget &Subtarget) {

52257

SDValue N0 = N->getOperand(0);

52258

SDValue N1 = N->getOperand(1);

52259

EVT VT = N->getValueType(0);

52260

SDLoc DL(N);

52261

52262

// Vector types are handled in combineANDXORWithAllOnesIntoANDNP().

52263

if (!((VT == MVT::f32 && Subtarget.hasSSE1()) ||

52264

(VT == MVT::f64 && Subtarget.hasSSE2()) ||

52265

(VT == MVT::v4f32 && Subtarget.hasSSE1() && !Subtarget.hasSSE2())))

52266

return SDValue();

52267

52268

auto isAllOnesConstantFP = [](SDValue V) {

52269

if (V.getSimpleValueType().isVector())

52270

return ISD::isBuildVectorAllOnes(V.getNode());

52271

auto *C = dyn_cast<ConstantFPSDNode>(V);

52272

return C && C->getConstantFPValue()->isAllOnesValue();

52273

};

52274

52275

// fand (fxor X, -1), Y --> fandn X, Y

52276

if (N0.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N0.getOperand(1)))

52277

return DAG.getNode(X86ISD::FANDN, DL, VT, N0.getOperand(0), N1);

52278

52279

// fand X, (fxor Y, -1) --> fandn Y, X

52280

if (N1.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N1.getOperand(1)))

52281

return DAG.getNode(X86ISD::FANDN, DL, VT, N1.getOperand(0), N0);

52282

52283

return SDValue();

52284

}

52285

52286

/// Do target-specific dag combines on X86ISD::FAND nodes.

52287

static SDValue combineFAnd(SDNode *N, SelectionDAG &DAG,

52288

const X86Subtarget &Subtarget) {

52289

// FAND(0.0, x) -> 0.0

52290

if (SDValue V = getNullFPConstForNullVal(N->getOperand(0), DAG, Subtarget))

52291

return V;

52292

52293

// FAND(x, 0.0) -> 0.0

52294

if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))

52295

return V;

52296

52297

if (SDValue V = combineFAndFNotToFAndn(N, DAG, Subtarget))

52298

return V;

52299

52300

return lowerX86FPLogicOp(N, DAG, Subtarget);

52301

}

52302

52303

/// Do target-specific dag combines on X86ISD::FANDN nodes.

52304

static SDValue combineFAndn(SDNode *N, SelectionDAG &DAG,

52305

const X86Subtarget &Subtarget) {

52306

// FANDN(0.0, x) -> x

52307

if (isNullFPScalarOrVectorConst(N->getOperand(0)))

52308

return N->getOperand(1);

52309

52310

// FANDN(x, 0.0) -> 0.0

52311

if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))

52312

return V;

52313

52314

return lowerX86FPLogicOp(N, DAG, Subtarget);

52315

}

52316

52317

/// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.

52318

static SDValue combineFOr(SDNode *N, SelectionDAG &DAG,

52319

TargetLowering::DAGCombinerInfo &DCI,

52320

const X86Subtarget &Subtarget) {

52321

assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR)(static_cast <bool> (N->getOpcode() == X86ISD::FOR ||
N->getOpcode() == X86ISD::FXOR) ? void (0) : __assert_fail
("N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 52321, __extension__
__PRETTY_FUNCTION__));

52322

52323

// F[X]OR(0.0, x) -> x

52324

if (isNullFPScalarOrVectorConst(N->getOperand(0)))

52325

return N->getOperand(1);

52326

52327

// F[X]OR(x, 0.0) -> x

52328

if (isNullFPScalarOrVectorConst(N->getOperand(1)))

52329

return N->getOperand(0);

52330

52331

if (SDValue NewVal = combineFneg(N, DAG, DCI, Subtarget))

52332

return NewVal;

52333

52334

return lowerX86FPLogicOp(N, DAG, Subtarget);

52335

}

52336

52337

/// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.

52338

static SDValue combineFMinFMax(SDNode *N, SelectionDAG &DAG) {

52339

assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX)(static_cast <bool> (N->getOpcode() == X86ISD::FMIN ||
N->getOpcode() == X86ISD::FMAX) ? void (0) : __assert_fail
("N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 52339, __extension__
__PRETTY_FUNCTION__));

52340

52341

// FMIN/FMAX are commutative if no NaNs and no negative zeros are allowed.

52342

if (!DAG.getTarget().Options.NoNaNsFPMath ||

52343

!DAG.getTarget().Options.NoSignedZerosFPMath)

52344

return SDValue();

52345

52346

// If we run in unsafe-math mode, then convert the FMAX and FMIN nodes

52347

// into FMINC and FMAXC, which are Commutative operations.

52348

unsigned NewOp = 0;

52349

switch (N->getOpcode()) {

52350

default: llvm_unreachable("unknown opcode")::llvm::llvm_unreachable_internal("unknown opcode", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 52350);

52351

case X86ISD::FMIN: NewOp = X86ISD::FMINC; break;

52352

case X86ISD::FMAX: NewOp = X86ISD::FMAXC; break;

52353

}

52354

52355

return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0),

52356

N->getOperand(0), N->getOperand(1));

52357

}

52358

52359

static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG,

52360

const X86Subtarget &Subtarget) {

52361

EVT VT = N->getValueType(0);

52362

if (Subtarget.useSoftFloat() || isSoftFP16(VT, Subtarget))

52363

return SDValue();

52364

52365

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

52366

52367

if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||

52368

(Subtarget.hasSSE2() && VT == MVT::f64) ||

52369

(Subtarget.hasFP16() && VT == MVT::f16) ||

52370

(VT.isVector() && TLI.isTypeLegal(VT))))

52371

return SDValue();

52372

52373

SDValue Op0 = N->getOperand(0);

52374

SDValue Op1 = N->getOperand(1);

52375

SDLoc DL(N);

52376

auto MinMaxOp = N->getOpcode() == ISD::FMAXNUM ? X86ISD::FMAX : X86ISD::FMIN;

52377

52378

// If we don't have to respect NaN inputs, this is a direct translation to x86

52379

// min/max instructions.

52380

if (DAG.getTarget().Options.NoNaNsFPMath || N->getFlags().hasNoNaNs())

52381

return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());

52382

52383

// If one of the operands is known non-NaN use the native min/max instructions

52384

// with the non-NaN input as second operand.

52385

if (DAG.isKnownNeverNaN(Op1))

52386

return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());

52387

if (DAG.isKnownNeverNaN(Op0))

52388

return DAG.getNode(MinMaxOp, DL, VT, Op1, Op0, N->getFlags());

52389

52390

// If we have to respect NaN inputs, this takes at least 3 instructions.

52391

// Favor a library call when operating on a scalar and minimizing code size.

52392

if (!VT.isVector() && DAG.getMachineFunction().getFunction().hasMinSize())

52393

return SDValue();

52394

52395

EVT SetCCType = TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(),

52396

VT);

52397

52398

// There are 4 possibilities involving NaN inputs, and these are the required

52399

// outputs:

52400

// Op1

52401

// Num NaN

52402

// ----------------

52403

// Num | Max | Op0 |

52404

// Op0 ----------------

52405

// NaN | Op1 | NaN |

52406

// ----------------

52407

//

52408

// The SSE FP max/min instructions were not designed for this case, but rather

52409

// to implement:

52410

// Min = Op1 < Op0 ? Op1 : Op0

52411

// Max = Op1 > Op0 ? Op1 : Op0

52412

//

52413

// So they always return Op0 if either input is a NaN. However, we can still

52414

// use those instructions for fmaxnum by selecting away a NaN input.

52415

52416

// If either operand is NaN, the 2nd source operand (Op0) is passed through.

52417

SDValue MinOrMax = DAG.getNode(MinMaxOp, DL, VT, Op1, Op0);

52418

SDValue IsOp0Nan = DAG.getSetCC(DL, SetCCType, Op0, Op0, ISD::SETUO);

52419

52420

// If Op0 is a NaN, select Op1. Otherwise, select the max. If both operands

52421

// are NaN, the NaN value of Op1 is the result.

52422

return DAG.getSelect(DL, VT, IsOp0Nan, Op1, MinOrMax);

52423

}

52424

52425

static SDValue combineX86INT_TO_FP(SDNode *N, SelectionDAG &DAG,

52426

TargetLowering::DAGCombinerInfo &DCI) {

52427

EVT VT = N->getValueType(0);

52428

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

52429

52430

APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());

52431

if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))

52432

return SDValue(N, 0);

52433

52434

// Convert a full vector load into vzload when not all bits are needed.

52435

SDValue In = N->getOperand(0);

52436

MVT InVT = In.getSimpleValueType();

52437

if (VT.getVectorNumElements() < InVT.getVectorNumElements() &&

52438

ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {

52439

assert(InVT.is128BitVector() && "Expected 128-bit input vector")(static_cast <bool> (InVT.is128BitVector() && "Expected 128-bit input vector"
) ? void (0) : __assert_fail ("InVT.is128BitVector() && \"Expected 128-bit input vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 52439, __extension__
__PRETTY_FUNCTION__));

52440

LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(0));

52441

unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();

52442

MVT MemVT = MVT::getIntegerVT(NumBits);

52443

MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);

52444

if (SDValue VZLoad = narrowLoadToVZLoad(LN, MemVT, LoadVT, DAG)) {

52445

SDLoc dl(N);

52446

SDValue Convert = DAG.getNode(N->getOpcode(), dl, VT,

52447

DAG.getBitcast(InVT, VZLoad));

52448

DCI.CombineTo(N, Convert);

52449

DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));

52450

DCI.recursivelyDeleteUnusedNodes(LN);

52451

return SDValue(N, 0);

52452

}

52453

}

52454

52455

return SDValue();

52456

}

52457

52458

static SDValue combineCVTP2I_CVTTP2I(SDNode *N, SelectionDAG &DAG,

52459

TargetLowering::DAGCombinerInfo &DCI) {

52460

bool IsStrict = N->isTargetStrictFPOpcode();

52461

EVT VT = N->getValueType(0);

52462

52463

// Convert a full vector load into vzload when not all bits are needed.

52464

SDValue In = N->getOperand(IsStrict ? 1 : 0);

52465

MVT InVT = In.getSimpleValueType();

52466

if (VT.getVectorNumElements() < InVT.getVectorNumElements() &&

52467

ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {

52468

assert(InVT.is128BitVector() && "Expected 128-bit input vector")(static_cast <bool> (InVT.is128BitVector() && "Expected 128-bit input vector"
) ? void (0) : __assert_fail ("InVT.is128BitVector() && \"Expected 128-bit input vector\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 52468, __extension__
__PRETTY_FUNCTION__));

52469

LoadSDNode *LN = cast<LoadSDNode>(In);

52470

unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();

52471

MVT MemVT = MVT::getFloatingPointVT(NumBits);

52472

MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);

52473

if (SDValue VZLoad = narrowLoadToVZLoad(LN, MemVT, LoadVT, DAG)) {

52474

SDLoc dl(N);

52475

if (IsStrict) {

52476

SDValue Convert =

52477

DAG.getNode(N->getOpcode(), dl, {VT, MVT::Other},

52478

{N->getOperand(0), DAG.getBitcast(InVT, VZLoad)});

52479

DCI.CombineTo(N, Convert, Convert.getValue(1));

52480

} else {

52481

SDValue Convert =

52482

DAG.getNode(N->getOpcode(), dl, VT, DAG.getBitcast(InVT, VZLoad));

52483

DCI.CombineTo(N, Convert);

52484

}

52485

DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));

52486

DCI.recursivelyDeleteUnusedNodes(LN);

52487

return SDValue(N, 0);

52488

}

52489

}

52490

52491

return SDValue();

52492

}

52493

52494

/// Do target-specific dag combines on X86ISD::ANDNP nodes.

52495

static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG,

52496

TargetLowering::DAGCombinerInfo &DCI,

52497

const X86Subtarget &Subtarget) {

52498

SDValue N0 = N->getOperand(0);

52499

SDValue N1 = N->getOperand(1);

52500

MVT VT = N->getSimpleValueType(0);

52501

int NumElts = VT.getVectorNumElements();

52502

unsigned EltSizeInBits = VT.getScalarSizeInBits();

52503

52504

// ANDNP(undef, x) -> 0

52505

// ANDNP(x, undef) -> 0

52506

if (N0.isUndef() || N1.isUndef())

52507

return DAG.getConstant(0, SDLoc(N), VT);

52508

52509

// ANDNP(0, x) -> x

52510

if (ISD::isBuildVectorAllZeros(N0.getNode()))

52511

return N1;

52512

52513

// ANDNP(x, 0) -> 0

52514

if (ISD::isBuildVectorAllZeros(N1.getNode()))

52515

return DAG.getConstant(0, SDLoc(N), VT);

52516

52517

// Turn ANDNP back to AND if input is inverted.

52518

if (SDValue Not = IsNOT(N0, DAG))

52519

return DAG.getNode(ISD::AND, SDLoc(N), VT, DAG.getBitcast(VT, Not), N1);

52520

52521

// Constant Folding

52522

APInt Undefs0, Undefs1;

52523

SmallVector<APInt> EltBits0, EltBits1;

52524

if (getTargetConstantBitsFromNode(N0, EltSizeInBits, Undefs0, EltBits0)) {

52525

SDLoc DL(N);

52526

APInt ResultUndefs = APInt::getZero(NumElts);

52527

52528

if (getTargetConstantBitsFromNode(N1, EltSizeInBits, Undefs1, EltBits1)) {

52529

SmallVector<APInt> ResultBits;

52530

for (int I = 0; I != NumElts; ++I)

52531

ResultBits.push_back(~EltBits0[I] & EltBits1[I]);

52532

return getConstVector(ResultBits, ResultUndefs, VT, DAG, DL);

52533

}

52534

52535

// Constant fold NOT(N0) to allow us to use AND.

52536

// Ensure this is only performed if we can confirm that the bitcasted source

52537

// has oneuse to prevent an infinite loop with canonicalizeBitSelect.

52538

if (N0->hasOneUse()) {

52539

SDValue BC0 = peekThroughOneUseBitcasts(N0);

52540

if (BC0.getOpcode() != ISD::BITCAST) {

52541

for (APInt &Elt : EltBits0)

52542

Elt = ~Elt;

52543

SDValue Not = getConstVector(EltBits0, ResultUndefs, VT, DAG, DL);

52544

return DAG.getNode(ISD::AND, DL, VT, Not, N1);

52545

}

52546

}

52547

}

52548

52549

// Attempt to recursively combine a bitmask ANDNP with shuffles.

52550

if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {

52551

SDValue Op(N, 0);

52552

if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))

52553

return Res;

52554

52555

// If either operand is a constant mask, then only the elements that aren't

52556

// zero are actually demanded by the other operand.

52557

auto GetDemandedMasks = [&](SDValue Op, bool Invert = false) {

52558

APInt UndefElts;

52559

SmallVector<APInt> EltBits;

52560

APInt DemandedBits = APInt::getAllOnes(EltSizeInBits);

52561

APInt DemandedElts = APInt::getAllOnes(NumElts);

52562

if (getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts,

52563

EltBits)) {

52564

DemandedBits.clearAllBits();

52565

DemandedElts.clearAllBits();

52566

for (int I = 0; I != NumElts; ++I) {

52567

if (UndefElts[I]) {

52568

// We can't assume an undef src element gives an undef dst - the

52569

// other src might be zero.

52570

DemandedBits.setAllBits();

52571

DemandedElts.setBit(I);

52572

} else if ((Invert && !EltBits[I].isAllOnes()) ||

52573

(!Invert && !EltBits[I].isZero())) {

52574

DemandedBits |= Invert ? ~EltBits[I] : EltBits[I];

52575

DemandedElts.setBit(I);

52576

}

52577

}

52578

}

52579

return std::make_pair(DemandedBits, DemandedElts);

52580

};

52581

APInt Bits0, Elts0;

52582

APInt Bits1, Elts1;

52583

std::tie(Bits0, Elts0) = GetDemandedMasks(N1);

52584

std::tie(Bits1, Elts1) = GetDemandedMasks(N0, true);

52585

52586

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

52587

if (TLI.SimplifyDemandedVectorElts(N0, Elts0, DCI) ||

52588

TLI.SimplifyDemandedVectorElts(N1, Elts1, DCI) ||

52589

TLI.SimplifyDemandedBits(N0, Bits0, Elts0, DCI) ||

52590

TLI.SimplifyDemandedBits(N1, Bits1, Elts1, DCI)) {

52591

if (N->getOpcode() != ISD::DELETED_NODE)

52592

DCI.AddToWorklist(N);

52593

return SDValue(N, 0);

52594

}

52595

}

52596

52597

return SDValue();

52598

}

52599

52600

static SDValue combineBT(SDNode *N, SelectionDAG &DAG,

52601

TargetLowering::DAGCombinerInfo &DCI) {

52602

SDValue N1 = N->getOperand(1);

52603

52604

// BT ignores high bits in the bit index operand.

52605

unsigned BitWidth = N1.getValueSizeInBits();

52606

APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth));

52607

if (DAG.getTargetLoweringInfo().SimplifyDemandedBits(N1, DemandedMask, DCI)) {

52608

if (N->getOpcode() != ISD::DELETED_NODE)

52609

DCI.AddToWorklist(N);

52610

return SDValue(N, 0);

52611

}

52612

52613

return SDValue();

52614

}

52615

52616

static SDValue combineCVTPH2PS(SDNode *N, SelectionDAG &DAG,

52617

TargetLowering::DAGCombinerInfo &DCI) {

52618

bool IsStrict = N->getOpcode() == X86ISD::STRICT_CVTPH2PS;

52619

SDValue Src = N->getOperand(IsStrict ? 1 : 0);

52620

52621

if (N->getValueType(0) == MVT::v4f32 && Src.getValueType() == MVT::v8i16) {

52622

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

52623

APInt DemandedElts = APInt::getLowBitsSet(8, 4);

52624

if (TLI.SimplifyDemandedVectorElts(Src, DemandedElts, DCI)) {

52625

if (N->getOpcode() != ISD::DELETED_NODE)

52626

DCI.AddToWorklist(N);

52627

return SDValue(N, 0);

52628

}

52629

52630

// Convert a full vector load into vzload when not all bits are needed.

52631

if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) {

52632

LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(IsStrict ? 1 : 0));

52633

if (SDValue VZLoad = narrowLoadToVZLoad(LN, MVT::i64, MVT::v2i64, DAG)) {

52634

SDLoc dl(N);

52635

if (IsStrict) {

52636

SDValue Convert = DAG.getNode(

52637

N->getOpcode(), dl, {MVT::v4f32, MVT::Other},

52638

{N->getOperand(0), DAG.getBitcast(MVT::v8i16, VZLoad)});

52639

DCI.CombineTo(N, Convert, Convert.getValue(1));

52640

} else {

52641

SDValue Convert = DAG.getNode(N->getOpcode(), dl, MVT::v4f32,

52642

DAG.getBitcast(MVT::v8i16, VZLoad));

52643

DCI.CombineTo(N, Convert);

52644

}

52645

52646

DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));

52647

DCI.recursivelyDeleteUnusedNodes(LN);

52648

return SDValue(N, 0);

52649

}

52650

}

52651

}

52652

52653

return SDValue();

52654

}

52655

52656

// Try to combine sext_in_reg of a cmov of constants by extending the constants.

52657

static SDValue combineSextInRegCmov(SDNode *N, SelectionDAG &DAG) {

52658

assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG)(static_cast <bool> (N->getOpcode() == ISD::SIGN_EXTEND_INREG
) ? void (0) : __assert_fail ("N->getOpcode() == ISD::SIGN_EXTEND_INREG"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 52658, __extension__
__PRETTY_FUNCTION__));

52659

52660

EVT DstVT = N->getValueType(0);

52661

52662

SDValue N0 = N->getOperand(0);

52663

SDValue N1 = N->getOperand(1);

52664

EVT ExtraVT = cast<VTSDNode>(N1)->getVT();

52665

52666

if (ExtraVT != MVT::i8 && ExtraVT != MVT::i16)

52667

return SDValue();

52668

52669

// Look through single use any_extends / truncs.

52670

SDValue IntermediateBitwidthOp;

52671

if ((N0.getOpcode() == ISD::ANY_EXTEND || N0.getOpcode() == ISD::TRUNCATE) &&

52672

N0.hasOneUse()) {

52673

IntermediateBitwidthOp = N0;

52674

N0 = N0.getOperand(0);

52675

}

52676

52677

// See if we have a single use cmov.

52678

if (N0.getOpcode() != X86ISD::CMOV || !N0.hasOneUse())

52679

return SDValue();

52680

52681

SDValue CMovOp0 = N0.getOperand(0);

52682

SDValue CMovOp1 = N0.getOperand(1);

52683

52684

// Make sure both operands are constants.

52685

if (!isa<ConstantSDNode>(CMovOp0.getNode()) ||

52686

!isa<ConstantSDNode>(CMovOp1.getNode()))

52687

return SDValue();

52688

52689

SDLoc DL(N);

52690

52691

// If we looked through an any_extend/trunc above, add one to the constants.

52692

if (IntermediateBitwidthOp) {

52693

unsigned IntermediateOpc = IntermediateBitwidthOp.getOpcode();

52694

CMovOp0 = DAG.getNode(IntermediateOpc, DL, DstVT, CMovOp0);

52695

CMovOp1 = DAG.getNode(IntermediateOpc, DL, DstVT, CMovOp1);

52696

}

52697

52698

CMovOp0 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, CMovOp0, N1);

52699

CMovOp1 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, CMovOp1, N1);

52700

52701

EVT CMovVT = DstVT;

52702

// We do not want i16 CMOV's. Promote to i32 and truncate afterwards.

52703

if (DstVT == MVT::i16) {

52704

CMovVT = MVT::i32;

52705

CMovOp0 = DAG.getNode(ISD::ZERO_EXTEND, DL, CMovVT, CMovOp0);

52706

CMovOp1 = DAG.getNode(ISD::ZERO_EXTEND, DL, CMovVT, CMovOp1);

52707

}

52708

52709

SDValue CMov = DAG.getNode(X86ISD::CMOV, DL, CMovVT, CMovOp0, CMovOp1,

52710

N0.getOperand(2), N0.getOperand(3));

52711

52712

if (CMovVT != DstVT)

52713

CMov = DAG.getNode(ISD::TRUNCATE, DL, DstVT, CMov);

52714

52715

return CMov;

52716

}

52717

52718

static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG,

52719

const X86Subtarget &Subtarget) {

52720

assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG)(static_cast <bool> (N->getOpcode() == ISD::SIGN_EXTEND_INREG
) ? void (0) : __assert_fail ("N->getOpcode() == ISD::SIGN_EXTEND_INREG"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 52720, __extension__
__PRETTY_FUNCTION__));

52721

52722

if (SDValue V = combineSextInRegCmov(N, DAG))

52723

return V;

52724

52725

EVT VT = N->getValueType(0);

52726

SDValue N0 = N->getOperand(0);

52727

SDValue N1 = N->getOperand(1);

52728

EVT ExtraVT = cast<VTSDNode>(N1)->getVT();

52729

SDLoc dl(N);

52730

52731

// The SIGN_EXTEND_INREG to v4i64 is expensive operation on the

52732

// both SSE and AVX2 since there is no sign-extended shift right

52733

// operation on a vector with 64-bit elements.

52734

//(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) ->

52735

// (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT)))

52736

if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND ||

52737

N0.getOpcode() == ISD::SIGN_EXTEND)) {

52738

SDValue N00 = N0.getOperand(0);

52739

52740

// EXTLOAD has a better solution on AVX2,

52741

// it may be replaced with X86ISD::VSEXT node.

52742

if (N00.getOpcode() == ISD::LOAD && Subtarget.hasInt256())

52743

if (!ISD::isNormalLoad(N00.getNode()))

52744

return SDValue();

52745

52746

// Attempt to promote any comparison mask ops before moving the

52747

// SIGN_EXTEND_INREG in the way.

52748

if (SDValue Promote = PromoteMaskArithmetic(N0.getNode(), DAG, Subtarget))

52749

return DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, VT, Promote, N1);

52750

52751

if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {

52752

SDValue Tmp =

52753

DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32, N00, N1);

52754

return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp);

52755

}

52756

}

52757

return SDValue();

52758

}

52759

52760

/// sext(add_nsw(x, C)) --> add(sext(x), C_sext)

52761

/// zext(add_nuw(x, C)) --> add(zext(x), C_zext)

52762

/// Promoting a sign/zero extension ahead of a no overflow 'add' exposes

52763

/// opportunities to combine math ops, use an LEA, or use a complex addressing

52764

/// mode. This can eliminate extend, add, and shift instructions.

52765

static SDValue promoteExtBeforeAdd(SDNode *Ext, SelectionDAG &DAG,

52766

const X86Subtarget &Subtarget) {

52767

if (Ext->getOpcode() != ISD::SIGN_EXTEND &&

52768

Ext->getOpcode() != ISD::ZERO_EXTEND)

52769

return SDValue();

52770

52771

// TODO: This should be valid for other integer types.

52772

EVT VT = Ext->getValueType(0);

52773

if (VT != MVT::i64)

52774

return SDValue();

52775

52776

SDValue Add = Ext->getOperand(0);

52777

if (Add.getOpcode() != ISD::ADD)

52778

return SDValue();

52779

52780

bool Sext = Ext->getOpcode() == ISD::SIGN_EXTEND;

52781

bool NSW = Add->getFlags().hasNoSignedWrap();

52782

bool NUW = Add->getFlags().hasNoUnsignedWrap();

52783

52784

// We need an 'add nsw' feeding into the 'sext' or 'add nuw' feeding

52785

// into the 'zext'

52786

if ((Sext && !NSW) || (!Sext && !NUW))

52787

return SDValue();

52788

52789

// Having a constant operand to the 'add' ensures that we are not increasing

52790

// the instruction count because the constant is extended for free below.

52791

// A constant operand can also become the displacement field of an LEA.

52792

auto *AddOp1 = dyn_cast<ConstantSDNode>(Add.getOperand(1));

52793

if (!AddOp1)

52794

return SDValue();

52795

52796

// Don't make the 'add' bigger if there's no hope of combining it with some

52797

// other 'add' or 'shl' instruction.

52798

// TODO: It may be profitable to generate simpler LEA instructions in place

52799

// of single 'add' instructions, but the cost model for selecting an LEA

52800

// currently has a high threshold.

52801

bool HasLEAPotential = false;

52802

for (auto *User : Ext->uses()) {

52803

if (User->getOpcode() == ISD::ADD || User->getOpcode() == ISD::SHL) {

52804

HasLEAPotential = true;

52805

break;

52806

}

52807

}

52808

if (!HasLEAPotential)

52809

return SDValue();

52810

52811

// Everything looks good, so pull the '{s|z}ext' ahead of the 'add'.

52812

int64_t AddConstant = Sext ? AddOp1->getSExtValue() : AddOp1->getZExtValue();

52813

SDValue AddOp0 = Add.getOperand(0);

52814

SDValue NewExt = DAG.getNode(Ext->getOpcode(), SDLoc(Ext), VT, AddOp0);

52815

SDValue NewConstant = DAG.getConstant(AddConstant, SDLoc(Add), VT);

52816

52817

// The wider add is guaranteed to not wrap because both operands are

52818

// sign-extended.

52819

SDNodeFlags Flags;

52820

Flags.setNoSignedWrap(NSW);

52821

Flags.setNoUnsignedWrap(NUW);

52822

return DAG.getNode(ISD::ADD, SDLoc(Add), VT, NewExt, NewConstant, Flags);

52823

}

52824

52825

// If we face {ANY,SIGN,ZERO}_EXTEND that is applied to a CMOV with constant

52826

// operands and the result of CMOV is not used anywhere else - promote CMOV

52827

// itself instead of promoting its result. This could be beneficial, because:

52828

// 1) X86TargetLowering::EmitLoweredSelect later can do merging of two

52829

// (or more) pseudo-CMOVs only when they go one-after-another and

52830

// getting rid of result extension code after CMOV will help that.

52831

// 2) Promotion of constant CMOV arguments is free, hence the

52832

// {ANY,SIGN,ZERO}_EXTEND will just be deleted.

52833

// 3) 16-bit CMOV encoding is 4 bytes, 32-bit CMOV is 3-byte, so this

52834

// promotion is also good in terms of code-size.

52835

// (64-bit CMOV is 4-bytes, that's why we don't do 32-bit => 64-bit

52836

// promotion).

52837

static SDValue combineToExtendCMOV(SDNode *Extend, SelectionDAG &DAG) {

52838

SDValue CMovN = Extend->getOperand(0);

52839

if (CMovN.getOpcode() != X86ISD::CMOV || !CMovN.hasOneUse())

52840

return SDValue();

52841

52842

EVT TargetVT = Extend->getValueType(0);

52843

unsigned ExtendOpcode = Extend->getOpcode();

52844

SDLoc DL(Extend);

52845

52846

EVT VT = CMovN.getValueType();

52847

SDValue CMovOp0 = CMovN.getOperand(0);

52848

SDValue CMovOp1 = CMovN.getOperand(1);

52849

52850

if (!isa<ConstantSDNode>(CMovOp0.getNode()) ||

52851

!isa<ConstantSDNode>(CMovOp1.getNode()))

52852

return SDValue();

52853

52854

// Only extend to i32 or i64.

52855

if (TargetVT != MVT::i32 && TargetVT != MVT::i64)

52856

return SDValue();

52857

52858

// Only extend from i16 unless its a sign_extend from i32. Zext/aext from i32

52859

// are free.

52860

if (VT != MVT::i16 && !(ExtendOpcode == ISD::SIGN_EXTEND && VT == MVT::i32))

52861

return SDValue();

52862

52863

// If this a zero extend to i64, we should only extend to i32 and use a free

52864

// zero extend to finish.

52865

EVT ExtendVT = TargetVT;

52866

if (TargetVT == MVT::i64 && ExtendOpcode != ISD::SIGN_EXTEND)

52867

ExtendVT = MVT::i32;

52868

52869

CMovOp0 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp0);

52870

CMovOp1 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp1);

52871

52872

SDValue Res = DAG.getNode(X86ISD::CMOV, DL, ExtendVT, CMovOp0, CMovOp1,

52873

CMovN.getOperand(2), CMovN.getOperand(3));

52874

52875

// Finish extending if needed.

52876

if (ExtendVT != TargetVT)

52877

Res = DAG.getNode(ExtendOpcode, DL, TargetVT, Res);

52878

52879

return Res;

52880

}

52881

52882

// Attempt to combine a (sext/zext (setcc)) to a setcc with a xmm/ymm/zmm

52883

// result type.

52884

static SDValue combineExtSetcc(SDNode *N, SelectionDAG &DAG,

52885

const X86Subtarget &Subtarget) {

52886

SDValue N0 = N->getOperand(0);

52887

EVT VT = N->getValueType(0);

52888

SDLoc dl(N);

52889

52890

// Only do this combine with AVX512 for vector extends.

52891

if (!Subtarget.hasAVX512() || !VT.isVector() || N0.getOpcode() != ISD::SETCC)

52892

return SDValue();

52893

52894

// Only combine legal element types.

52895

EVT SVT = VT.getVectorElementType();

52896

if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32 &&

52897

SVT != MVT::i64 && SVT != MVT::f32 && SVT != MVT::f64)

52898

return SDValue();

52899

52900

// We don't have CMPP Instruction for vxf16

52901

if (N0.getOperand(0).getValueType().getVectorElementType() == MVT::f16)

52902

return SDValue();

52903

// We can only do this if the vector size in 256 bits or less.

52904

unsigned Size = VT.getSizeInBits();

52905

if (Size > 256 && Subtarget.useAVX512Regs())

52906

return SDValue();

52907

52908

// Don't fold if the condition code can't be handled by PCMPEQ/PCMPGT since

52909

// that's the only integer compares with we have.

52910

ISD::CondCode CC = cast<CondCodeSDNode>(N0.getOperand(2))->get();

52911

if (ISD::isUnsignedIntSetCC(CC))

52912

return SDValue();

52913

52914

// Only do this combine if the extension will be fully consumed by the setcc.

52915

EVT N00VT = N0.getOperand(0).getValueType();

52916

EVT MatchingVecType = N00VT.changeVectorElementTypeToInteger();

52917

if (Size != MatchingVecType.getSizeInBits())

52918

return SDValue();

52919

52920

SDValue Res = DAG.getSetCC(dl, VT, N0.getOperand(0), N0.getOperand(1), CC);

52921

52922

if (N->getOpcode() == ISD::ZERO_EXTEND)

52923

Res = DAG.getZeroExtendInReg(Res, dl, N0.getValueType());

52924

52925

return Res;

52926

}

52927

52928

static SDValue combineSext(SDNode *N, SelectionDAG &DAG,

52929

TargetLowering::DAGCombinerInfo &DCI,

52930

const X86Subtarget &Subtarget) {

52931

SDValue N0 = N->getOperand(0);

52932

EVT VT = N->getValueType(0);

52933

SDLoc DL(N);

52934

52935

// (i32 (sext (i8 (x86isd::setcc_carry)))) -> (i32 (x86isd::setcc_carry))

52936

if (!DCI.isBeforeLegalizeOps() &&

52937

N0.getOpcode() == X86ISD::SETCC_CARRY) {

52938

SDValue Setcc = DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, N0->getOperand(0),

52939

N0->getOperand(1));

52940

bool ReplaceOtherUses = !N0.hasOneUse();

52941

DCI.CombineTo(N, Setcc);

52942

// Replace other uses with a truncate of the widened setcc_carry.

52943

if (ReplaceOtherUses) {

52944

SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),

52945

N0.getValueType(), Setcc);

52946

DCI.CombineTo(N0.getNode(), Trunc);

52947

}

52948

52949

return SDValue(N, 0);

52950

}

52951

52952

if (SDValue NewCMov = combineToExtendCMOV(N, DAG))

52953

return NewCMov;

52954

52955

if (!DCI.isBeforeLegalizeOps())

52956

return SDValue();

52957

52958

if (SDValue V = combineExtSetcc(N, DAG, Subtarget))

52959

return V;

52960

52961

if (SDValue V = combineToExtendBoolVectorInReg(N->getOpcode(), DL, VT, N0,

52962

DAG, DCI, Subtarget))

52963

return V;

52964

52965

if (VT.isVector()) {

52966

if (SDValue R = PromoteMaskArithmetic(N, DAG, Subtarget))

52967

return R;

52968

52969

if (N0.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG)

52970

return DAG.getNode(N0.getOpcode(), DL, VT, N0.getOperand(0));

52971

}

52972

52973

if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))

52974

return NewAdd;

52975

52976

return SDValue();

52977

}

52978

52979

static SDValue combineFMA(SDNode *N, SelectionDAG &DAG,

52980

TargetLowering::DAGCombinerInfo &DCI,

52981

const X86Subtarget &Subtarget) {

52982

SDLoc dl(N);

52983

EVT VT = N->getValueType(0);

52984

bool IsStrict = N->isStrictFPOpcode() || N->isTargetStrictFPOpcode();

52985

52986

// Let legalize expand this if it isn't a legal type yet.

52987

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

52988

if (!TLI.isTypeLegal(VT))

52989

return SDValue();

52990

52991

SDValue A = N->getOperand(IsStrict ? 1 : 0);

52992

SDValue B = N->getOperand(IsStrict ? 2 : 1);

52993

SDValue C = N->getOperand(IsStrict ? 3 : 2);

52994

52995

// If the operation allows fast-math and the target does not support FMA,

52996

// split this into mul+add to avoid libcall(s).

52997

SDNodeFlags Flags = N->getFlags();

52998

if (!IsStrict && Flags.hasAllowReassociation() &&

52999

TLI.isOperationExpand(ISD::FMA, VT)) {

53000

SDValue Fmul = DAG.getNode(ISD::FMUL, dl, VT, A, B, Flags);

53001

return DAG.getNode(ISD::FADD, dl, VT, Fmul, C, Flags);

53002

}

53003

53004

EVT ScalarVT = VT.getScalarType();

53005

if (((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) ||

53006

!Subtarget.hasAnyFMA()) &&

53007

!(ScalarVT == MVT::f16 && Subtarget.hasFP16()))

53008

return SDValue();

53009

53010

auto invertIfNegative = [&DAG, &TLI, &DCI](SDValue &V) {

53011

bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();

53012

bool LegalOperations = !DCI.isBeforeLegalizeOps();

53013

if (SDValue NegV = TLI.getCheaperNegatedExpression(V, DAG, LegalOperations,

53014

CodeSize)) {

53015

V = NegV;

53016

return true;

53017

}

53018

// Look through extract_vector_elts. If it comes from an FNEG, create a

53019

// new extract from the FNEG input.

53020

if (V.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&

53021

isNullConstant(V.getOperand(1))) {

53022

SDValue Vec = V.getOperand(0);

53023

if (SDValue NegV = TLI.getCheaperNegatedExpression(

53024

Vec, DAG, LegalOperations, CodeSize)) {

53025

V = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(V), V.getValueType(),

53026

NegV, V.getOperand(1));

53027

return true;

53028

}

53029

}

53030

53031

return false;

53032

};

53033

53034

// Do not convert the passthru input of scalar intrinsics.

53035

// FIXME: We could allow negations of the lower element only.

53036

bool NegA = invertIfNegative(A);

53037

bool NegB = invertIfNegative(B);

53038

bool NegC = invertIfNegative(C);

53039

53040

if (!NegA && !NegB && !NegC)

53041

return SDValue();

53042

53043

unsigned NewOpcode =

53044

negateFMAOpcode(N->getOpcode(), NegA != NegB, NegC, false);

53045

53046

// Propagate fast-math-flags to new FMA node.

53047

SelectionDAG::FlagInserter FlagsInserter(DAG, Flags);

53048

if (IsStrict) {

53049

assert(N->getNumOperands() == 4 && "Shouldn't be greater than 4")(static_cast <bool> (N->getNumOperands() == 4 &&
"Shouldn't be greater than 4") ? void (0) : __assert_fail ("N->getNumOperands() == 4 && \"Shouldn't be greater than 4\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 53049, __extension__
__PRETTY_FUNCTION__));

53050

return DAG.getNode(NewOpcode, dl, {VT, MVT::Other},

53051

{N->getOperand(0), A, B, C});

53052

} else {

53053

if (N->getNumOperands() == 4)

53054

return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3));

53055

return DAG.getNode(NewOpcode, dl, VT, A, B, C);

53056

}

53057

}

53058

53059

// Combine FMADDSUB(A, B, FNEG(C)) -> FMSUBADD(A, B, C)

53060

// Combine FMSUBADD(A, B, FNEG(C)) -> FMADDSUB(A, B, C)

53061

static SDValue combineFMADDSUB(SDNode *N, SelectionDAG &DAG,

53062

TargetLowering::DAGCombinerInfo &DCI) {

53063

SDLoc dl(N);

53064

EVT VT = N->getValueType(0);

53065

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

53066

bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();

53067

bool LegalOperations = !DCI.isBeforeLegalizeOps();

53068

53069

SDValue N2 = N->getOperand(2);

53070

53071

SDValue NegN2 =

53072

TLI.getCheaperNegatedExpression(N2, DAG, LegalOperations, CodeSize);

53073

if (!NegN2)

53074

return SDValue();

53075

unsigned NewOpcode = negateFMAOpcode(N->getOpcode(), false, true, false);

53076

53077

if (N->getNumOperands() == 4)

53078

return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),

53079

NegN2, N->getOperand(3));

53080

return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),

53081

NegN2);

53082

}

53083

53084

static SDValue combineZext(SDNode *N, SelectionDAG &DAG,

53085

TargetLowering::DAGCombinerInfo &DCI,

53086

const X86Subtarget &Subtarget) {

53087

SDLoc dl(N);

53088

SDValue N0 = N->getOperand(0);

53089

EVT VT = N->getValueType(0);

53090

53091

// (i32 (aext (i8 (x86isd::setcc_carry)))) -> (i32 (x86isd::setcc_carry))

53092

// FIXME: Is this needed? We don't seem to have any tests for it.

53093

if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ANY_EXTEND &&

53094

N0.getOpcode() == X86ISD::SETCC_CARRY) {

53095

SDValue Setcc = DAG.getNode(X86ISD::SETCC_CARRY, dl, VT, N0->getOperand(0),

53096

N0->getOperand(1));

53097

bool ReplaceOtherUses = !N0.hasOneUse();

53098

DCI.CombineTo(N, Setcc);

53099

// Replace other uses with a truncate of the widened setcc_carry.

53100

if (ReplaceOtherUses) {

53101

SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),

53102

N0.getValueType(), Setcc);

53103

DCI.CombineTo(N0.getNode(), Trunc);

53104

}

53105

53106

return SDValue(N, 0);

53107

}

53108

53109

if (SDValue NewCMov = combineToExtendCMOV(N, DAG))

53110

return NewCMov;

53111

53112

if (DCI.isBeforeLegalizeOps())

53113

if (SDValue V = combineExtSetcc(N, DAG, Subtarget))

53114

return V;

53115

53116

if (SDValue V = combineToExtendBoolVectorInReg(N->getOpcode(), dl, VT, N0,

53117

DAG, DCI, Subtarget))

53118

return V;

53119

53120

if (VT.isVector())

53121

if (SDValue R = PromoteMaskArithmetic(N, DAG, Subtarget))

53122

return R;

53123

53124

if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))

53125

return NewAdd;

53126

53127

if (SDValue R = combineOrCmpEqZeroToCtlzSrl(N, DAG, DCI, Subtarget))

53128

return R;

53129

53130

// TODO: Combine with any target/faux shuffle.

53131

if (N0.getOpcode() == X86ISD::PACKUS && N0.getValueSizeInBits() == 128 &&

53132

VT.getScalarSizeInBits() == N0.getOperand(0).getScalarValueSizeInBits()) {

53133

SDValue N00 = N0.getOperand(0);

53134

SDValue N01 = N0.getOperand(1);

53135

unsigned NumSrcEltBits = N00.getScalarValueSizeInBits();

53136

APInt ZeroMask = APInt::getHighBitsSet(NumSrcEltBits, NumSrcEltBits / 2);

53137

if ((N00.isUndef() || DAG.MaskedValueIsZero(N00, ZeroMask)) &&

53138

(N01.isUndef() || DAG.MaskedValueIsZero(N01, ZeroMask))) {

53139

return concatSubVectors(N00, N01, DAG, dl);

53140

}

53141

}

53142

53143

return SDValue();

53144

}

53145

53146

/// Recursive helper for combineVectorSizedSetCCEquality() to see if we have a

53147

/// recognizable memcmp expansion.

53148

static bool isOrXorXorTree(SDValue X, bool Root = true) {

53149

if (X.getOpcode() == ISD::OR)

53150

return isOrXorXorTree(X.getOperand(0), false) &&

53151

isOrXorXorTree(X.getOperand(1), false);

53152

if (Root)

53153

return false;

53154

return X.getOpcode() == ISD::XOR;

53155

}

53156

53157

/// Recursive helper for combineVectorSizedSetCCEquality() to emit the memcmp

53158

/// expansion.

53159

template <typename F>

53160

static SDValue emitOrXorXorTree(SDValue X, SDLoc &DL, SelectionDAG &DAG,

53161

EVT VecVT, EVT CmpVT, bool HasPT, F SToV) {

53162

SDValue Op0 = X.getOperand(0);

53163

SDValue Op1 = X.getOperand(1);

53164

if (X.getOpcode() == ISD::OR) {

53165

SDValue A = emitOrXorXorTree(Op0, DL, DAG, VecVT, CmpVT, HasPT, SToV);

53166

SDValue B = emitOrXorXorTree(Op1, DL, DAG, VecVT, CmpVT, HasPT, SToV);

53167

if (VecVT != CmpVT)

53168

return DAG.getNode(ISD::OR, DL, CmpVT, A, B);

53169

if (HasPT)

53170

return DAG.getNode(ISD::OR, DL, VecVT, A, B);

53171

return DAG.getNode(ISD::AND, DL, CmpVT, A, B);

53172

}

53173

if (X.getOpcode() == ISD::XOR) {

53174

SDValue A = SToV(Op0);

53175

SDValue B = SToV(Op1);

53176

if (VecVT != CmpVT)

53177

return DAG.getSetCC(DL, CmpVT, A, B, ISD::SETNE);

53178

if (HasPT)

53179

return DAG.getNode(ISD::XOR, DL, VecVT, A, B);

53180

return DAG.getSetCC(DL, CmpVT, A, B, ISD::SETEQ);

53181

}

53182

llvm_unreachable("Impossible")::llvm::llvm_unreachable_internal("Impossible", "llvm/lib/Target/X86/X86ISelLowering.cpp"
, 53182);

53183

}

53184

53185

/// Try to map a 128-bit or larger integer comparison to vector instructions

53186

/// before type legalization splits it up into chunks.

53187

static SDValue combineVectorSizedSetCCEquality(SDNode *SetCC, SelectionDAG &DAG,

53188

const X86Subtarget &Subtarget) {

53189

ISD::CondCode CC = cast<CondCodeSDNode>(SetCC->getOperand(2))->get();

53190

assert((CC == ISD::SETNE || CC == ISD::SETEQ) && "Bad comparison predicate")(static_cast <bool> ((CC == ISD::SETNE || CC == ISD::SETEQ
) && "Bad comparison predicate") ? void (0) : __assert_fail
("(CC == ISD::SETNE || CC == ISD::SETEQ) && \"Bad comparison predicate\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 53190, __extension__
__PRETTY_FUNCTION__));

53191

53192

// We're looking for an oversized integer equality comparison.

53193

SDValue X = SetCC->getOperand(0);

53194

SDValue Y = SetCC->getOperand(1);

53195

EVT OpVT = X.getValueType();

53196

unsigned OpSize = OpVT.getSizeInBits();

53197

if (!OpVT.isScalarInteger() || OpSize < 128)

53198

return SDValue();

53199

53200

// Ignore a comparison with zero because that gets special treatment in

53201

// EmitTest(). But make an exception for the special case of a pair of

53202

// logically-combined vector-sized operands compared to zero. This pattern may

53203

// be generated by the memcmp expansion pass with oversized integer compares

53204

// (see PR33325).

53205

bool IsOrXorXorTreeCCZero = isNullConstant(Y) && isOrXorXorTree(X);

53206

if (isNullConstant(Y) && !IsOrXorXorTreeCCZero)

53207

return SDValue();

53208

53209

// Don't perform this combine if constructing the vector will be expensive.

53210

auto IsVectorBitCastCheap = [](SDValue X) {

53211

X = peekThroughBitcasts(X);

53212

return isa<ConstantSDNode>(X) || X.getValueType().isVector() ||

53213

X.getOpcode() == ISD::LOAD;

53214

};

53215

if ((!IsVectorBitCastCheap(X) || !IsVectorBitCastCheap(Y)) &&

53216

!IsOrXorXorTreeCCZero)

53217

return SDValue();

53218

53219

EVT VT = SetCC->getValueType(0);

53220

SDLoc DL(SetCC);

53221

53222

// Use XOR (plus OR) and PTEST after SSE4.1 for 128/256-bit operands.

53223

// Use PCMPNEQ (plus OR) and KORTEST for 512-bit operands.

53224

// Otherwise use PCMPEQ (plus AND) and mask testing.

53225

bool NoImplicitFloatOps =

53226

DAG.getMachineFunction().getFunction().hasFnAttribute(

53227

Attribute::NoImplicitFloat);

53228

if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps &&

53229

((OpSize == 128 && Subtarget.hasSSE2()) ||

53230

(OpSize == 256 && Subtarget.hasAVX()) ||

53231

(OpSize == 512 && Subtarget.useAVX512Regs()))) {

53232

bool HasPT = Subtarget.hasSSE41();

53233

53234

// PTEST and MOVMSK are slow on Knights Landing and Knights Mill and widened

53235

// vector registers are essentially free. (Technically, widening registers

53236

// prevents load folding, but the tradeoff is worth it.)

53237

bool PreferKOT = Subtarget.preferMaskRegisters();

53238

bool NeedZExt = PreferKOT && !Subtarget.hasVLX() && OpSize != 512;

53239

53240

EVT VecVT = MVT::v16i8;

53241

EVT CmpVT = PreferKOT ? MVT::v16i1 : VecVT;

53242

if (OpSize == 256) {

53243

VecVT = MVT::v32i8;

53244

CmpVT = PreferKOT ? MVT::v32i1 : VecVT;

53245

}

53246

EVT CastVT = VecVT;

53247

bool NeedsAVX512FCast = false;

53248

if (OpSize == 512 || NeedZExt) {

53249

if (Subtarget.hasBWI()) {

53250

VecVT = MVT::v64i8;

53251

CmpVT = MVT::v64i1;

53252

if (OpSize == 512)

53253

CastVT = VecVT;

53254

} else {

53255

VecVT = MVT::v16i32;

53256

CmpVT = MVT::v16i1;

53257

CastVT = OpSize == 512 ? VecVT :

53258

OpSize == 256 ? MVT::v8i32 : MVT::v4i32;

53259

NeedsAVX512FCast = true;

53260

}

53261

}

53262

53263

auto ScalarToVector = [&](SDValue X) -> SDValue {

53264

bool TmpZext = false;

53265

EVT TmpCastVT = CastVT;

53266

if (X.getOpcode() == ISD::ZERO_EXTEND) {

53267

SDValue OrigX = X.getOperand(0);

53268

unsigned OrigSize = OrigX.getScalarValueSizeInBits();

53269

if (OrigSize < OpSize) {

53270

if (OrigSize == 128) {

53271

TmpCastVT = NeedsAVX512FCast ? MVT::v4i32 : MVT::v16i8;

53272

X = OrigX;

53273

TmpZext = true;

53274

} else if (OrigSize == 256) {

53275

TmpCastVT = NeedsAVX512FCast ? MVT::v8i32 : MVT::v32i8;

53276

X = OrigX;

53277

TmpZext = true;

53278

}

53279

}

53280

}

53281

X = DAG.getBitcast(TmpCastVT, X);

53282

if (!NeedZExt && !TmpZext)

53283

return X;

53284

return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VecVT,

53285

DAG.getConstant(0, DL, VecVT), X,

53286

DAG.getVectorIdxConstant(0, DL));

53287

};

53288

53289

SDValue Cmp;

53290

if (IsOrXorXorTreeCCZero) {

53291

// This is a bitwise-combined equality comparison of 2 pairs of vectors:

53292

// setcc i128 (or (xor A, B), (xor C, D)), 0, eq|ne

53293

// Use 2 vector equality compares and 'and' the results before doing a

53294

// MOVMSK.

53295

Cmp = emitOrXorXorTree(X, DL, DAG, VecVT, CmpVT, HasPT, ScalarToVector);

53296

} else {

53297

SDValue VecX = ScalarToVector(X);

53298

SDValue VecY = ScalarToVector(Y);

53299

if (VecVT != CmpVT) {

53300

Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETNE);

53301

} else if (HasPT) {

53302

Cmp = DAG.getNode(ISD::XOR, DL, VecVT, VecX, VecY);

53303

} else {

53304

Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETEQ);

53305

}

53306

}

53307

// AVX512 should emit a setcc that will lower to kortest.

53308

if (VecVT != CmpVT) {

53309

EVT KRegVT = CmpVT == MVT::v64i1 ? MVT::i64 :

53310

CmpVT == MVT::v32i1 ? MVT::i32 : MVT::i16;

53311

return DAG.getSetCC(DL, VT, DAG.getBitcast(KRegVT, Cmp),

53312

DAG.getConstant(0, DL, KRegVT), CC);

53313

}

53314

if (HasPT) {

53315

SDValue BCCmp = DAG.getBitcast(OpSize == 256 ? MVT::v4i64 : MVT::v2i64,

53316

Cmp);

53317

SDValue PT = DAG.getNode(X86ISD::PTEST, DL, MVT::i32, BCCmp, BCCmp);

53318

X86::CondCode X86CC = CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE;

53319

SDValue X86SetCC = getSETCC(X86CC, PT, DL, DAG);

53320

return DAG.getNode(ISD::TRUNCATE, DL, VT, X86SetCC.getValue(0));

53321

}

53322

// If all bytes match (bitmask is 0x(FFFF)FFFF), that's equality.

53323

// setcc i128 X, Y, eq --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, eq

53324

// setcc i128 X, Y, ne --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, ne

53325

assert(Cmp.getValueType() == MVT::v16i8 &&(static_cast <bool> (Cmp.getValueType() == MVT::v16i8 &&
"Non 128-bit vector on pre-SSE41 target") ? void (0) : __assert_fail
("Cmp.getValueType() == MVT::v16i8 && \"Non 128-bit vector on pre-SSE41 target\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 53326, __extension__
__PRETTY_FUNCTION__))

53326

"Non 128-bit vector on pre-SSE41 target")(static_cast <bool> (Cmp.getValueType() == MVT::v16i8 &&
"Non 128-bit vector on pre-SSE41 target") ? void (0) : __assert_fail
("Cmp.getValueType() == MVT::v16i8 && \"Non 128-bit vector on pre-SSE41 target\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 53326, __extension__
__PRETTY_FUNCTION__));

53327

SDValue MovMsk = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Cmp);

53328

SDValue FFFFs = DAG.getConstant(0xFFFF, DL, MVT::i32);

53329

return DAG.getSetCC(DL, VT, MovMsk, FFFFs, CC);

53330

}

53331

53332

return SDValue();

53333

}

53334

53335

static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,

53336

TargetLowering::DAGCombinerInfo &DCI,

53337

const X86Subtarget &Subtarget) {

53338

const ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();

53339

const SDValue LHS = N->getOperand(0);

53340

const SDValue RHS = N->getOperand(1);

53341

EVT VT = N->getValueType(0);

53342

EVT OpVT = LHS.getValueType();

53343

SDLoc DL(N);

53344

53345

if (CC == ISD::SETNE || CC == ISD::SETEQ) {

53346

if (SDValue V = combineVectorSizedSetCCEquality(N, DAG, Subtarget))

53347

return V;

53348

53349

if (VT == MVT::i1 && isNullConstant(RHS)) {

53350

SDValue X86CC;

53351

if (SDValue V =

53352

MatchVectorAllZeroTest(LHS, CC, DL, Subtarget, DAG, X86CC))

53353

return DAG.getNode(ISD::TRUNCATE, DL, VT,

53354

DAG.getNode(X86ISD::SETCC, DL, MVT::i8, X86CC, V));

53355

}

53356

53357

if (OpVT.isScalarInteger()) {

53358

// cmpeq(or(X,Y),X) --> cmpeq(and(~X,Y),0)

53359

// cmpne(or(X,Y),X) --> cmpne(and(~X,Y),0)

53360

auto MatchOrCmpEq = [&](SDValue N0, SDValue N1) {

53361

if (N0.getOpcode() == ISD::OR && N0->hasOneUse()) {

53362

if (N0.getOperand(0) == N1)

53363

return DAG.getNode(ISD::AND, DL, OpVT, DAG.getNOT(DL, N1, OpVT),

53364

N0.getOperand(1));

53365

if (N0.getOperand(1) == N1)

53366

return DAG.getNode(ISD::AND, DL, OpVT, DAG.getNOT(DL, N1, OpVT),

53367

N0.getOperand(0));

53368

}

53369

return SDValue();

53370

};

53371

if (SDValue AndN = MatchOrCmpEq(LHS, RHS))

53372

return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);

53373

if (SDValue AndN = MatchOrCmpEq(RHS, LHS))

53374

return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);

53375

53376

// cmpeq(and(X,Y),Y) --> cmpeq(and(~X,Y),0)

53377

// cmpne(and(X,Y),Y) --> cmpne(and(~X,Y),0)

53378

auto MatchAndCmpEq = [&](SDValue N0, SDValue N1) {

53379

if (N0.getOpcode() == ISD::AND && N0->hasOneUse()) {

53380

if (N0.getOperand(0) == N1)

53381

return DAG.getNode(ISD::AND, DL, OpVT, N1,

53382

DAG.getNOT(DL, N0.getOperand(1), OpVT));

53383

if (N0.getOperand(1) == N1)

53384

return DAG.getNode(ISD::AND, DL, OpVT, N1,

53385

DAG.getNOT(DL, N0.getOperand(0), OpVT));

53386

}

53387

return SDValue();

53388

};

53389

if (SDValue AndN = MatchAndCmpEq(LHS, RHS))

53390

return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);

53391

if (SDValue AndN = MatchAndCmpEq(RHS, LHS))

53392

return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);

53393

53394

// cmpeq(trunc(x),0) --> cmpeq(x,0)

53395

// cmpne(trunc(x),0) --> cmpne(x,0)

53396

// iff x upper bits are zero.

53397

// TODO: Add support for RHS to be truncate as well?

53398

if (LHS.getOpcode() == ISD::TRUNCATE &&

53399

LHS.getOperand(0).getScalarValueSizeInBits() >= 32 &&

53400

isNullConstant(RHS) && !DCI.isBeforeLegalize()) {

53401

EVT SrcVT = LHS.getOperand(0).getValueType();

53402

APInt UpperBits = APInt::getBitsSetFrom(SrcVT.getScalarSizeInBits(),

53403

OpVT.getScalarSizeInBits());

53404

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

53405

if (DAG.MaskedValueIsZero(LHS.getOperand(0), UpperBits) &&

53406

TLI.isTypeLegal(LHS.getOperand(0).getValueType()))

53407

return DAG.getSetCC(DL, VT, LHS.getOperand(0),

53408

DAG.getConstant(0, DL, SrcVT), CC);

53409

}

53410

}

53411

}

53412

53413

if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&

53414

(CC == ISD::SETNE || CC == ISD::SETEQ || ISD::isSignedIntSetCC(CC))) {

53415

// Using temporaries to avoid messing up operand ordering for later

53416

// transformations if this doesn't work.

53417

SDValue Op0 = LHS;

53418

SDValue Op1 = RHS;

53419

ISD::CondCode TmpCC = CC;

53420

// Put build_vector on the right.

53421

if (Op0.getOpcode() == ISD::BUILD_VECTOR) {

53422

std::swap(Op0, Op1);

53423

TmpCC = ISD::getSetCCSwappedOperands(TmpCC);

53424

}

53425

53426

bool IsSEXT0 =

53427

(Op0.getOpcode() == ISD::SIGN_EXTEND) &&

53428

(Op0.getOperand(0).getValueType().getVectorElementType() == MVT::i1);

53429

bool IsVZero1 = ISD::isBuildVectorAllZeros(Op1.getNode());

53430

53431

if (IsSEXT0 && IsVZero1) {

53432

assert(VT == Op0.getOperand(0).getValueType() &&(static_cast <bool> (VT == Op0.getOperand(0).getValueType
() && "Unexpected operand type") ? void (0) : __assert_fail
("VT == Op0.getOperand(0).getValueType() && \"Unexpected operand type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 53433, __extension__
__PRETTY_FUNCTION__))

53433

"Unexpected operand type")(static_cast <bool> (VT == Op0.getOperand(0).getValueType
() && "Unexpected operand type") ? void (0) : __assert_fail
("VT == Op0.getOperand(0).getValueType() && \"Unexpected operand type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 53433, __extension__
__PRETTY_FUNCTION__));

53434

if (TmpCC == ISD::SETGT)

53435

return DAG.getConstant(0, DL, VT);

53436

if (TmpCC == ISD::SETLE)

53437

return DAG.getConstant(1, DL, VT);

53438

if (TmpCC == ISD::SETEQ || TmpCC == ISD::SETGE)

53439

return DAG.getNOT(DL, Op0.getOperand(0), VT);

53440

53441

assert((TmpCC == ISD::SETNE || TmpCC == ISD::SETLT) &&(static_cast <bool> ((TmpCC == ISD::SETNE || TmpCC == ISD
::SETLT) && "Unexpected condition code!") ? void (0) :
__assert_fail ("(TmpCC == ISD::SETNE || TmpCC == ISD::SETLT) && \"Unexpected condition code!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 53442, __extension__
__PRETTY_FUNCTION__))

53442

"Unexpected condition code!")(static_cast <bool> ((TmpCC == ISD::SETNE || TmpCC == ISD
::SETLT) && "Unexpected condition code!") ? void (0) :
__assert_fail ("(TmpCC == ISD::SETNE || TmpCC == ISD::SETLT) && \"Unexpected condition code!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 53442, __extension__
__PRETTY_FUNCTION__));

53443

return Op0.getOperand(0);

53444

}

53445

}

53446

53447

// If we have AVX512, but not BWI and this is a vXi16/vXi8 setcc, just

53448

// pre-promote its result type since vXi1 vectors don't get promoted

53449

// during type legalization.

53450

// NOTE: The element count check is to ignore operand types that need to

53451

// go through type promotion to a 128-bit vector.

53452

if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && VT.isVector() &&

53453

VT.getVectorElementType() == MVT::i1 &&

53454

(OpVT.getVectorElementType() == MVT::i8 ||

53455

OpVT.getVectorElementType() == MVT::i16)) {

53456

SDValue Setcc = DAG.getSetCC(DL, OpVT, LHS, RHS, CC);

53457

return DAG.getNode(ISD::TRUNCATE, DL, VT, Setcc);

53458

}

53459

53460

// For an SSE1-only target, lower a comparison of v4f32 to X86ISD::CMPP early

53461

// to avoid scalarization via legalization because v4i32 is not a legal type.

53462

if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32 &&

53463

LHS.getValueType() == MVT::v4f32)

53464

return LowerVSETCC(SDValue(N, 0), Subtarget, DAG);

53465

53466

// X pred 0.0 --> X pred -X

53467

// If the negation of X already exists, use it in the comparison. This removes

53468

// the need to materialize 0.0 and allows matching to SSE's MIN/MAX

53469

// instructions in patterns with a 'select' node.

53470

if (isNullFPScalarOrVectorConst(RHS)) {

53471

SDVTList FNegVT = DAG.getVTList(OpVT);

53472

if (SDNode *FNeg = DAG.getNodeIfExists(ISD::FNEG, FNegVT, {LHS}))

53473

return DAG.getSetCC(DL, VT, LHS, SDValue(FNeg, 0), CC);

53474

}

53475

53476

return SDValue();

53477

}

53478

53479

static SDValue combineMOVMSK(SDNode *N, SelectionDAG &DAG,

53480

TargetLowering::DAGCombinerInfo &DCI,

53481

const X86Subtarget &Subtarget) {

53482

SDValue Src = N->getOperand(0);

53483

MVT SrcVT = Src.getSimpleValueType();

53484

MVT VT = N->getSimpleValueType(0);

53485

unsigned NumBits = VT.getScalarSizeInBits();

53486

unsigned NumElts = SrcVT.getVectorNumElements();

53487

unsigned NumBitsPerElt = SrcVT.getScalarSizeInBits();

53488

assert(VT == MVT::i32 && NumElts <= NumBits && "Unexpected MOVMSK types")(static_cast <bool> (VT == MVT::i32 && NumElts <=
NumBits && "Unexpected MOVMSK types") ? void (0) : __assert_fail
("VT == MVT::i32 && NumElts <= NumBits && \"Unexpected MOVMSK types\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 53488, __extension__
__PRETTY_FUNCTION__));

53489

53490

// Perform constant folding.

53491

APInt UndefElts;

53492

SmallVector<APInt, 32> EltBits;

53493

if (getTargetConstantBitsFromNode(Src, NumBitsPerElt, UndefElts, EltBits)) {

53494

APInt Imm(32, 0);

53495

for (unsigned Idx = 0; Idx != NumElts; ++Idx)

53496

if (!UndefElts[Idx] && EltBits[Idx].isNegative())

53497

Imm.setBit(Idx);

53498

53499

return DAG.getConstant(Imm, SDLoc(N), VT);

53500

}

53501

53502

// Look through int->fp bitcasts that don't change the element width.

53503

unsigned EltWidth = SrcVT.getScalarSizeInBits();

53504

if (Subtarget.hasSSE2() && Src.getOpcode() == ISD::BITCAST &&

53505

Src.getOperand(0).getScalarValueSizeInBits() == EltWidth)

53506

return DAG.getNode(X86ISD::MOVMSK, SDLoc(N), VT, Src.getOperand(0));

53507

53508

// Fold movmsk(not(x)) -> not(movmsk(x)) to improve folding of movmsk results

53509

// with scalar comparisons.

53510

if (SDValue NotSrc = IsNOT(Src, DAG)) {

53511

SDLoc DL(N);

53512

APInt NotMask = APInt::getLowBitsSet(NumBits, NumElts);

53513

NotSrc = DAG.getBitcast(SrcVT, NotSrc);

53514

return DAG.getNode(ISD::XOR, DL, VT,

53515

DAG.getNode(X86ISD::MOVMSK, DL, VT, NotSrc),

53516

DAG.getConstant(NotMask, DL, VT));

53517

}

53518

53519

// Fold movmsk(icmp_sgt(x,-1)) -> not(movmsk(x)) to improve folding of movmsk

53520

// results with scalar comparisons.

53521

if (Src.getOpcode() == X86ISD::PCMPGT &&

53522

ISD::isBuildVectorAllOnes(Src.getOperand(1).getNode())) {

53523

SDLoc DL(N);

53524

APInt NotMask = APInt::getLowBitsSet(NumBits, NumElts);

53525

return DAG.getNode(ISD::XOR, DL, VT,

53526

DAG.getNode(X86ISD::MOVMSK, DL, VT, Src.getOperand(0)),

53527

DAG.getConstant(NotMask, DL, VT));

53528

}

53529

53530

// Fold movmsk(icmp_eq(and(x,c1),0)) -> movmsk(not(shl(x,c2)))

53531

// iff pow2splat(c1).

53532

if (Src.getOpcode() == X86ISD::PCMPEQ &&

53533

Src.getOperand(0).getOpcode() == ISD::AND &&

53534

ISD::isBuildVectorAllZeros(Src.getOperand(1).getNode())) {

53535

SDValue LHS = Src.getOperand(0).getOperand(0);

53536

SDValue RHS = Src.getOperand(0).getOperand(1);

53537

KnownBits KnownRHS = DAG.computeKnownBits(RHS);

53538

if (KnownRHS.isConstant() && KnownRHS.getConstant().isPowerOf2()) {

53539

SDLoc DL(N);

53540

MVT ShiftVT = SrcVT;

53541

if (ShiftVT.getScalarType() == MVT::i8) {

53542

// vXi8 shifts - we only care about the signbit so can use PSLLW.

53543

ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);

53544

LHS = DAG.getBitcast(ShiftVT, LHS);

53545

}

53546

unsigned ShiftAmt = KnownRHS.getConstant().countLeadingZeros();

53547

LHS = getTargetVShiftByConstNode(X86ISD::VSHLI, DL, ShiftVT, LHS,

53548

ShiftAmt, DAG);

53549

LHS = DAG.getNOT(DL, DAG.getBitcast(SrcVT, LHS), SrcVT);

53550

return DAG.getNode(X86ISD::MOVMSK, DL, VT, LHS);

53551

}

53552

}

53553

53554

// Simplify the inputs.

53555

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

53556

APInt DemandedMask(APInt::getAllOnes(NumBits));

53557

if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))

53558

return SDValue(N, 0);

53559

53560

return SDValue();

53561

}

53562

53563

static SDValue combineX86GatherScatter(SDNode *N, SelectionDAG &DAG,

53564

TargetLowering::DAGCombinerInfo &DCI,

53565

const X86Subtarget &Subtarget) {

53566

auto *MemOp = cast<X86MaskedGatherScatterSDNode>(N);

53567

SDValue BasePtr = MemOp->getBasePtr();

53568

SDValue Index = MemOp->getIndex();

53569

SDValue Scale = MemOp->getScale();

53570

SDValue Mask = MemOp->getMask();

53571

53572

// Attempt to fold an index scale into the scale value directly.

53573

// For smaller indices, implicit sext is performed BEFORE scale, preventing

53574

// this fold under most circumstances.

53575

// TODO: Move this into X86DAGToDAGISel::matchVectorAddressRecursively?

53576

if ((Index.getOpcode() == X86ISD::VSHLI ||

53577

(Index.getOpcode() == ISD::ADD &&

53578

Index.getOperand(0) == Index.getOperand(1))) &&

53579

isa<ConstantSDNode>(Scale) &&

53580

BasePtr.getScalarValueSizeInBits() == Index.getScalarValueSizeInBits()) {

53581

unsigned ShiftAmt =

53582

Index.getOpcode() == ISD::ADD ? 1 : Index.getConstantOperandVal(1);

53583

uint64_t ScaleAmt = cast<ConstantSDNode>(Scale)->getZExtValue();

53584

uint64_t NewScaleAmt = ScaleAmt * (1ULL << ShiftAmt);

53585

if (isPowerOf2_64(NewScaleAmt) && NewScaleAmt <= 8) {

53586

SDValue NewIndex = Index.getOperand(0);

53587

SDValue NewScale =

53588

DAG.getTargetConstant(NewScaleAmt, SDLoc(N), Scale.getValueType());

53589

if (N->getOpcode() == X86ISD::MGATHER)

53590

return getAVX2GatherNode(N->getOpcode(), SDValue(N, 0), DAG,

53591

MemOp->getOperand(1), Mask,

53592

MemOp->getBasePtr(), NewIndex, NewScale,

53593

MemOp->getChain(), Subtarget);

53594

if (N->getOpcode() == X86ISD::MSCATTER)

53595

return getScatterNode(N->getOpcode(), SDValue(N, 0), DAG,

53596

MemOp->getOperand(1), Mask, MemOp->getBasePtr(),

53597

NewIndex, NewScale, MemOp->getChain(), Subtarget);

53598

}

53599

}

53600

53601

// With vector masks we only demand the upper bit of the mask.

53602

if (Mask.getScalarValueSizeInBits() != 1) {

53603

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

53604

APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));

53605

if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) {

53606

if (N->getOpcode() != ISD::DELETED_NODE)

53607

DCI.AddToWorklist(N);

53608

return SDValue(N, 0);

53609

}

53610

}

53611

53612

return SDValue();

53613

}

53614

53615

static SDValue rebuildGatherScatter(MaskedGatherScatterSDNode *GorS,

53616

SDValue Index, SDValue Base, SDValue Scale,

53617

SelectionDAG &DAG) {

53618

SDLoc DL(GorS);

53619

53620

if (auto *Gather = dyn_cast<MaskedGatherSDNode>(GorS)) {

53621

SDValue Ops[] = { Gather->getChain(), Gather->getPassThru(),

53622

Gather->getMask(), Base, Index, Scale } ;

53623

return DAG.getMaskedGather(Gather->getVTList(),

53624

Gather->getMemoryVT(), DL, Ops,

53625

Gather->getMemOperand(),

53626

Gather->getIndexType(),

53627

Gather->getExtensionType());

53628

}

53629

auto *Scatter = cast<MaskedScatterSDNode>(GorS);

53630

SDValue Ops[] = { Scatter->getChain(), Scatter->getValue(),

53631

Scatter->getMask(), Base, Index, Scale };

53632

return DAG.getMaskedScatter(Scatter->getVTList(),

53633

Scatter->getMemoryVT(), DL,

53634

Ops, Scatter->getMemOperand(),

53635

Scatter->getIndexType(),

53636

Scatter->isTruncatingStore());

53637

}

53638

53639

static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG,

53640

TargetLowering::DAGCombinerInfo &DCI) {

53641

SDLoc DL(N);

53642

auto *GorS = cast<MaskedGatherScatterSDNode>(N);

53643

SDValue Index = GorS->getIndex();

53644

SDValue Base = GorS->getBasePtr();

53645

SDValue Scale = GorS->getScale();

53646

53647

if (DCI.isBeforeLegalize()) {

53648

unsigned IndexWidth = Index.getScalarValueSizeInBits();

53649

53650

// Shrink constant indices if they are larger than 32-bits.

53651

// Only do this before legalize types since v2i64 could become v2i32.

53652

// FIXME: We could check that the type is legal if we're after legalize

53653

// types, but then we would need to construct test cases where that happens.

53654

// FIXME: We could support more than just constant vectors, but we need to

53655

// careful with costing. A truncate that can be optimized out would be fine.

53656

// Otherwise we might only want to create a truncate if it avoids a split.

53657

if (auto *BV = dyn_cast<BuildVectorSDNode>(Index)) {

53658

if (BV->isConstant() && IndexWidth > 32 &&

53659

DAG.ComputeNumSignBits(Index) > (IndexWidth - 32)) {

53660

EVT NewVT = Index.getValueType().changeVectorElementType(MVT::i32);

53661

Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index);

53662

return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);

53663

}

53664

}

53665

53666

// Shrink any sign/zero extends from 32 or smaller to larger than 32 if

53667

// there are sufficient sign bits. Only do this before legalize types to

53668

// avoid creating illegal types in truncate.

53669

if ((Index.getOpcode() == ISD::SIGN_EXTEND ||

53670

Index.getOpcode() == ISD::ZERO_EXTEND) &&

53671

IndexWidth > 32 &&

53672

Index.getOperand(0).getScalarValueSizeInBits() <= 32 &&

53673

DAG.ComputeNumSignBits(Index) > (IndexWidth - 32)) {

53674

EVT NewVT = Index.getValueType().changeVectorElementType(MVT::i32);

53675

Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index);

53676

return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);

53677

}

53678

}

53679

53680

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

53681

EVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());

53682

// Try to move splat constant adders from the index operand to the base

53683

// pointer operand. Taking care to multiply by the scale. We can only do

53684

// this when index element type is the same as the pointer type.

53685

// Otherwise we need to be sure the math doesn't wrap before the scale.

53686

if (Index.getOpcode() == ISD::ADD &&

53687

Index.getValueType().getVectorElementType() == PtrVT &&

53688

isa<ConstantSDNode>(Scale)) {

53689

uint64_t ScaleAmt = cast<ConstantSDNode>(Scale)->getZExtValue();

53690

if (auto *BV = dyn_cast<BuildVectorSDNode>(Index.getOperand(1))) {

53691

BitVector UndefElts;

53692

if (ConstantSDNode *C = BV->getConstantSplatNode(&UndefElts)) {

53693

// FIXME: Allow non-constant?

53694

if (UndefElts.none()) {

53695

// Apply the scale.

53696

APInt Adder = C->getAPIntValue() * ScaleAmt;

53697

// Add it to the existing base.

53698

Base = DAG.getNode(ISD::ADD, DL, PtrVT, Base,

53699

DAG.getConstant(Adder, DL, PtrVT));

53700

Index = Index.getOperand(0);

53701

return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);

53702

}

53703

}

53704

53705

// It's also possible base is just a constant. In that case, just

53706

// replace it with 0 and move the displacement into the index.

53707

if (BV->isConstant() && isa<ConstantSDNode>(Base) &&

53708

isOneConstant(Scale)) {

53709

SDValue Splat = DAG.getSplatBuildVector(Index.getValueType(), DL, Base);

53710

// Combine the constant build_vector and the constant base.

53711

Splat = DAG.getNode(ISD::ADD, DL, Index.getValueType(),

53712

Index.getOperand(1), Splat);

53713

// Add to the LHS of the original Index add.

53714

Index = DAG.getNode(ISD::ADD, DL, Index.getValueType(),

53715

Index.getOperand(0), Splat);

53716

Base = DAG.getConstant(0, DL, Base.getValueType());

53717

return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);

53718

}

53719

}

53720

}

53721

53722

if (DCI.isBeforeLegalizeOps()) {

53723

unsigned IndexWidth = Index.getScalarValueSizeInBits();

53724

53725

// Make sure the index is either i32 or i64

53726

if (IndexWidth != 32 && IndexWidth != 64) {

53727

MVT EltVT = IndexWidth > 32 ? MVT::i64 : MVT::i32;

53728

EVT IndexVT = Index.getValueType().changeVectorElementType(EltVT);

53729

Index = DAG.getSExtOrTrunc(Index, DL, IndexVT);

53730

return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);

53731

}

53732

}

53733

53734

// With vector masks we only demand the upper bit of the mask.

53735

SDValue Mask = GorS->getMask();

53736

if (Mask.getScalarValueSizeInBits() != 1) {

53737

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

53738

APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));

53739

if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) {

53740

if (N->getOpcode() != ISD::DELETED_NODE)

53741

DCI.AddToWorklist(N);

53742

return SDValue(N, 0);

53743

}

53744

}

53745

53746

return SDValue();

53747

}

53748

53749

// Optimize RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT

53750

static SDValue combineX86SetCC(SDNode *N, SelectionDAG &DAG,

53751

const X86Subtarget &Subtarget) {

53752

SDLoc DL(N);

53753

X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));

53754

SDValue EFLAGS = N->getOperand(1);

53755

53756

// Try to simplify the EFLAGS and condition code operands.

53757

if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget))

53758

return getSETCC(CC, Flags, DL, DAG);

53759

53760

return SDValue();

53761

}

53762

53763

/// Optimize branch condition evaluation.

53764

static SDValue combineBrCond(SDNode *N, SelectionDAG &DAG,

53765

const X86Subtarget &Subtarget) {

53766

SDLoc DL(N);

53767

SDValue EFLAGS = N->getOperand(3);

53768

X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2));

53769

53770

// Try to simplify the EFLAGS and condition code operands.

53771

// Make sure to not keep references to operands, as combineSetCCEFLAGS can

53772

// RAUW them under us.

53773

if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget)) {

53774

SDValue Cond = DAG.getTargetConstant(CC, DL, MVT::i8);

53775

return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), N->getOperand(0),

53776

N->getOperand(1), Cond, Flags);

53777

}

53778

53779

return SDValue();

53780

}

53781

53782

// TODO: Could we move this to DAGCombine?

53783

static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N,

53784

SelectionDAG &DAG) {

53785

// Take advantage of vector comparisons (etc.) producing 0 or -1 in each lane

53786

// to optimize away operation when it's from a constant.

53787

//

53788

// The general transformation is:

53789

// UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->

53790

// AND(VECTOR_CMP(x,y), constant2)

53791

// constant2 = UNARYOP(constant)

53792

53793

// Early exit if this isn't a vector operation, the operand of the

53794

// unary operation isn't a bitwise AND, or if the sizes of the operations

53795

// aren't the same.

53796

EVT VT = N->getValueType(0);

53797

bool IsStrict = N->isStrictFPOpcode();

53798

unsigned NumEltBits = VT.getScalarSizeInBits();

53799

SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);

53800

if (!VT.isVector() || Op0.getOpcode() != ISD::AND ||

53801

DAG.ComputeNumSignBits(Op0.getOperand(0)) != NumEltBits ||

53802

VT.getSizeInBits() != Op0.getValueSizeInBits())

53803

return SDValue();

53804

53805

// Now check that the other operand of the AND is a constant. We could

53806

// make the transformation for non-constant splats as well, but it's unclear

53807

// that would be a benefit as it would not eliminate any operations, just

53808

// perform one more step in scalar code before moving to the vector unit.

53809

if (auto *BV = dyn_cast<BuildVectorSDNode>(Op0.getOperand(1))) {

53810

// Bail out if the vector isn't a constant.

53811

if (!BV->isConstant())

53812

return SDValue();

53813

53814

// Everything checks out. Build up the new and improved node.

53815

SDLoc DL(N);

53816

EVT IntVT = BV->getValueType(0);

53817

// Create a new constant of the appropriate type for the transformed

53818

// DAG.

53819

SDValue SourceConst;

53820

if (IsStrict)

53821

SourceConst = DAG.getNode(N->getOpcode(), DL, {VT, MVT::Other},

53822

{N->getOperand(0), SDValue(BV, 0)});

53823

else

53824

SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));

53825

// The AND node needs bitcasts to/from an integer vector type around it.

53826

SDValue MaskConst = DAG.getBitcast(IntVT, SourceConst);

53827

SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT, Op0->getOperand(0),

53828

MaskConst);

53829

SDValue Res = DAG.getBitcast(VT, NewAnd);

53830

if (IsStrict)

53831

return DAG.getMergeValues({Res, SourceConst.getValue(1)}, DL);

53832

return Res;

53833

}

53834

53835

return SDValue();

53836

}

53837

53838

/// If we are converting a value to floating-point, try to replace scalar

53839

/// truncate of an extracted vector element with a bitcast. This tries to keep

53840

/// the sequence on XMM registers rather than moving between vector and GPRs.

53841

static SDValue combineToFPTruncExtElt(SDNode *N, SelectionDAG &DAG) {

53842

// TODO: This is currently only used by combineSIntToFP, but it is generalized

53843

// to allow being called by any similar cast opcode.

53844

// TODO: Consider merging this into lowering: vectorizeExtractedCast().

53845

SDValue Trunc = N->getOperand(0);

53846

if (!Trunc.hasOneUse() || Trunc.getOpcode() != ISD::TRUNCATE)

53847

return SDValue();

53848

53849

SDValue ExtElt = Trunc.getOperand(0);

53850

if (!ExtElt.hasOneUse() || ExtElt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

53851

!isNullConstant(ExtElt.getOperand(1)))

53852

return SDValue();

53853

53854

EVT TruncVT = Trunc.getValueType();

53855

EVT SrcVT = ExtElt.getValueType();

53856

unsigned DestWidth = TruncVT.getSizeInBits();

53857

unsigned SrcWidth = SrcVT.getSizeInBits();

53858

if (SrcWidth % DestWidth != 0)

53859

return SDValue();

53860

53861

// inttofp (trunc (extelt X, 0)) --> inttofp (extelt (bitcast X), 0)

53862

EVT SrcVecVT = ExtElt.getOperand(0).getValueType();

53863

unsigned VecWidth = SrcVecVT.getSizeInBits();

53864

unsigned NumElts = VecWidth / DestWidth;

53865

EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), TruncVT, NumElts);

53866

SDValue BitcastVec = DAG.getBitcast(BitcastVT, ExtElt.getOperand(0));

53867

SDLoc DL(N);

53868

SDValue NewExtElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, TruncVT,

53869

BitcastVec, ExtElt.getOperand(1));

53870

return DAG.getNode(N->getOpcode(), DL, N->getValueType(0), NewExtElt);

53871

}

53872

53873

static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG,

53874

const X86Subtarget &Subtarget) {

53875

bool IsStrict = N->isStrictFPOpcode();

53876

SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);

53877

EVT VT = N->getValueType(0);

53878

EVT InVT = Op0.getValueType();

53879

53880

// UINT_TO_FP(vXi1~15) -> UINT_TO_FP(ZEXT(vXi1~15 to vXi16))

53881

// UINT_TO_FP(vXi17~31) -> UINT_TO_FP(ZEXT(vXi17~31 to vXi32))

53882

// UINT_TO_FP(vXi33~63) -> UINT_TO_FP(ZEXT(vXi33~63 to vXi64))

53883

if (InVT.isVector() && VT.getVectorElementType() == MVT::f16) {

53884

unsigned ScalarSize = InVT.getScalarSizeInBits();

53885

if (ScalarSize == 16 || ScalarSize == 32 || ScalarSize >= 64)

53886

return SDValue();

53887

SDLoc dl(N);

53888

EVT DstVT = EVT::getVectorVT(*DAG.getContext(),

53889

ScalarSize < 16 ? MVT::i16

53890

: ScalarSize < 32 ? MVT::i32

53891

: MVT::i64,

53892

InVT.getVectorNumElements());

53893

SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);

53894

if (IsStrict)

53895

return DAG.getNode(ISD::STRICT_UINT_TO_FP, dl, {VT, MVT::Other},

53896

{N->getOperand(0), P});

53897

return DAG.getNode(ISD::UINT_TO_FP, dl, VT, P);

53898

}

53899

53900

// UINT_TO_FP(vXi1) -> SINT_TO_FP(ZEXT(vXi1 to vXi32))

53901

// UINT_TO_FP(vXi8) -> SINT_TO_FP(ZEXT(vXi8 to vXi32))

53902

// UINT_TO_FP(vXi16) -> SINT_TO_FP(ZEXT(vXi16 to vXi32))

53903

if (InVT.isVector() && InVT.getScalarSizeInBits() < 32 &&

53904

VT.getScalarType() != MVT::f16) {

53905

SDLoc dl(N);

53906

EVT DstVT = InVT.changeVectorElementType(MVT::i32);

53907

SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);

53908

53909

// UINT_TO_FP isn't legal without AVX512 so use SINT_TO_FP.

53910

if (IsStrict)

53911

return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},

53912

{N->getOperand(0), P});

53913

return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);

53914

}

53915

53916

// Since UINT_TO_FP is legal (it's marked custom), dag combiner won't

53917

// optimize it to a SINT_TO_FP when the sign bit is known zero. Perform

53918

// the optimization here.

53919

if (DAG.SignBitIsZero(Op0)) {

53920

if (IsStrict)

53921

return DAG.getNode(ISD::STRICT_SINT_TO_FP, SDLoc(N), {VT, MVT::Other},

53922

{N->getOperand(0), Op0});

53923

return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, Op0);

53924

}

53925

53926

return SDValue();

53927

}

53928

53929

static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG,

53930

TargetLowering::DAGCombinerInfo &DCI,

53931

const X86Subtarget &Subtarget) {

53932

// First try to optimize away the conversion entirely when it's

53933

// conditionally from a constant. Vectors only.

53934

bool IsStrict = N->isStrictFPOpcode();

53935

if (SDValue Res = combineVectorCompareAndMaskUnaryOp(N, DAG))

53936

return Res;

53937

53938

// Now move on to more general possibilities.

53939

SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);

53940

EVT VT = N->getValueType(0);

53941

EVT InVT = Op0.getValueType();

53942

53943

// SINT_TO_FP(vXi1~15) -> SINT_TO_FP(SEXT(vXi1~15 to vXi16))

53944

// SINT_TO_FP(vXi17~31) -> SINT_TO_FP(SEXT(vXi17~31 to vXi32))

53945

// SINT_TO_FP(vXi33~63) -> SINT_TO_FP(SEXT(vXi33~63 to vXi64))

53946

if (InVT.isVector() && VT.getVectorElementType() == MVT::f16) {

53947

unsigned ScalarSize = InVT.getScalarSizeInBits();

53948

if (ScalarSize == 16 || ScalarSize == 32 || ScalarSize >= 64)

53949

return SDValue();

53950

SDLoc dl(N);

53951

EVT DstVT = EVT::getVectorVT(*DAG.getContext(),

53952

ScalarSize < 16 ? MVT::i16

53953

: ScalarSize < 32 ? MVT::i32

53954

: MVT::i64,

53955

InVT.getVectorNumElements());

53956

SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);

53957

if (IsStrict)

53958

return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},

53959

{N->getOperand(0), P});

53960

return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);

53961

}

53962

53963

// SINT_TO_FP(vXi1) -> SINT_TO_FP(SEXT(vXi1 to vXi32))

53964

// SINT_TO_FP(vXi8) -> SINT_TO_FP(SEXT(vXi8 to vXi32))

53965

// SINT_TO_FP(vXi16) -> SINT_TO_FP(SEXT(vXi16 to vXi32))

53966

if (InVT.isVector() && InVT.getScalarSizeInBits() < 32 &&

53967

VT.getScalarType() != MVT::f16) {

53968

SDLoc dl(N);

53969

EVT DstVT = InVT.changeVectorElementType(MVT::i32);

53970

SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);

53971

if (IsStrict)

53972

return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},

53973

{N->getOperand(0), P});

53974

return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);

53975

}

53976

53977

// Without AVX512DQ we only support i64 to float scalar conversion. For both

53978

// vectors and scalars, see if we know that the upper bits are all the sign

53979

// bit, in which case we can truncate the input to i32 and convert from that.

53980

if (InVT.getScalarSizeInBits() > 32 && !Subtarget.hasDQI()) {

53981

unsigned BitWidth = InVT.getScalarSizeInBits();

53982

unsigned NumSignBits = DAG.ComputeNumSignBits(Op0);

53983

if (NumSignBits >= (BitWidth - 31)) {

53984

EVT TruncVT = MVT::i32;

53985

if (InVT.isVector())

53986

TruncVT = InVT.changeVectorElementType(TruncVT);

53987

SDLoc dl(N);

53988

if (DCI.isBeforeLegalize() || TruncVT != MVT::v2i32) {

53989

SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Op0);

53990

if (IsStrict)

53991

return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},

53992

{N->getOperand(0), Trunc});

53993

return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Trunc);

53994

}

53995

// If we're after legalize and the type is v2i32 we need to shuffle and

53996

// use CVTSI2P.

53997

assert(InVT == MVT::v2i64 && "Unexpected VT!")(static_cast <bool> (InVT == MVT::v2i64 && "Unexpected VT!"
) ? void (0) : __assert_fail ("InVT == MVT::v2i64 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 53997, __extension__
__PRETTY_FUNCTION__));

53998

SDValue Cast = DAG.getBitcast(MVT::v4i32, Op0);

53999

SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Cast, Cast,

54000

{ 0, 2, -1, -1 });

54001

if (IsStrict)

54002

return DAG.getNode(X86ISD::STRICT_CVTSI2P, dl, {VT, MVT::Other},

54003

{N->getOperand(0), Shuf});

54004

return DAG.getNode(X86ISD::CVTSI2P, dl, VT, Shuf);

54005

}

54006

}

54007

54008

// Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have

54009

// a 32-bit target where SSE doesn't support i64->FP operations.

54010

if (!Subtarget.useSoftFloat() && Subtarget.hasX87() &&

54011

Op0.getOpcode() == ISD::LOAD) {

54012

LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());

54013

54014

// This transformation is not supported if the result type is f16 or f128.

54015

if (VT == MVT::f16 || VT == MVT::f128)

54016

return SDValue();

54017

54018

// If we have AVX512DQ we can use packed conversion instructions unless

54019

// the VT is f80.

54020

if (Subtarget.hasDQI() && VT != MVT::f80)

54021

return SDValue();

54022

54023

if (Ld->isSimple() && !VT.isVector() && ISD::isNormalLoad(Op0.getNode()) &&

54024

Op0.hasOneUse() && !Subtarget.is64Bit() && InVT == MVT::i64) {

54025

std::pair<SDValue, SDValue> Tmp =

54026

Subtarget.getTargetLowering()->BuildFILD(

54027

VT, InVT, SDLoc(N), Ld->getChain(), Ld->getBasePtr(),

54028

Ld->getPointerInfo(), Ld->getOriginalAlign(), DAG);

54029

DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), Tmp.second);

54030

return Tmp.first;

54031

}

54032

}

54033

54034

if (IsStrict)

54035

return SDValue();

54036

54037

if (SDValue V = combineToFPTruncExtElt(N, DAG))

54038

return V;

54039

54040

return SDValue();

54041

}

54042

54043

static bool needCarryOrOverflowFlag(SDValue Flags) {

54044

assert(Flags.getValueType() == MVT::i32 && "Unexpected VT!")(static_cast <bool> (Flags.getValueType() == MVT::i32 &&
"Unexpected VT!") ? void (0) : __assert_fail ("Flags.getValueType() == MVT::i32 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 54044, __extension__
__PRETTY_FUNCTION__));

54045

54046

for (const SDNode *User : Flags->uses()) {

54047

X86::CondCode CC;

54048

switch (User->getOpcode()) {

54049

default:

54050

// Be conservative.

54051

return true;

54052

case X86ISD::SETCC:

54053

case X86ISD::SETCC_CARRY:

54054

CC = (X86::CondCode)User->getConstantOperandVal(0);

54055

break;

54056

case X86ISD::BRCOND:

54057

case X86ISD::CMOV:

54058

CC = (X86::CondCode)User->getConstantOperandVal(2);

54059

break;

54060

}

54061

54062

switch (CC) {

54063

default: break;

54064

case X86::COND_A: case X86::COND_AE:

54065

case X86::COND_B: case X86::COND_BE:

54066

case X86::COND_O: case X86::COND_NO:

54067

case X86::COND_G: case X86::COND_GE:

54068

case X86::COND_L: case X86::COND_LE:

54069

return true;

54070

}

54071

}

54072

54073

return false;

54074

}

54075

54076

static bool onlyZeroFlagUsed(SDValue Flags) {

54077

assert(Flags.getValueType() == MVT::i32 && "Unexpected VT!")(static_cast <bool> (Flags.getValueType() == MVT::i32 &&
"Unexpected VT!") ? void (0) : __assert_fail ("Flags.getValueType() == MVT::i32 && \"Unexpected VT!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 54077, __extension__
__PRETTY_FUNCTION__));

54078

54079

for (const SDNode *User : Flags->uses()) {

54080

unsigned CCOpNo;

54081

switch (User->getOpcode()) {

54082

default:

54083

// Be conservative.

54084

return false;

54085

case X86ISD::SETCC:

54086

case X86ISD::SETCC_CARRY:

54087

CCOpNo = 0;

54088

break;

54089

case X86ISD::BRCOND:

54090

case X86ISD::CMOV:

54091

CCOpNo = 2;

54092

break;

54093

}

54094

54095

X86::CondCode CC = (X86::CondCode)User->getConstantOperandVal(CCOpNo);

54096

if (CC != X86::COND_E && CC != X86::COND_NE)

54097

return false;

54098

}

54099

54100

return true;

54101

}

54102

54103

/// If this is an add or subtract where one operand is produced by a cmp+setcc,

54104

/// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}

54105

/// with CMP+{ADC, SBB}.

54106

/// Also try (ADD/SUB)+(AND(SRL,1)) bit extraction pattern with BT+{ADC, SBB}.

54107

static SDValue combineAddOrSubToADCOrSBB(bool IsSub, const SDLoc &DL, EVT VT,

54108

SDValue X, SDValue Y,

54109

SelectionDAG &DAG,

54110

bool ZeroSecondOpOnly = false) {

54111

if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))

54112

return SDValue();

54113

54114

// Look through a one-use zext.

54115

if (Y.getOpcode() == ISD::ZERO_EXTEND && Y.hasOneUse())

54116

Y = Y.getOperand(0);

54117

54118

X86::CondCode CC;

54119

SDValue EFLAGS;

54120

if (Y.getOpcode() == X86ISD::SETCC && Y.hasOneUse()) {

54121

CC = (X86::CondCode)Y.getConstantOperandVal(0);

54122

EFLAGS = Y.getOperand(1);

54123

} else if (Y.getOpcode() == ISD::AND && isOneConstant(Y.getOperand(1)) &&

54124

Y.hasOneUse()) {

54125

EFLAGS = LowerAndToBT(Y, ISD::SETNE, DL, DAG, CC);

54126

}

54127

54128

if (!EFLAGS)

54129

return SDValue();

54130

54131

// If X is -1 or 0, then we have an opportunity to avoid constants required in

54132

// the general case below.

54133

auto *ConstantX = dyn_cast<ConstantSDNode>(X);

54134

if (ConstantX && !ZeroSecondOpOnly) {

54135

if ((!IsSub && CC == X86::COND_AE && ConstantX->isAllOnes()) ||

54136

(IsSub && CC == X86::COND_B && ConstantX->isZero())) {

54137

// This is a complicated way to get -1 or 0 from the carry flag:

54138

// -1 + SETAE --> -1 + (!CF) --> CF ? -1 : 0 --> SBB %eax, %eax

54139

// 0 - SETB --> 0 - (CF) --> CF ? -1 : 0 --> SBB %eax, %eax

54140

return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,

54141

DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),

54142

EFLAGS);

54143

}

54144

54145

if ((!IsSub && CC == X86::COND_BE && ConstantX->isAllOnes()) ||

54146

(IsSub && CC == X86::COND_A && ConstantX->isZero())) {

54147

if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&

54148

EFLAGS.getValueType().isInteger() &&

54149

!isa<ConstantSDNode>(EFLAGS.getOperand(1))) {

54150

// Swap the operands of a SUB, and we have the same pattern as above.

54151

// -1 + SETBE (SUB A, B) --> -1 + SETAE (SUB B, A) --> SUB + SBB

54152

// 0 - SETA (SUB A, B) --> 0 - SETB (SUB B, A) --> SUB + SBB

54153

SDValue NewSub = DAG.getNode(

54154

X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),

54155

EFLAGS.getOperand(1), EFLAGS.getOperand(0));

54156

SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());

54157

return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,

54158

DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),

54159

NewEFLAGS);

54160

}

54161

}

54162

}

54163

54164

if (CC == X86::COND_B) {

54165

// X + SETB Z --> adc X, 0

54166

// X - SETB Z --> sbb X, 0

54167

return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,

54168

DAG.getVTList(VT, MVT::i32), X,

54169

DAG.getConstant(0, DL, VT), EFLAGS);

54170

}

54171

54172

if (ZeroSecondOpOnly)

54173

return SDValue();

54174

54175

if (CC == X86::COND_A) {

54176

// Try to convert COND_A into COND_B in an attempt to facilitate

54177

// materializing "setb reg".

54178

//

54179

// Do not flip "e > c", where "c" is a constant, because Cmp instruction

54180

// cannot take an immediate as its first operand.

54181

//

54182

if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.getNode()->hasOneUse() &&

54183

EFLAGS.getValueType().isInteger() &&

54184

!isa<ConstantSDNode>(EFLAGS.getOperand(1))) {

54185

SDValue NewSub =

54186

DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),

54187

EFLAGS.getOperand(1), EFLAGS.getOperand(0));

54188

SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo());

54189

return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,

54190

DAG.getVTList(VT, MVT::i32), X,

54191

DAG.getConstant(0, DL, VT), NewEFLAGS);

54192

}

54193

}

54194

54195

if (CC == X86::COND_AE) {

54196

// X + SETAE --> sbb X, -1

54197

// X - SETAE --> adc X, -1

54198

return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL,

54199

DAG.getVTList(VT, MVT::i32), X,

54200

DAG.getConstant(-1, DL, VT), EFLAGS);

54201

}

54202

54203

if (CC == X86::COND_BE) {

54204

// X + SETBE --> sbb X, -1

54205

// X - SETBE --> adc X, -1

54206

// Try to convert COND_BE into COND_AE in an attempt to facilitate

54207

// materializing "setae reg".

54208

//

54209

// Do not flip "e <= c", where "c" is a constant, because Cmp instruction

54210

// cannot take an immediate as its first operand.

54211

//

54212

if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.getNode()->hasOneUse() &&

54213

EFLAGS.getValueType().isInteger() &&

54214

!isa<ConstantSDNode>(EFLAGS.getOperand(1))) {

54215

SDValue NewSub =

54216

DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),

54217

EFLAGS.getOperand(1), EFLAGS.getOperand(0));

54218

SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo());

54219

return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL,

54220

DAG.getVTList(VT, MVT::i32), X,

54221

DAG.getConstant(-1, DL, VT), NewEFLAGS);

54222

}

54223

}

54224

54225

if (CC != X86::COND_E && CC != X86::COND_NE)

54226

return SDValue();

54227

54228

if (EFLAGS.getOpcode() != X86ISD::CMP || !EFLAGS.hasOneUse() ||

54229

!X86::isZeroNode(EFLAGS.getOperand(1)) ||

54230

!EFLAGS.getOperand(0).getValueType().isInteger())

54231

return SDValue();

54232

54233

SDValue Z = EFLAGS.getOperand(0);

54234

EVT ZVT = Z.getValueType();

54235

54236

// If X is -1 or 0, then we have an opportunity to avoid constants required in

54237

// the general case below.

54238

if (ConstantX) {

54239

// 'neg' sets the carry flag when Z != 0, so create 0 or -1 using 'sbb' with

54240

// fake operands:

54241

// 0 - (Z != 0) --> sbb %eax, %eax, (neg Z)

54242

// -1 + (Z == 0) --> sbb %eax, %eax, (neg Z)

54243

if ((IsSub && CC == X86::COND_NE && ConstantX->isZero()) ||

54244

(!IsSub && CC == X86::COND_E && ConstantX->isAllOnes())) {

54245

SDValue Zero = DAG.getConstant(0, DL, ZVT);

54246

SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);

54247

SDValue Neg = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Zero, Z);

54248

return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,

54249

DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),

54250

SDValue(Neg.getNode(), 1));

54251

}

54252

54253

// cmp with 1 sets the carry flag when Z == 0, so create 0 or -1 using 'sbb'

54254

// with fake operands:

54255

// 0 - (Z == 0) --> sbb %eax, %eax, (cmp Z, 1)

54256

// -1 + (Z != 0) --> sbb %eax, %eax, (cmp Z, 1)

54257

if ((IsSub && CC == X86::COND_E && ConstantX->isZero()) ||

54258

(!IsSub && CC == X86::COND_NE && ConstantX->isAllOnes())) {

54259

SDValue One = DAG.getConstant(1, DL, ZVT);

54260

SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);

54261

SDValue Cmp1 = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Z, One);

54262

return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,

54263

DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),

54264

Cmp1.getValue(1));

54265

}

54266

}

54267

54268

// (cmp Z, 1) sets the carry flag if Z is 0.

54269

SDValue One = DAG.getConstant(1, DL, ZVT);

54270

SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);

54271

SDValue Cmp1 = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Z, One);

54272

54273

// Add the flags type for ADC/SBB nodes.

54274

SDVTList VTs = DAG.getVTList(VT, MVT::i32);

54275

54276

// X - (Z != 0) --> sub X, (zext(setne Z, 0)) --> adc X, -1, (cmp Z, 1)

54277

// X + (Z != 0) --> add X, (zext(setne Z, 0)) --> sbb X, -1, (cmp Z, 1)

54278

if (CC == X86::COND_NE)

54279

return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL, VTs, X,

54280

DAG.getConstant(-1ULL, DL, VT), Cmp1.getValue(1));

54281

54282

// X - (Z == 0) --> sub X, (zext(sete Z, 0)) --> sbb X, 0, (cmp Z, 1)

54283

// X + (Z == 0) --> add X, (zext(sete Z, 0)) --> adc X, 0, (cmp Z, 1)

54284

return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL, VTs, X,

54285

DAG.getConstant(0, DL, VT), Cmp1.getValue(1));

54286

}

54287

54288

/// If this is an add or subtract where one operand is produced by a cmp+setcc,

54289

/// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}

54290

/// with CMP+{ADC, SBB}.

54291

static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) {

54292

bool IsSub = N->getOpcode() == ISD::SUB;

54293

SDValue X = N->getOperand(0);

54294

SDValue Y = N->getOperand(1);

54295

EVT VT = N->getValueType(0);

54296

SDLoc DL(N);

54297

54298

if (SDValue ADCOrSBB = combineAddOrSubToADCOrSBB(IsSub, DL, VT, X, Y, DAG))

54299

return ADCOrSBB;

54300

54301

// Commute and try again (negate the result for subtracts).

54302

if (SDValue ADCOrSBB = combineAddOrSubToADCOrSBB(IsSub, DL, VT, Y, X, DAG)) {

54303

if (IsSub)

54304

ADCOrSBB =

54305

DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), ADCOrSBB);

54306

return ADCOrSBB;

54307

}

54308

54309

return SDValue();

54310

}

54311

54312

static SDValue combineCMP(SDNode *N, SelectionDAG &DAG) {

54313

// Only handle test patterns.

54314

if (!isNullConstant(N->getOperand(1)))

54315

return SDValue();

54316

54317

// If we have a CMP of a truncated binop, see if we can make a smaller binop

54318

// and use its flags directly.

54319

// TODO: Maybe we should try promoting compares that only use the zero flag

54320

// first if we can prove the upper bits with computeKnownBits?

54321

SDLoc dl(N);

54322

SDValue Op = N->getOperand(0);

54323

EVT VT = Op.getValueType();

54324

54325

// If we have a constant logical shift that's only used in a comparison

54326

// against zero turn it into an equivalent AND. This allows turning it into

54327

// a TEST instruction later.

54328

if ((Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SHL) &&

54329

Op.hasOneUse() && isa<ConstantSDNode>(Op.getOperand(1)) &&

54330

onlyZeroFlagUsed(SDValue(N, 0))) {

54331

unsigned BitWidth = VT.getSizeInBits();

54332

const APInt &ShAmt = Op.getConstantOperandAPInt(1);

54333

if (ShAmt.ult(BitWidth)) { // Avoid undefined shifts.

54334

unsigned MaskBits = BitWidth - ShAmt.getZExtValue();

54335

APInt Mask = Op.getOpcode() == ISD::SRL

54336

? APInt::getHighBitsSet(BitWidth, MaskBits)

54337

: APInt::getLowBitsSet(BitWidth, MaskBits);

54338

if (Mask.isSignedIntN(32)) {

54339

Op = DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0),

54340

DAG.getConstant(Mask, dl, VT));

54341

return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,

54342

DAG.getConstant(0, dl, VT));

54343

}

54344

}

54345

}

54346

54347

// Peek through any zero-extend if we're only testing for a zero result.

54348

if (Op.getOpcode() == ISD::ZERO_EXTEND && onlyZeroFlagUsed(SDValue(N, 0))) {

54349

SDValue Src = Op.getOperand(0);

54350

EVT SrcVT = Src.getValueType();

54351

if (SrcVT.getScalarSizeInBits() >= 8 &&

54352

DAG.getTargetLoweringInfo().isTypeLegal(SrcVT))

54353

return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Src,

54354

DAG.getConstant(0, dl, SrcVT));

54355

}

54356

54357

// Look for a truncate.

54358

if (Op.getOpcode() != ISD::TRUNCATE)

54359

return SDValue();

54360

54361

SDValue Trunc = Op;

54362

Op = Op.getOperand(0);

54363

54364

// See if we can compare with zero against the truncation source,

54365

// which should help using the Z flag from many ops. Only do this for

54366

// i32 truncated op to prevent partial-reg compares of promoted ops.

54367

EVT OpVT = Op.getValueType();

54368

APInt UpperBits =

54369

APInt::getBitsSetFrom(OpVT.getSizeInBits(), VT.getSizeInBits());

54370

if (OpVT == MVT::i32 && DAG.MaskedValueIsZero(Op, UpperBits) &&

54371

onlyZeroFlagUsed(SDValue(N, 0))) {

54372

return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,

54373

DAG.getConstant(0, dl, OpVT));

54374

}

54375

54376

// After this the truncate and arithmetic op must have a single use.

54377

if (!Trunc.hasOneUse() || !Op.hasOneUse())

54378

return SDValue();

54379

54380

unsigned NewOpc;

54381

switch (Op.getOpcode()) {

54382

default: return SDValue();

54383

case ISD::AND:

54384

// Skip and with constant. We have special handling for and with immediate

54385

// during isel to generate test instructions.

54386

if (isa<ConstantSDNode>(Op.getOperand(1)))

54387

return SDValue();

54388

NewOpc = X86ISD::AND;

54389

break;

54390

case ISD::OR: NewOpc = X86ISD::OR; break;

54391

case ISD::XOR: NewOpc = X86ISD::XOR; break;

54392

case ISD::ADD:

54393

// If the carry or overflow flag is used, we can't truncate.

54394

if (needCarryOrOverflowFlag(SDValue(N, 0)))

54395

return SDValue();

54396

NewOpc = X86ISD::ADD;

54397

break;

54398

case ISD::SUB:

54399

// If the carry or overflow flag is used, we can't truncate.

54400

if (needCarryOrOverflowFlag(SDValue(N, 0)))

54401

return SDValue();

54402

NewOpc = X86ISD::SUB;

54403

break;

54404

}

54405

54406

// We found an op we can narrow. Truncate its inputs.

54407

SDValue Op0 = DAG.getNode(ISD::TRUNCATE, dl, VT, Op.getOperand(0));

54408

SDValue Op1 = DAG.getNode(ISD::TRUNCATE, dl, VT, Op.getOperand(1));

54409

54410

// Use a X86 specific opcode to avoid DAG combine messing with it.

54411

SDVTList VTs = DAG.getVTList(VT, MVT::i32);

54412

Op = DAG.getNode(NewOpc, dl, VTs, Op0, Op1);

54413

54414

// For AND, keep a CMP so that we can match the test pattern.

54415

if (NewOpc == X86ISD::AND)

54416

return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,

54417

DAG.getConstant(0, dl, VT));

54418

54419

// Return the flags.

54420

return Op.getValue(1);

54421

}

54422

54423

static SDValue combineX86AddSub(SDNode *N, SelectionDAG &DAG,

54424

TargetLowering::DAGCombinerInfo &DCI) {

54425

assert((X86ISD::ADD == N->getOpcode() || X86ISD::SUB == N->getOpcode()) &&(static_cast <bool> ((X86ISD::ADD == N->getOpcode() ||
X86ISD::SUB == N->getOpcode()) && "Expected X86ISD::ADD or X86ISD::SUB"
) ? void (0) : __assert_fail ("(X86ISD::ADD == N->getOpcode() || X86ISD::SUB == N->getOpcode()) && \"Expected X86ISD::ADD or X86ISD::SUB\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 54426, __extension__
__PRETTY_FUNCTION__))

54426

"Expected X86ISD::ADD or X86ISD::SUB")(static_cast <bool> ((X86ISD::ADD == N->getOpcode() ||
X86ISD::SUB == N->getOpcode()) && "Expected X86ISD::ADD or X86ISD::SUB"
) ? void (0) : __assert_fail ("(X86ISD::ADD == N->getOpcode() || X86ISD::SUB == N->getOpcode()) && \"Expected X86ISD::ADD or X86ISD::SUB\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 54426, __extension__
__PRETTY_FUNCTION__));

54427

54428

SDLoc DL(N);

54429

SDValue LHS = N->getOperand(0);

54430

SDValue RHS = N->getOperand(1);

54431

MVT VT = LHS.getSimpleValueType();

54432

bool IsSub = X86ISD::SUB == N->getOpcode();

54433

unsigned GenericOpc = IsSub ? ISD::SUB : ISD::ADD;

54434

54435

// If we don't use the flag result, simplify back to a generic ADD/SUB.

54436

if (!N->hasAnyUseOfValue(1)) {

54437

SDValue Res = DAG.getNode(GenericOpc, DL, VT, LHS, RHS);

54438

return DAG.getMergeValues({Res, DAG.getConstant(0, DL, MVT::i32)}, DL);

54439

}

54440

54441

// Fold any similar generic ADD/SUB opcodes to reuse this node.

54442

auto MatchGeneric = [&](SDValue N0, SDValue N1, bool Negate) {

54443

SDValue Ops[] = {N0, N1};

54444

SDVTList VTs = DAG.getVTList(N->getValueType(0));

54445

if (SDNode *GenericAddSub = DAG.getNodeIfExists(GenericOpc, VTs, Ops)) {

54446

SDValue Op(N, 0);

54447

if (Negate)

54448

Op = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Op);

54449

DCI.CombineTo(GenericAddSub, Op);

54450

}

54451

};

54452

MatchGeneric(LHS, RHS, false);

54453

MatchGeneric(RHS, LHS, X86ISD::SUB == N->getOpcode());

54454

54455

// TODO: Can we drop the ZeroSecondOpOnly limit? This is to guarantee that the

54456

// EFLAGS result doesn't change.

54457

return combineAddOrSubToADCOrSBB(IsSub, DL, VT, LHS, RHS, DAG,

54458

/*ZeroSecondOpOnly*/ true);

54459

}

54460

54461

static SDValue combineSBB(SDNode *N, SelectionDAG &DAG) {

54462

SDValue LHS = N->getOperand(0);

54463

SDValue RHS = N->getOperand(1);

54464

SDValue BorrowIn = N->getOperand(2);

54465

54466

if (SDValue Flags = combineCarryThroughADD(BorrowIn, DAG)) {

54467

MVT VT = N->getSimpleValueType(0);

54468

SDVTList VTs = DAG.getVTList(VT, MVT::i32);

54469

return DAG.getNode(X86ISD::SBB, SDLoc(N), VTs, LHS, RHS, Flags);

54470

}

54471

54472

// Fold SBB(SUB(X,Y),0,Carry) -> SBB(X,Y,Carry)

54473

// iff the flag result is dead.

54474

if (LHS.getOpcode() == ISD::SUB && isNullConstant(RHS) &&

54475

!N->hasAnyUseOfValue(1))

54476

return DAG.getNode(X86ISD::SBB, SDLoc(N), N->getVTList(), LHS.getOperand(0),

54477

LHS.getOperand(1), BorrowIn);

54478

54479

return SDValue();

54480

}

54481

54482

// Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS

54483

static SDValue combineADC(SDNode *N, SelectionDAG &DAG,

54484

TargetLowering::DAGCombinerInfo &DCI) {

54485

SDValue LHS = N->getOperand(0);

54486

SDValue RHS = N->getOperand(1);

54487

SDValue CarryIn = N->getOperand(2);

54488

auto *LHSC = dyn_cast<ConstantSDNode>(LHS);

54489

auto *RHSC = dyn_cast<ConstantSDNode>(RHS);

54490

54491

// Canonicalize constant to RHS.

54492

if (LHSC && !RHSC)

54493

return DAG.getNode(X86ISD::ADC, SDLoc(N), N->getVTList(), RHS, LHS,

54494

CarryIn);

54495

54496

// If the LHS and RHS of the ADC node are zero, then it can't overflow and

54497

// the result is either zero or one (depending on the input carry bit).

54498

// Strength reduce this down to a "set on carry" aka SETCC_CARRY&1.

54499

if (LHSC && RHSC && LHSC->isZero() && RHSC->isZero() &&

54500

// We don't have a good way to replace an EFLAGS use, so only do this when

54501

// dead right now.

54502

SDValue(N, 1).use_empty()) {

54503

SDLoc DL(N);

54504

EVT VT = N->getValueType(0);

54505

SDValue CarryOut = DAG.getConstant(0, DL, N->getValueType(1));

54506

SDValue Res1 = DAG.getNode(

54507

ISD::AND, DL, VT,

54508

DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,

54509

DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), CarryIn),

54510

DAG.getConstant(1, DL, VT));

54511

return DCI.CombineTo(N, Res1, CarryOut);

54512

}

54513

54514

// Fold ADC(C1,C2,Carry) -> ADC(0,C1+C2,Carry)

54515

// iff the flag result is dead.

54516

// TODO: Allow flag result if C1+C2 doesn't signed/unsigned overflow.

54517

if (LHSC && RHSC && !LHSC->isZero() && !N->hasAnyUseOfValue(1)) {

54518

SDLoc DL(N);

54519

APInt Sum = LHSC->getAPIntValue() + RHSC->getAPIntValue();

54520

return DAG.getNode(X86ISD::ADC, DL, N->getVTList(),

54521

DAG.getConstant(0, DL, LHS.getValueType()),

54522

DAG.getConstant(Sum, DL, LHS.getValueType()), CarryIn);

54523

}

54524

54525

if (SDValue Flags = combineCarryThroughADD(CarryIn, DAG)) {

54526

MVT VT = N->getSimpleValueType(0);

54527

SDVTList VTs = DAG.getVTList(VT, MVT::i32);

54528

return DAG.getNode(X86ISD::ADC, SDLoc(N), VTs, LHS, RHS, Flags);

54529

}

54530

54531

// Fold ADC(ADD(X,Y),0,Carry) -> ADC(X,Y,Carry)

54532

// iff the flag result is dead.

54533

if (LHS.getOpcode() == ISD::ADD && RHSC && RHSC->isZero() &&

54534

!N->hasAnyUseOfValue(1))

54535

return DAG.getNode(X86ISD::ADC, SDLoc(N), N->getVTList(), LHS.getOperand(0),

54536

LHS.getOperand(1), CarryIn);

54537

54538

return SDValue();

54539

}

54540

54541

static SDValue matchPMADDWD(SelectionDAG &DAG, SDValue Op0, SDValue Op1,

54542

const SDLoc &DL, EVT VT,

54543

const X86Subtarget &Subtarget) {

54544

// Example of pattern we try to detect:

54545

// t := (v8i32 mul (sext (v8i16 x0), (sext (v8i16 x1))))

54546

//(add (build_vector (extract_elt t, 0),

54547

// (extract_elt t, 2),

54548

// (extract_elt t, 4),

54549

// (extract_elt t, 6)),

54550

// (build_vector (extract_elt t, 1),

54551

// (extract_elt t, 3),

54552

// (extract_elt t, 5),

54553

// (extract_elt t, 7)))

54554

54555

if (!Subtarget.hasSSE2())

54556

return SDValue();

54557

54558

if (Op0.getOpcode() != ISD::BUILD_VECTOR ||

54559

Op1.getOpcode() != ISD::BUILD_VECTOR)

54560

return SDValue();

54561

54562

if (!VT.isVector() || VT.getVectorElementType() != MVT::i32 ||

54563

VT.getVectorNumElements() < 4 ||

54564

!isPowerOf2_32(VT.getVectorNumElements()))

54565

return SDValue();

54566

54567

// Check if one of Op0,Op1 is of the form:

54568

// (build_vector (extract_elt Mul, 0),

54569

// (extract_elt Mul, 2),

54570

// (extract_elt Mul, 4),

54571

// ...

54572

// the other is of the form:

54573

// (build_vector (extract_elt Mul, 1),

54574

// (extract_elt Mul, 3),

54575

// (extract_elt Mul, 5),

54576

// ...

54577

// and identify Mul.

54578

SDValue Mul;

54579

for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; i += 2) {

54580

SDValue Op0L = Op0->getOperand(i), Op1L = Op1->getOperand(i),

54581

Op0H = Op0->getOperand(i + 1), Op1H = Op1->getOperand(i + 1);

54582

// TODO: Be more tolerant to undefs.

54583

if (Op0L.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

54584

Op1L.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

54585

Op0H.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

54586

Op1H.getOpcode() != ISD::EXTRACT_VECTOR_ELT)

54587

return SDValue();

54588

auto *Const0L = dyn_cast<ConstantSDNode>(Op0L->getOperand(1));

54589

auto *Const1L = dyn_cast<ConstantSDNode>(Op1L->getOperand(1));

54590

auto *Const0H = dyn_cast<ConstantSDNode>(Op0H->getOperand(1));

54591

auto *Const1H = dyn_cast<ConstantSDNode>(Op1H->getOperand(1));

54592

if (!Const0L || !Const1L || !Const0H || !Const1H)

54593

return SDValue();

54594

unsigned Idx0L = Const0L->getZExtValue(), Idx1L = Const1L->getZExtValue(),

54595

Idx0H = Const0H->getZExtValue(), Idx1H = Const1H->getZExtValue();

54596

// Commutativity of mul allows factors of a product to reorder.

54597

if (Idx0L > Idx1L)

54598

std::swap(Idx0L, Idx1L);

54599

if (Idx0H > Idx1H)

54600

std::swap(Idx0H, Idx1H);

54601

// Commutativity of add allows pairs of factors to reorder.

54602

if (Idx0L > Idx0H) {

54603

std::swap(Idx0L, Idx0H);

54604

std::swap(Idx1L, Idx1H);

54605

}

54606

if (Idx0L != 2 * i || Idx1L != 2 * i + 1 || Idx0H != 2 * i + 2 ||

54607

Idx1H != 2 * i + 3)

54608

return SDValue();

54609

if (!Mul) {

54610

// First time an extract_elt's source vector is visited. Must be a MUL

54611

// with 2X number of vector elements than the BUILD_VECTOR.

54612

// Both extracts must be from same MUL.

54613

Mul = Op0L->getOperand(0);

54614

if (Mul->getOpcode() != ISD::MUL ||

54615

Mul.getValueType().getVectorNumElements() != 2 * e)

54616

return SDValue();

54617

}

54618

// Check that the extract is from the same MUL previously seen.

54619

if (Mul != Op0L->getOperand(0) || Mul != Op1L->getOperand(0) ||

54620

Mul != Op0H->getOperand(0) || Mul != Op1H->getOperand(0))

54621

return SDValue();

54622

}

54623

54624

// Check if the Mul source can be safely shrunk.

54625

ShrinkMode Mode;

54626

if (!canReduceVMulWidth(Mul.getNode(), DAG, Mode) ||

54627

Mode == ShrinkMode::MULU16)

54628

return SDValue();

54629

54630

EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,

54631

VT.getVectorNumElements() * 2);

54632

SDValue N0 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Mul.getOperand(0));

54633

SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Mul.getOperand(1));

54634

54635

auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,

54636

ArrayRef<SDValue> Ops) {

54637

EVT InVT = Ops[0].getValueType();

54638

assert(InVT == Ops[1].getValueType() && "Operands' types mismatch")(static_cast <bool> (InVT == Ops[1].getValueType() &&
"Operands' types mismatch") ? void (0) : __assert_fail ("InVT == Ops[1].getValueType() && \"Operands' types mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 54638, __extension__
__PRETTY_FUNCTION__));

54639

EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,

54640

InVT.getVectorNumElements() / 2);

54641

return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, Ops[0], Ops[1]);

54642

};

54643

return SplitOpsAndApply(DAG, Subtarget, DL, VT, { N0, N1 }, PMADDBuilder);

54644

}

54645

54646

// Attempt to turn this pattern into PMADDWD.

54647

// (add (mul (sext (build_vector)), (sext (build_vector))),

54648

// (mul (sext (build_vector)), (sext (build_vector)))

54649

static SDValue matchPMADDWD_2(SelectionDAG &DAG, SDValue N0, SDValue N1,

54650

const SDLoc &DL, EVT VT,

54651

const X86Subtarget &Subtarget) {

54652

if (!Subtarget.hasSSE2())

54653

return SDValue();

54654

54655

if (N0.getOpcode() != ISD::MUL || N1.getOpcode() != ISD::MUL)

54656

return SDValue();

54657

54658

if (!VT.isVector() || VT.getVectorElementType() != MVT::i32 ||

54659

VT.getVectorNumElements() < 4 ||

54660

!isPowerOf2_32(VT.getVectorNumElements()))

54661

return SDValue();

54662

54663

SDValue N00 = N0.getOperand(0);

54664

SDValue N01 = N0.getOperand(1);

54665

SDValue N10 = N1.getOperand(0);

54666

SDValue N11 = N1.getOperand(1);

54667

54668

// All inputs need to be sign extends.

54669

// TODO: Support ZERO_EXTEND from known positive?

54670

if (N00.getOpcode() != ISD::SIGN_EXTEND ||

54671

N01.getOpcode() != ISD::SIGN_EXTEND ||

54672

N10.getOpcode() != ISD::SIGN_EXTEND ||

54673

N11.getOpcode() != ISD::SIGN_EXTEND)

54674

return SDValue();

54675

54676

// Peek through the extends.

54677

N00 = N00.getOperand(0);

54678

N01 = N01.getOperand(0);

54679

N10 = N10.getOperand(0);

54680

N11 = N11.getOperand(0);

54681

54682

// Must be extending from vXi16.

54683

EVT InVT = N00.getValueType();

54684

if (InVT.getVectorElementType() != MVT::i16 || N01.getValueType() != InVT ||

54685

N10.getValueType() != InVT || N11.getValueType() != InVT)

54686

return SDValue();

54687

54688

// All inputs should be build_vectors.

54689

if (N00.getOpcode() != ISD::BUILD_VECTOR ||

54690

N01.getOpcode() != ISD::BUILD_VECTOR ||

54691

N10.getOpcode() != ISD::BUILD_VECTOR ||

54692

N11.getOpcode() != ISD::BUILD_VECTOR)

54693

return SDValue();

54694

54695

// For each element, we need to ensure we have an odd element from one vector

54696

// multiplied by the odd element of another vector and the even element from

54697

// one of the same vectors being multiplied by the even element from the

54698

// other vector. So we need to make sure for each element i, this operator

54699

// is being performed:

54700

// A[2 * i] * B[2 * i] + A[2 * i + 1] * B[2 * i + 1]

54701

SDValue In0, In1;

54702

for (unsigned i = 0; i != N00.getNumOperands(); ++i) {

54703

SDValue N00Elt = N00.getOperand(i);

54704

SDValue N01Elt = N01.getOperand(i);

54705

SDValue N10Elt = N10.getOperand(i);

54706

SDValue N11Elt = N11.getOperand(i);

54707

// TODO: Be more tolerant to undefs.

54708

if (N00Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

54709

N01Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

54710

N10Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||

54711

N11Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT)

54712

return SDValue();

54713

auto *ConstN00Elt = dyn_cast<ConstantSDNode>(N00Elt.getOperand(1));

54714

auto *ConstN01Elt = dyn_cast<ConstantSDNode>(N01Elt.getOperand(1));

54715

auto *ConstN10Elt = dyn_cast<ConstantSDNode>(N10Elt.getOperand(1));

54716

auto *ConstN11Elt = dyn_cast<ConstantSDNode>(N11Elt.getOperand(1));

54717

if (!ConstN00Elt || !ConstN01Elt || !ConstN10Elt || !ConstN11Elt)

54718

return SDValue();

54719

unsigned IdxN00 = ConstN00Elt->getZExtValue();

54720

unsigned IdxN01 = ConstN01Elt->getZExtValue();

54721

unsigned IdxN10 = ConstN10Elt->getZExtValue();

54722

unsigned IdxN11 = ConstN11Elt->getZExtValue();

54723

// Add is commutative so indices can be reordered.

54724

if (IdxN00 > IdxN10) {

54725

std::swap(IdxN00, IdxN10);

54726

std::swap(IdxN01, IdxN11);

54727

}

54728

// N0 indices be the even element. N1 indices must be the next odd element.

54729

if (IdxN00 != 2 * i || IdxN10 != 2 * i + 1 ||

54730

IdxN01 != 2 * i || IdxN11 != 2 * i + 1)

54731

return SDValue();

54732

SDValue N00In = N00Elt.getOperand(0);

54733

SDValue N01In = N01Elt.getOperand(0);

54734

SDValue N10In = N10Elt.getOperand(0);

54735

SDValue N11In = N11Elt.getOperand(0);

54736

54737

// First time we find an input capture it.

54738

if (!In0) {

54739

In0 = N00In;

54740

In1 = N01In;

54741

54742

// The input vectors must be at least as wide as the output.

54743

// If they are larger than the output, we extract subvector below.

54744

if (In0.getValueSizeInBits() < VT.getSizeInBits() ||

54745

In1.getValueSizeInBits() < VT.getSizeInBits())

54746

return SDValue();

54747

}

54748

// Mul is commutative so the input vectors can be in any order.

54749

// Canonicalize to make the compares easier.

54750

if (In0 != N00In)

54751

std::swap(N00In, N01In);

54752

if (In0 != N10In)

54753

std::swap(N10In, N11In);

54754

if (In0 != N00In || In1 != N01In || In0 != N10In || In1 != N11In)

54755

return SDValue();

54756

}

54757

54758

auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,

54759

ArrayRef<SDValue> Ops) {

54760

EVT OpVT = Ops[0].getValueType();

54761

assert(OpVT.getScalarType() == MVT::i16 &&(static_cast <bool> (OpVT.getScalarType() == MVT::i16 &&
"Unexpected scalar element type") ? void (0) : __assert_fail
("OpVT.getScalarType() == MVT::i16 && \"Unexpected scalar element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 54762, __extension__
__PRETTY_FUNCTION__))

54762

"Unexpected scalar element type")(static_cast <bool> (OpVT.getScalarType() == MVT::i16 &&
"Unexpected scalar element type") ? void (0) : __assert_fail
("OpVT.getScalarType() == MVT::i16 && \"Unexpected scalar element type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 54762, __extension__
__PRETTY_FUNCTION__));

54763

assert(OpVT == Ops[1].getValueType() && "Operands' types mismatch")(static_cast <bool> (OpVT == Ops[1].getValueType() &&
"Operands' types mismatch") ? void (0) : __assert_fail ("OpVT == Ops[1].getValueType() && \"Operands' types mismatch\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 54763, __extension__
__PRETTY_FUNCTION__));

54764

EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,

54765

OpVT.getVectorNumElements() / 2);

54766

return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, Ops[0], Ops[1]);

54767

};

54768

54769

// If the output is narrower than an input, extract the low part of the input

54770

// vector.

54771

EVT OutVT16 = EVT::getVectorVT(*DAG.getContext(), MVT::i16,

54772

VT.getVectorNumElements() * 2);

54773

if (OutVT16.bitsLT(In0.getValueType())) {

54774

In0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT16, In0,

54775

DAG.getIntPtrConstant(0, DL));

54776

}

54777

if (OutVT16.bitsLT(In1.getValueType())) {

54778

In1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT16, In1,

54779

DAG.getIntPtrConstant(0, DL));

54780

}

54781

return SplitOpsAndApply(DAG, Subtarget, DL, VT, { In0, In1 },

54782

PMADDBuilder);

54783

}

54784

54785

// ADD(VPMADDWD(X,Y),VPMADDWD(Z,W)) -> VPMADDWD(SHUFFLE(X,Z), SHUFFLE(Y,W))

54786

// If upper element in each pair of both VPMADDWD are zero then we can merge

54787

// the operand elements and use the implicit add of VPMADDWD.

54788

// TODO: Add support for VPMADDUBSW (which isn't commutable).

54789

static SDValue combineAddOfPMADDWD(SelectionDAG &DAG, SDValue N0, SDValue N1,

54790

const SDLoc &DL, EVT VT) {

54791

if (N0.getOpcode() != N1.getOpcode() || N0.getOpcode() != X86ISD::VPMADDWD)

54792

return SDValue();

54793

54794

// TODO: Add 256/512-bit support once VPMADDWD combines with shuffles.

54795

if (VT.getSizeInBits() > 128)

54796

return SDValue();

54797

54798

unsigned NumElts = VT.getVectorNumElements();

54799

MVT OpVT = N0.getOperand(0).getSimpleValueType();

54800

APInt DemandedBits = APInt::getAllOnes(OpVT.getScalarSizeInBits());

54801

APInt DemandedHiElts = APInt::getSplat(2 * NumElts, APInt(2, 2));

54802

54803

bool Op0HiZero =

54804

DAG.MaskedValueIsZero(N0.getOperand(0), DemandedBits, DemandedHiElts) ||

54805

DAG.MaskedValueIsZero(N0.getOperand(1), DemandedBits, DemandedHiElts);

54806

bool Op1HiZero =

54807

DAG.MaskedValueIsZero(N1.getOperand(0), DemandedBits, DemandedHiElts) ||

54808

DAG.MaskedValueIsZero(N1.getOperand(1), DemandedBits, DemandedHiElts);

54809

54810

// TODO: Check for zero lower elements once we have actual codegen that

54811

// creates them.

54812

if (!Op0HiZero || !Op1HiZero)

54813

return SDValue();

54814

54815

// Create a shuffle mask packing the lower elements from each VPMADDWD.

54816

SmallVector<int> Mask;

54817

for (int i = 0; i != (int)NumElts; ++i) {

54818

Mask.push_back(2 * i);

54819

Mask.push_back(2 * (i + NumElts));

54820

}

54821

54822

SDValue LHS =

54823

DAG.getVectorShuffle(OpVT, DL, N0.getOperand(0), N1.getOperand(0), Mask);

54824

SDValue RHS =

54825

DAG.getVectorShuffle(OpVT, DL, N0.getOperand(1), N1.getOperand(1), Mask);

54826

return DAG.getNode(X86ISD::VPMADDWD, DL, VT, LHS, RHS);

54827

}

54828

54829

/// CMOV of constants requires materializing constant operands in registers.

54830

/// Try to fold those constants into an 'add' instruction to reduce instruction

54831

/// count. We do this with CMOV rather the generic 'select' because there are

54832

/// earlier folds that may be used to turn select-of-constants into logic hacks.

54833

static SDValue pushAddIntoCmovOfConsts(SDNode *N, SelectionDAG &DAG,

54834

const X86Subtarget &Subtarget) {

54835

// If an operand is zero, add-of-0 gets simplified away, so that's clearly

54836

// better because we eliminate 1-2 instructions. This transform is still

54837

// an improvement without zero operands because we trade 2 move constants and

54838

// 1 add for 2 adds (LEA) as long as the constants can be represented as

54839

// immediate asm operands (fit in 32-bits).

54840

auto isSuitableCmov = [](SDValue V) {

54841

if (V.getOpcode() != X86ISD::CMOV || !V.hasOneUse())

54842

return false;

54843

if (!isa<ConstantSDNode>(V.getOperand(0)) ||

54844

!isa<ConstantSDNode>(V.getOperand(1)))

54845

return false;

54846

return isNullConstant(V.getOperand(0)) || isNullConstant(V.getOperand(1)) ||

54847

(V.getConstantOperandAPInt(0).isSignedIntN(32) &&

54848

V.getConstantOperandAPInt(1).isSignedIntN(32));

54849

};

54850

54851

// Match an appropriate CMOV as the first operand of the add.

54852

SDValue Cmov = N->getOperand(0);

54853

SDValue OtherOp = N->getOperand(1);

54854

if (!isSuitableCmov(Cmov))

54855

std::swap(Cmov, OtherOp);

54856

if (!isSuitableCmov(Cmov))

54857

return SDValue();

54858

54859

// Don't remove a load folding opportunity for the add. That would neutralize

54860

// any improvements from removing constant materializations.

54861

if (X86::mayFoldLoad(OtherOp, Subtarget))

54862

return SDValue();

54863

54864

EVT VT = N->getValueType(0);

54865

SDLoc DL(N);

54866

SDValue FalseOp = Cmov.getOperand(0);

54867

SDValue TrueOp = Cmov.getOperand(1);

54868

54869

// We will push the add through the select, but we can potentially do better

54870

// if we know there is another add in the sequence and this is pointer math.

54871

// In that case, we can absorb an add into the trailing memory op and avoid

54872

// a 3-operand LEA which is likely slower than a 2-operand LEA.

54873

// TODO: If target has "slow3OpsLEA", do this even without the trailing memop?

54874

if (OtherOp.getOpcode() == ISD::ADD && OtherOp.hasOneUse() &&

54875

!isa<ConstantSDNode>(OtherOp.getOperand(0)) &&

54876

all_of(N->uses(), [&](SDNode *Use) {

54877

auto *MemNode = dyn_cast<MemSDNode>(Use);

54878

return MemNode && MemNode->getBasePtr().getNode() == N;

54879

})) {

54880

// add (cmov C1, C2), add (X, Y) --> add (cmov (add X, C1), (add X, C2)), Y

54881

// TODO: We are arbitrarily choosing op0 as the 1st piece of the sum, but

54882

// it is possible that choosing op1 might be better.

54883

SDValue X = OtherOp.getOperand(0), Y = OtherOp.getOperand(1);

54884

FalseOp = DAG.getNode(ISD::ADD, DL, VT, X, FalseOp);

54885

TrueOp = DAG.getNode(ISD::ADD, DL, VT, X, TrueOp);

54886

Cmov = DAG.getNode(X86ISD::CMOV, DL, VT, FalseOp, TrueOp,

54887

Cmov.getOperand(2), Cmov.getOperand(3));

54888

return DAG.getNode(ISD::ADD, DL, VT, Cmov, Y);

54889

}

54890

54891

// add (cmov C1, C2), OtherOp --> cmov (add OtherOp, C1), (add OtherOp, C2)

54892

FalseOp = DAG.getNode(ISD::ADD, DL, VT, OtherOp, FalseOp);

54893

TrueOp = DAG.getNode(ISD::ADD, DL, VT, OtherOp, TrueOp);

54894

return DAG.getNode(X86ISD::CMOV, DL, VT, FalseOp, TrueOp, Cmov.getOperand(2),

54895

Cmov.getOperand(3));

54896

}

54897

54898

static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,

54899

TargetLowering::DAGCombinerInfo &DCI,

54900

const X86Subtarget &Subtarget) {

54901

EVT VT = N->getValueType(0);

54902

SDValue Op0 = N->getOperand(0);

54903

SDValue Op1 = N->getOperand(1);

54904

SDLoc DL(N);

54905

54906

if (SDValue Select = pushAddIntoCmovOfConsts(N, DAG, Subtarget))

54907

return Select;

54908

54909

if (SDValue MAdd = matchPMADDWD(DAG, Op0, Op1, DL, VT, Subtarget))

54910

return MAdd;

54911

if (SDValue MAdd = matchPMADDWD_2(DAG, Op0, Op1, DL, VT, Subtarget))

54912

return MAdd;

54913

if (SDValue MAdd = combineAddOfPMADDWD(DAG, Op0, Op1, DL, VT))

54914

return MAdd;

54915

54916

// Try to synthesize horizontal adds from adds of shuffles.

54917

if (SDValue V = combineToHorizontalAddSub(N, DAG, Subtarget))

54918

return V;

54919

54920

// If vectors of i1 are legal, turn (add (zext (vXi1 X)), Y) into

54921

// (sub Y, (sext (vXi1 X))).

54922

// FIXME: We have the (sub Y, (zext (vXi1 X))) -> (add (sext (vXi1 X)), Y) in

54923

// generic DAG combine without a legal type check, but adding this there

54924

// caused regressions.

54925

if (VT.isVector()) {

54926

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

54927

if (Op0.getOpcode() == ISD::ZERO_EXTEND &&

54928

Op0.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&

54929

TLI.isTypeLegal(Op0.getOperand(0).getValueType())) {

54930

SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Op0.getOperand(0));

54931

return DAG.getNode(ISD::SUB, DL, VT, Op1, SExt);

54932

}

54933

54934

if (Op1.getOpcode() == ISD::ZERO_EXTEND &&

54935

Op1.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&

54936

TLI.isTypeLegal(Op1.getOperand(0).getValueType())) {

54937

SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Op1.getOperand(0));

54938

return DAG.getNode(ISD::SUB, DL, VT, Op0, SExt);

54939

}

54940

}

54941

54942

// Fold ADD(ADC(Y,0,W),X) -> ADC(X,Y,W)

54943

if (Op0.getOpcode() == X86ISD::ADC && Op0->hasOneUse() &&

54944

X86::isZeroNode(Op0.getOperand(1))) {

54945

assert(!Op0->hasAnyUseOfValue(1) && "Overflow bit in use")(static_cast <bool> (!Op0->hasAnyUseOfValue(1) &&
"Overflow bit in use") ? void (0) : __assert_fail ("!Op0->hasAnyUseOfValue(1) && \"Overflow bit in use\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 54945, __extension__
__PRETTY_FUNCTION__));

54946

return DAG.getNode(X86ISD::ADC, SDLoc(Op0), Op0->getVTList(), Op1,

54947

Op0.getOperand(0), Op0.getOperand(2));

54948

}

54949

54950

return combineAddOrSubToADCOrSBB(N, DAG);

54951

}

54952

54953

// Try to fold (sub Y, cmovns X, -X) -> (add Y, cmovns -X, X) if the cmov

54954

// condition comes from the subtract node that produced -X. This matches the

54955

// cmov expansion for absolute value. By swapping the operands we convert abs

54956

// to nabs.

54957

static SDValue combineSubABS(SDNode *N, SelectionDAG &DAG) {

54958

SDValue N0 = N->getOperand(0);

54959

SDValue N1 = N->getOperand(1);

54960

54961

if (N1.getOpcode() != X86ISD::CMOV || !N1.hasOneUse())

54962

return SDValue();

54963

54964

X86::CondCode CC = (X86::CondCode)N1.getConstantOperandVal(2);

54965

if (CC != X86::COND_S && CC != X86::COND_NS)

54966

return SDValue();

54967

54968

// Condition should come from a negate operation.

54969

SDValue Cond = N1.getOperand(3);

54970

if (Cond.getOpcode() != X86ISD::SUB || !isNullConstant(Cond.getOperand(0)))

54971

return SDValue();

54972

assert(Cond.getResNo() == 1 && "Unexpected result number")(static_cast <bool> (Cond.getResNo() == 1 && "Unexpected result number"
) ? void (0) : __assert_fail ("Cond.getResNo() == 1 && \"Unexpected result number\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 54972, __extension__
__PRETTY_FUNCTION__));

54973

54974

// Get the X and -X from the negate.

54975

SDValue NegX = Cond.getValue(0);

54976

SDValue X = Cond.getOperand(1);

54977

54978

SDValue FalseOp = N1.getOperand(0);

54979

SDValue TrueOp = N1.getOperand(1);

54980

54981

// Cmov operands should be X and NegX. Order doesn't matter.

54982

if (!(TrueOp == X && FalseOp == NegX) && !(TrueOp == NegX && FalseOp == X))

54983

return SDValue();

54984

54985

// Build a new CMOV with the operands swapped.

54986

SDLoc DL(N);

54987

MVT VT = N->getSimpleValueType(0);

54988

SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, VT, TrueOp, FalseOp,

54989

N1.getOperand(2), Cond);

54990

// Convert sub to add.

54991

return DAG.getNode(ISD::ADD, DL, VT, N0, Cmov);

54992

}

54993

54994

static SDValue combineSub(SDNode *N, SelectionDAG &DAG,

54995

TargetLowering::DAGCombinerInfo &DCI,

54996

const X86Subtarget &Subtarget) {

54997

SDValue Op0 = N->getOperand(0);

54998

SDValue Op1 = N->getOperand(1);

54999

55000

// TODO: Add NoOpaque handling to isConstantIntBuildVectorOrConstantInt.

55001

auto IsNonOpaqueConstant = [&](SDValue Op) {

55002

if (SDNode *C = DAG.isConstantIntBuildVectorOrConstantInt(Op)) {

55003

if (auto *Cst = dyn_cast<ConstantSDNode>(C))

55004

return !Cst->isOpaque();

55005

return true;

55006

}

55007

return false;

55008

};

55009

55010

// X86 can't encode an immediate LHS of a sub. See if we can push the

55011

// negation into a preceding instruction. If the RHS of the sub is a XOR with

55012

// one use and a constant, invert the immediate, saving one register.

55013

// sub(C1, xor(X, C2)) -> add(xor(X, ~C2), C1+1)

55014

if (Op1.getOpcode() == ISD::XOR && IsNonOpaqueConstant(Op0) &&

55015

IsNonOpaqueConstant(Op1.getOperand(1)) && Op1->hasOneUse()) {

55016

SDLoc DL(N);

55017

EVT VT = Op0.getValueType();

55018

SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT, Op1.getOperand(0),

55019

DAG.getNOT(SDLoc(Op1), Op1.getOperand(1), VT));

55020

SDValue NewAdd =

55021

DAG.getNode(ISD::ADD, DL, VT, Op0, DAG.getConstant(1, DL, VT));

55022

return DAG.getNode(ISD::ADD, DL, VT, NewXor, NewAdd);

55023

}

55024

55025

if (SDValue V = combineSubABS(N, DAG))

55026

return V;

55027

55028

// Try to synthesize horizontal subs from subs of shuffles.

55029

if (SDValue V = combineToHorizontalAddSub(N, DAG, Subtarget))

55030

return V;

55031

55032

// Fold SUB(X,ADC(Y,0,W)) -> SBB(X,Y,W)

55033

if (Op1.getOpcode() == X86ISD::ADC && Op1->hasOneUse() &&

55034

X86::isZeroNode(Op1.getOperand(1))) {

55035

assert(!Op1->hasAnyUseOfValue(1) && "Overflow bit in use")(static_cast <bool> (!Op1->hasAnyUseOfValue(1) &&
"Overflow bit in use") ? void (0) : __assert_fail ("!Op1->hasAnyUseOfValue(1) && \"Overflow bit in use\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 55035, __extension__
__PRETTY_FUNCTION__));

55036

return DAG.getNode(X86ISD::SBB, SDLoc(Op1), Op1->getVTList(), Op0,

55037

Op1.getOperand(0), Op1.getOperand(2));

55038

}

55039

55040

// Fold SUB(X,SBB(Y,Z,W)) -> SUB(ADC(X,Z,W),Y)

55041

// Don't fold to ADC(0,0,W)/SETCC_CARRY pattern which will prevent more folds.

55042

if (Op1.getOpcode() == X86ISD::SBB && Op1->hasOneUse() &&

55043

!(X86::isZeroNode(Op0) && X86::isZeroNode(Op1.getOperand(1)))) {

55044

assert(!Op1->hasAnyUseOfValue(1) && "Overflow bit in use")(static_cast <bool> (!Op1->hasAnyUseOfValue(1) &&
"Overflow bit in use") ? void (0) : __assert_fail ("!Op1->hasAnyUseOfValue(1) && \"Overflow bit in use\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 55044, __extension__
__PRETTY_FUNCTION__));

55045

SDValue ADC = DAG.getNode(X86ISD::ADC, SDLoc(Op1), Op1->getVTList(), Op0,

55046

Op1.getOperand(1), Op1.getOperand(2));

55047

return DAG.getNode(ISD::SUB, SDLoc(N), Op0.getValueType(), ADC.getValue(0),

55048

Op1.getOperand(0));

55049

}

55050

55051

return combineAddOrSubToADCOrSBB(N, DAG);

55052

}

55053

55054

static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG,

55055

const X86Subtarget &Subtarget) {

55056

MVT VT = N->getSimpleValueType(0);

55057

SDLoc DL(N);

55058

55059

if (N->getOperand(0) == N->getOperand(1)) {

55060

if (N->getOpcode() == X86ISD::PCMPEQ)

55061

return DAG.getConstant(-1, DL, VT);

55062

if (N->getOpcode() == X86ISD::PCMPGT)

55063

return DAG.getConstant(0, DL, VT);

55064

}

55065

55066

return SDValue();

55067

}

55068

55069

/// Helper that combines an array of subvector ops as if they were the operands

55070

/// of a ISD::CONCAT_VECTORS node, but may have come from another source (e.g.

55071

/// ISD::INSERT_SUBVECTOR). The ops are assumed to be of the same type.

55072

static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,

55073

ArrayRef<SDValue> Ops, SelectionDAG &DAG,

55074

TargetLowering::DAGCombinerInfo &DCI,

55075

const X86Subtarget &Subtarget) {

55076

assert(Subtarget.hasAVX() && "AVX assumed for concat_vectors")(static_cast <bool> (Subtarget.hasAVX() && "AVX assumed for concat_vectors"
) ? void (0) : __assert_fail ("Subtarget.hasAVX() && \"AVX assumed for concat_vectors\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 55076, __extension__
__PRETTY_FUNCTION__));

55077

unsigned EltSizeInBits = VT.getScalarSizeInBits();

55078

55079

if (llvm::all_of(Ops, [](SDValue Op) { return Op.isUndef(); }))

55080

return DAG.getUNDEF(VT);

55081

55082

if (llvm::all_of(Ops, [](SDValue Op) {

55083

return ISD::isBuildVectorAllZeros(Op.getNode());

55084

}))

55085

return getZeroVector(VT, Subtarget, DAG, DL);

55086

55087

SDValue Op0 = Ops[0];

55088

bool IsSplat = llvm::all_equal(Ops);

55089

55090

// Repeated subvectors.

55091

if (IsSplat &&

55092

(VT.is256BitVector() || (VT.is512BitVector() && Subtarget.hasAVX512()))) {

55093

// If this broadcast is inserted into both halves, use a larger broadcast.

55094

if (Op0.getOpcode() == X86ISD::VBROADCAST)

55095

return DAG.getNode(Op0.getOpcode(), DL, VT, Op0.getOperand(0));

55096

55097

// If this simple subvector or scalar/subvector broadcast_load is inserted

55098

// into both halves, use a larger broadcast_load. Update other uses to use

55099

// an extracted subvector.

55100

if (ISD::isNormalLoad(Op0.getNode()) ||

55101

Op0.getOpcode() == X86ISD::VBROADCAST_LOAD ||

55102

Op0.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) {

55103

auto *Mem = cast<MemSDNode>(Op0);

55104

unsigned Opc = Op0.getOpcode() == X86ISD::VBROADCAST_LOAD

55105

? X86ISD::VBROADCAST_LOAD

55106

: X86ISD::SUBV_BROADCAST_LOAD;

55107

if (SDValue BcastLd =

55108

getBROADCAST_LOAD(Opc, DL, VT, Mem->getMemoryVT(), Mem, 0, DAG)) {

55109

SDValue BcastSrc =

55110

extractSubVector(BcastLd, 0, DAG, DL, Op0.getValueSizeInBits());

55111

DAG.ReplaceAllUsesOfValueWith(Op0, BcastSrc);

55112

return BcastLd;

55113

}

55114

}

55115

55116

// concat_vectors(movddup(x),movddup(x)) -> broadcast(x)

55117

if (Op0.getOpcode() == X86ISD::MOVDDUP && VT == MVT::v4f64 &&

55118

(Subtarget.hasAVX2() ||

55119

X86::mayFoldLoadIntoBroadcastFromMem(Op0.getOperand(0),

55120

VT.getScalarType(), Subtarget)))

55121

return DAG.getNode(X86ISD::VBROADCAST, DL, VT,

55122

DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f64,

55123

Op0.getOperand(0),

55124

DAG.getIntPtrConstant(0, DL)));

55125

55126

// concat_vectors(scalar_to_vector(x),scalar_to_vector(x)) -> broadcast(x)

55127

if (Op0.getOpcode() == ISD::SCALAR_TO_VECTOR &&

55128

(Subtarget.hasAVX2() ||

55129

(EltSizeInBits >= 32 &&

55130

X86::mayFoldLoad(Op0.getOperand(0), Subtarget))) &&

55131

Op0.getOperand(0).getValueType() == VT.getScalarType())

55132

return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Op0.getOperand(0));

55133

55134

// concat_vectors(extract_subvector(broadcast(x)),

55135

// extract_subvector(broadcast(x))) -> broadcast(x)

55136

if (Op0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&

55137

Op0.getOperand(0).getValueType() == VT) {

55138

if (Op0.getOperand(0).getOpcode() == X86ISD::VBROADCAST ||

55139

Op0.getOperand(0).getOpcode() == X86ISD::VBROADCAST_LOAD)

55140

return Op0.getOperand(0);

55141

}

55142

}

55143

55144

// concat(extract_subvector(v0,c0), extract_subvector(v1,c1)) -> vperm2x128.

55145

// Only concat of subvector high halves which vperm2x128 is best at.

55146

// TODO: This should go in combineX86ShufflesRecursively eventually.

55147

if (VT.is256BitVector() && Ops.size() == 2) {

55148

SDValue Src0 = peekThroughBitcasts(Ops[0]);

55149

SDValue Src1 = peekThroughBitcasts(Ops[1]);

55150

if (Src0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&

55151

Src1.getOpcode() == ISD::EXTRACT_SUBVECTOR) {

55152

EVT SrcVT0 = Src0.getOperand(0).getValueType();

55153

EVT SrcVT1 = Src1.getOperand(0).getValueType();

55154

unsigned NumSrcElts0 = SrcVT0.getVectorNumElements();

55155

unsigned NumSrcElts1 = SrcVT1.getVectorNumElements();

55156

if (SrcVT0.is256BitVector() && SrcVT1.is256BitVector() &&

55157

Src0.getConstantOperandAPInt(1) == (NumSrcElts0 / 2) &&

55158

Src1.getConstantOperandAPInt(1) == (NumSrcElts1 / 2)) {

55159

return DAG.getNode(X86ISD::VPERM2X128, DL, VT,

55160

DAG.getBitcast(VT, Src0.getOperand(0)),

55161

DAG.getBitcast(VT, Src1.getOperand(0)),

55162

DAG.getTargetConstant(0x31, DL, MVT::i8));

55163

}

55164

}

55165

}

55166

55167

// Repeated opcode.

55168

// TODO - combineX86ShufflesRecursively should handle shuffle concatenation

55169

// but it currently struggles with different vector widths.

55170

if (llvm::all_of(Ops, [Op0](SDValue Op) {

55171

return Op.getOpcode() == Op0.getOpcode();

55172

})) {

55173

auto ConcatSubOperand = [&](MVT VT, ArrayRef<SDValue> SubOps, unsigned I) {

55174

SmallVector<SDValue> Subs;

55175

for (SDValue SubOp : SubOps)

55176

Subs.push_back(SubOp.getOperand(I));

55177

return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);

55178

};

55179

auto IsConcatFree = [](MVT VT, ArrayRef<SDValue> SubOps, unsigned Op) {

55180

for (unsigned I = 0, E = SubOps.size(); I != E; ++I) {

55181

SDValue Sub = SubOps[I].getOperand(Op);

55182

unsigned NumSubElts = Sub.getValueType().getVectorNumElements();

55183

if (Sub.getOpcode() != ISD::EXTRACT_SUBVECTOR ||

55184

Sub.getOperand(0).getValueType() != VT ||

55185

Sub.getConstantOperandAPInt(1) != (I * NumSubElts))

55186

return false;

55187

}

55188

return true;

55189

};

55190

55191

unsigned NumOps = Ops.size();

55192

switch (Op0.getOpcode()) {

55193

case X86ISD::VBROADCAST: {

55194

if (!IsSplat && llvm::all_of(Ops, [](SDValue Op) {

55195

return Op.getOperand(0).getValueType().is128BitVector();

55196

})) {

55197

if (VT == MVT::v4f64 || VT == MVT::v4i64)

55198

return DAG.getNode(X86ISD::UNPCKL, DL, VT,

55199

ConcatSubOperand(VT, Ops, 0),

55200

ConcatSubOperand(VT, Ops, 0));

55201

// TODO: Add pseudo v8i32 PSHUFD handling to AVX1Only targets.

55202

if (VT == MVT::v8f32 || (VT == MVT::v8i32 && Subtarget.hasInt256()))

55203

return DAG.getNode(VT == MVT::v8f32 ? X86ISD::VPERMILPI

55204

: X86ISD::PSHUFD,

55205

DL, VT, ConcatSubOperand(VT, Ops, 0),

55206

getV4X86ShuffleImm8ForMask({0, 0, 0, 0}, DL, DAG));

55207

}

55208

break;

55209

}

55210

case X86ISD::MOVDDUP:

55211

case X86ISD::MOVSHDUP:

55212

case X86ISD::MOVSLDUP: {

55213

if (!IsSplat)

55214

return DAG.getNode(Op0.getOpcode(), DL, VT,

55215

ConcatSubOperand(VT, Ops, 0));

55216

break;

55217

}

55218

case X86ISD::SHUFP: {

55219

// Add SHUFPD support if/when necessary.

55220

if (!IsSplat && VT.getScalarType() == MVT::f32 &&

55221

llvm::all_of(Ops, [Op0](SDValue Op) {

55222

return Op.getOperand(2) == Op0.getOperand(2);

55223

})) {

55224

return DAG.getNode(Op0.getOpcode(), DL, VT,

55225

ConcatSubOperand(VT, Ops, 0),

55226

ConcatSubOperand(VT, Ops, 1), Op0.getOperand(2));

55227

}

55228

break;

55229

}

55230

case X86ISD::PSHUFHW:

55231

case X86ISD::PSHUFLW:

55232

case X86ISD::PSHUFD:

55233

if (!IsSplat && NumOps == 2 && VT.is256BitVector() &&

55234

Subtarget.hasInt256() && Op0.getOperand(1) == Ops[1].getOperand(1)) {

55235

return DAG.getNode(Op0.getOpcode(), DL, VT,

55236

ConcatSubOperand(VT, Ops, 0), Op0.getOperand(1));

55237

}

55238

[[fallthrough]];

55239

case X86ISD::VPERMILPI:

55240

if (!IsSplat && NumOps == 2 && (VT == MVT::v8f32 || VT == MVT::v8i32) &&

55241

Op0.getOperand(1) == Ops[1].getOperand(1)) {

55242

SDValue Res = DAG.getBitcast(MVT::v8f32, ConcatSubOperand(VT, Ops, 0));

55243

Res = DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, Res,

55244

Op0.getOperand(1));

55245

return DAG.getBitcast(VT, Res);

55246

}

55247

if (!IsSplat && NumOps == 2 && VT == MVT::v4f64) {

55248

uint64_t Idx0 = Ops[0].getConstantOperandVal(1);

55249

uint64_t Idx1 = Ops[1].getConstantOperandVal(1);

55250

uint64_t Idx = ((Idx1 & 3) << 2) | (Idx0 & 3);

55251

return DAG.getNode(Op0.getOpcode(), DL, VT,

55252

ConcatSubOperand(VT, Ops, 0),

55253

DAG.getTargetConstant(Idx, DL, MVT::i8));

55254

}

55255

break;

55256

case X86ISD::PSHUFB:

55257

if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) ||

55258

(VT.is512BitVector() && Subtarget.useBWIRegs()))) {

55259

return DAG.getNode(Op0.getOpcode(), DL, VT,

55260

ConcatSubOperand(VT, Ops, 0),

55261

ConcatSubOperand(VT, Ops, 1));

55262

}

55263

break;

55264

case X86ISD::VPERMV3:

55265

if (!IsSplat && NumOps == 2 && VT.is512BitVector()) {

55266

MVT OpVT = Op0.getSimpleValueType();

55267

int NumSrcElts = OpVT.getVectorNumElements();

55268

SmallVector<int, 64> ConcatMask;

55269

for (unsigned i = 0; i != NumOps; ++i) {

55270

SmallVector<int, 64> SubMask;

55271

SmallVector<SDValue, 2> SubOps;

55272

if (!getTargetShuffleMask(Ops[i].getNode(), OpVT, false, SubOps,

55273

SubMask))

55274

break;

55275

for (int M : SubMask) {

55276

if (0 <= M) {

55277

M += M < NumSrcElts ? 0 : NumSrcElts;

55278

M += i * NumSrcElts;

55279

}

55280

ConcatMask.push_back(M);

55281

}

55282

}

55283

if (ConcatMask.size() == (NumOps * NumSrcElts)) {

55284

SDValue Src0 = concatSubVectors(Ops[0].getOperand(0),

55285

Ops[1].getOperand(0), DAG, DL);

55286

SDValue Src1 = concatSubVectors(Ops[0].getOperand(2),

55287

Ops[1].getOperand(2), DAG, DL);

55288

MVT IntMaskSVT = MVT::getIntegerVT(VT.getScalarSizeInBits());

55289

MVT IntMaskVT = MVT::getVectorVT(IntMaskSVT, NumOps * NumSrcElts);

55290

SDValue Mask = getConstVector(ConcatMask, IntMaskVT, DAG, DL, true);

55291

return DAG.getNode(X86ISD::VPERMV3, DL, VT, Src0, Mask, Src1);

55292

}

55293

}

55294

break;

55295

case X86ISD::VSHLI:

55296

case X86ISD::VSRLI:

55297

// Special case: SHL/SRL AVX1 V4i64 by 32-bits can lower as a shuffle.

55298

// TODO: Move this to LowerShiftByScalarImmediate?

55299

if (VT == MVT::v4i64 && !Subtarget.hasInt256() &&

55300

llvm::all_of(Ops, [](SDValue Op) {

55301

return Op.getConstantOperandAPInt(1) == 32;

55302

})) {

55303

SDValue Res = DAG.getBitcast(MVT::v8i32, ConcatSubOperand(VT, Ops, 0));

55304

SDValue Zero = getZeroVector(MVT::v8i32, Subtarget, DAG, DL);

55305

if (Op0.getOpcode() == X86ISD::VSHLI) {

55306

Res = DAG.getVectorShuffle(MVT::v8i32, DL, Res, Zero,

55307

{8, 0, 8, 2, 8, 4, 8, 6});

55308

} else {

55309

Res = DAG.getVectorShuffle(MVT::v8i32, DL, Res, Zero,

55310

{1, 8, 3, 8, 5, 8, 7, 8});

55311

}

55312

return DAG.getBitcast(VT, Res);

55313

}

55314

[[fallthrough]];

55315

case X86ISD::VSRAI:

55316

case X86ISD::VSHL:

55317

case X86ISD::VSRL:

55318

case X86ISD::VSRA:

55319

if (((VT.is256BitVector() && Subtarget.hasInt256()) ||

55320

(VT.is512BitVector() && Subtarget.useAVX512Regs() &&

55321

(EltSizeInBits >= 32 || Subtarget.useBWIRegs()))) &&

55322

llvm::all_of(Ops, [Op0](SDValue Op) {

55323

return Op0.getOperand(1) == Op.getOperand(1);

55324

})) {

55325

return DAG.getNode(Op0.getOpcode(), DL, VT,

55326

ConcatSubOperand(VT, Ops, 0), Op0.getOperand(1));

55327

}

55328

break;

55329

case X86ISD::VPERMI:

55330

case X86ISD::VROTLI:

55331

case X86ISD::VROTRI:

55332

if (VT.is512BitVector() && Subtarget.useAVX512Regs() &&

55333

llvm::all_of(Ops, [Op0](SDValue Op) {

55334

return Op0.getOperand(1) == Op.getOperand(1);

55335

})) {

55336

return DAG.getNode(Op0.getOpcode(), DL, VT,

55337

ConcatSubOperand(VT, Ops, 0), Op0.getOperand(1));

55338

}

55339

break;

55340

case ISD::AND:

55341

case ISD::OR:

55342

case ISD::XOR:

55343

case X86ISD::ANDNP:

55344

// TODO: Add 256-bit support.

55345

if (!IsSplat && VT.is512BitVector()) {

55346

MVT SrcVT = Op0.getOperand(0).getSimpleValueType();

55347

SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),

55348

NumOps * SrcVT.getVectorNumElements());

55349

return DAG.getNode(Op0.getOpcode(), DL, VT,

55350

ConcatSubOperand(SrcVT, Ops, 0),

55351

ConcatSubOperand(SrcVT, Ops, 1));

55352

}

55353

break;

55354

case X86ISD::GF2P8AFFINEQB:

55355

if (!IsSplat &&

55356

(VT.is256BitVector() ||

55357

(VT.is512BitVector() && Subtarget.useAVX512Regs())) &&

55358

llvm::all_of(Ops, [Op0](SDValue Op) {

55359

return Op0.getOperand(2) == Op.getOperand(2);

55360

})) {

55361

return DAG.getNode(Op0.getOpcode(), DL, VT,

55362

ConcatSubOperand(VT, Ops, 0),

55363

ConcatSubOperand(VT, Ops, 1), Op0.getOperand(2));

55364

}

55365

break;

55366

case X86ISD::HADD:

55367

case X86ISD::HSUB:

55368

case X86ISD::FHADD:

55369

case X86ISD::FHSUB:

55370

case X86ISD::PACKSS:

55371

case X86ISD::PACKUS:

55372

if (!IsSplat && VT.is256BitVector() &&

55373

(VT.isFloatingPoint() || Subtarget.hasInt256())) {

55374

MVT SrcVT = Op0.getOperand(0).getSimpleValueType();

55375

SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),

55376

NumOps * SrcVT.getVectorNumElements());

55377

return DAG.getNode(Op0.getOpcode(), DL, VT,

55378

ConcatSubOperand(SrcVT, Ops, 0),

55379

ConcatSubOperand(SrcVT, Ops, 1));

55380

}

55381

break;

55382

case X86ISD::PALIGNR:

55383

if (!IsSplat &&

55384

((VT.is256BitVector() && Subtarget.hasInt256()) ||

55385

(VT.is512BitVector() && Subtarget.useBWIRegs())) &&

55386

llvm::all_of(Ops, [Op0](SDValue Op) {

55387

return Op0.getOperand(2) == Op.getOperand(2);

55388

})) {

55389

return DAG.getNode(Op0.getOpcode(), DL, VT,

55390

ConcatSubOperand(VT, Ops, 0),

55391

ConcatSubOperand(VT, Ops, 1), Op0.getOperand(2));

55392

}

55393

break;

55394

case ISD::VSELECT:

55395

case X86ISD::BLENDV:

55396

if (!IsSplat && VT.is256BitVector() && Ops.size() == 2 &&

55397

(VT.getScalarSizeInBits() >= 32 || Subtarget.hasInt256()) &&

55398

IsConcatFree(VT, Ops, 1) && IsConcatFree(VT, Ops, 2)) {

55399

EVT SelVT = Ops[0].getOperand(0).getValueType();

55400

SelVT = SelVT.getDoubleNumVectorElementsVT(*DAG.getContext());

55401

if (DAG.getTargetLoweringInfo().isTypeLegal(SelVT))

55402

return DAG.getNode(Op0.getOpcode(), DL, VT,

55403

ConcatSubOperand(SelVT.getSimpleVT(), Ops, 0),

55404

ConcatSubOperand(VT, Ops, 1),

55405

ConcatSubOperand(VT, Ops, 2));

55406

}

55407

break;

55408

}

55409

}

55410

55411

// Fold subvector loads into one.

55412

// If needed, look through bitcasts to get to the load.

55413

if (auto *FirstLd = dyn_cast<LoadSDNode>(peekThroughBitcasts(Op0))) {

55414

unsigned Fast;

55415

const X86TargetLowering *TLI = Subtarget.getTargetLowering();

55416

if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,

55417

*FirstLd->getMemOperand(), &Fast) &&

55418

Fast) {

55419

if (SDValue Ld =

55420

EltsFromConsecutiveLoads(VT, Ops, DL, DAG, Subtarget, false))

55421

return Ld;

55422

}

55423

}

55424

55425

// Attempt to fold target constant loads.

55426

if (all_of(Ops, [](SDValue Op) { return getTargetConstantFromNode(Op); })) {

55427

SmallVector<APInt> EltBits;

55428

APInt UndefElts = APInt::getNullValue(VT.getVectorNumElements());

55429

for (unsigned I = 0, E = Ops.size(); I != E; ++I) {

55430

APInt OpUndefElts;

55431

SmallVector<APInt> OpEltBits;

55432

if (!getTargetConstantBitsFromNode(Ops[I], EltSizeInBits, OpUndefElts,

55433

OpEltBits, true, false))

55434

break;

55435

EltBits.append(OpEltBits);

55436

UndefElts.insertBits(OpUndefElts, I * OpUndefElts.getBitWidth());

55437

}

55438

if (EltBits.size() == VT.getVectorNumElements())

55439

return getConstVector(EltBits, UndefElts, VT, DAG, DL);

55440

}

55441

55442

return SDValue();

55443

}

55444

55445

static SDValue combineCONCAT_VECTORS(SDNode *N, SelectionDAG &DAG,

55446

TargetLowering::DAGCombinerInfo &DCI,

55447

const X86Subtarget &Subtarget) {

55448

EVT VT = N->getValueType(0);

55449

EVT SrcVT = N->getOperand(0).getValueType();

55450

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

55451

55452

// Don't do anything for i1 vectors.

55453

if (VT.getVectorElementType() == MVT::i1)

55454

return SDValue();

55455

55456

if (Subtarget.hasAVX() && TLI.isTypeLegal(VT) && TLI.isTypeLegal(SrcVT)) {

55457

SmallVector<SDValue, 4> Ops(N->op_begin(), N->op_end());

55458

if (SDValue R = combineConcatVectorOps(SDLoc(N), VT.getSimpleVT(), Ops, DAG,

55459

DCI, Subtarget))

55460

return R;

55461

}

55462

55463

return SDValue();

55464

}

55465

55466

static SDValue combineINSERT_SUBVECTOR(SDNode *N, SelectionDAG &DAG,

55467

TargetLowering::DAGCombinerInfo &DCI,

55468

const X86Subtarget &Subtarget) {

55469

if (DCI.isBeforeLegalizeOps())

55470

return SDValue();

55471

55472

MVT OpVT = N->getSimpleValueType(0);

55473

55474

bool IsI1Vector = OpVT.getVectorElementType() == MVT::i1;

55475

55476

SDLoc dl(N);

55477

SDValue Vec = N->getOperand(0);

55478

SDValue SubVec = N->getOperand(1);

55479

55480

uint64_t IdxVal = N->getConstantOperandVal(2);

55481

MVT SubVecVT = SubVec.getSimpleValueType();

55482

55483

if (Vec.isUndef() && SubVec.isUndef())

55484

return DAG.getUNDEF(OpVT);

55485

55486

// Inserting undefs/zeros into zeros/undefs is a zero vector.

55487

if ((Vec.isUndef() || ISD::isBuildVectorAllZeros(Vec.getNode())) &&

55488

(SubVec.isUndef() || ISD::isBuildVectorAllZeros(SubVec.getNode())))

55489

return getZeroVector(OpVT, Subtarget, DAG, dl);

55490

55491

if (ISD::isBuildVectorAllZeros(Vec.getNode())) {

55492

// If we're inserting into a zero vector and then into a larger zero vector,

55493

// just insert into the larger zero vector directly.

55494

if (SubVec.getOpcode() == ISD::INSERT_SUBVECTOR &&

55495

ISD::isBuildVectorAllZeros(SubVec.getOperand(0).getNode())) {

55496

uint64_t Idx2Val = SubVec.getConstantOperandVal(2);

55497

return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,

55498

getZeroVector(OpVT, Subtarget, DAG, dl),

55499

SubVec.getOperand(1),

55500

DAG.getIntPtrConstant(IdxVal + Idx2Val, dl));

55501

}

55502

55503

// If we're inserting into a zero vector and our input was extracted from an

55504

// insert into a zero vector of the same type and the extraction was at

55505

// least as large as the original insertion. Just insert the original

55506

// subvector into a zero vector.

55507

if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR && IdxVal == 0 &&

55508

isNullConstant(SubVec.getOperand(1)) &&

55509

SubVec.getOperand(0).getOpcode() == ISD::INSERT_SUBVECTOR) {

55510

SDValue Ins = SubVec.getOperand(0);

55511

if (isNullConstant(Ins.getOperand(2)) &&

55512

ISD::isBuildVectorAllZeros(Ins.getOperand(0).getNode()) &&

55513

Ins.getOperand(1).getValueSizeInBits().getFixedValue() <=

55514

SubVecVT.getFixedSizeInBits())

55515

return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,

55516

getZeroVector(OpVT, Subtarget, DAG, dl),

55517

Ins.getOperand(1), N->getOperand(2));

55518

}

55519

}

55520

55521

// Stop here if this is an i1 vector.

55522

if (IsI1Vector)

55523

return SDValue();

55524

55525

// If this is an insert of an extract, combine to a shuffle. Don't do this

55526

// if the insert or extract can be represented with a subregister operation.

55527

if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR &&

55528

SubVec.getOperand(0).getSimpleValueType() == OpVT &&

55529

(IdxVal != 0 ||

55530

!(Vec.isUndef() || ISD::isBuildVectorAllZeros(Vec.getNode())))) {

55531

int ExtIdxVal = SubVec.getConstantOperandVal(1);

55532

if (ExtIdxVal != 0) {

55533

int VecNumElts = OpVT.getVectorNumElements();

55534

int SubVecNumElts = SubVecVT.getVectorNumElements();

55535

SmallVector<int, 64> Mask(VecNumElts);

55536

// First create an identity shuffle mask.

55537

for (int i = 0; i != VecNumElts; ++i)

55538

Mask[i] = i;

55539

// Now insert the extracted portion.

55540

for (int i = 0; i != SubVecNumElts; ++i)

55541

Mask[i + IdxVal] = i + ExtIdxVal + VecNumElts;

55542

55543

return DAG.getVectorShuffle(OpVT, dl, Vec, SubVec.getOperand(0), Mask);

55544

}

55545

}

55546

55547

// Match concat_vector style patterns.

55548

SmallVector<SDValue, 2> SubVectorOps;

55549

if (collectConcatOps(N, SubVectorOps, DAG)) {

55550

if (SDValue Fold =

55551

combineConcatVectorOps(dl, OpVT, SubVectorOps, DAG, DCI, Subtarget))

55552

return Fold;

55553

55554

// If we're inserting all zeros into the upper half, change this to

55555

// a concat with zero. We will match this to a move

55556

// with implicit upper bit zeroing during isel.

55557

// We do this here because we don't want combineConcatVectorOps to

55558

// create INSERT_SUBVECTOR from CONCAT_VECTORS.

55559

if (SubVectorOps.size() == 2 &&

55560

ISD::isBuildVectorAllZeros(SubVectorOps[1].getNode()))

55561

return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,

55562

getZeroVector(OpVT, Subtarget, DAG, dl),

55563

SubVectorOps[0], DAG.getIntPtrConstant(0, dl));

55564

}

55565

55566

// If this is a broadcast insert into an upper undef, use a larger broadcast.

55567

if (Vec.isUndef() && IdxVal != 0 && SubVec.getOpcode() == X86ISD::VBROADCAST)

55568

return DAG.getNode(X86ISD::VBROADCAST, dl, OpVT, SubVec.getOperand(0));

55569

55570

// If this is a broadcast load inserted into an upper undef, use a larger

55571

// broadcast load.

55572

if (Vec.isUndef() && IdxVal != 0 && SubVec.hasOneUse() &&

55573

SubVec.getOpcode() == X86ISD::VBROADCAST_LOAD) {

55574

auto *MemIntr = cast<MemIntrinsicSDNode>(SubVec);

55575

SDVTList Tys = DAG.getVTList(OpVT, MVT::Other);

55576

SDValue Ops[] = { MemIntr->getChain(), MemIntr->getBasePtr() };

55577

SDValue BcastLd =

55578

DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops,

55579

MemIntr->getMemoryVT(),

55580

MemIntr->getMemOperand());

55581

DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), BcastLd.getValue(1));

55582

return BcastLd;

55583

}

55584

55585

// If we're splatting the lower half subvector of a full vector load into the

55586

// upper half, attempt to create a subvector broadcast.

55587

if (IdxVal == (OpVT.getVectorNumElements() / 2) && SubVec.hasOneUse() &&

55588

Vec.getValueSizeInBits() == (2 * SubVec.getValueSizeInBits())) {

55589

auto *VecLd = dyn_cast<LoadSDNode>(Vec);

55590

auto *SubLd = dyn_cast<LoadSDNode>(SubVec);

55591

if (VecLd && SubLd &&

55592

DAG.areNonVolatileConsecutiveLoads(SubLd, VecLd,

55593

SubVec.getValueSizeInBits() / 8, 0))

55594

return getBROADCAST_LOAD(X86ISD::SUBV_BROADCAST_LOAD, dl, OpVT, SubVecVT,

55595

SubLd, 0, DAG);

55596

}

55597

55598

return SDValue();

55599

}

55600

55601

/// If we are extracting a subvector of a vector select and the select condition

55602

/// is composed of concatenated vectors, try to narrow the select width. This

55603

/// is a common pattern for AVX1 integer code because 256-bit selects may be

55604

/// legal, but there is almost no integer math/logic available for 256-bit.

55605

/// This function should only be called with legal types (otherwise, the calls

55606

/// to get simple value types will assert).

55607

static SDValue narrowExtractedVectorSelect(SDNode *Ext, SelectionDAG &DAG) {

55608

SDValue Sel = Ext->getOperand(0);

55609

SmallVector<SDValue, 4> CatOps;

55610

if (Sel.getOpcode() != ISD::VSELECT ||

55611

!collectConcatOps(Sel.getOperand(0).getNode(), CatOps, DAG))

55612

return SDValue();

55613

55614

// Note: We assume simple value types because this should only be called with

55615

// legal operations/types.

55616

// TODO: This can be extended to handle extraction to 256-bits.

55617

MVT VT = Ext->getSimpleValueType(0);

55618

if (!VT.is128BitVector())

55619

return SDValue();

55620

55621

MVT SelCondVT = Sel.getOperand(0).getSimpleValueType();

55622

if (!SelCondVT.is256BitVector() && !SelCondVT.is512BitVector())

55623

return SDValue();

55624

55625

MVT WideVT = Ext->getOperand(0).getSimpleValueType();

55626

MVT SelVT = Sel.getSimpleValueType();

55627

assert((SelVT.is256BitVector() || SelVT.is512BitVector()) &&(static_cast <bool> ((SelVT.is256BitVector() || SelVT.is512BitVector
()) && "Unexpected vector type with legal operations"
) ? void (0) : __assert_fail ("(SelVT.is256BitVector() || SelVT.is512BitVector()) && \"Unexpected vector type with legal operations\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 55628, __extension__
__PRETTY_FUNCTION__))

55628

"Unexpected vector type with legal operations")(static_cast <bool> ((SelVT.is256BitVector() || SelVT.is512BitVector
()) && "Unexpected vector type with legal operations"
) ? void (0) : __assert_fail ("(SelVT.is256BitVector() || SelVT.is512BitVector()) && \"Unexpected vector type with legal operations\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 55628, __extension__
__PRETTY_FUNCTION__));

55629

55630

unsigned SelElts = SelVT.getVectorNumElements();

55631

unsigned CastedElts = WideVT.getVectorNumElements();

55632

unsigned ExtIdx = Ext->getConstantOperandVal(1);

55633

if (SelElts % CastedElts == 0) {

55634

// The select has the same or more (narrower) elements than the extract

55635

// operand. The extraction index gets scaled by that factor.

55636

ExtIdx *= (SelElts / CastedElts);

55637

} else if (CastedElts % SelElts == 0) {

55638

// The select has less (wider) elements than the extract operand. Make sure

55639

// that the extraction index can be divided evenly.

55640

unsigned IndexDivisor = CastedElts / SelElts;

55641

if (ExtIdx % IndexDivisor != 0)

55642

return SDValue();

55643

ExtIdx /= IndexDivisor;

55644

} else {

55645

llvm_unreachable("Element count of simple vector types are not divisible?")::llvm::llvm_unreachable_internal("Element count of simple vector types are not divisible?"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 55645);

55646

}

55647

55648

unsigned NarrowingFactor = WideVT.getSizeInBits() / VT.getSizeInBits();

55649

unsigned NarrowElts = SelElts / NarrowingFactor;

55650

MVT NarrowSelVT = MVT::getVectorVT(SelVT.getVectorElementType(), NarrowElts);

55651

SDLoc DL(Ext);

55652

SDValue ExtCond = extract128BitVector(Sel.getOperand(0), ExtIdx, DAG, DL);

55653

SDValue ExtT = extract128BitVector(Sel.getOperand(1), ExtIdx, DAG, DL);

55654

SDValue ExtF = extract128BitVector(Sel.getOperand(2), ExtIdx, DAG, DL);

55655

SDValue NarrowSel = DAG.getSelect(DL, NarrowSelVT, ExtCond, ExtT, ExtF);

55656

return DAG.getBitcast(VT, NarrowSel);

55657

}

55658

55659

static SDValue combineEXTRACT_SUBVECTOR(SDNode *N, SelectionDAG &DAG,

55660

TargetLowering::DAGCombinerInfo &DCI,

55661

const X86Subtarget &Subtarget) {

55662

// For AVX1 only, if we are extracting from a 256-bit and+not (which will

55663

// eventually get combined/lowered into ANDNP) with a concatenated operand,

55664

// split the 'and' into 128-bit ops to avoid the concatenate and extract.

55665

// We let generic combining take over from there to simplify the

55666

// insert/extract and 'not'.

55667

// This pattern emerges during AVX1 legalization. We handle it before lowering

55668

// to avoid complications like splitting constant vector loads.

55669

55670

// Capture the original wide type in the likely case that we need to bitcast

55671

// back to this type.

55672

if (!N->getValueType(0).isSimple())

55673

return SDValue();

55674

55675

MVT VT = N->getSimpleValueType(0);

55676

SDValue InVec = N->getOperand(0);

55677

unsigned IdxVal = N->getConstantOperandVal(1);

55678

SDValue InVecBC = peekThroughBitcasts(InVec);

55679

EVT InVecVT = InVec.getValueType();

55680

unsigned SizeInBits = VT.getSizeInBits();

55681

unsigned InSizeInBits = InVecVT.getSizeInBits();

55682

unsigned NumSubElts = VT.getVectorNumElements();

55683

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

55684

55685

if (Subtarget.hasAVX() && !Subtarget.hasAVX2() &&

55686

TLI.isTypeLegal(InVecVT) &&

55687

InSizeInBits == 256 && InVecBC.getOpcode() == ISD::AND) {

55688

auto isConcatenatedNot = [](SDValue V) {

55689

V = peekThroughBitcasts(V);

55690

if (!isBitwiseNot(V))

55691

return false;

55692

SDValue NotOp = V->getOperand(0);

55693

return peekThroughBitcasts(NotOp).getOpcode() == ISD::CONCAT_VECTORS;

55694

};

55695

if (isConcatenatedNot(InVecBC.getOperand(0)) ||

55696

isConcatenatedNot(InVecBC.getOperand(1))) {

55697

// extract (and v4i64 X, (not (concat Y1, Y2))), n -> andnp v2i64 X(n), Y1

55698

SDValue Concat = splitVectorIntBinary(InVecBC, DAG);

55699

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), VT,

55700

DAG.getBitcast(InVecVT, Concat), N->getOperand(1));

55701

}

55702

}

55703

55704

if (DCI.isBeforeLegalizeOps())

55705

return SDValue();

55706

55707

if (SDValue V = narrowExtractedVectorSelect(N, DAG))

55708

return V;

55709

55710

if (ISD::isBuildVectorAllZeros(InVec.getNode()))

55711

return getZeroVector(VT, Subtarget, DAG, SDLoc(N));

55712

55713

if (ISD::isBuildVectorAllOnes(InVec.getNode())) {

55714

if (VT.getScalarType() == MVT::i1)

55715

return DAG.getConstant(1, SDLoc(N), VT);

55716

return getOnesVector(VT, DAG, SDLoc(N));

55717

}

55718

55719

if (InVec.getOpcode() == ISD::BUILD_VECTOR)

55720

return DAG.getBuildVector(VT, SDLoc(N),

55721

InVec->ops().slice(IdxVal, NumSubElts));

55722

55723

// If we are extracting from an insert into a larger vector, replace with a

55724

// smaller insert if we don't access less than the original subvector. Don't

55725

// do this for i1 vectors.

55726

// TODO: Relax the matching indices requirement?

55727

if (VT.getVectorElementType() != MVT::i1 &&

55728

InVec.getOpcode() == ISD::INSERT_SUBVECTOR && InVec.hasOneUse() &&

55729

IdxVal == InVec.getConstantOperandVal(2) &&

55730

InVec.getOperand(1).getValueSizeInBits() <= SizeInBits) {

55731

SDLoc DL(N);

55732

SDValue NewExt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT,

55733

InVec.getOperand(0), N->getOperand(1));

55734

unsigned NewIdxVal = InVec.getConstantOperandVal(2) - IdxVal;

55735

return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, NewExt,

55736

InVec.getOperand(1),

55737

DAG.getVectorIdxConstant(NewIdxVal, DL));

55738

}

55739

55740

// If we're extracting an upper subvector from a broadcast we should just

55741

// extract the lowest subvector instead which should allow

55742

// SimplifyDemandedVectorElts do more simplifications.

55743

if (IdxVal != 0 && (InVec.getOpcode() == X86ISD::VBROADCAST ||

55744

InVec.getOpcode() == X86ISD::VBROADCAST_LOAD ||

55745

DAG.isSplatValue(InVec, /*AllowUndefs*/ false)))

55746

return extractSubVector(InVec, 0, DAG, SDLoc(N), SizeInBits);

55747

55748

// If we're extracting a broadcasted subvector, just use the lowest subvector.

55749

if (IdxVal != 0 && InVec.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD &&

55750

cast<MemIntrinsicSDNode>(InVec)->getMemoryVT() == VT)

55751

return extractSubVector(InVec, 0, DAG, SDLoc(N), SizeInBits);

55752

55753

// Attempt to extract from the source of a shuffle vector.

55754

if ((InSizeInBits % SizeInBits) == 0 && (IdxVal % NumSubElts) == 0) {

55755

SmallVector<int, 32> ShuffleMask;

55756

SmallVector<int, 32> ScaledMask;

55757

SmallVector<SDValue, 2> ShuffleInputs;

55758

unsigned NumSubVecs = InSizeInBits / SizeInBits;

55759

// Decode the shuffle mask and scale it so its shuffling subvectors.

55760

if (getTargetShuffleInputs(InVecBC, ShuffleInputs, ShuffleMask, DAG) &&

55761

scaleShuffleElements(ShuffleMask, NumSubVecs, ScaledMask)) {

55762

unsigned SubVecIdx = IdxVal / NumSubElts;

55763

if (ScaledMask[SubVecIdx] == SM_SentinelUndef)

55764

return DAG.getUNDEF(VT);

55765

if (ScaledMask[SubVecIdx] == SM_SentinelZero)

55766

return getZeroVector(VT, Subtarget, DAG, SDLoc(N));

55767

SDValue Src = ShuffleInputs[ScaledMask[SubVecIdx] / NumSubVecs];

55768

if (Src.getValueSizeInBits() == InSizeInBits) {

55769

unsigned SrcSubVecIdx = ScaledMask[SubVecIdx] % NumSubVecs;

55770

unsigned SrcEltIdx = SrcSubVecIdx * NumSubElts;

55771

return extractSubVector(DAG.getBitcast(InVecVT, Src), SrcEltIdx, DAG,

55772

SDLoc(N), SizeInBits);

55773

}

55774

}

55775

}

55776

55777

// If we're extracting the lowest subvector and we're the only user,

55778

// we may be able to perform this with a smaller vector width.

55779

unsigned InOpcode = InVec.getOpcode();

55780

if (InVec.hasOneUse()) {

55781

if (IdxVal == 0 && VT == MVT::v2f64 && InVecVT == MVT::v4f64) {

55782

// v2f64 CVTDQ2PD(v4i32).

55783

if (InOpcode == ISD::SINT_TO_FP &&

55784

InVec.getOperand(0).getValueType() == MVT::v4i32) {

55785

return DAG.getNode(X86ISD::CVTSI2P, SDLoc(N), VT, InVec.getOperand(0));

55786

}

55787

// v2f64 CVTUDQ2PD(v4i32).

55788

if (InOpcode == ISD::UINT_TO_FP && Subtarget.hasVLX() &&

55789

InVec.getOperand(0).getValueType() == MVT::v4i32) {

55790

return DAG.getNode(X86ISD::CVTUI2P, SDLoc(N), VT, InVec.getOperand(0));

55791

}

55792

// v2f64 CVTPS2PD(v4f32).

55793

if (InOpcode == ISD::FP_EXTEND &&

55794

InVec.getOperand(0).getValueType() == MVT::v4f32) {

55795

return DAG.getNode(X86ISD::VFPEXT, SDLoc(N), VT, InVec.getOperand(0));

55796

}

55797

}

55798

if (IdxVal == 0 &&

55799

(InOpcode == ISD::ANY_EXTEND ||

55800

InOpcode == ISD::ANY_EXTEND_VECTOR_INREG ||

55801

InOpcode == ISD::ZERO_EXTEND ||

55802

InOpcode == ISD::ZERO_EXTEND_VECTOR_INREG ||

55803

InOpcode == ISD::SIGN_EXTEND ||

55804

InOpcode == ISD::SIGN_EXTEND_VECTOR_INREG) &&

55805

(SizeInBits == 128 || SizeInBits == 256) &&

55806

InVec.getOperand(0).getValueSizeInBits() >= SizeInBits) {

55807

SDLoc DL(N);

55808

SDValue Ext = InVec.getOperand(0);

55809

if (Ext.getValueSizeInBits() > SizeInBits)

55810

Ext = extractSubVector(Ext, 0, DAG, DL, SizeInBits);

55811

unsigned ExtOp = DAG.getOpcode_EXTEND_VECTOR_INREG(InOpcode);

55812

return DAG.getNode(ExtOp, DL, VT, Ext);

55813

}

55814

if (IdxVal == 0 && InOpcode == ISD::VSELECT &&

55815

InVec.getOperand(0).getValueType().is256BitVector() &&

55816

InVec.getOperand(1).getValueType().is256BitVector() &&

55817

InVec.getOperand(2).getValueType().is256BitVector()) {

55818

SDLoc DL(N);

55819

SDValue Ext0 = extractSubVector(InVec.getOperand(0), 0, DAG, DL, 128);

55820

SDValue Ext1 = extractSubVector(InVec.getOperand(1), 0, DAG, DL, 128);

55821

SDValue Ext2 = extractSubVector(InVec.getOperand(2), 0, DAG, DL, 128);

55822

return DAG.getNode(InOpcode, DL, VT, Ext0, Ext1, Ext2);

55823

}

55824

if (IdxVal == 0 && InOpcode == ISD::TRUNCATE && Subtarget.hasVLX() &&

55825

(VT.is128BitVector() || VT.is256BitVector())) {

55826

SDLoc DL(N);

55827

SDValue InVecSrc = InVec.getOperand(0);

55828

unsigned Scale = InVecSrc.getValueSizeInBits() / InSizeInBits;

55829

SDValue Ext = extractSubVector(InVecSrc, 0, DAG, DL, Scale * SizeInBits);

55830

return DAG.getNode(InOpcode, DL, VT, Ext);

55831

}

55832

if (InOpcode == X86ISD::MOVDDUP &&

55833

(VT.is128BitVector() || VT.is256BitVector())) {

55834

SDLoc DL(N);

55835

SDValue Ext0 =

55836

extractSubVector(InVec.getOperand(0), IdxVal, DAG, DL, SizeInBits);

55837

return DAG.getNode(InOpcode, DL, VT, Ext0);

55838

}

55839

}

55840

55841

// Always split vXi64 logical shifts where we're extracting the upper 32-bits

55842

// as this is very likely to fold into a shuffle/truncation.

55843

if ((InOpcode == X86ISD::VSHLI || InOpcode == X86ISD::VSRLI) &&

55844

InVecVT.getScalarSizeInBits() == 64 &&

55845

InVec.getConstantOperandAPInt(1) == 32) {

55846

SDLoc DL(N);

55847

SDValue Ext =

55848

extractSubVector(InVec.getOperand(0), IdxVal, DAG, DL, SizeInBits);

55849

return DAG.getNode(InOpcode, DL, VT, Ext, InVec.getOperand(1));

55850

}

55851

55852

return SDValue();

55853

}

55854

55855

static SDValue combineScalarToVector(SDNode *N, SelectionDAG &DAG) {

55856

EVT VT = N->getValueType(0);

55857

SDValue Src = N->getOperand(0);

55858

SDLoc DL(N);

55859

55860

// If this is a scalar to vector to v1i1 from an AND with 1, bypass the and.

55861

// This occurs frequently in our masked scalar intrinsic code and our

55862

// floating point select lowering with AVX512.

55863

// TODO: SimplifyDemandedBits instead?

55864

if (VT == MVT::v1i1 && Src.getOpcode() == ISD::AND && Src.hasOneUse())

55865

if (auto *C = dyn_cast<ConstantSDNode>(Src.getOperand(1)))

55866

if (C->getAPIntValue().isOne())

55867

return DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1,

55868

Src.getOperand(0));

55869

55870

// Combine scalar_to_vector of an extract_vector_elt into an extract_subvec.

55871

if (VT == MVT::v1i1 && Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&

55872

Src.hasOneUse() && Src.getOperand(0).getValueType().isVector() &&

55873

Src.getOperand(0).getValueType().getVectorElementType() == MVT::i1)

55874

if (auto *C = dyn_cast<ConstantSDNode>(Src.getOperand(1)))

55875

if (C->isZero())

55876

return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Src.getOperand(0),

55877

Src.getOperand(1));

55878

55879

// Reduce v2i64 to v4i32 if we don't need the upper bits or are known zero.

55880

// TODO: Move to DAGCombine/SimplifyDemandedBits?

55881

if ((VT == MVT::v2i64 || VT == MVT::v2f64) && Src.hasOneUse()) {

55882

auto IsExt64 = [&DAG](SDValue Op, bool IsZeroExt) {

55883

if (Op.getValueType() != MVT::i64)

55884

return SDValue();

55885

unsigned Opc = IsZeroExt ? ISD::ZERO_EXTEND : ISD::ANY_EXTEND;

55886

if (Op.getOpcode() == Opc &&

55887

Op.getOperand(0).getScalarValueSizeInBits() <= 32)

55888

return Op.getOperand(0);

55889

unsigned Ext = IsZeroExt ? ISD::ZEXTLOAD : ISD::EXTLOAD;

55890

if (auto *Ld = dyn_cast<LoadSDNode>(Op))

55891

if (Ld->getExtensionType() == Ext &&

55892

Ld->getMemoryVT().getScalarSizeInBits() <= 32)

55893

return Op;

55894

if (IsZeroExt) {

55895

KnownBits Known = DAG.computeKnownBits(Op);

55896

if (!Known.isConstant() && Known.countMinLeadingZeros() >= 32)

55897

return Op;

55898

}

55899

return SDValue();

55900

};

55901

55902

if (SDValue AnyExt = IsExt64(peekThroughOneUseBitcasts(Src), false))

55903

return DAG.getBitcast(

55904

VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32,

55905

DAG.getAnyExtOrTrunc(AnyExt, DL, MVT::i32)));

55906

55907

if (SDValue ZeroExt = IsExt64(peekThroughOneUseBitcasts(Src), true))

55908

return DAG.getBitcast(

55909

VT,

55910

DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v4i32,

55911

DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32,

55912

DAG.getZExtOrTrunc(ZeroExt, DL, MVT::i32))));

55913

}

55914

55915

// Combine (v2i64 (scalar_to_vector (i64 (bitconvert (mmx))))) to MOVQ2DQ.

55916

if (VT == MVT::v2i64 && Src.getOpcode() == ISD::BITCAST &&

55917

Src.getOperand(0).getValueType() == MVT::x86mmx)

55918

return DAG.getNode(X86ISD::MOVQ2DQ, DL, VT, Src.getOperand(0));

55919

55920

// See if we're broadcasting the scalar value, in which case just reuse that.

55921

// Ensure the same SDValue from the SDNode use is being used.

55922

if (VT.getScalarType() == Src.getValueType())

55923

for (SDNode *User : Src->uses())

55924

if (User->getOpcode() == X86ISD::VBROADCAST &&

55925

Src == User->getOperand(0)) {

55926

unsigned SizeInBits = VT.getFixedSizeInBits();

55927

unsigned BroadcastSizeInBits =

55928

User->getValueSizeInBits(0).getFixedValue();

55929

if (BroadcastSizeInBits == SizeInBits)

55930

return SDValue(User, 0);

55931

if (BroadcastSizeInBits > SizeInBits)

55932

return extractSubVector(SDValue(User, 0), 0, DAG, DL, SizeInBits);

55933

// TODO: Handle BroadcastSizeInBits < SizeInBits when we have test

55934

// coverage.

55935

}

55936

55937

return SDValue();

55938

}

55939

55940

// Simplify PMULDQ and PMULUDQ operations.

55941

static SDValue combinePMULDQ(SDNode *N, SelectionDAG &DAG,

55942

TargetLowering::DAGCombinerInfo &DCI,

55943

const X86Subtarget &Subtarget) {

55944

SDValue LHS = N->getOperand(0);

55945

SDValue RHS = N->getOperand(1);

55946

55947

// Canonicalize constant to RHS.

55948

if (DAG.isConstantIntBuildVectorOrConstantInt(LHS) &&

55949

!DAG.isConstantIntBuildVectorOrConstantInt(RHS))

55950

return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), RHS, LHS);

55951

55952

// Multiply by zero.

55953

// Don't return RHS as it may contain UNDEFs.

55954

if (ISD::isBuildVectorAllZeros(RHS.getNode()))

55955

return DAG.getConstant(0, SDLoc(N), N->getValueType(0));

55956

55957

// PMULDQ/PMULUDQ only uses lower 32 bits from each vector element.

55958

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

55959

if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(64), DCI))

55960

return SDValue(N, 0);

55961

55962

// If the input is an extend_invec and the SimplifyDemandedBits call didn't

55963

// convert it to any_extend_invec, due to the LegalOperations check, do the

55964

// conversion directly to a vector shuffle manually. This exposes combine

55965

// opportunities missed by combineEXTEND_VECTOR_INREG not calling

55966

// combineX86ShufflesRecursively on SSE4.1 targets.

55967

// FIXME: This is basically a hack around several other issues related to

55968

// ANY_EXTEND_VECTOR_INREG.

55969

if (N->getValueType(0) == MVT::v2i64 && LHS.hasOneUse() &&

55970

(LHS.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG ||

55971

LHS.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG) &&

55972

LHS.getOperand(0).getValueType() == MVT::v4i32) {

55973

SDLoc dl(N);

55974

LHS = DAG.getVectorShuffle(MVT::v4i32, dl, LHS.getOperand(0),

55975

LHS.getOperand(0), { 0, -1, 1, -1 });

55976

LHS = DAG.getBitcast(MVT::v2i64, LHS);

55977

return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS);

55978

}

55979

if (N->getValueType(0) == MVT::v2i64 && RHS.hasOneUse() &&

55980

(RHS.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG ||

55981

RHS.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG) &&

55982

RHS.getOperand(0).getValueType() == MVT::v4i32) {

55983

SDLoc dl(N);

55984

RHS = DAG.getVectorShuffle(MVT::v4i32, dl, RHS.getOperand(0),

55985

RHS.getOperand(0), { 0, -1, 1, -1 });

55986

RHS = DAG.getBitcast(MVT::v2i64, RHS);

55987

return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS);

55988

}

55989

55990

return SDValue();

55991

}

55992

55993

// Simplify VPMADDUBSW/VPMADDWD operations.

55994

static SDValue combineVPMADD(SDNode *N, SelectionDAG &DAG,

55995

TargetLowering::DAGCombinerInfo &DCI) {

55996

EVT VT = N->getValueType(0);

55997

SDValue LHS = N->getOperand(0);

55998

SDValue RHS = N->getOperand(1);

55999

56000

// Multiply by zero.

56001

// Don't return LHS/RHS as it may contain UNDEFs.

56002

if (ISD::isBuildVectorAllZeros(LHS.getNode()) ||

56003

ISD::isBuildVectorAllZeros(RHS.getNode()))

56004

return DAG.getConstant(0, SDLoc(N), VT);

56005

56006

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

56007

APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());

56008

if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))

56009

return SDValue(N, 0);

56010

56011

return SDValue();

56012

}

56013

56014

static SDValue combineEXTEND_VECTOR_INREG(SDNode *N, SelectionDAG &DAG,

56015

TargetLowering::DAGCombinerInfo &DCI,

56016

const X86Subtarget &Subtarget) {

56017

EVT VT = N->getValueType(0);

56018

SDValue In = N->getOperand(0);

56019

unsigned Opcode = N->getOpcode();

56020

unsigned InOpcode = In.getOpcode();

56021

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

56022

SDLoc DL(N);

56023

56024

// Try to merge vector loads and extend_inreg to an extload.

56025

if (!DCI.isBeforeLegalizeOps() && ISD::isNormalLoad(In.getNode()) &&

56026

In.hasOneUse()) {

56027

auto *Ld = cast<LoadSDNode>(In);

56028

if (Ld->isSimple()) {

56029

MVT SVT = In.getSimpleValueType().getVectorElementType();

56030

ISD::LoadExtType Ext = Opcode == ISD::SIGN_EXTEND_VECTOR_INREG

56031

? ISD::SEXTLOAD

56032

: ISD::ZEXTLOAD;

56033

EVT MemVT = VT.changeVectorElementType(SVT);

56034

if (TLI.isLoadExtLegal(Ext, VT, MemVT)) {

56035

SDValue Load = DAG.getExtLoad(

56036

Ext, DL, VT, Ld->getChain(), Ld->getBasePtr(), Ld->getPointerInfo(),

56037

MemVT, Ld->getOriginalAlign(), Ld->getMemOperand()->getFlags());

56038

DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));

56039

return Load;

56040

}

56041

}

56042

}

56043

56044

// Fold EXTEND_VECTOR_INREG(EXTEND_VECTOR_INREG(X)) -> EXTEND_VECTOR_INREG(X).

56045

if (Opcode == InOpcode)

56046

return DAG.getNode(Opcode, DL, VT, In.getOperand(0));

56047

56048

// Fold EXTEND_VECTOR_INREG(EXTRACT_SUBVECTOR(EXTEND(X),0))

56049

// -> EXTEND_VECTOR_INREG(X).

56050

// TODO: Handle non-zero subvector indices.

56051

if (InOpcode == ISD::EXTRACT_SUBVECTOR && In.getConstantOperandVal(1) == 0 &&

56052

In.getOperand(0).getOpcode() == DAG.getOpcode_EXTEND(Opcode) &&

56053

In.getOperand(0).getOperand(0).getValueSizeInBits() ==

56054

In.getValueSizeInBits())

56055

return DAG.getNode(Opcode, DL, VT, In.getOperand(0).getOperand(0));

56056

56057

// Fold EXTEND_VECTOR_INREG(BUILD_VECTOR(X,Y,?,?)) -> BUILD_VECTOR(X,0,Y,0).

56058

// TODO: Move to DAGCombine?

56059

if (!DCI.isBeforeLegalizeOps() && Opcode == ISD::ZERO_EXTEND_VECTOR_INREG &&

56060

In.getOpcode() == ISD::BUILD_VECTOR && In.hasOneUse() &&

56061

In.getValueSizeInBits() == VT.getSizeInBits()) {

56062

unsigned NumElts = VT.getVectorNumElements();

56063

unsigned Scale = VT.getScalarSizeInBits() / In.getScalarValueSizeInBits();

56064

EVT EltVT = In.getOperand(0).getValueType();

56065

SmallVector<SDValue> Elts(Scale * NumElts, DAG.getConstant(0, DL, EltVT));

56066

for (unsigned I = 0; I != NumElts; ++I)

56067

Elts[I * Scale] = In.getOperand(I);

56068

return DAG.getBitcast(VT, DAG.getBuildVector(In.getValueType(), DL, Elts));

56069

}

56070

56071

// Attempt to combine as a shuffle on SSE41+ targets.

56072

if ((Opcode == ISD::ANY_EXTEND_VECTOR_INREG ||

56073

Opcode == ISD::ZERO_EXTEND_VECTOR_INREG) &&

56074

Subtarget.hasSSE41()) {

56075

SDValue Op(N, 0);

56076

if (TLI.isTypeLegal(VT) && TLI.isTypeLegal(In.getValueType()))

56077

if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))

56078

return Res;

56079

}

56080

56081

return SDValue();

56082

}

56083

56084

static SDValue combineKSHIFT(SDNode *N, SelectionDAG &DAG,

56085

TargetLowering::DAGCombinerInfo &DCI) {

56086

EVT VT = N->getValueType(0);

56087

56088

if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))

56089

return DAG.getConstant(0, SDLoc(N), VT);

56090

56091

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

56092

APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements());

56093

if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI))

56094

return SDValue(N, 0);

56095

56096

return SDValue();

56097

}

56098

56099

// Optimize (fp16_to_fp (fp_to_fp16 X)) to VCVTPS2PH followed by VCVTPH2PS.

56100

// Done as a combine because the lowering for fp16_to_fp and fp_to_fp16 produce

56101

// extra instructions between the conversion due to going to scalar and back.

56102

static SDValue combineFP16_TO_FP(SDNode *N, SelectionDAG &DAG,

56103

const X86Subtarget &Subtarget) {

56104

if (Subtarget.useSoftFloat() || !Subtarget.hasF16C())

56105

return SDValue();

56106

56107

if (N->getOperand(0).getOpcode() != ISD::FP_TO_FP16)

56108

return SDValue();

56109

56110

if (N->getValueType(0) != MVT::f32 ||

56111

N->getOperand(0).getOperand(0).getValueType() != MVT::f32)

56112

return SDValue();

56113

56114

SDLoc dl(N);

56115

SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32,

56116

N->getOperand(0).getOperand(0));

56117

Res = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Res,

56118

DAG.getTargetConstant(4, dl, MVT::i32));

56119

Res = DAG.getNode(X86ISD::CVTPH2PS, dl, MVT::v4f32, Res);

56120

return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,

56121

DAG.getIntPtrConstant(0, dl));

56122

}

56123

56124

static SDValue combineFP_EXTEND(SDNode *N, SelectionDAG &DAG,

56125

const X86Subtarget &Subtarget) {

56126

if (!Subtarget.hasF16C() || Subtarget.useSoftFloat())

56127

return SDValue();

56128

56129

if (Subtarget.hasFP16())

56130

return SDValue();

56131

56132

bool IsStrict = N->isStrictFPOpcode();

56133

EVT VT = N->getValueType(0);

56134

SDValue Src = N->getOperand(IsStrict ? 1 : 0);

56135

EVT SrcVT = Src.getValueType();

56136

56137

if (!SrcVT.isVector() || SrcVT.getVectorElementType() != MVT::f16)

56138

return SDValue();

56139

56140

if (VT.getVectorElementType() != MVT::f32 &&

56141

VT.getVectorElementType() != MVT::f64)

56142

return SDValue();

56143

56144

unsigned NumElts = VT.getVectorNumElements();

56145

if (NumElts == 1 || !isPowerOf2_32(NumElts))

56146

return SDValue();

56147

56148

SDLoc dl(N);

56149

56150

// Convert the input to vXi16.

56151

EVT IntVT = SrcVT.changeVectorElementTypeToInteger();

56152

Src = DAG.getBitcast(IntVT, Src);

56153

56154

// Widen to at least 8 input elements.

56155

if (NumElts < 8) {

56156

unsigned NumConcats = 8 / NumElts;

56157

SDValue Fill = NumElts == 4 ? DAG.getUNDEF(IntVT)

56158

: DAG.getConstant(0, dl, IntVT);

56159

SmallVector<SDValue, 4> Ops(NumConcats, Fill);

56160

Ops[0] = Src;

56161

Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, Ops);

56162

}

56163

56164

// Destination is vXf32 with at least 4 elements.

56165

EVT CvtVT = EVT::getVectorVT(*DAG.getContext(), MVT::f32,

56166

std::max(4U, NumElts));

56167

SDValue Cvt, Chain;

56168

if (IsStrict) {

56169

Cvt = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {CvtVT, MVT::Other},

56170

{N->getOperand(0), Src});

56171

Chain = Cvt.getValue(1);

56172

} else {

56173

Cvt = DAG.getNode(X86ISD::CVTPH2PS, dl, CvtVT, Src);

56174

}

56175

56176

if (NumElts < 4) {

56177

assert(NumElts == 2 && "Unexpected size")(static_cast <bool> (NumElts == 2 && "Unexpected size"
) ? void (0) : __assert_fail ("NumElts == 2 && \"Unexpected size\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 56177, __extension__
__PRETTY_FUNCTION__));

56178

Cvt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2f32, Cvt,

56179

DAG.getIntPtrConstant(0, dl));

56180

}

56181

56182

if (IsStrict) {

56183

// Extend to the original VT if necessary.

56184

if (Cvt.getValueType() != VT) {

56185

Cvt = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {VT, MVT::Other},

56186

{Chain, Cvt});

56187

Chain = Cvt.getValue(1);

56188

}

56189

return DAG.getMergeValues({Cvt, Chain}, dl);

56190

}

56191

56192

// Extend to the original VT if necessary.

56193

return DAG.getNode(ISD::FP_EXTEND, dl, VT, Cvt);

56194

}

56195

56196

// Try to find a larger VBROADCAST_LOAD/SUBV_BROADCAST_LOAD that we can extract

56197

// from. Limit this to cases where the loads have the same input chain and the

56198

// output chains are unused. This avoids any memory ordering issues.

56199

static SDValue combineBROADCAST_LOAD(SDNode *N, SelectionDAG &DAG,

56200

TargetLowering::DAGCombinerInfo &DCI) {

56201

assert((N->getOpcode() == X86ISD::VBROADCAST_LOAD ||(static_cast <bool> ((N->getOpcode() == X86ISD::VBROADCAST_LOAD
|| N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) &&
"Unknown broadcast load type") ? void (0) : __assert_fail ("(N->getOpcode() == X86ISD::VBROADCAST_LOAD || N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) && \"Unknown broadcast load type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 56203, __extension__
__PRETTY_FUNCTION__))

56202

N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) &&(static_cast <bool> ((N->getOpcode() == X86ISD::VBROADCAST_LOAD
|| N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) &&
"Unknown broadcast load type") ? void (0) : __assert_fail ("(N->getOpcode() == X86ISD::VBROADCAST_LOAD || N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) && \"Unknown broadcast load type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 56203, __extension__
__PRETTY_FUNCTION__))

56203

"Unknown broadcast load type")(static_cast <bool> ((N->getOpcode() == X86ISD::VBROADCAST_LOAD
|| N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) &&
"Unknown broadcast load type") ? void (0) : __assert_fail ("(N->getOpcode() == X86ISD::VBROADCAST_LOAD || N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) && \"Unknown broadcast load type\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 56203, __extension__
__PRETTY_FUNCTION__));

56204

56205

// Only do this if the chain result is unused.

56206

if (N->hasAnyUseOfValue(1))

56207

return SDValue();

56208

56209

auto *MemIntrin = cast<MemIntrinsicSDNode>(N);

56210

56211

SDValue Ptr = MemIntrin->getBasePtr();

56212

SDValue Chain = MemIntrin->getChain();

56213

EVT VT = N->getSimpleValueType(0);

56214

EVT MemVT = MemIntrin->getMemoryVT();

56215

56216

// Look at other users of our base pointer and try to find a wider broadcast.

56217

// The input chain and the size of the memory VT must match.

56218

for (SDNode *User : Ptr->uses())

56219

if (User != N && User->getOpcode() == N->getOpcode() &&

56220

cast<MemIntrinsicSDNode>(User)->getBasePtr() == Ptr &&

56221

cast<MemIntrinsicSDNode>(User)->getChain() == Chain &&

56222

cast<MemIntrinsicSDNode>(User)->getMemoryVT().getSizeInBits() ==

56223

MemVT.getSizeInBits() &&

56224

!User->hasAnyUseOfValue(1) &&

56225

User->getValueSizeInBits(0).getFixedValue() > VT.getFixedSizeInBits()) {

56226

SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, SDLoc(N),

56227

VT.getSizeInBits());

56228

Extract = DAG.getBitcast(VT, Extract);

56229

return DCI.CombineTo(N, Extract, SDValue(User, 1));

56230

}

56231

56232

return SDValue();

56233

}

56234

56235

static SDValue combineFP_ROUND(SDNode *N, SelectionDAG &DAG,

56236

const X86Subtarget &Subtarget) {

56237

if (!Subtarget.hasF16C() || Subtarget.useSoftFloat())

56238

return SDValue();

56239

56240

if (Subtarget.hasFP16())

56241

return SDValue();

56242

56243

bool IsStrict = N->isStrictFPOpcode();

56244

EVT VT = N->getValueType(0);

56245

SDValue Src = N->getOperand(IsStrict ? 1 : 0);

56246

EVT SrcVT = Src.getValueType();

56247

56248

if (!VT.isVector() || VT.getVectorElementType() != MVT::f16 ||

56249

SrcVT.getVectorElementType() != MVT::f32)

56250

return SDValue();

56251

56252

unsigned NumElts = VT.getVectorNumElements();

56253

if (NumElts == 1 || !isPowerOf2_32(NumElts))

56254

return SDValue();

56255

56256

SDLoc dl(N);

56257

56258

// Widen to at least 4 input elements.

56259

if (NumElts < 4)

56260

Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,

56261

DAG.getConstantFP(0.0, dl, SrcVT));

56262

56263

// Destination is v8i16 with at least 8 elements.

56264

EVT CvtVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,

56265

std::max(8U, NumElts));

56266

SDValue Cvt, Chain;

56267

SDValue Rnd = DAG.getTargetConstant(4, dl, MVT::i32);

56268

if (IsStrict) {

56269

Cvt = DAG.getNode(X86ISD::STRICT_CVTPS2PH, dl, {CvtVT, MVT::Other},

56270

{N->getOperand(0), Src, Rnd});

56271

Chain = Cvt.getValue(1);

56272

} else {

56273

Cvt = DAG.getNode(X86ISD::CVTPS2PH, dl, CvtVT, Src, Rnd);

56274

}

56275

56276

// Extract down to real number of elements.

56277

if (NumElts < 8) {

56278

EVT IntVT = VT.changeVectorElementTypeToInteger();

56279

Cvt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, IntVT, Cvt,

56280

DAG.getIntPtrConstant(0, dl));

56281

}

56282

56283

Cvt = DAG.getBitcast(VT, Cvt);

56284

56285

if (IsStrict)

56286

return DAG.getMergeValues({Cvt, Chain}, dl);

56287

56288

return Cvt;

56289

}

56290

56291

static SDValue combineMOVDQ2Q(SDNode *N, SelectionDAG &DAG) {

56292

SDValue Src = N->getOperand(0);

56293

56294

// Turn MOVDQ2Q+simple_load into an mmx load.

56295

if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) {

56296

LoadSDNode *LN = cast<LoadSDNode>(Src.getNode());

56297

56298

if (LN->isSimple()) {

56299

SDValue NewLd = DAG.getLoad(MVT::x86mmx, SDLoc(N), LN->getChain(),

56300

LN->getBasePtr(),

56301

LN->getPointerInfo(),

56302

LN->getOriginalAlign(),

56303

LN->getMemOperand()->getFlags());

56304

DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), NewLd.getValue(1));

56305

return NewLd;

56306

}

56307

}

56308

56309

return SDValue();

56310

}

56311

56312

static SDValue combinePDEP(SDNode *N, SelectionDAG &DAG,

56313

TargetLowering::DAGCombinerInfo &DCI) {

56314

unsigned NumBits = N->getSimpleValueType(0).getSizeInBits();

56315

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

56316

if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(NumBits), DCI))

56317

return SDValue(N, 0);

56318

56319

return SDValue();

56320

}

56321

56322

SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,

56323

DAGCombinerInfo &DCI) const {

56324

SelectionDAG &DAG = DCI.DAG;

56325

switch (N->getOpcode()) {

56326

default: break;

56327

case ISD::SCALAR_TO_VECTOR:

56328

return combineScalarToVector(N, DAG);

56329

case ISD::EXTRACT_VECTOR_ELT:

56330

case X86ISD::PEXTRW:

56331

case X86ISD::PEXTRB:

56332

return combineExtractVectorElt(N, DAG, DCI, Subtarget);

56333

case ISD::CONCAT_VECTORS:

56334

return combineCONCAT_VECTORS(N, DAG, DCI, Subtarget);

56335

case ISD::INSERT_SUBVECTOR:

56336

return combineINSERT_SUBVECTOR(N, DAG, DCI, Subtarget);

56337

case ISD::EXTRACT_SUBVECTOR:

56338

return combineEXTRACT_SUBVECTOR(N, DAG, DCI, Subtarget);

56339

case ISD::VSELECT:

56340

case ISD::SELECT:

56341

case X86ISD::BLENDV: return combineSelect(N, DAG, DCI, Subtarget);

56342

case ISD::BITCAST: return combineBitcast(N, DAG, DCI, Subtarget);

56343

case X86ISD::CMOV: return combineCMov(N, DAG, DCI, Subtarget);

56344

case X86ISD::CMP: return combineCMP(N, DAG);

56345

case ISD::ADD: return combineAdd(N, DAG, DCI, Subtarget);

56346

case ISD::SUB: return combineSub(N, DAG, DCI, Subtarget);

56347

case X86ISD::ADD:

56348

case X86ISD::SUB: return combineX86AddSub(N, DAG, DCI);

56349

case X86ISD::SBB: return combineSBB(N, DAG);

56350

case X86ISD::ADC: return combineADC(N, DAG, DCI);

56351

case ISD::MUL: return combineMul(N, DAG, DCI, Subtarget);

56352

case ISD::SHL: return combineShiftLeft(N, DAG);

56353

case ISD::SRA: return combineShiftRightArithmetic(N, DAG, Subtarget);

56354

case ISD::SRL: return combineShiftRightLogical(N, DAG, DCI, Subtarget);

56355

case ISD::AND: return combineAnd(N, DAG, DCI, Subtarget);

56356

case ISD::OR: return combineOr(N, DAG, DCI, Subtarget);

56357

case ISD::XOR: return combineXor(N, DAG, DCI, Subtarget);

56358

case X86ISD::BEXTR:

56359

case X86ISD::BEXTRI: return combineBEXTR(N, DAG, DCI, Subtarget);

56360

case ISD::LOAD: return combineLoad(N, DAG, DCI, Subtarget);

56361

case ISD::MLOAD: return combineMaskedLoad(N, DAG, DCI, Subtarget);

56362

case ISD::STORE: return combineStore(N, DAG, DCI, Subtarget);

56363

case ISD::MSTORE: return combineMaskedStore(N, DAG, DCI, Subtarget);

56364

case X86ISD::VEXTRACT_STORE:

56365

return combineVEXTRACT_STORE(N, DAG, DCI, Subtarget);

56366

case ISD::SINT_TO_FP:

56367

case ISD::STRICT_SINT_TO_FP:

56368

return combineSIntToFP(N, DAG, DCI, Subtarget);

56369

case ISD::UINT_TO_FP:

56370

case ISD::STRICT_UINT_TO_FP:

56371

return combineUIntToFP(N, DAG, Subtarget);

56372

case ISD::FADD:

56373

case ISD::FSUB: return combineFaddFsub(N, DAG, Subtarget);

56374

case X86ISD::VFCMULC:

56375

case X86ISD::VFMULC: return combineFMulcFCMulc(N, DAG, Subtarget);

56376

case ISD::FNEG: return combineFneg(N, DAG, DCI, Subtarget);

56377

case ISD::TRUNCATE: return combineTruncate(N, DAG, Subtarget);

56378

case X86ISD::VTRUNC: return combineVTRUNC(N, DAG, DCI);

56379

case X86ISD::ANDNP: return combineAndnp(N, DAG, DCI, Subtarget);

56380

case X86ISD::FAND: return combineFAnd(N, DAG, Subtarget);

56381

case X86ISD::FANDN: return combineFAndn(N, DAG, Subtarget);

56382

case X86ISD::FXOR:

56383

case X86ISD::FOR: return combineFOr(N, DAG, DCI, Subtarget);

56384

case X86ISD::FMIN:

56385

case X86ISD::FMAX: return combineFMinFMax(N, DAG);

56386

case ISD::FMINNUM:

56387

case ISD::FMAXNUM: return combineFMinNumFMaxNum(N, DAG, Subtarget);

56388

case X86ISD::CVTSI2P:

56389

case X86ISD::CVTUI2P: return combineX86INT_TO_FP(N, DAG, DCI);

56390

case X86ISD::CVTP2SI:

56391

case X86ISD::CVTP2UI:

56392

case X86ISD::STRICT_CVTTP2SI:

56393

case X86ISD::CVTTP2SI:

56394

case X86ISD::STRICT_CVTTP2UI:

56395

case X86ISD::CVTTP2UI:

56396

return combineCVTP2I_CVTTP2I(N, DAG, DCI);

56397

case X86ISD::STRICT_CVTPH2PS:

56398

case X86ISD::CVTPH2PS: return combineCVTPH2PS(N, DAG, DCI);

56399

case X86ISD::BT: return combineBT(N, DAG, DCI);

56400

case ISD::ANY_EXTEND:

56401

case ISD::ZERO_EXTEND: return combineZext(N, DAG, DCI, Subtarget);

56402

case ISD::SIGN_EXTEND: return combineSext(N, DAG, DCI, Subtarget);

56403

case ISD::SIGN_EXTEND_INREG: return combineSignExtendInReg(N, DAG, Subtarget);

56404

case ISD::ANY_EXTEND_VECTOR_INREG:

56405

case ISD::SIGN_EXTEND_VECTOR_INREG:

56406

case ISD::ZERO_EXTEND_VECTOR_INREG:

56407

return combineEXTEND_VECTOR_INREG(N, DAG, DCI, Subtarget);

56408

case ISD::SETCC: return combineSetCC(N, DAG, DCI, Subtarget);

56409

case X86ISD::SETCC: return combineX86SetCC(N, DAG, Subtarget);

56410

case X86ISD::BRCOND: return combineBrCond(N, DAG, Subtarget);

56411

case X86ISD::PACKSS:

56412

case X86ISD::PACKUS: return combineVectorPack(N, DAG, DCI, Subtarget);

56413

case X86ISD::HADD:

56414

case X86ISD::HSUB:

56415

case X86ISD::FHADD:

56416

case X86ISD::FHSUB: return combineVectorHADDSUB(N, DAG, DCI, Subtarget);

56417

case X86ISD::VSHL:

56418

case X86ISD::VSRA:

56419

case X86ISD::VSRL:

56420

return combineVectorShiftVar(N, DAG, DCI, Subtarget);

56421

case X86ISD::VSHLI:

56422

case X86ISD::VSRAI:

56423

case X86ISD::VSRLI:

56424

return combineVectorShiftImm(N, DAG, DCI, Subtarget);

56425

case ISD::INSERT_VECTOR_ELT:

56426

case X86ISD::PINSRB:

56427

case X86ISD::PINSRW: return combineVectorInsert(N, DAG, DCI, Subtarget);

56428

case X86ISD::SHUFP: // Handle all target specific shuffles

56429

case X86ISD::INSERTPS:

56430

case X86ISD::EXTRQI:

56431

case X86ISD::INSERTQI:

56432

case X86ISD::VALIGN:

56433

case X86ISD::PALIGNR:

56434

case X86ISD::VSHLDQ:

56435

case X86ISD::VSRLDQ:

56436

case X86ISD::BLENDI:

56437

case X86ISD::UNPCKH:

56438

case X86ISD::UNPCKL:

56439

case X86ISD::MOVHLPS:

56440

case X86ISD::MOVLHPS:

56441

case X86ISD::PSHUFB:

56442

case X86ISD::PSHUFD:

56443

case X86ISD::PSHUFHW:

56444

case X86ISD::PSHUFLW:

56445

case X86ISD::MOVSHDUP:

56446

case X86ISD::MOVSLDUP:

56447

case X86ISD::MOVDDUP:

56448

case X86ISD::MOVSS:

56449

case X86ISD::MOVSD:

56450

case X86ISD::MOVSH:

56451

case X86ISD::VBROADCAST:

56452

case X86ISD::VPPERM:

56453

case X86ISD::VPERMI:

56454

case X86ISD::VPERMV:

56455

case X86ISD::VPERMV3:

56456

case X86ISD::VPERMIL2:

56457

case X86ISD::VPERMILPI:

56458

case X86ISD::VPERMILPV:

56459

case X86ISD::VPERM2X128:

56460

case X86ISD::SHUF128:

56461

case X86ISD::VZEXT_MOVL:

56462

case ISD::VECTOR_SHUFFLE: return combineShuffle(N, DAG, DCI,Subtarget);

56463

case X86ISD::FMADD_RND:

56464

case X86ISD::FMSUB:

56465

case X86ISD::STRICT_FMSUB:

56466

case X86ISD::FMSUB_RND:

56467

case X86ISD::FNMADD:

56468

case X86ISD::STRICT_FNMADD:

56469

case X86ISD::FNMADD_RND:

56470

case X86ISD::FNMSUB:

56471

case X86ISD::STRICT_FNMSUB:

56472

case X86ISD::FNMSUB_RND:

56473

case ISD::FMA:

56474

case ISD::STRICT_FMA: return combineFMA(N, DAG, DCI, Subtarget);

56475

case X86ISD::FMADDSUB_RND:

56476

case X86ISD::FMSUBADD_RND:

56477

case X86ISD::FMADDSUB:

56478

case X86ISD::FMSUBADD: return combineFMADDSUB(N, DAG, DCI);

56479

case X86ISD::MOVMSK: return combineMOVMSK(N, DAG, DCI, Subtarget);

56480

case X86ISD::MGATHER:

56481

case X86ISD::MSCATTER:

56482

return combineX86GatherScatter(N, DAG, DCI, Subtarget);

56483

case ISD::MGATHER:

56484

case ISD::MSCATTER: return combineGatherScatter(N, DAG, DCI);

56485

case X86ISD::PCMPEQ:

56486

case X86ISD::PCMPGT: return combineVectorCompare(N, DAG, Subtarget);

56487

case X86ISD::PMULDQ:

56488

case X86ISD::PMULUDQ: return combinePMULDQ(N, DAG, DCI, Subtarget);

56489

case X86ISD::VPMADDUBSW:

56490

case X86ISD::VPMADDWD: return combineVPMADD(N, DAG, DCI);

56491

case X86ISD::KSHIFTL:

56492

case X86ISD::KSHIFTR: return combineKSHIFT(N, DAG, DCI);

56493

case ISD::FP16_TO_FP: return combineFP16_TO_FP(N, DAG, Subtarget);

56494

case ISD::STRICT_FP_EXTEND:

56495

case ISD::FP_EXTEND: return combineFP_EXTEND(N, DAG, Subtarget);

56496

case ISD::STRICT_FP_ROUND:

56497

case ISD::FP_ROUND: return combineFP_ROUND(N, DAG, Subtarget);

56498

case X86ISD::VBROADCAST_LOAD:

56499

case X86ISD::SUBV_BROADCAST_LOAD: return combineBROADCAST_LOAD(N, DAG, DCI);

56500

case X86ISD::MOVDQ2Q: return combineMOVDQ2Q(N, DAG);

56501

case X86ISD::PDEP: return combinePDEP(N, DAG, DCI);

56502

}

56503

56504

return SDValue();

56505

}

56506

56507

bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {

56508

if (!isTypeLegal(VT))

56509

return false;

56510

56511

// There are no vXi8 shifts.

56512

if (Opc == ISD::SHL && VT.isVector() && VT.getVectorElementType() == MVT::i8)

56513

return false;

56514

56515

// TODO: Almost no 8-bit ops are desirable because they have no actual

56516

// size/speed advantages vs. 32-bit ops, but they do have a major

56517

// potential disadvantage by causing partial register stalls.

56518

//

56519

// 8-bit multiply/shl is probably not cheaper than 32-bit multiply/shl, and

56520

// we have specializations to turn 32-bit multiply/shl into LEA or other ops.

56521

// Also, see the comment in "IsDesirableToPromoteOp" - where we additionally

56522

// check for a constant operand to the multiply.

56523

if ((Opc == ISD::MUL || Opc == ISD::SHL) && VT == MVT::i8)

56524

return false;

56525

56526

// i16 instruction encodings are longer and some i16 instructions are slow,

56527

// so those are not desirable.

56528

if (VT == MVT::i16) {

56529

switch (Opc) {

56530

default:

56531

break;

56532

case ISD::LOAD:

56533

case ISD::SIGN_EXTEND:

56534

case ISD::ZERO_EXTEND:

56535

case ISD::ANY_EXTEND:

56536

case ISD::SHL:

56537

case ISD::SRA:

56538

case ISD::SRL:

56539

case ISD::SUB:

56540

case ISD::ADD:

56541

case ISD::MUL:

56542

case ISD::AND:

56543

case ISD::OR:

56544

case ISD::XOR:

56545

return false;

56546

}

56547

}

56548

56549

// Any legal type not explicitly accounted for above here is desirable.

56550

return true;

56551

}

56552

56553

SDValue X86TargetLowering::expandIndirectJTBranch(const SDLoc& dl,

56554

SDValue Value, SDValue Addr,

56555

SelectionDAG &DAG) const {

56556

const Module *M = DAG.getMachineFunction().getMMI().getModule();

56557

Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch");

56558

if (IsCFProtectionSupported) {

56559

// In case control-flow branch protection is enabled, we need to add

56560

// notrack prefix to the indirect branch.

56561

// In order to do that we create NT_BRIND SDNode.

56562

// Upon ISEL, the pattern will convert it to jmp with NoTrack prefix.

56563

return DAG.getNode(X86ISD::NT_BRIND, dl, MVT::Other, Value, Addr);

56564

}

56565

56566

return TargetLowering::expandIndirectJTBranch(dl, Value, Addr, DAG);

56567

}

56568

56569

bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {

56570

EVT VT = Op.getValueType();

56571

bool Is8BitMulByConstant = VT == MVT::i8 && Op.getOpcode() == ISD::MUL &&

56572

isa<ConstantSDNode>(Op.getOperand(1));

56573

56574

// i16 is legal, but undesirable since i16 instruction encodings are longer

56575

// and some i16 instructions are slow.

56576

// 8-bit multiply-by-constant can usually be expanded to something cheaper

56577

// using LEA and/or other ALU ops.

56578

if (VT != MVT::i16 && !Is8BitMulByConstant)

56579

return false;

56580

56581

auto IsFoldableRMW = [](SDValue Load, SDValue Op) {

56582

if (!Op.hasOneUse())

56583

return false;

56584

SDNode *User = *Op->use_begin();

56585

if (!ISD::isNormalStore(User))

56586

return false;

56587

auto *Ld = cast<LoadSDNode>(Load);

56588

auto *St = cast<StoreSDNode>(User);

56589

return Ld->getBasePtr() == St->getBasePtr();

56590

};

56591

56592

auto IsFoldableAtomicRMW = [](SDValue Load, SDValue Op) {

56593

if (!Load.hasOneUse() || Load.getOpcode() != ISD::ATOMIC_LOAD)

56594

return false;

56595

if (!Op.hasOneUse())

56596

return false;

56597

SDNode *User = *Op->use_begin();

56598

if (User->getOpcode() != ISD::ATOMIC_STORE)

56599

return false;

56600

auto *Ld = cast<AtomicSDNode>(Load);

56601

auto *St = cast<AtomicSDNode>(User);

56602

return Ld->getBasePtr() == St->getBasePtr();

56603

};

56604

56605

bool Commute = false;

56606

switch (Op.getOpcode()) {

56607

default: return false;

56608

case ISD::SIGN_EXTEND:

56609

case ISD::ZERO_EXTEND:

56610

case ISD::ANY_EXTEND:

56611

break;

56612

case ISD::SHL:

56613

case ISD::SRA:

56614

case ISD::SRL: {

56615

SDValue N0 = Op.getOperand(0);

56616

// Look out for (store (shl (load), x)).

56617

if (X86::mayFoldLoad(N0, Subtarget) && IsFoldableRMW(N0, Op))

56618

return false;

56619

break;

56620

}

56621

case ISD::ADD:

56622

case ISD::MUL:

56623

case ISD::AND:

56624

case ISD::OR:

56625

case ISD::XOR:

56626

Commute = true;

56627

[[fallthrough]];

56628

case ISD::SUB: {

56629

SDValue N0 = Op.getOperand(0);

56630

SDValue N1 = Op.getOperand(1);

56631

// Avoid disabling potential load folding opportunities.

56632

if (X86::mayFoldLoad(N1, Subtarget) &&

56633

(!Commute || !isa<ConstantSDNode>(N0) ||

56634

(Op.getOpcode() != ISD::MUL && IsFoldableRMW(N1, Op))))

56635

return false;

56636

if (X86::mayFoldLoad(N0, Subtarget) &&

56637

((Commute && !isa<ConstantSDNode>(N1)) ||

56638

(Op.getOpcode() != ISD::MUL && IsFoldableRMW(N0, Op))))

56639

return false;

56640

if (IsFoldableAtomicRMW(N0, Op) ||

56641

(Commute && IsFoldableAtomicRMW(N1, Op)))

56642

return false;

56643

}

56644

}

56645

56646

PVT = MVT::i32;

56647

return true;

56648

}

56649

56650

//===----------------------------------------------------------------------===//

56651

// X86 Inline Assembly Support

56652

//===----------------------------------------------------------------------===//

56653

56654

// Helper to match a string separated by whitespace.

56655

static bool matchAsm(StringRef S, ArrayRef<const char *> Pieces) {

56656

S = S.substr(S.find_first_not_of(" \t")); // Skip leading whitespace.

56657

56658

for (StringRef Piece : Pieces) {

56659

if (!S.startswith(Piece)) // Check if the piece matches.

56660

return false;

56661

56662

S = S.substr(Piece.size());

56663

StringRef::size_type Pos = S.find_first_not_of(" \t");

56664

if (Pos == 0) // We matched a prefix.

56665

return false;

56666

56667

S = S.substr(Pos);

56668

}

56669

56670

return S.empty();

56671

}

56672

56673

static bool clobbersFlagRegisters(const SmallVector<StringRef, 4> &AsmPieces) {

56674

56675

if (AsmPieces.size() == 3 || AsmPieces.size() == 4) {

56676

if (llvm::is_contained(AsmPieces, "~{cc}") &&

56677

llvm::is_contained(AsmPieces, "~{flags}") &&

56678

llvm::is_contained(AsmPieces, "~{fpsr}")) {

56679

56680

if (AsmPieces.size() == 3)

56681

return true;

56682

else if (llvm::is_contained(AsmPieces, "~{dirflag}"))

56683

return true;

56684

}

56685

}

56686

return false;

56687

}

56688

56689

bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {

56690

InlineAsm *IA = cast<InlineAsm>(CI->getCalledOperand());

56691

56692

const std::string &AsmStr = IA->getAsmString();

56693

56694

IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());

56695

if (!Ty || Ty->getBitWidth() % 16 != 0)

56696

return false;

56697

56698

// TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a"

56699

SmallVector<StringRef, 4> AsmPieces;

56700

SplitString(AsmStr, AsmPieces, ";\n");

56701

56702

switch (AsmPieces.size()) {

56703

default: return false;

56704

case 1:

56705

// FIXME: this should verify that we are targeting a 486 or better. If not,

56706

// we will turn this bswap into something that will be lowered to logical

56707

// ops instead of emitting the bswap asm. For now, we don't support 486 or

56708

// lower so don't worry about this.

56709

// bswap $0

56710

if (matchAsm(AsmPieces[0], {"bswap", "$0"}) ||

56711

matchAsm(AsmPieces[0], {"bswapl", "$0"}) ||

56712

matchAsm(AsmPieces[0], {"bswapq", "$0"}) ||

56713

matchAsm(AsmPieces[0], {"bswap", "${0:q}"}) ||

56714

matchAsm(AsmPieces[0], {"bswapl", "${0:q}"}) ||

56715

matchAsm(AsmPieces[0], {"bswapq", "${0:q}"})) {

56716

// No need to check constraints, nothing other than the equivalent of

56717

// "=r,0" would be valid here.

56718

return IntrinsicLowering::LowerToByteSwap(CI);

56719

}

56720

56721

// rorw $$8, ${0:w} --> llvm.bswap.i16

56722

if (CI->getType()->isIntegerTy(16) &&

56723

IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&

56724

(matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) ||

56725

matchAsm(AsmPieces[0], {"rolw", "$$8,", "${0:w}"}))) {

56726

AsmPieces.clear();

56727

StringRef ConstraintsStr = IA->getConstraintString();

56728

SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");

56729

array_pod_sort(AsmPieces.begin(), AsmPieces.end());

56730

if (clobbersFlagRegisters(AsmPieces))

56731

return IntrinsicLowering::LowerToByteSwap(CI);

56732

}

56733

break;

56734

case 3:

56735

if (CI->getType()->isIntegerTy(32) &&

56736

IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&

56737

matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) &&

56738

matchAsm(AsmPieces[1], {"rorl", "$$16,", "$0"}) &&

56739

matchAsm(AsmPieces[2], {"rorw", "$$8,", "${0:w}"})) {

56740

AsmPieces.clear();

56741

StringRef ConstraintsStr = IA->getConstraintString();

56742

SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");

56743

array_pod_sort(AsmPieces.begin(), AsmPieces.end());

56744

if (clobbersFlagRegisters(AsmPieces))

56745

return IntrinsicLowering::LowerToByteSwap(CI);

56746

}

56747

56748

if (CI->getType()->isIntegerTy(64)) {

56749

InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints();

56750

if (Constraints.size() >= 2 &&

56751

Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&

56752

Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {

56753

// bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64

56754

if (matchAsm(AsmPieces[0], {"bswap", "%eax"}) &&

56755

matchAsm(AsmPieces[1], {"bswap", "%edx"}) &&

56756

matchAsm(AsmPieces[2], {"xchgl", "%eax,", "%edx"}))

56757

return IntrinsicLowering::LowerToByteSwap(CI);

56758

}

56759

}

56760

break;

56761

}

56762

return false;

56763

}

56764

56765

static X86::CondCode parseConstraintCode(llvm::StringRef Constraint) {

56766

X86::CondCode Cond = StringSwitch<X86::CondCode>(Constraint)

56767

.Case("{@cca}", X86::COND_A)

56768

.Case("{@ccae}", X86::COND_AE)

56769

.Case("{@ccb}", X86::COND_B)

56770

.Case("{@ccbe}", X86::COND_BE)

56771

.Case("{@ccc}", X86::COND_B)

56772

.Case("{@cce}", X86::COND_E)

56773

.Case("{@ccz}", X86::COND_E)

56774

.Case("{@ccg}", X86::COND_G)

56775

.Case("{@ccge}", X86::COND_GE)

56776

.Case("{@ccl}", X86::COND_L)

56777

.Case("{@ccle}", X86::COND_LE)

56778

.Case("{@ccna}", X86::COND_BE)

56779

.Case("{@ccnae}", X86::COND_B)

56780

.Case("{@ccnb}", X86::COND_AE)

56781

.Case("{@ccnbe}", X86::COND_A)

56782

.Case("{@ccnc}", X86::COND_AE)

56783

.Case("{@ccne}", X86::COND_NE)

56784

.Case("{@ccnz}", X86::COND_NE)

56785

.Case("{@ccng}", X86::COND_LE)

56786

.Case("{@ccnge}", X86::COND_L)

56787

.Case("{@ccnl}", X86::COND_GE)

56788

.Case("{@ccnle}", X86::COND_G)

56789

.Case("{@ccno}", X86::COND_NO)

56790

.Case("{@ccnp}", X86::COND_NP)

56791

.Case("{@ccns}", X86::COND_NS)

56792

.Case("{@cco}", X86::COND_O)

56793

.Case("{@ccp}", X86::COND_P)

56794

.Case("{@ccs}", X86::COND_S)

56795

.Default(X86::COND_INVALID);

56796

return Cond;

56797

}

56798

56799

/// Given a constraint letter, return the type of constraint for this target.

56800

X86TargetLowering::ConstraintType

56801

X86TargetLowering::getConstraintType(StringRef Constraint) const {

56802

if (Constraint.size() == 1) {

56803

switch (Constraint[0]) {

56804

case 'R':

56805

case 'q':

56806

case 'Q':

56807

case 'f':

56808

case 't':

56809

case 'u':

56810

case 'y':

56811

case 'x':

56812

case 'v':

56813

case 'l':

56814

case 'k': // AVX512 masking registers.

56815

return C_RegisterClass;

56816

case 'a':

56817

case 'b':

56818

case 'c':

56819

case 'd':

56820

case 'S':

56821

case 'D':

56822

case 'A':

56823

return C_Register;

56824

case 'I':

56825

case 'J':

56826

case 'K':

56827

case 'N':

56828

case 'G':

56829

case 'L':

56830

case 'M':

56831

return C_Immediate;

56832

case 'C':

56833

case 'e':

56834

case 'Z':

56835

return C_Other;

56836

default:

56837

break;

56838

}

56839

}

56840

else if (Constraint.size() == 2) {

56841

switch (Constraint[0]) {

56842

default:

56843

break;

56844

case 'Y':

56845

switch (Constraint[1]) {

56846

default:

56847

break;

56848

case 'z':

56849

return C_Register;

56850

case 'i':

56851

case 'm':

56852

case 'k':

56853

case 't':

56854

case '2':

56855

return C_RegisterClass;

56856

}

56857

}

56858

} else if (parseConstraintCode(Constraint) != X86::COND_INVALID)

56859

return C_Other;

56860

return TargetLowering::getConstraintType(Constraint);

56861

}

56862

56863

/// Examine constraint type and operand type and determine a weight value.

56864

/// This object must already have been set up with the operand type

56865

/// and the current alternative constraint selected.

56866

TargetLowering::ConstraintWeight

56867

X86TargetLowering::getSingleConstraintMatchWeight(

56868

AsmOperandInfo &info, const char *constraint) const {

56869

ConstraintWeight weight = CW_Invalid;

56870

Value *CallOperandVal = info.CallOperandVal;

56871

// If we don't have a value, we can't do a match,

56872

// but allow it at the lowest weight.

56873

if (!CallOperandVal)

56874

return CW_Default;

56875

Type *type = CallOperandVal->getType();

56876

// Look at the constraint type.

56877

switch (*constraint) {

56878

default:

56879

weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);

56880

[[fallthrough]];

56881

case 'R':

56882

case 'q':

56883

case 'Q':

56884

case 'a':

56885

case 'b':

56886

case 'c':

56887

case 'd':

56888

case 'S':

56889

case 'D':

56890

case 'A':

56891

if (CallOperandVal->getType()->isIntegerTy())

56892

weight = CW_SpecificReg;

56893

break;

56894

case 'f':

56895

case 't':

56896

case 'u':

56897

if (type->isFloatingPointTy())

56898

weight = CW_SpecificReg;

56899

break;

56900

case 'y':

56901

if (type->isX86_MMXTy() && Subtarget.hasMMX())

56902

weight = CW_SpecificReg;

56903

break;

56904

case 'Y':

56905

if (StringRef(constraint).size() != 2)

56906

break;

56907

switch (constraint[1]) {

56908

default:

56909

return CW_Invalid;

56910

// XMM0

56911

case 'z':

56912

if (((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||

56913

((type->getPrimitiveSizeInBits() == 256) && Subtarget.hasAVX()) ||

56914

((type->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512()))

56915

return CW_SpecificReg;

56916

return CW_Invalid;

56917

// Conditional OpMask regs (AVX512)

56918

case 'k':

56919

if ((type->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())

56920

return CW_Register;

56921

return CW_Invalid;

56922

// Any MMX reg

56923

case 'm':

56924

if (type->isX86_MMXTy() && Subtarget.hasMMX())

56925

return weight;

56926

return CW_Invalid;

56927

// Any SSE reg when ISA >= SSE2, same as 'x'

56928

case 'i':

56929

case 't':

56930

case '2':

56931

if (!Subtarget.hasSSE2())

56932

return CW_Invalid;

56933

break;

56934

}

56935

break;

56936

case 'v':

56937

if ((type->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512())

56938

weight = CW_Register;

56939

[[fallthrough]];

56940

case 'x':

56941

if (((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||

56942

((type->getPrimitiveSizeInBits() == 256) && Subtarget.hasAVX()))

56943

weight = CW_Register;

56944

break;

56945

case 'k':

56946

// Enable conditional vector operations using %k<#> registers.

56947

if ((type->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())

56948

weight = CW_Register;

56949

break;

56950

case 'I':

56951

if (auto *C = dyn_cast<ConstantInt>(info.CallOperandVal)) {

56952

if (C->getZExtValue() <= 31)

56953

weight = CW_Constant;

56954

}

56955

break;

56956

case 'J':

56957

if (auto *C = dyn_cast<ConstantInt>(CallOperandVal)) {

56958

if (C->getZExtValue() <= 63)

56959

weight = CW_Constant;

56960

}

56961

break;

56962

case 'K':

56963

if (auto *C = dyn_cast<ConstantInt>(CallOperandVal)) {

56964

if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f))

56965

weight = CW_Constant;

56966

}

56967

break;

56968

case 'L':

56969

if (auto *C = dyn_cast<ConstantInt>(CallOperandVal)) {

56970

if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff))

56971

weight = CW_Constant;

56972

}

56973

break;

56974

case 'M':

56975

if (auto *C = dyn_cast<ConstantInt>(CallOperandVal)) {

56976

if (C->getZExtValue() <= 3)

56977

weight = CW_Constant;

56978

}

56979

break;

56980

case 'N':

56981

if (auto *C = dyn_cast<ConstantInt>(CallOperandVal)) {

56982

if (C->getZExtValue() <= 0xff)

56983

weight = CW_Constant;

56984

}

56985

break;

56986

case 'G':

56987

case 'C':

56988

if (isa<ConstantFP>(CallOperandVal)) {

56989

weight = CW_Constant;

56990

}

56991

break;

56992

case 'e':

56993

if (auto *C = dyn_cast<ConstantInt>(CallOperandVal)) {

56994

if ((C->getSExtValue() >= -0x80000000LL) &&

56995

(C->getSExtValue() <= 0x7fffffffLL))

56996

weight = CW_Constant;

56997

}

56998

break;

56999

case 'Z':

57000

if (auto *C = dyn_cast<ConstantInt>(CallOperandVal)) {

57001

if (C->getZExtValue() <= 0xffffffff)

57002

weight = CW_Constant;

57003

}

57004

break;

57005

}

57006

return weight;

57007

}

57008

57009

/// Try to replace an X constraint, which matches anything, with another that

57010

/// has more specific requirements based on the type of the corresponding

57011

/// operand.

57012

const char *X86TargetLowering::

57013

LowerXConstraint(EVT ConstraintVT) const {

57014

// FP X constraints get lowered to SSE1/2 registers if available, otherwise

57015

// 'f' like normal targets.

57016

if (ConstraintVT.isFloatingPoint()) {

57017

if (Subtarget.hasSSE1())

57018

return "x";

57019

}

57020

57021

return TargetLowering::LowerXConstraint(ConstraintVT);

57022

}

57023

57024

// Lower @cc targets via setcc.

57025

SDValue X86TargetLowering::LowerAsmOutputForConstraint(

57026

SDValue &Chain, SDValue &Flag, const SDLoc &DL,

57027

const AsmOperandInfo &OpInfo, SelectionDAG &DAG) const {

57028

X86::CondCode Cond = parseConstraintCode(OpInfo.ConstraintCode);

57029

if (Cond == X86::COND_INVALID)

57030

return SDValue();

57031

// Check that return type is valid.

57032

if (OpInfo.ConstraintVT.isVector() || !OpInfo.ConstraintVT.isInteger() ||

57033

OpInfo.ConstraintVT.getSizeInBits() < 8)

57034

report_fatal_error("Flag output operand is of invalid type");

57035

57036

// Get EFLAGS register. Only update chain when copyfrom is glued.

57037

if (Flag.getNode()) {

57038

Flag = DAG.getCopyFromReg(Chain, DL, X86::EFLAGS, MVT::i32, Flag);

57039

Chain = Flag.getValue(1);

57040

} else

57041

Flag = DAG.getCopyFromReg(Chain, DL, X86::EFLAGS, MVT::i32);

57042

// Extract CC code.

57043

SDValue CC = getSETCC(Cond, Flag, DL, DAG);

57044

// Extend to 32-bits

57045

SDValue Result = DAG.getNode(ISD::ZERO_EXTEND, DL, OpInfo.ConstraintVT, CC);

57046

57047

return Result;

57048

}

57049

57050

/// Lower the specified operand into the Ops vector.

57051

/// If it is invalid, don't add anything to Ops.

57052

void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,

57053

std::string &Constraint,

57054

std::vector<SDValue>&Ops,

57055

SelectionDAG &DAG) const {

57056

SDValue Result;

57057

57058

// Only support length 1 constraints for now.

57059

if (Constraint.length() > 1) return;

57060

57061

char ConstraintLetter = Constraint[0];

57062

switch (ConstraintLetter) {

57063

default: break;

57064

case 'I':

57065

if (auto *C = dyn_cast<ConstantSDNode>(Op)) {

57066

if (C->getZExtValue() <= 31) {

57067

Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),

57068

Op.getValueType());

57069

break;

57070

}

57071

}

57072

return;

57073

case 'J':

57074

if (auto *C = dyn_cast<ConstantSDNode>(Op)) {

57075

if (C->getZExtValue() <= 63) {

57076

Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),

57077

Op.getValueType());

57078

break;

57079

}

57080

}

57081

return;

57082

case 'K':

57083

if (auto *C = dyn_cast<ConstantSDNode>(Op)) {

57084

if (isInt<8>(C->getSExtValue())) {

57085

Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),

57086

Op.getValueType());

57087

break;

57088

}

57089

}

57090

return;

57091

case 'L':

57092

if (auto *C = dyn_cast<ConstantSDNode>(Op)) {

57093

if (C->getZExtValue() == 0xff || C->getZExtValue() == 0xffff ||

57094

(Subtarget.is64Bit() && C->getZExtValue() == 0xffffffff)) {

57095

Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op),

57096

Op.getValueType());

57097

break;

57098

}

57099

}

57100

return;

57101

case 'M':

57102

if (auto *C = dyn_cast<ConstantSDNode>(Op)) {

57103

if (C->getZExtValue() <= 3) {

57104

Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),

57105

Op.getValueType());

57106

break;

57107

}

57108

}

57109

return;

57110

case 'N':

57111

if (auto *C = dyn_cast<ConstantSDNode>(Op)) {

57112

if (C->getZExtValue() <= 255) {

57113

Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),

57114

Op.getValueType());

57115

break;

57116

}

57117

}

57118

return;

57119

case 'O':

57120

if (auto *C = dyn_cast<ConstantSDNode>(Op)) {

57121

if (C->getZExtValue() <= 127) {

57122

Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),

57123

Op.getValueType());

57124

break;

57125

}

57126

}

57127

return;

57128

case 'e': {

57129

// 32-bit signed value

57130

if (auto *C = dyn_cast<ConstantSDNode>(Op)) {

57131

if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),

57132

C->getSExtValue())) {

57133

// Widen to 64 bits here to get it sign extended.

57134

Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op), MVT::i64);

57135

break;

57136

}

57137

// FIXME gcc accepts some relocatable values here too, but only in certain

57138

// memory models; it's complicated.

57139

}

57140

return;

57141

}

57142

case 'Z': {

57143

// 32-bit unsigned value

57144

if (auto *C = dyn_cast<ConstantSDNode>(Op)) {

57145

if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),

57146

C->getZExtValue())) {

57147

Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),

57148

Op.getValueType());

57149

break;

57150

}

57151

}

57152

// FIXME gcc accepts some relocatable values here too, but only in certain

57153

// memory models; it's complicated.

57154

return;

57155

}

57156

case 'i': {

57157

// Literal immediates are always ok.

57158

if (auto *CST = dyn_cast<ConstantSDNode>(Op)) {

57159

bool IsBool = CST->getConstantIntValue()->getBitWidth() == 1;

57160

BooleanContent BCont = getBooleanContents(MVT::i64);

57161

ISD::NodeType ExtOpc = IsBool ? getExtendForContent(BCont)

57162

: ISD::SIGN_EXTEND;

57163

int64_t ExtVal = ExtOpc == ISD::ZERO_EXTEND ? CST->getZExtValue()

57164

: CST->getSExtValue();

57165

Result = DAG.getTargetConstant(ExtVal, SDLoc(Op), MVT::i64);

57166

break;

57167

}

57168

57169

// In any sort of PIC mode addresses need to be computed at runtime by

57170

// adding in a register or some sort of table lookup. These can't

57171

// be used as immediates. BlockAddresses and BasicBlocks are fine though.

57172

if ((Subtarget.isPICStyleGOT() || Subtarget.isPICStyleStubPIC()) &&

57173

!(isa<BlockAddressSDNode>(Op) || isa<BasicBlockSDNode>(Op)))

57174

return;

57175

57176

// If we are in non-pic codegen mode, we allow the address of a global (with

57177

// an optional displacement) to be used with 'i'.

57178

if (auto *GA = dyn_cast<GlobalAddressSDNode>(Op))

57179

// If we require an extra load to get this address, as in PIC mode, we

57180

// can't accept it.

57181

if (isGlobalStubReference(

57182

Subtarget.classifyGlobalReference(GA->getGlobal())))

57183

return;

57184

break;

57185

}

57186

}

57187

57188

if (Result.getNode()) {

57189

Ops.push_back(Result);

57190

return;

57191

}

57192

return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);

57193

}

57194

57195

/// Check if \p RC is a general purpose register class.

57196

/// I.e., GR* or one of their variant.

57197

static bool isGRClass(const TargetRegisterClass &RC) {

57198

return RC.hasSuperClassEq(&X86::GR8RegClass) ||

57199

RC.hasSuperClassEq(&X86::GR16RegClass) ||

57200

RC.hasSuperClassEq(&X86::GR32RegClass) ||

57201

RC.hasSuperClassEq(&X86::GR64RegClass) ||

57202

RC.hasSuperClassEq(&X86::LOW32_ADDR_ACCESS_RBPRegClass);

57203

}

57204

57205

/// Check if \p RC is a vector register class.

57206

/// I.e., FR* / VR* or one of their variant.

57207

static bool isFRClass(const TargetRegisterClass &RC) {

57208

return RC.hasSuperClassEq(&X86::FR16XRegClass) ||

57209

RC.hasSuperClassEq(&X86::FR32XRegClass) ||

57210

RC.hasSuperClassEq(&X86::FR64XRegClass) ||

57211

RC.hasSuperClassEq(&X86::VR128XRegClass) ||

57212

RC.hasSuperClassEq(&X86::VR256XRegClass) ||

57213

RC.hasSuperClassEq(&X86::VR512RegClass);

57214

}

57215

57216

/// Check if \p RC is a mask register class.

57217

/// I.e., VK* or one of their variant.

57218

static bool isVKClass(const TargetRegisterClass &RC) {

57219

return RC.hasSuperClassEq(&X86::VK1RegClass) ||

57220

RC.hasSuperClassEq(&X86::VK2RegClass) ||

57221

RC.hasSuperClassEq(&X86::VK4RegClass) ||

57222

RC.hasSuperClassEq(&X86::VK8RegClass) ||

57223

RC.hasSuperClassEq(&X86::VK16RegClass) ||

57224

RC.hasSuperClassEq(&X86::VK32RegClass) ||

57225

RC.hasSuperClassEq(&X86::VK64RegClass);

57226

}

57227

57228

std::pair<unsigned, const TargetRegisterClass *>

57229

X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,

57230

StringRef Constraint,

57231

MVT VT) const {

57232

// First, see if this is a constraint that directly corresponds to an LLVM

57233

// register class.

57234

if (Constraint.size() == 1) {

57235

// GCC Constraint Letters

57236

switch (Constraint[0]) {

57237

default: break;

57238

// 'A' means [ER]AX + [ER]DX.

57239

case 'A':

57240

if (Subtarget.is64Bit())

57241

return std::make_pair(X86::RAX, &X86::GR64_ADRegClass);

57242

assert((Subtarget.is32Bit() || Subtarget.is16Bit()) &&(static_cast <bool> ((Subtarget.is32Bit() || Subtarget.
is16Bit()) && "Expecting 64, 32 or 16 bit subtarget")
? void (0) : __assert_fail ("(Subtarget.is32Bit() || Subtarget.is16Bit()) && \"Expecting 64, 32 or 16 bit subtarget\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 57243, __extension__
__PRETTY_FUNCTION__))

57243

"Expecting 64, 32 or 16 bit subtarget")(static_cast <bool> ((Subtarget.is32Bit() || Subtarget.
is16Bit()) && "Expecting 64, 32 or 16 bit subtarget")
? void (0) : __assert_fail ("(Subtarget.is32Bit() || Subtarget.is16Bit()) && \"Expecting 64, 32 or 16 bit subtarget\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 57243, __extension__
__PRETTY_FUNCTION__));

57244

return std::make_pair(X86::EAX, &X86::GR32_ADRegClass);

57245

57246

// TODO: Slight differences here in allocation order and leaving

57247

// RIP in the class. Do they matter any more here than they do

57248

// in the normal allocation?

57249

case 'k':

57250

if (Subtarget.hasAVX512()) {

57251

if (VT == MVT::i1)

57252

return std::make_pair(0U, &X86::VK1RegClass);

57253

if (VT == MVT::i8)

57254

return std::make_pair(0U, &X86::VK8RegClass);

57255

if (VT == MVT::i16)

57256

return std::make_pair(0U, &X86::VK16RegClass);

57257

}

57258

if (Subtarget.hasBWI()) {

57259

if (VT == MVT::i32)

57260

return std::make_pair(0U, &X86::VK32RegClass);

57261

if (VT == MVT::i64)

57262

return std::make_pair(0U, &X86::VK64RegClass);

57263

}

57264

break;

57265

case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.

57266

if (Subtarget.is64Bit()) {

57267

if (VT == MVT::i8 || VT == MVT::i1)

57268

return std::make_pair(0U, &X86::GR8RegClass);

57269

if (VT == MVT::i16)

57270

return std::make_pair(0U, &X86::GR16RegClass);

57271

if (VT == MVT::i32 || VT == MVT::f32)

57272

return std::make_pair(0U, &X86::GR32RegClass);

57273

if (VT != MVT::f80 && !VT.isVector())

57274

return std::make_pair(0U, &X86::GR64RegClass);

57275

break;

57276

}

57277

[[fallthrough]];

57278

// 32-bit fallthrough

57279

case 'Q': // Q_REGS

57280

if (VT == MVT::i8 || VT == MVT::i1)

57281

return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);

57282

if (VT == MVT::i16)

57283

return std::make_pair(0U, &X86::GR16_ABCDRegClass);

57284

if (VT == MVT::i32 || VT == MVT::f32 ||

57285

(!VT.isVector() && !Subtarget.is64Bit()))

57286

return std::make_pair(0U, &X86::GR32_ABCDRegClass);

57287

if (VT != MVT::f80 && !VT.isVector())

57288

return std::make_pair(0U, &X86::GR64_ABCDRegClass);

57289

break;

57290

case 'r': // GENERAL_REGS

57291

case 'l': // INDEX_REGS

57292

if (VT == MVT::i8 || VT == MVT::i1)

57293

return std::make_pair(0U, &X86::GR8RegClass);

57294

if (VT == MVT::i16)

57295

return std::make_pair(0U, &X86::GR16RegClass);

57296

if (VT == MVT::i32 || VT == MVT::f32 ||

57297

(!VT.isVector() && !Subtarget.is64Bit()))

57298

return std::make_pair(0U, &X86::GR32RegClass);

57299

if (VT != MVT::f80 && !VT.isVector())

57300

return std::make_pair(0U, &X86::GR64RegClass);

57301

break;

57302

case 'R': // LEGACY_REGS

57303

if (VT == MVT::i8 || VT == MVT::i1)

57304

return std::make_pair(0U, &X86::GR8_NOREXRegClass);

57305

if (VT == MVT::i16)

57306

return std::make_pair(0U, &X86::GR16_NOREXRegClass);

57307

if (VT == MVT::i32 || VT == MVT::f32 ||

57308

(!VT.isVector() && !Subtarget.is64Bit()))

57309

return std::make_pair(0U, &X86::GR32_NOREXRegClass);

57310

if (VT != MVT::f80 && !VT.isVector())

57311

return std::make_pair(0U, &X86::GR64_NOREXRegClass);

57312

break;

57313

case 'f': // FP Stack registers.

57314

// If SSE is enabled for this VT, use f80 to ensure the isel moves the

57315

// value to the correct fpstack register class.

57316

if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))

57317

return std::make_pair(0U, &X86::RFP32RegClass);

57318

if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))

57319

return std::make_pair(0U, &X86::RFP64RegClass);

57320

if (VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f80)

57321

return std::make_pair(0U, &X86::RFP80RegClass);

57322

break;

57323

case 'y': // MMX_REGS if MMX allowed.

57324

if (!Subtarget.hasMMX()) break;

57325

return std::make_pair(0U, &X86::VR64RegClass);

57326

case 'v':

57327

case 'x': // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed

57328

if (!Subtarget.hasSSE1()) break;

57329

bool VConstraint = (Constraint[0] == 'v');

57330

57331

switch (VT.SimpleTy) {

57332

default: break;

57333

// Scalar SSE types.

57334

case MVT::f16:

57335

if (VConstraint && Subtarget.hasFP16())

57336

return std::make_pair(0U, &X86::FR16XRegClass);

57337

break;

57338

case MVT::f32:

57339

case MVT::i32:

57340

if (VConstraint && Subtarget.hasVLX())

57341

return std::make_pair(0U, &X86::FR32XRegClass);

57342

return std::make_pair(0U, &X86::FR32RegClass);

57343

case MVT::f64:

57344

case MVT::i64:

57345

if (VConstraint && Subtarget.hasVLX())

57346

return std::make_pair(0U, &X86::FR64XRegClass);

57347

return std::make_pair(0U, &X86::FR64RegClass);

57348

case MVT::i128:

57349

if (Subtarget.is64Bit()) {

57350

if (VConstraint && Subtarget.hasVLX())

57351

return std::make_pair(0U, &X86::VR128XRegClass);

57352

return std::make_pair(0U, &X86::VR128RegClass);

57353

}

57354

break;

57355

// Vector types and fp128.

57356

case MVT::v8f16:

57357

if (!Subtarget.hasFP16())

57358

break;

57359

[[fallthrough]];

57360

case MVT::f128:

57361

case MVT::v16i8:

57362

case MVT::v8i16:

57363

case MVT::v4i32:

57364

case MVT::v2i64:

57365

case MVT::v4f32:

57366

case MVT::v2f64:

57367

if (VConstraint && Subtarget.hasVLX())

57368

return std::make_pair(0U, &X86::VR128XRegClass);

57369

return std::make_pair(0U, &X86::VR128RegClass);

57370

// AVX types.

57371

case MVT::v16f16:

57372

if (!Subtarget.hasFP16())

57373

break;

57374

[[fallthrough]];

57375

case MVT::v32i8:

57376

case MVT::v16i16:

57377

case MVT::v8i32:

57378

case MVT::v4i64:

57379

case MVT::v8f32:

57380

case MVT::v4f64:

57381

if (VConstraint && Subtarget.hasVLX())

57382

return std::make_pair(0U, &X86::VR256XRegClass);

57383

if (Subtarget.hasAVX())

57384

return std::make_pair(0U, &X86::VR256RegClass);

57385

break;

57386

case MVT::v32f16:

57387

if (!Subtarget.hasFP16())

57388

break;

57389

[[fallthrough]];

57390

case MVT::v64i8:

57391

case MVT::v32i16:

57392

case MVT::v8f64:

57393

case MVT::v16f32:

57394

case MVT::v16i32:

57395

case MVT::v8i64:

57396

if (!Subtarget.hasAVX512()) break;

57397

if (VConstraint)

57398

return std::make_pair(0U, &X86::VR512RegClass);

57399

return std::make_pair(0U, &X86::VR512_0_15RegClass);

57400

}

57401

break;

57402

}

57403

} else if (Constraint.size() == 2 && Constraint[0] == 'Y') {

57404

switch (Constraint[1]) {

57405

default:

57406

break;

57407

case 'i':

57408

case 't':

57409

case '2':

57410

return getRegForInlineAsmConstraint(TRI, "x", VT);

57411

case 'm':

57412

if (!Subtarget.hasMMX()) break;

57413

return std::make_pair(0U, &X86::VR64RegClass);

57414

case 'z':

57415

if (!Subtarget.hasSSE1()) break;

57416

switch (VT.SimpleTy) {

57417

default: break;

57418

// Scalar SSE types.

57419

case MVT::f16:

57420

if (!Subtarget.hasFP16())

57421

break;

57422

return std::make_pair(X86::XMM0, &X86::FR16XRegClass);

57423

case MVT::f32:

57424

case MVT::i32:

57425

return std::make_pair(X86::XMM0, &X86::FR32RegClass);

57426

case MVT::f64:

57427

case MVT::i64:

57428

return std::make_pair(X86::XMM0, &X86::FR64RegClass);

57429

case MVT::v8f16:

57430

if (!Subtarget.hasFP16())

57431

break;

57432

[[fallthrough]];

57433

case MVT::f128:

57434

case MVT::v16i8:

57435

case MVT::v8i16:

57436

case MVT::v4i32:

57437

case MVT::v2i64:

57438

case MVT::v4f32:

57439

case MVT::v2f64:

57440

return std::make_pair(X86::XMM0, &X86::VR128RegClass);

57441

// AVX types.

57442

case MVT::v16f16:

57443

if (!Subtarget.hasFP16())

57444

break;

57445

[[fallthrough]];

57446

case MVT::v32i8:

57447

case MVT::v16i16:

57448

case MVT::v8i32:

57449

case MVT::v4i64:

57450

case MVT::v8f32:

57451

case MVT::v4f64:

57452

if (Subtarget.hasAVX())

57453

return std::make_pair(X86::YMM0, &X86::VR256RegClass);

57454

break;

57455

case MVT::v32f16:

57456

if (!Subtarget.hasFP16())

57457

break;

57458

[[fallthrough]];

57459

case MVT::v64i8:

57460

case MVT::v32i16:

57461

case MVT::v8f64:

57462

case MVT::v16f32:

57463

case MVT::v16i32:

57464

case MVT::v8i64:

57465

if (Subtarget.hasAVX512())

57466

return std::make_pair(X86::ZMM0, &X86::VR512_0_15RegClass);

57467

break;

57468

}

57469

break;

57470

case 'k':

57471

// This register class doesn't allocate k0 for masked vector operation.

57472

if (Subtarget.hasAVX512()) {

57473

if (VT == MVT::i1)

57474

return std::make_pair(0U, &X86::VK1WMRegClass);

57475

if (VT == MVT::i8)

57476

return std::make_pair(0U, &X86::VK8WMRegClass);

57477

if (VT == MVT::i16)

57478

return std::make_pair(0U, &X86::VK16WMRegClass);

57479

}

57480

if (Subtarget.hasBWI()) {

57481

if (VT == MVT::i32)

57482

return std::make_pair(0U, &X86::VK32WMRegClass);

57483

if (VT == MVT::i64)

57484

return std::make_pair(0U, &X86::VK64WMRegClass);

57485

}

57486

break;

57487

}

57488

}

57489

57490

if (parseConstraintCode(Constraint) != X86::COND_INVALID)

57491

return std::make_pair(0U, &X86::GR32RegClass);

57492

57493

// Use the default implementation in TargetLowering to convert the register

57494

// constraint into a member of a register class.

57495

std::pair<Register, const TargetRegisterClass*> Res;

57496

Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);

57497

57498

// Not found as a standard register?

57499

if (!Res.second) {

57500

// Only match x87 registers if the VT is one SelectionDAGBuilder can convert

57501

// to/from f80.

57502

if (VT == MVT::Other || VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f80) {

57503

// Map st(0) -> st(7) -> ST0

57504

if (Constraint.size() == 7 && Constraint[0] == '{' &&

57505

tolower(Constraint[1]) == 's' && tolower(Constraint[2]) == 't' &&

57506

Constraint[3] == '(' &&

57507

(Constraint[4] >= '0' && Constraint[4] <= '7') &&

57508

Constraint[5] == ')' && Constraint[6] == '}') {

57509

// st(7) is not allocatable and thus not a member of RFP80. Return

57510

// singleton class in cases where we have a reference to it.

57511

if (Constraint[4] == '7')

57512

return std::make_pair(X86::FP7, &X86::RFP80_7RegClass);

57513

return std::make_pair(X86::FP0 + Constraint[4] - '0',

57514

&X86::RFP80RegClass);

57515

}

57516

57517

// GCC allows "st(0)" to be called just plain "st".

57518

if (StringRef("{st}").equals_insensitive(Constraint))

57519

return std::make_pair(X86::FP0, &X86::RFP80RegClass);

57520

}

57521

57522

// flags -> EFLAGS

57523

if (StringRef("{flags}").equals_insensitive(Constraint))

57524

return std::make_pair(X86::EFLAGS, &X86::CCRRegClass);

57525

57526

// dirflag -> DF

57527

// Only allow for clobber.

57528

if (StringRef("{dirflag}").equals_insensitive(Constraint) &&

57529

VT == MVT::Other)

57530

return std::make_pair(X86::DF, &X86::DFCCRRegClass);

57531

57532

// fpsr -> FPSW

57533

if (StringRef("{fpsr}").equals_insensitive(Constraint))

57534

return std::make_pair(X86::FPSW, &X86::FPCCRRegClass);

57535

57536

return Res;

57537

}

57538

57539

// Make sure it isn't a register that requires 64-bit mode.

57540

if (!Subtarget.is64Bit() &&

57541

(isFRClass(*Res.second) || isGRClass(*Res.second)) &&

57542

TRI->getEncodingValue(Res.first) >= 8) {

57543

// Register requires REX prefix, but we're in 32-bit mode.

57544

return std::make_pair(0, nullptr);

57545

}

57546

57547

// Make sure it isn't a register that requires AVX512.

57548

if (!Subtarget.hasAVX512() && isFRClass(*Res.second) &&

57549

TRI->getEncodingValue(Res.first) & 0x10) {

57550

// Register requires EVEX prefix.

57551

return std::make_pair(0, nullptr);

57552

}

57553

57554

// Otherwise, check to see if this is a register class of the wrong value

57555

// type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to

57556

// turn into {ax},{dx}.

57557

// MVT::Other is used to specify clobber names.

57558

if (TRI->isTypeLegalForClass(*Res.second, VT) || VT == MVT::Other)

57559

return Res; // Correct type already, nothing to do.

57560

57561

// Get a matching integer of the correct size. i.e. "ax" with MVT::32 should

57562

// return "eax". This should even work for things like getting 64bit integer

57563

// registers when given an f64 type.

57564

const TargetRegisterClass *Class = Res.second;

57565

// The generic code will match the first register class that contains the

57566

// given register. Thus, based on the ordering of the tablegened file,

57567

// the "plain" GR classes might not come first.

57568

// Therefore, use a helper method.

57569

if (isGRClass(*Class)) {

57570

unsigned Size = VT.getSizeInBits();

57571

if (Size == 1) Size = 8;

57572

Register DestReg = getX86SubSuperRegisterOrZero(Res.first, Size);

57573

if (DestReg > 0) {

57574

bool is64Bit = Subtarget.is64Bit();

57575

const TargetRegisterClass *RC =

57576

Size == 8 ? (is64Bit ? &X86::GR8RegClass : &X86::GR8_NOREXRegClass)

57577

: Size == 16 ? (is64Bit ? &X86::GR16RegClass : &X86::GR16_NOREXRegClass)

57578

: Size == 32 ? (is64Bit ? &X86::GR32RegClass : &X86::GR32_NOREXRegClass)

57579

: Size == 64 ? (is64Bit ? &X86::GR64RegClass : nullptr)

57580

: nullptr;

57581

if (Size == 64 && !is64Bit) {

57582

// Model GCC's behavior here and select a fixed pair of 32-bit

57583

// registers.

57584

switch (DestReg) {

57585

case X86::RAX:

57586

return std::make_pair(X86::EAX, &X86::GR32_ADRegClass);

57587

case X86::RDX:

57588

return std::make_pair(X86::EDX, &X86::GR32_DCRegClass);

57589

case X86::RCX:

57590

return std::make_pair(X86::ECX, &X86::GR32_CBRegClass);

57591

case X86::RBX:

57592

return std::make_pair(X86::EBX, &X86::GR32_BSIRegClass);

57593

case X86::RSI:

57594

return std::make_pair(X86::ESI, &X86::GR32_SIDIRegClass);

57595

case X86::RDI:

57596

return std::make_pair(X86::EDI, &X86::GR32_DIBPRegClass);

57597

case X86::RBP:

57598

return std::make_pair(X86::EBP, &X86::GR32_BPSPRegClass);

57599

default:

57600

return std::make_pair(0, nullptr);

57601

}

57602

}

57603

if (RC && RC->contains(DestReg))

57604

return std::make_pair(DestReg, RC);

57605

return Res;

57606

}

57607

// No register found/type mismatch.

57608

return std::make_pair(0, nullptr);

57609

} else if (isFRClass(*Class)) {

57610

// Handle references to XMM physical registers that got mapped into the

57611

// wrong class. This can happen with constraints like {xmm0} where the

57612

// target independent register mapper will just pick the first match it can

57613

// find, ignoring the required type.

57614

57615

// TODO: Handle f128 and i128 in FR128RegClass after it is tested well.

57616

if (VT == MVT::f16)

57617

Res.second = &X86::FR16XRegClass;

57618

else if (VT == MVT::f32 || VT == MVT::i32)

57619

Res.second = &X86::FR32XRegClass;

57620

else if (VT == MVT::f64 || VT == MVT::i64)

57621

Res.second = &X86::FR64XRegClass;

57622

else if (TRI->isTypeLegalForClass(X86::VR128XRegClass, VT))

57623

Res.second = &X86::VR128XRegClass;

57624

else if (TRI->isTypeLegalForClass(X86::VR256XRegClass, VT))

57625

Res.second = &X86::VR256XRegClass;

57626

else if (TRI->isTypeLegalForClass(X86::VR512RegClass, VT))

57627

Res.second = &X86::VR512RegClass;

57628

else {

57629

// Type mismatch and not a clobber: Return an error;

57630

Res.first = 0;

57631

Res.second = nullptr;

57632

}

57633

} else if (isVKClass(*Class)) {

57634

if (VT == MVT::i1)

57635

Res.second = &X86::VK1RegClass;

57636

else if (VT == MVT::i8)

57637

Res.second = &X86::VK8RegClass;

57638

else if (VT == MVT::i16)

57639

Res.second = &X86::VK16RegClass;

57640

else if (VT == MVT::i32)

57641

Res.second = &X86::VK32RegClass;

57642

else if (VT == MVT::i64)

57643

Res.second = &X86::VK64RegClass;

57644

else {

57645

// Type mismatch and not a clobber: Return an error;

57646

Res.first = 0;

57647

Res.second = nullptr;

57648

}

57649

}

57650

57651

return Res;

57652

}

57653

57654

bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {

57655

// Integer division on x86 is expensive. However, when aggressively optimizing

57656

// for code size, we prefer to use a div instruction, as it is usually smaller

57657

// than the alternative sequence.

57658

// The exception to this is vector division. Since x86 doesn't have vector

57659

// integer division, leaving the division as-is is a loss even in terms of

57660

// size, because it will have to be scalarized, while the alternative code

57661

// sequence can be performed in vector form.

57662

bool OptSize = Attr.hasFnAttr(Attribute::MinSize);

57663

return OptSize && !VT.isVector();

57664

}

57665

57666

void X86TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {

57667

if (!Subtarget.is64Bit())

57668

return;

57669

57670

// Update IsSplitCSR in X86MachineFunctionInfo.

57671

X86MachineFunctionInfo *AFI =

57672

Entry->getParent()->getInfo<X86MachineFunctionInfo>();

57673

AFI->setIsSplitCSR(true);

57674

}

57675

57676

void X86TargetLowering::insertCopiesSplitCSR(

57677

MachineBasicBlock *Entry,

57678

const SmallVectorImpl<MachineBasicBlock *> &Exits) const {

57679

const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();

57680

const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());

57681

if (!IStart)

57682

return;

57683

57684

const TargetInstrInfo *TII = Subtarget.getInstrInfo();

57685

MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();

57686

MachineBasicBlock::iterator MBBI = Entry->begin();

57687

for (const MCPhysReg *I = IStart; *I; ++I) {

57688

const TargetRegisterClass *RC = nullptr;

57689

if (X86::GR64RegClass.contains(*I))

57690

RC = &X86::GR64RegClass;

57691

else

57692

llvm_unreachable("Unexpected register class in CSRsViaCopy!")::llvm::llvm_unreachable_internal("Unexpected register class in CSRsViaCopy!"
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 57692);

57693

57694

Register NewVR = MRI->createVirtualRegister(RC);

57695

// Create copy from CSR to a virtual register.

57696

// FIXME: this currently does not emit CFI pseudo-instructions, it works

57697

// fine for CXX_FAST_TLS since the C++-style TLS access functions should be

57698

// nounwind. If we want to generalize this later, we may need to emit

57699

// CFI pseudo-instructions.

57700

assert((static_cast <bool> (Entry->getParent()->getFunction
().hasFnAttribute(Attribute::NoUnwind) && "Function should be nounwind in insertCopiesSplitCSR!"
) ? void (0) : __assert_fail ("Entry->getParent()->getFunction().hasFnAttribute(Attribute::NoUnwind) && \"Function should be nounwind in insertCopiesSplitCSR!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 57702, __extension__
__PRETTY_FUNCTION__))

57701

Entry->getParent()->getFunction().hasFnAttribute(Attribute::NoUnwind) &&(static_cast <bool> (Entry->getParent()->getFunction
().hasFnAttribute(Attribute::NoUnwind) && "Function should be nounwind in insertCopiesSplitCSR!"
) ? void (0) : __assert_fail ("Entry->getParent()->getFunction().hasFnAttribute(Attribute::NoUnwind) && \"Function should be nounwind in insertCopiesSplitCSR!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 57702, __extension__
__PRETTY_FUNCTION__))

57702

"Function should be nounwind in insertCopiesSplitCSR!")(static_cast <bool> (Entry->getParent()->getFunction
().hasFnAttribute(Attribute::NoUnwind) && "Function should be nounwind in insertCopiesSplitCSR!"
) ? void (0) : __assert_fail ("Entry->getParent()->getFunction().hasFnAttribute(Attribute::NoUnwind) && \"Function should be nounwind in insertCopiesSplitCSR!\""
, "llvm/lib/Target/X86/X86ISelLowering.cpp", 57702, __extension__
__PRETTY_FUNCTION__));

57703

Entry->addLiveIn(*I);

57704

BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)

57705

.addReg(*I);

57706

57707

// Insert the copy-back instructions right before the terminator.

57708

for (auto *Exit : Exits)

57709

BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),

57710

TII->get(TargetOpcode::COPY), *I)

57711

.addReg(NewVR);

57712

}

57713

}

57714

57715

bool X86TargetLowering::supportSwiftError() const {

57716

return Subtarget.is64Bit();

57717

}

57718

57719

/// Returns true if stack probing through a function call is requested.

57720

bool X86TargetLowering::hasStackProbeSymbol(const MachineFunction &MF) const {

57721

return !getStackProbeSymbolName(MF).empty();

57722

}

57723

57724

/// Returns true if stack probing through inline assembly is requested.

57725

bool X86TargetLowering::hasInlineStackProbe(const MachineFunction &MF) const {

57726

57727

// No inline stack probe for Windows, they have their own mechanism.

57728

if (Subtarget.isOSWindows() ||

57729

MF.getFunction().hasFnAttribute("no-stack-arg-probe"))

57730

return false;

57731

57732

// If the function specifically requests inline stack probes, emit them.

57733

if (MF.getFunction().hasFnAttribute("probe-stack"))

57734

return MF.getFunction().getFnAttribute("probe-stack").getValueAsString() ==

57735

"inline-asm";

57736

57737

return false;

57738

}

57739

57740

/// Returns the name of the symbol used to emit stack probes or the empty

57741

/// string if not applicable.

57742

StringRef

57743

X86TargetLowering::getStackProbeSymbolName(const MachineFunction &MF) const {

57744

// Inline Stack probes disable stack probe call

57745

if (hasInlineStackProbe(MF))

57746

return "";

57747

57748

// If the function specifically requests stack probes, emit them.

57749

if (MF.getFunction().hasFnAttribute("probe-stack"))

57750

return MF.getFunction().getFnAttribute("probe-stack").getValueAsString();

57751

57752

// Generally, if we aren't on Windows, the platform ABI does not include

57753

// support for stack probes, so don't emit them.

57754

if (!Subtarget.isOSWindows() || Subtarget.isTargetMachO() ||

57755

MF.getFunction().hasFnAttribute("no-stack-arg-probe"))

57756

return "";

57757

57758

// We need a stack probe to conform to the Windows ABI. Choose the right

57759

// symbol.

57760

if (Subtarget.is64Bit())

57761

return Subtarget.isTargetCygMing() ? "___chkstk_ms" : "__chkstk";

57762

return Subtarget.isTargetCygMing() ? "_alloca" : "_chkstk";

57763

}

57764

57765

unsigned

57766

X86TargetLowering::getStackProbeSize(const MachineFunction &MF) const {

57767

// The default stack probe size is 4096 if the function has no stackprobesize

57768

// attribute.

57769

return MF.getFunction().getFnAttributeAsParsedInteger("stack-probe-size",

57770

4096);

57771

}

57772

57773

Align X86TargetLowering::getPrefLoopAlignment(MachineLoop *ML) const {

57774

if (ML->isInnermost() &&

57775

ExperimentalPrefInnermostLoopAlignment.getNumOccurrences())

57776

return Align(1ULL << ExperimentalPrefInnermostLoopAlignment);

57777

return TargetLowering::getPrefLoopAlignment();

57778

}

File:	build/source/llvm/lib/Target/X86/X86ISelLowering.cpp
Warning:	line 17442, column 31 Division by zero

Bug Summary

Annotated Source Code